-
-
Notifications
You must be signed in to change notification settings - Fork 312
/
Copy pathDocument.php
152 lines (129 loc) · 4.37 KB
/
Document.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
<?php
declare(strict_types = 1);
namespace Embed;
use DOMDocument;
use DOMNode;
use DOMXPath;
use HtmlParser\Parser;
use Psr\Http\Message\UriInterface;
use RuntimeException;
use Symfony\Component\CssSelector\CssSelectorConverter;
class Document
{
private static CssSelectorConverter $cssConverter;
private Extractor $extractor;
private DOMDocument $document;
private DOMXPath $xpath;
public function __construct(Extractor $extractor)
{
$this->extractor = $extractor;
$html = (string) $extractor->getResponse()->getBody();
$html = str_replace('<br>', "\n<br>", $html);
$html = str_replace('<br ', "\n<br ", $html);
$encoding = null;
$contentType = $extractor->getResponse()->getHeaderLine('content-type');
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
try {
$ret = mb_encoding_aliases($encoding ?? '');
if ($ret === false) {
$encoding = null;
}
} catch (\ValueError $exception) {
$encoding = null;
}
}
if (is_null($encoding) && !empty($html)) {
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
}
try {
$ret = mb_encoding_aliases($encoding ?? '');
if ($ret === false) {
$encoding = null;
}
} catch (\ValueError $exception) {
$encoding = null;
}
}
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
$this->initXPath();
}
private function initXPath()
{
$this->xpath = new DOMXPath($this->document);
$this->xpath->registerNamespace('php', 'http://php.net/xpath');
$this->xpath->registerPhpFunctions();
}
public function __clone()
{
$this->document = clone $this->document;
$this->initXPath();
}
public function remove(string $query): void
{
$nodes = iterator_to_array($this->xpath->query($query), false);
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
public function removeCss(string $query): void
{
$this->remove(self::cssToXpath($query));
}
public function getDocument(): DOMDocument
{
return $this->document;
}
/**
* Helper to build xpath queries easily and case insensitive
*/
private static function buildQuery(string $startQuery, array $attributes): string
{
$selector = [$startQuery];
foreach ($attributes as $name => $value) {
$selector[] = sprintf('[php:functionString("strtolower", @%s)="%s"]', $name, mb_strtolower($value));
}
return implode('', $selector);
}
/**
* Select a element in the dom
*/
public function select(string $query, ?array $attributes = null, ?DOMNode $context = null): QueryResult
{
if (!empty($attributes)) {
$query = self::buildQuery($query, $attributes);
}
return new QueryResult($this->xpath->query($query, $context), $this->extractor);
}
/**
* Select a element in the dom using a css selector
*/
public function selectCss(string $query, ?DOMNode $context = null): QueryResult
{
return $this->select(self::cssToXpath($query), null, $context);
}
/**
* Shortcut to select a <link> element and return the href
*/
public function link(string $rel, array $extra = []): ?UriInterface
{
return $this->select('.//link', ['rel' => $rel] + $extra)->url('href');
}
public function __toString(): string
{
return Parser::stringify($this->getDocument());
}
private static function cssToXpath(string $selector): string
{
if (!isset(self::$cssConverter)) {
if (!class_exists(CssSelectorConverter::class)) {
throw new RuntimeException('You need to install "symfony/css-selector" to use css selectors');
}
self::$cssConverter = new CssSelectorConverter();
}
return self::$cssConverter->toXpath($selector);
}
}