diff --git a/src/Readability.php b/src/Readability.php index 1d2d2f5..05a6c9f 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1396,7 +1396,10 @@ private function loadHtml() $this->logger->debug('Parsing URL: ' . $this->url); if ($this->url) { - $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/'; + $host = parse_url($this->url, \PHP_URL_HOST); + if (null !== $host) { + $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', $host), ['.' => '\.']) . '/'; + } } mb_internal_encoding('UTF-8'); @@ -1432,7 +1435,7 @@ private function loadHtml() unset($tidy); } - $this->html = '' . (string) $this->html; + $this->html = self::ensureMetaCharset((string) $this->html); if ('html5lib' === $this->parser || 'html5' === $this->parser) { $this->dom = (new HTML5())->loadHTML($this->html); @@ -1450,4 +1453,45 @@ private function loadHtml() $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); } + + /** + * Tries to insert `meta[charset]` tag into the proper place in the passed HTML document. + * + * `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. + * This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding. + * Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag. + * + * @param string $html UTF-8 encoded document + */ + private static function ensureMetaCharset($html) + { + $charsetTag = ''; + + // Only look at first 1024 bytes since, according to HTML5 specification, + // that’s where elements declaring a character encoding must be located. + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset + $start = substr($html, 0, 1000); + + if (1 === preg_match('/]+charset/i', $start)) { + // tag is already present, no need for modification. + return $html; + } + + if (1 === preg_match('/
]*>/i', $start)) { + // tag was located, tags go there. + $html = preg_replace('/]*>/i', '$0' . $charsetTag, $html, 1); + + return $html; + } + + if (1 === preg_match('/]*>/i', $start)) { + // tag was located, let’s put it inside and have parser create . + $html = preg_replace('/]*>/i', '$0' . $charsetTag, $html, 1); + + return $html; + } + + // Fallback – just plop the at the start of the fragment. + return $charsetTag . $html; + } } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index b190a8e..fc9f6d5 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -19,7 +19,7 @@ public function testConstructDefault() $readability = $this->getReadability(''); $this->assertNull($readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); } public function testConstructHtml5Parser() @@ -27,7 +27,7 @@ public function testConstructHtml5Parser() $readability = $this->getReadability('