Skip to content

Commit efbbc86

Browse files
committed
Fix discarding html[lang]
`DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding. In f14428e, we tried to resolve it by putting `meta[charset]` tag at the start of the HTML fragment. Unfortunately, it turns out that causes parser to auto-insert a `html` element, losing the attributes of the original `html` tag. Let’s try to insert the `meta[charset]` tag into the proper place in the HTML document. We do not need to use the same trick with `JSLikeHTMLElement::__set`. That expects smaller HTML fragments, not `html` documents, so creating `html` and `head` elements will not be a problem.
1 parent 541fab3 commit efbbc86

File tree

2 files changed

+84
-1
lines changed

2 files changed

+84
-1
lines changed

src/Readability.php

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1419,7 +1419,7 @@ private function loadHtml(): void
14191419
unset($tidy);
14201420
}
14211421

1422-
$this->html = '<meta charset="utf-8">' . (string) $this->html;
1422+
$this->html = self::ensureMetaCharset((string) $this->html);
14231423

14241424
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
14251425
$this->dom = (new HTML5())->loadHTML($this->html);
@@ -1507,4 +1507,45 @@ private function isNodeVisible(\DOMElement $node): bool
15071507
)
15081508
&& !$node->hasAttribute('hidden');
15091509
}
1510+
1511+
/**
1512+
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1513+
*
1514+
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
1515+
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1516+
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
1517+
*
1518+
* @param string $html UTF-8 encoded document
1519+
*/
1520+
private static function ensureMetaCharset(string $html): string
1521+
{
1522+
$charsetTag = '<meta charset="utf-8">';
1523+
1524+
// Only look at first 1024 bytes since, according to HTML5 specification,
1525+
// that’s where <meta> elements declaring a character encoding must be located.
1526+
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
1527+
$start = substr($html, 0, 1000);
1528+
1529+
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
1530+
// <meta> tag is already present, no need for modification.
1531+
return $html;
1532+
}
1533+
1534+
if (1 === preg_match('/<head[^>]*>/i', $start)) {
1535+
// <head> tag was located, <meta> tags go there.
1536+
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
1537+
1538+
return $html;
1539+
}
1540+
1541+
if (1 === preg_match('/<html[^>]*>/i', $start)) {
1542+
// <html> tag was located, let’s put it inside and have parser create <head>.
1543+
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
1544+
1545+
return $html;
1546+
}
1547+
1548+
// Fallback – just plop the <meta> at the start of the fragment.
1549+
return $charsetTag . $html;
1550+
}
15101551
}

tests/ReadabilityTest.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,48 @@ public function testVisibleNode(string $content, bool $shouldBeVisible): void
529529
}
530530
}
531531

532+
/**
533+
* @return array<string, array{0: string, 1: string, 2?: bool}>
534+
*/
535+
public function dataForHtmlLang(): array
536+
{
537+
return [
538+
'meta' => [
539+
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
540+
'fr',
541+
],
542+
'head' => [
543+
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
544+
'fr',
545+
],
546+
'headless' => [
547+
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
548+
'fr',
549+
// tidy would add <head> tag.
550+
false,
551+
],
552+
'fragment' => [
553+
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
554+
'',
555+
// tidy would add <html>.
556+
false,
557+
],
558+
];
559+
}
560+
561+
/**
562+
* @dataProvider dataForHtmlLang
563+
*/
564+
public function testHtmlLang(string $html, string $lang, bool $useTidy = true): void
565+
{
566+
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
567+
$res = $readability->init();
568+
569+
$this->assertTrue($res);
570+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
571+
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
572+
}
573+
532574
private function getReadability(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
533575
{
534576
$readability = new Readability($html, $url, $parser, $useTidy);

0 commit comments

Comments
 (0)