Skip to content

Commit 3a979c9

Browse files
jtojnarj0k3r
authored andcommitted
Fix character decoding regression when title precedes meta[charset]
Because of PHP 8.2 deprecation, in f14428e, we stopped converting non-ASCII characters to HTML entities. Instead, we started to explicitly insert `meta[charset]` tag at the start of the document. Later, we discovered that was breaking `html[lang]` so, in efbbc86, we made the insertion smarter. One of the improvements was that it would not insert the `meta[charset]` tag when it was already present. That, however, broke websites that had `title` tag before `meta[charset]`. On those, libxml2 would decode the `title` contents as ISO-8859-1. We could improve the logic (e.g. check that there is not text content before `meta[charset]`) or insert the tag unconditionally but it will probably be simplest to just go back to converting the non-ASCII characters to entities, just using non-deprecated function variant.
1 parent 7f304d0 commit 3a979c9

File tree

2 files changed

+62
-31
lines changed

2 files changed

+62
-31
lines changed

src/Readability.php

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,7 +1435,7 @@ private function loadHtml()
14351435
unset($tidy);
14361436
}
14371437

1438-
$this->html = self::ensureMetaCharset((string) $this->html);
1438+
$this->html = self::entitizeNonAscii((string) $this->html);
14391439

14401440
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
14411441
$this->dom = (new HTML5())->loadHTML($this->html);
@@ -1455,43 +1455,19 @@ private function loadHtml()
14551455
}
14561456

14571457
/**
1458-
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1458+
* Converts non-ASCII UTF-8 characters to numeric HTML entities.
14591459
*
14601460
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
14611461
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1462-
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
14631462
*
14641463
* @param string $html UTF-8 encoded document
14651464
*/
1466-
private static function ensureMetaCharset($html)
1465+
private static function entitizeNonAscii($html)
14671466
{
1468-
$charsetTag = '<meta charset="utf-8">';
1467+
$convmap = [
1468+
0x80, 0x1FFFFF, 0, 0x10FFFF,
1469+
];
14691470

1470-
// Only look at first 1024 bytes since, according to HTML5 specification,
1471-
// that’s where <meta> elements declaring a character encoding must be located.
1472-
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
1473-
$start = substr($html, 0, 1000);
1474-
1475-
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
1476-
// <meta> tag is already present, no need for modification.
1477-
return $html;
1478-
}
1479-
1480-
if (1 === preg_match('/<head[^>]*>/i', $start)) {
1481-
// <head> tag was located, <meta> tags go there.
1482-
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
1483-
1484-
return $html;
1485-
}
1486-
1487-
if (1 === preg_match('/<html[^>]*>/i', $start)) {
1488-
// <html> tag was located, let’s put it inside and have parser create <head>.
1489-
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
1490-
1491-
return $html;
1492-
}
1493-
1494-
// Fallback – just plop the <meta> at the start of the fragment.
1495-
return $charsetTag . $html;
1471+
return mb_encode_numericentity($html, $convmap, 'utf8', true);
14961472
}
14971473
}

tests/ReadabilityTest.php

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,61 @@ public function testWithWipedBody()
486486
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
487487
}
488488

489+
public function dataForVisibleNode(): array
490+
{
491+
return [
492+
'visible node' => [
493+
'<div>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
494+
true,
495+
],
496+
'display=none' => [
497+
'<div style="display:none;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
498+
false,
499+
],
500+
'display=inline' => [
501+
'<div style="display:inline;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
502+
true,
503+
],
504+
'hidden attribute' => [
505+
'<div hidden>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
506+
false,
507+
],
508+
'missing display' => [
509+
'<div style="color:#ccc;">' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>',
510+
true,
511+
],
512+
];
513+
}
514+
515+
/**
516+
* @dataProvider dataForVisibleNode
517+
*/
518+
public function testVisibleNode(string $content, bool $shouldBeVisible): void
519+
{
520+
$readability = $this->getReadability($content, 'http://0.0.0.0');
521+
$res = $readability->init();
522+
523+
if ($shouldBeVisible) {
524+
$this->assertStringContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml());
525+
} else {
526+
$this->assertStringNotContainsString('WONDERFUL content', $readability->getContent()->getInnerHtml());
527+
}
528+
}
529+
530+
// https://github.com/wallabag/wallabag/issues/8158
531+
public function testCharsetAfterTitle(): void
532+
{
533+
$readability = $this->getReadability('<!DOCTYPE html><html lang="et"><head><title>Tõde ja õigus I</title> <meta charset="utf-8"></head><body><p>See oli läinud aastasaja kolmanda veerandi lõpul. Päike lähenes silmapiirile, seistes sedavõrd madalas, et enam ei ulatunud valgustama ei mäkke ronivat hobust, kes puutelgedega vankrit vedas, ei vankril istuvat noort naist ega ka ligi kolmekümnelist meest, kes kõndis vankri kõrval.</p></body></html>', 'https://et.wikisource.org/wiki/T%C3%B5de_ja_%C3%B5igus_I/I');
534+
$readability->convertLinksToFootnotes = true;
535+
$res = $readability->init();
536+
537+
$this->assertTrue($res);
538+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
539+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
540+
$this->assertSame('Tõde ja õigus I', $readability->getTitle()->getInnerHtml());
541+
$this->assertStringContainsString('Päike lähenes', $readability->getContent()->getInnerHtml());
542+
}
543+
489544
/**
490545
* @return array<string, array{0: string, 1: string, 2?: bool}>
491546
*/

0 commit comments

Comments
 (0)