Skip to content

Commit b9dde0f

Browse files
authored
Merge pull request #107 from j0k3r/backport/encode
[1.x] Backport character decoding regression
2 parents 109a226 + a21742b commit b9dde0f

File tree

4 files changed

+32
-40
lines changed

4 files changed

+32
-40
lines changed

.github/workflows/coding-standards.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ env:
1616
jobs:
1717
coding-standards:
1818
name: "CS Fixer & PHPStan"
19-
runs-on: "ubuntu-22.04"
19+
runs-on: "ubuntu-latest"
2020

2121
strategy:
2222
matrix:

.github/workflows/continuous-integration.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ env:
1616
jobs:
1717
phpunit:
1818
name: "PHPUnit (PHP ${{ matrix.php }})"
19-
runs-on: "ubuntu-22.04"
19+
runs-on: "ubuntu-latest"
2020

2121
strategy:
2222
matrix:
@@ -66,7 +66,7 @@ jobs:
6666

6767
phpunit-coverage:
6868
name: "PHPUnit coverage (PHP ${{ matrix.php }})"
69-
runs-on: "ubuntu-22.04"
69+
runs-on: "ubuntu-latest"
7070

7171
strategy:
7272
matrix:
@@ -117,7 +117,7 @@ jobs:
117117

118118
phpunit-lowest:
119119
name: "PHPUnit lowest deps (PHP ${{ matrix.php }})"
120-
runs-on: "ubuntu-22.04"
120+
runs-on: "ubuntu-latest"
121121

122122
strategy:
123123
matrix:
@@ -158,7 +158,7 @@ jobs:
158158

159159
phpunit-composerv2:
160160
name: "PHPUnit with Composer v1 (PHP ${{ matrix.php }})"
161-
runs-on: "ubuntu-20.04"
161+
runs-on: "ubuntu-latest"
162162

163163
strategy:
164164
matrix:

src/Readability.php

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,7 +1435,7 @@ private function loadHtml()
14351435
unset($tidy);
14361436
}
14371437

1438-
$this->html = self::ensureMetaCharset((string) $this->html);
1438+
$this->html = self::entitizeNonAscii((string) $this->html);
14391439

14401440
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
14411441
$this->dom = (new HTML5())->loadHTML($this->html);
@@ -1455,43 +1455,19 @@ private function loadHtml()
14551455
}
14561456

14571457
/**
1458-
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1458+
* Converts non-ASCII UTF-8 characters to numeric HTML entities.
14591459
*
14601460
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
14611461
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1462-
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
14631462
*
14641463
* @param string $html UTF-8 encoded document
14651464
*/
1466-
private static function ensureMetaCharset($html)
1465+
private static function entitizeNonAscii($html)
14671466
{
1468-
$charsetTag = '<meta charset="utf-8">';
1467+
$convmap = [
1468+
0x80, 0x1FFFFF, 0, 0x10FFFF,
1469+
];
14691470

1470-
// Only look at first 1024 bytes since, according to HTML5 specification,
1471-
// that’s where <meta> elements declaring a character encoding must be located.
1472-
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
1473-
$start = substr($html, 0, 1000);
1474-
1475-
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
1476-
// <meta> tag is already present, no need for modification.
1477-
return $html;
1478-
}
1479-
1480-
if (1 === preg_match('/<head[^>]*>/i', $start)) {
1481-
// <head> tag was located, <meta> tags go there.
1482-
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
1483-
1484-
return $html;
1485-
}
1486-
1487-
if (1 === preg_match('/<html[^>]*>/i', $start)) {
1488-
// <html> tag was located, let’s put it inside and have parser create <head>.
1489-
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
1490-
1491-
return $html;
1492-
}
1493-
1494-
// Fallback – just plop the <meta> at the start of the fragment.
1495-
return $charsetTag . $html;
1471+
return mb_encode_numericentity($html, $convmap, 'utf8', true);
14961472
}
14971473
}

tests/ReadabilityTest.php

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -486,28 +486,42 @@ public function testWithWipedBody()
486486
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
487487
}
488488

489+
// https://github.com/wallabag/wallabag/issues/8158
490+
public function testCharsetAfterTitle()
491+
{
492+
$readability = $this->getReadability('<!DOCTYPE html><html lang="et"><head><title>Tõde ja õigus I</title> <meta charset="utf-8"></head><body><p>See oli läinud aastasaja kolmanda veerandi lõpul. Päike lähenes silmapiirile, seistes sedavõrd madalas, et enam ei ulatunud valgustama ei mäkke ronivat hobust, kes puutelgedega vankrit vedas, ei vankril istuvat noort naist ega ka ligi kolmekümnelist meest, kes kõndis vankri kõrval.</p></body></html>', 'https://et.wikisource.org/wiki/T%C3%B5de_ja_%C3%B5igus_I/I');
493+
$readability->convertLinksToFootnotes = true;
494+
$res = $readability->init();
495+
496+
$this->assertTrue($res);
497+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
498+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
499+
$this->assertSame('Tõde ja õigus I', $readability->getTitle()->getInnerHtml());
500+
$this->assertStringContainsString('Päike lähenes', $readability->getContent()->getInnerHtml());
501+
}
502+
489503
/**
490504
* @return array<string, array{0: string, 1: string, 2?: bool}>
491505
*/
492506
public function dataForHtmlLang()
493507
{
494508
return [
495509
'meta' => [
496-
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
510+
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
497511
'fr',
498512
],
499513
'head' => [
500-
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
514+
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
501515
'fr',
502516
],
503517
'headless' => [
504-
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
518+
'<html lang="fr"><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
505519
'fr',
506520
// tidy would add <head> tag.
507521
false,
508522
],
509523
'fragment' => [
510-
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
524+
'<article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article>',
511525
'',
512526
// tidy would add <html>.
513527
false,
@@ -526,6 +540,8 @@ public function testHtmlLang($html, $lang, $useTidy = true)
526540
$this->assertTrue($res);
527541
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
528542
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
543+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
544+
$this->assertStringContainsString('êtres', $readability->getContent()->getInnerHtml());
529545
}
530546

531547
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)

0 commit comments

Comments
 (0)