Skip to content

Commit 7413a38

Browse files
authored
Merge pull request #104 from jtojnar/html-shadowing
Fix discarding `html[lang]`
2 parents a18cd0f + efbbc86 commit 7413a38

File tree

2 files changed

+89
-27
lines changed

2 files changed

+89
-27
lines changed

src/Readability.php

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1422,7 +1422,7 @@ private function loadHtml(): void
14221422
unset($tidy);
14231423
}
14241424

1425-
$this->html = '<meta charset="utf-8">' . (string) $this->html;
1425+
$this->html = self::ensureMetaCharset((string) $this->html);
14261426

14271427
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
14281428
$this->dom = (new HTML5())->loadHTML($this->html);
@@ -1510,4 +1510,45 @@ private function isNodeVisible(\DOMElement $node): bool
15101510
)
15111511
&& !$node->hasAttribute('hidden');
15121512
}
1513+
1514+
/**
1515+
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1516+
*
1517+
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
1518+
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1519+
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
1520+
*
1521+
* @param string $html UTF-8 encoded document
1522+
*/
1523+
private static function ensureMetaCharset(string $html): string
1524+
{
1525+
$charsetTag = '<meta charset="utf-8">';
1526+
1527+
// Only look at first 1024 bytes since, according to HTML5 specification,
1528+
// that’s where <meta> elements declaring a character encoding must be located.
1529+
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
1530+
$start = substr($html, 0, 1000);
1531+
1532+
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
1533+
// <meta> tag is already present, no need for modification.
1534+
return $html;
1535+
}
1536+
1537+
if (1 === preg_match('/<head[^>]*>/i', $start)) {
1538+
// <head> tag was located, <meta> tags go there.
1539+
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
1540+
1541+
return $html;
1542+
}
1543+
1544+
if (1 === preg_match('/<html[^>]*>/i', $start)) {
1545+
// <html> tag was located, let’s put it inside and have parser create <head>.
1546+
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
1547+
1548+
return $html;
1549+
}
1550+
1551+
// Fallback – just plop the <meta> at the start of the fragment.
1552+
return $charsetTag . $html;
1553+
}
15131554
}

tests/ReadabilityTest.php

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public function testConstructDefault(): void
2424
$readability->init();
2525

2626
$this->assertNull($readability->url);
27-
$this->assertInstanceOf('DomDocument', $readability->dom);
27+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
2828
}
2929

3030
public function testConstructHtml5Parser(): void
@@ -33,7 +33,7 @@ public function testConstructHtml5Parser(): void
3333
$readability->init();
3434

3535
$this->assertSame('http://0.0.0.0', $readability->url);
36-
$this->assertInstanceOf('DomDocument', $readability->dom);
36+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
3737
$this->assertSame('<html/>', $readability->original_html);
3838
}
3939

@@ -46,7 +46,7 @@ public function testConstructSimple(): void
4646
$readability->init();
4747

4848
$this->assertSame('http://0.0.0.0', $readability->url);
49-
$this->assertInstanceOf('DomDocument', $readability->dom);
49+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
5050
$this->assertSame('<html/>', $readability->original_html);
5151
$this->assertTrue($readability->tidied);
5252
}
@@ -60,7 +60,7 @@ public function testConstructDefaultWithoutTidy(): void
6060
$this->assertSame('', $readability->original_html);
6161
$this->assertFalse($readability->tidied);
6262

63-
$this->assertInstanceOf('DomDocument', $readability->dom);
63+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
6464
}
6565

6666
public function testConstructSimpleWithoutTidy(): void
@@ -69,7 +69,7 @@ public function testConstructSimpleWithoutTidy(): void
6969
$readability->init();
7070

7171
$this->assertSame('http://0.0.0.0', $readability->url);
72-
$this->assertInstanceOf('DomDocument', $readability->dom);
72+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
7373
$this->assertSame('<html/>', $readability->original_html);
7474
$this->assertFalse($readability->tidied);
7575
}
@@ -115,7 +115,6 @@ public function testInitDivP(): void
115115
public function testInitDiv(): void
116116
{
117117
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
118-
$readability->debug = true;
119118
$res = $readability->init();
120119

121120
$this->assertTrue($res);
@@ -129,7 +128,6 @@ public function testInitDiv(): void
129128
public function testWithFootnotes(): void
130129
{
131130
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
132-
$readability->debug = true;
133131
$readability->convertLinksToFootnotes = true;
134132
$res = $readability->init();
135133

@@ -146,7 +144,6 @@ public function testWithFootnotes(): void
146144
public function testStandardClean(): void
147145
{
148146
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
149-
$readability->debug = true;
150147
$readability->lightClean = false;
151148
$res = $readability->init();
152149

@@ -163,7 +160,6 @@ public function testStandardClean(): void
163160
public function testWithIframe(): void
164161
{
165162
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
166-
$readability->debug = true;
167163
$res = $readability->init();
168164

169165
$this->assertTrue($res);
@@ -178,7 +174,6 @@ public function testWithIframe(): void
178174
public function testWithArticle(): void
179175
{
180176
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
181-
$readability->debug = true;
182177
$res = $readability->init();
183178

184179
$this->assertTrue($res);
@@ -193,7 +188,6 @@ public function testWithArticle(): void
193188
public function testWithAside(): void
194189
{
195190
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
196-
$readability->debug = true;
197191
$res = $readability->init();
198192

199193
$this->assertTrue($res);
@@ -208,7 +202,6 @@ public function testWithAside(): void
208202
public function testWithClasses(): void
209203
{
210204
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
211-
$readability->debug = true;
212205
$res = $readability->init();
213206

214207
$this->assertTrue($res);
@@ -223,7 +216,6 @@ public function testWithClasses(): void
223216
public function testWithClassesWithoutLightClean(): void
224217
{
225218
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
226-
$readability->debug = true;
227219
$readability->lightClean = false;
228220
$res = $readability->init();
229221

@@ -239,7 +231,6 @@ public function testWithClassesWithoutLightClean(): void
239231
public function testWithTd(): void
240232
{
241233
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
242-
$readability->debug = true;
243234
$res = $readability->init();
244235

245236
$this->assertTrue($res);
@@ -252,7 +243,6 @@ public function testWithTd(): void
252243
public function testWithSameClasses(): void
253244
{
254245
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
255-
$readability->debug = true;
256246
$res = $readability->init();
257247

258248
$this->assertTrue($res);
@@ -266,7 +256,6 @@ public function testWithSameClasses(): void
266256
public function testWithScript(): void
267257
{
268258
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
269-
$readability->debug = true;
270259
$res = $readability->init();
271260

272261
$this->assertTrue($res);
@@ -280,7 +269,6 @@ public function testWithScript(): void
280269
public function testTitle(): void
281270
{
282271
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
283-
$readability->debug = true;
284272
$res = $readability->init();
285273

286274
$this->assertTrue($res);
@@ -294,7 +282,6 @@ public function testTitle(): void
294282
public function testTitleWithDash(): void
295283
{
296284
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
297-
$readability->debug = true;
298285
$res = $readability->init();
299286

300287
$this->assertTrue($res);
@@ -308,7 +295,6 @@ public function testTitleWithDash(): void
308295
public function testTitleWithDoubleDot(): void
309296
{
310297
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
311-
$readability->debug = true;
312298
$res = $readability->init();
313299

314300
$this->assertTrue($res);
@@ -322,7 +308,6 @@ public function testTitleWithDoubleDot(): void
322308
public function testTitleTooShortUseH1(): void
323309
{
324310
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
325-
$readability->debug = true;
326311
$res = $readability->init();
327312

328313
$this->assertTrue($res);
@@ -369,7 +354,6 @@ public function testAutoClosingIframeNotThrowingException(): void
369354
</html>';
370355

371356
$readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030');
372-
$readability->debug = true;
373357

374358
$res = $readability->init();
375359

@@ -437,7 +421,6 @@ public function testAppendIdAlreadyHere(): void
437421
</html>';
438422

439423
$readability = $this->getReadability($data, 'http://0.0.0.0');
440-
$readability->debug = true;
441424

442425
$res = $readability->init();
443426

@@ -474,7 +457,6 @@ public function testChildNodeGoneNull(): void
474457
$html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html');
475458

476459
$readability = $this->getReadability($html, 'http://0.0.0.0');
477-
$readability->debug = true;
478460
$readability->convertLinksToFootnotes = true;
479461
$res = $readability->init();
480462

@@ -487,7 +469,6 @@ public function testKeepFootnotes(): void
487469
$html = (string) file_get_contents('tests/fixtures/keepFootnotes.html');
488470

489471
$readability = $this->getReadability($html, 'http://0.0.0.0');
490-
$readability->debug = true;
491472
$res = $readability->init();
492473

493474
$this->assertTrue($res);
@@ -501,7 +482,6 @@ public function testWithWipedBody(): void
501482
$html = (string) file_get_contents('tests/fixtures/wipedBody.html');
502483

503484
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
504-
$readability->debug = true;
505485
$res = $readability->init();
506486

507487
$this->assertTrue($res);
@@ -540,7 +520,6 @@ public function dataForVisibleNode(): array
540520
public function testVisibleNode(string $content, bool $shouldBeVisible): void
541521
{
542522
$readability = $this->getReadability($content, 'http://0.0.0.0');
543-
$readability->debug = true;
544523
$res = $readability->init();
545524

546525
if ($shouldBeVisible) {
@@ -550,6 +529,48 @@ public function testVisibleNode(string $content, bool $shouldBeVisible): void
550529
}
551530
}
552531

532+
/**
533+
* @return array<string, array{0: string, 1: string, 2?: bool}>
534+
*/
535+
public function dataForHtmlLang(): array
536+
{
537+
return [
538+
'meta' => [
539+
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
540+
'fr',
541+
],
542+
'head' => [
543+
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
544+
'fr',
545+
],
546+
'headless' => [
547+
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
548+
'fr',
549+
// tidy would add <head> tag.
550+
false,
551+
],
552+
'fragment' => [
553+
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
554+
'',
555+
// tidy would add <html>.
556+
false,
557+
],
558+
];
559+
}
560+
561+
/**
562+
* @dataProvider dataForHtmlLang
563+
*/
564+
public function testHtmlLang(string $html, string $lang, bool $useTidy = true): void
565+
{
566+
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
567+
$res = $readability->init();
568+
569+
$this->assertTrue($res);
570+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
571+
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
572+
}
573+
553574
private function getReadability(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
554575
{
555576
$readability = new Readability($html, $url, $parser, $useTidy);

0 commit comments

Comments
 (0)