Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -1396,7 +1396,10 @@ private function loadHtml()
$this->logger->debug('Parsing URL: ' . $this->url);

if ($this->url) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
$host = parse_url($this->url, \PHP_URL_HOST);
if (null !== $host) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', $host), ['.' => '\.']) . '/';
}
}

mb_internal_encoding('UTF-8');
Expand Down Expand Up @@ -1432,7 +1435,7 @@ private function loadHtml()
unset($tidy);
}

$this->html = '<meta charset="utf-8">' . (string) $this->html;
$this->html = self::ensureMetaCharset((string) $this->html);

if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);
Expand All @@ -1450,4 +1453,45 @@ private function loadHtml()

$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
}

/**
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
*
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
*
* @param string $html UTF-8 encoded document
*/
private static function ensureMetaCharset($html)
{
$charsetTag = '<meta charset="utf-8">';

// Only look at first 1024 bytes since, according to HTML5 specification,
// that’s where <meta> elements declaring a character encoding must be located.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
$start = substr($html, 0, 1000);

if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
// <meta> tag is already present, no need for modification.
return $html;
}

if (1 === preg_match('/<head[^>]*>/i', $start)) {
// <head> tag was located, <meta> tags go there.
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);

return $html;
}

if (1 === preg_match('/<html[^>]*>/i', $start)) {
// <html> tag was located, let’s put it inside and have parser create <head>.
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);

return $html;
}

// Fallback – just plop the <meta> at the start of the fragment.
return $charsetTag . $html;
}
}
72 changes: 47 additions & 25 deletions tests/ReadabilityTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ public function testConstructDefault()
$readability = $this->getReadability('');

$this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
}

public function testConstructHtml5Parser()
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'html5lib');

$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html);
}

Expand All @@ -39,7 +39,7 @@ public function testConstructSimple()
$readability = $this->getReadability('<html/>', 'http://0.0.0.0');

$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html);
$this->assertTrue($readability->tidied);
}
Expand All @@ -52,15 +52,15 @@ public function testConstructDefaultWithoutTidy()
$this->assertSame('', $readability->original_html);
$this->assertFalse($readability->tidied);

$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
}

public function testConstructSimpleWithoutTidy()
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);

$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html);
$this->assertFalse($readability->tidied);
}
Expand Down Expand Up @@ -106,7 +106,6 @@ public function testInitDivP()
public function testInitDiv()
{
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -120,7 +119,6 @@ public function testInitDiv()
public function testWithFootnotes()
{
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true;
$res = $readability->init();

Expand All @@ -137,7 +135,6 @@ public function testWithFootnotes()
public function testStandardClean()
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false;
$res = $readability->init();

Expand All @@ -154,7 +151,6 @@ public function testStandardClean()
public function testWithIframe()
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -169,7 +165,6 @@ public function testWithIframe()
public function testWithArticle()
{
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -184,7 +179,6 @@ public function testWithArticle()
public function testWithAside()
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -199,7 +193,6 @@ public function testWithAside()
public function testWithClasses()
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -214,7 +207,6 @@ public function testWithClasses()
public function testWithClassesWithoutLightClean()
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false;
$res = $readability->init();

Expand All @@ -230,7 +222,6 @@ public function testWithClassesWithoutLightClean()
public function testWithTd()
{
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -243,7 +234,6 @@ public function testWithTd()
public function testWithSameClasses()
{
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -257,7 +247,6 @@ public function testWithSameClasses()
public function testWithScript()
{
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -271,7 +260,6 @@ public function testWithScript()
public function testTitle()
{
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -285,7 +273,6 @@ public function testTitle()
public function testTitleWithDash()
{
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -299,7 +286,6 @@ public function testTitleWithDash()
public function testTitleWithDoubleDot()
{
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -313,7 +299,6 @@ public function testTitleWithDoubleDot()
public function testTitleTooShortUseH1()
{
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand Down Expand Up @@ -365,7 +350,6 @@ public function testAutoClosingIframeNotThrowingException()
</html>';

$readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030');
$readability->debug = true;

$res = $readability->init();

Expand Down Expand Up @@ -433,7 +417,6 @@ public function testAppendIdAlreadyHere()
</html>';

$readability = $this->getReadability($data, 'http://0.0.0.0');
$readability->debug = true;

$res = $readability->init();

Expand Down Expand Up @@ -472,7 +455,6 @@ public function testChildNodeGoneNull()
$html = file_get_contents('tests/fixtures/childNodeGoesNull.html');

$readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true;
$res = $readability->init();

Expand All @@ -485,7 +467,6 @@ public function testKeepFootnotes()
$html = file_get_contents('tests/fixtures/keepFootnotes.html');

$readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
Expand All @@ -499,13 +480,54 @@ public function testWithWipedBody()
$html = file_get_contents('tests/fixtures/wipedBody.html');

$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
$readability->debug = true;
$res = $readability->init();

$this->assertTrue($res);
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
}

/**
* @return array<string, array{0: string, 1: string, 2?: bool}>
*/
public function dataForHtmlLang()
{
return [
'meta' => [
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'head' => [
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'headless' => [
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
// tidy would add <head> tag.
false,
],
'fragment' => [
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
'',
// tidy would add <html>.
false,
],
];
}

/**
* @dataProvider dataForHtmlLang
*/
public function testHtmlLang($html, $lang, $useTidy = true)
{
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
$res = $readability->init();

$this->assertTrue($res);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
}

private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
{
$readability = new Readability($html, $url, $parser, $useTidy);
Expand Down
Loading