You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
1428
1428
$this->dom = (newHTML5())->loadHTML($this->html);
@@ -1510,4 +1510,45 @@ private function isNodeVisible(\DOMElement $node): bool
1510
1510
)
1511
1511
&& !$node->hasAttribute('hidden');
1512
1512
}
1513
+
1514
+
/**
1515
+
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1516
+
*
1517
+
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
1518
+
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1519
+
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
@@ -115,7 +115,6 @@ public function testInitDivP(): void
115
115
publicfunctiontestInitDiv(): void
116
116
{
117
117
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
118
-
$readability->debug = true;
119
118
$res = $readability->init();
120
119
121
120
$this->assertTrue($res);
@@ -129,7 +128,6 @@ public function testInitDiv(): void
129
128
publicfunctiontestWithFootnotes(): void
130
129
{
131
130
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
132
-
$readability->debug = true;
133
131
$readability->convertLinksToFootnotes = true;
134
132
$res = $readability->init();
135
133
@@ -146,7 +144,6 @@ public function testWithFootnotes(): void
146
144
publicfunctiontestStandardClean(): void
147
145
{
148
146
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
149
-
$readability->debug = true;
150
147
$readability->lightClean = false;
151
148
$res = $readability->init();
152
149
@@ -163,7 +160,6 @@ public function testStandardClean(): void
163
160
publicfunctiontestWithIframe(): void
164
161
{
165
162
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
166
-
$readability->debug = true;
167
163
$res = $readability->init();
168
164
169
165
$this->assertTrue($res);
@@ -178,7 +174,6 @@ public function testWithIframe(): void
178
174
publicfunctiontestWithArticle(): void
179
175
{
180
176
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
181
-
$readability->debug = true;
182
177
$res = $readability->init();
183
178
184
179
$this->assertTrue($res);
@@ -193,7 +188,6 @@ public function testWithArticle(): void
193
188
publicfunctiontestWithAside(): void
194
189
{
195
190
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
196
-
$readability->debug = true;
197
191
$res = $readability->init();
198
192
199
193
$this->assertTrue($res);
@@ -208,7 +202,6 @@ public function testWithAside(): void
208
202
publicfunctiontestWithClasses(): void
209
203
{
210
204
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
211
-
$readability->debug = true;
212
205
$res = $readability->init();
213
206
214
207
$this->assertTrue($res);
@@ -223,7 +216,6 @@ public function testWithClasses(): void
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
226
-
$readability->debug = true;
227
219
$readability->lightClean = false;
228
220
$res = $readability->init();
229
221
@@ -239,7 +231,6 @@ public function testWithClassesWithoutLightClean(): void
239
231
publicfunctiontestWithTd(): void
240
232
{
241
233
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
242
-
$readability->debug = true;
243
234
$res = $readability->init();
244
235
245
236
$this->assertTrue($res);
@@ -252,7 +243,6 @@ public function testWithTd(): void
252
243
publicfunctiontestWithSameClasses(): void
253
244
{
254
245
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
255
-
$readability->debug = true;
256
246
$res = $readability->init();
257
247
258
248
$this->assertTrue($res);
@@ -266,7 +256,6 @@ public function testWithSameClasses(): void
266
256
publicfunctiontestWithScript(): void
267
257
{
268
258
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
269
-
$readability->debug = true;
270
259
$res = $readability->init();
271
260
272
261
$this->assertTrue($res);
@@ -280,7 +269,6 @@ public function testWithScript(): void
280
269
publicfunctiontestTitle(): void
281
270
{
282
271
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
283
-
$readability->debug = true;
284
272
$res = $readability->init();
285
273
286
274
$this->assertTrue($res);
@@ -294,7 +282,6 @@ public function testTitle(): void
294
282
publicfunctiontestTitleWithDash(): void
295
283
{
296
284
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
297
-
$readability->debug = true;
298
285
$res = $readability->init();
299
286
300
287
$this->assertTrue($res);
@@ -308,7 +295,6 @@ public function testTitleWithDash(): void
308
295
publicfunctiontestTitleWithDoubleDot(): void
309
296
{
310
297
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
311
-
$readability->debug = true;
312
298
$res = $readability->init();
313
299
314
300
$this->assertTrue($res);
@@ -322,7 +308,6 @@ public function testTitleWithDoubleDot(): void
322
308
publicfunctiontestTitleTooShortUseH1(): void
323
309
{
324
310
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
325
-
$readability->debug = true;
326
311
$res = $readability->init();
327
312
328
313
$this->assertTrue($res);
@@ -369,7 +354,6 @@ public function testAutoClosingIframeNotThrowingException(): void
0 commit comments