Skip to content

Commit 6ce4e11

Browse files
authored
Merge pull request #88 from redapple/re-replace-entities-switch
Add replace_entities argument to re()/re_first()
2 parents ef0104d + 4df034c commit 6ce4e11

File tree

3 files changed

+63
-13
lines changed

3 files changed

+63
-13
lines changed

parsel/selector.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -89,21 +89,31 @@ def css(self, query):
8989
"""
9090
return self.__class__(flatten([x.css(query) for x in self]))
9191

92-
def re(self, regex):
92+
def re(self, regex, replace_entities=True):
9393
"""
9494
Call the ``.re()`` method for each element in this list and return
9595
their results flattened, as a list of unicode strings.
96+
97+
By default, character entity references are replaced by their
98+
corresponding character (except for ``&`` and ``<``.
99+
Passing ``replace_entities`` as ``False`` switches off these
100+
replacements.
96101
"""
97-
return flatten([x.re(regex) for x in self])
102+
return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
98103

99-
def re_first(self, regex, default=None):
104+
def re_first(self, regex, default=None, replace_entities=True):
100105
"""
101106
Call the ``.re()`` method for the first element in this list and
102107
return the result in an unicode string. If the list is empty or the
103108
regex doesn't match anything, return the default value (``None`` if
104109
the argument is not provided).
110+
111+
By default, character entity references are replaced by their
112+
corresponding character (except for ``&`` and ``<``.
113+
Passing ``replace_entities`` as ``False`` switches off these
114+
replacements.
105115
"""
106-
for el in iflatten(x.re(regex) for x in self):
116+
for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self):
107117
return el
108118
else:
109119
return default
@@ -238,23 +248,33 @@ def css(self, query):
238248
def _css2xpath(self, query):
239249
return self._csstranslator.css_to_xpath(query)
240250

241-
def re(self, regex):
251+
def re(self, regex, replace_entities=True):
242252
"""
243253
Apply the given regex and return a list of unicode strings with the
244254
matches.
245255
246256
``regex`` can be either a compiled regular expression or a string which
247-
will be compiled to a regular expression using ``re.compile(regex)``
257+
will be compiled to a regular expression using ``re.compile(regex)``.
258+
259+
By default, character entity references are replaced by their
260+
corresponding character (except for ``&`` and ``<``.
261+
Passing ``replace_entities`` as ``False`` switches off these
262+
replacements.
248263
"""
249-
return extract_regex(regex, self.extract())
264+
return extract_regex(regex, self.extract(), replace_entities=replace_entities)
250265

251-
def re_first(self, regex, default=None):
266+
def re_first(self, regex, default=None, replace_entities=True):
252267
"""
253268
Apply the given regex and return the first unicode string which
254269
matches. If there is no match, return the default value (``None`` if
255270
the argument is not provided).
271+
272+
By default, character entity references are replaced by their
273+
corresponding character (except for ``&`` and ``<``.
274+
Passing ``replace_entities`` as ``False`` switches off these
275+
replacements.
256276
"""
257-
return next(iflatten(self.re(regex)), default)
277+
return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
258278

259279
def extract(self):
260280
"""

parsel/utils.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
import six
3-
from w3lib.html import replace_entities
3+
from w3lib.html import replace_entities as w3lib_replace_entities
44

55

66
def flatten(x):
@@ -56,7 +56,7 @@ def _is_listlike(x):
5656
return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
5757

5858

59-
def extract_regex(regex, text):
59+
def extract_regex(regex, text, replace_entities=True):
6060
"""Extract a list of unicode strings from the given text/encoding using the following policies:
6161
* if the regex contains a named group called "extract" that will be returned
6262
* if the regex contains multiple numbered groups, all those will be returned (flattened)
@@ -76,4 +76,8 @@ def extract_regex(regex, text):
7676
else:
7777
# full regex or numbered groups
7878
strings = regex.findall(text)
79-
return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(strings)]
79+
80+
strings = flatten(strings)
81+
if not replace_entities:
82+
return strings
83+
return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]

tests/test_selector.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,7 @@ def test_re(self):
455455
["John", "Paul"])
456456
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
457457
["10", "20"])
458-
458+
459459
# Test named group, hit and miss
460460
x = self.sscls(text=u'foobar')
461461
self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
@@ -465,6 +465,32 @@ def test_re(self):
465465
x = self.sscls(text=u'baz')
466466
self.assertEqual(x.re('(?P<extract>foo)|(?P<bar>baz)'), [])
467467

468+
def test_re_replace_entities(self):
469+
body = u"""<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
470+
x = self.sscls(text=body)
471+
472+
name_re = re.compile('{"foo":(.*)}')
473+
474+
# by default, only &amp; and &lt; are preserved ;
475+
# other entities are converted
476+
expected = u'"bar &amp; "baz""'
477+
self.assertEqual(x.xpath("//script/text()").re(name_re), [expected])
478+
self.assertEqual(x.xpath("//script").re(name_re), [expected])
479+
self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected])
480+
self.assertEqual(x.xpath("//script")[0].re(name_re), [expected])
481+
482+
# check that re_first() works the same way for single value output
483+
self.assertEqual(x.xpath("//script").re_first(name_re), expected)
484+
self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected)
485+
486+
# switching off replace_entities will preserve &quot; also
487+
expected = u'"bar &amp; &quot;baz&quot;"'
488+
self.assertEqual(x.xpath("//script/text()").re(name_re, replace_entities=False), [expected])
489+
self.assertEqual(x.xpath("//script")[0].re(name_re, replace_entities=False), [expected])
490+
491+
self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected)
492+
self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected)
493+
468494
def test_re_intl(self):
469495
body = u'<div>Evento: cumplea\xf1os</div>'
470496
x = self.sscls(text=body)

0 commit comments

Comments
 (0)