Merge pull request #88 from redapple/re-replace-entities-switch

dangra · web-flow · commit 6ce4e11ad8f6 · 2017-05-17T14:43:30.000-03:00
Add replace_entities argument to re()/re_first()
diff --git a/parsel/selector.py b/parsel/selector.py
@@ -89,21 +89,31 @@ def css(self, query):
         """
         return self.__class__(flatten([x.css(query) for x in self]))
 
-    def re(self, regex):
+    def re(self, regex, replace_entities=True):
         """
         Call the ``.re()`` method for each element in this list and return
         their results flattened, as a list of unicode strings.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
         """
-        return flatten([x.re(regex) for x in self])
+        return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
 
-    def re_first(self, regex, default=None):
+    def re_first(self, regex, default=None, replace_entities=True):
         """
         Call the ``.re()`` method for the first element in this list and
         return the result in an unicode string. If the list is empty or the
         regex doesn't match anything, return the default value (``None`` if
         the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
         """
-        for el in iflatten(x.re(regex) for x in self):
+        for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self):
             return el
         else:
             return default
@@ -238,23 +248,33 @@ def css(self, query):
     def _css2xpath(self, query):
         return self._csstranslator.css_to_xpath(query)
 
-    def re(self, regex):
+    def re(self, regex, replace_entities=True):
         """
         Apply the given regex and return a list of unicode strings with the
         matches.
 
         ``regex`` can be either a compiled regular expression or a string which
-        will be compiled to a regular expression using ``re.compile(regex)``
+        will be compiled to a regular expression using ``re.compile(regex)``.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
         """
-        return extract_regex(regex, self.extract())
+        return extract_regex(regex, self.extract(), replace_entities=replace_entities)
 
-    def re_first(self, regex, default=None):
+    def re_first(self, regex, default=None, replace_entities=True):
         """
         Apply the given regex and return the first unicode string which
         matches. If there is no match, return the default value (``None`` if
         the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
         """
-        return next(iflatten(self.re(regex)), default)
+        return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
 
     def extract(self):
         """
diff --git a/parsel/utils.py b/parsel/utils.py
@@ -1,6 +1,6 @@
 import re
 import six
-from w3lib.html import replace_entities
+from w3lib.html import replace_entities as w3lib_replace_entities
 
 
 def flatten(x):
@@ -56,7 +56,7 @@ def _is_listlike(x):
     return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
 
 
-def extract_regex(regex, text):
+def extract_regex(regex, text, replace_entities=True):
     """Extract a list of unicode strings from the given text/encoding using the following policies:
     * if the regex contains a named group called "extract" that will be returned
     * if the regex contains multiple numbered groups, all those will be returned (flattened)
@@ -76,4 +76,8 @@ def extract_regex(regex, text):
     else:
         # full regex or numbered groups
         strings = regex.findall(text)
-    return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(strings)]
+
+    strings = flatten(strings)
+    if not replace_entities:
+        return strings
+    return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -455,7 +455,7 @@ def test_re(self):
                          ["John", "Paul"])
         self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
                          ["10", "20"])
-
+        
         # Test named group, hit and miss
         x = self.sscls(text=u'foobar')
         self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
@@ -465,6 +465,32 @@ def test_re(self):
         x = self.sscls(text=u'baz')
         self.assertEqual(x.re('(?P<extract>foo)|(?P<bar>baz)'), [])
 
+    def test_re_replace_entities(self):
+        body = u"""<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
+        x = self.sscls(text=body)
+        
+        name_re = re.compile('{"foo":(.*)}')
+
+        # by default, only &amp; and &lt; are preserved ;
+        # other entities are converted
+        expected = u'"bar &amp; "baz""'
+        self.assertEqual(x.xpath("//script/text()").re(name_re), [expected])
+        self.assertEqual(x.xpath("//script").re(name_re), [expected])
+        self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected])
+        self.assertEqual(x.xpath("//script")[0].re(name_re), [expected])
+
+        # check that re_first() works the same way for single value output
+        self.assertEqual(x.xpath("//script").re_first(name_re), expected)
+        self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected)
+
+        # switching off replace_entities will preserve &quot; also
+        expected = u'"bar &amp; &quot;baz&quot;"'
+        self.assertEqual(x.xpath("//script/text()").re(name_re, replace_entities=False), [expected])
+        self.assertEqual(x.xpath("//script")[0].re(name_re, replace_entities=False), [expected])
+
+        self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected)
+        self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected)
+
     def test_re_intl(self):
         body = u'<div>Evento: cumplea\xf1os</div>'
         x = self.sscls(text=body)