Skip to content

Commit 4df034c

Browse files
authored
Merge branch 'master' into re-replace-entities-switch
2 parents 8bba20f + ef0104d commit 4df034c

File tree

2 files changed

+23
-7
lines changed

2 files changed

+23
-7
lines changed

parsel/utils.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,19 @@ def extract_regex(regex, text, replace_entities=True):
6565
if isinstance(regex, six.string_types):
6666
regex = re.compile(regex, re.UNICODE)
6767

68-
try:
69-
strings = [regex.search(text).group('extract')] # named group
70-
except:
71-
strings = regex.findall(text) # full regex or numbered groups
68+
if 'extract' in regex.groupindex:
69+
# named group
70+
try:
71+
extracted = regex.search(text).group('extract')
72+
except AttributeError:
73+
strings = []
74+
else:
75+
strings = [extracted] if extracted is not None else []
76+
else:
77+
# full regex or numbered groups
78+
strings = regex.findall(text)
79+
7280
strings = flatten(strings)
7381
if not replace_entities:
7482
return strings
75-
return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
76-
83+
return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]

tests/test_selector.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,11 +455,20 @@ def test_re(self):
455455
["John", "Paul"])
456456
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
457457
["10", "20"])
458+
459+
# Test named group, hit and miss
460+
x = self.sscls(text=u'foobar')
461+
self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
462+
self.assertEqual(x.re('(?P<extract>baz)'), [])
463+
464+
# A purposely constructed test for an edge case
465+
x = self.sscls(text=u'baz')
466+
self.assertEqual(x.re('(?P<extract>foo)|(?P<bar>baz)'), [])
458467

459468
def test_re_replace_entities(self):
460469
body = u"""<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
461470
x = self.sscls(text=body)
462-
471+
463472
name_re = re.compile('{"foo":(.*)}')
464473

465474
# by default, only &amp; and &lt; are preserved ;

0 commit comments

Comments
 (0)