Author: Armin Rigo <[email protected]>
Branch: unicode-utf8-re
Changeset: r93291:fc5e50bec2b2
Date: 2017-12-07 09:01 +0100
http://bitbucket.org/pypy/pypy/changeset/fc5e50bec2b2/
Log: Refix and test the standard StrMatchContext
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -138,8 +138,40 @@
"""Similar to str()."""
raise NotImplementedError
- def debug_check_pos(self, pos):
- pass
+ # The following methods are provided to be overriden in
+ # Utf8MatchContext. The non-utf8 implementation is provided
+ # by the FixedMatchContext abstract subclass, in order to use
+ # the same @not_rpython safety trick as above.
+ @not_rpython
+ def next(self, position):
+ raise NotImplementedError
+ @not_rpython
+ def prev(self, position):
+ raise NotImplementedError
+ @not_rpython
+ def next_n(self, position, n):
+ raise NotImplementedError
+ @not_rpython
+ def prev_n(self, position, n, start_position):
+ raise NotImplementedError
+ @not_rpython
+ def slowly_convert_byte_pos_to_index(self, position):
+ raise NotImplementedError
+ @not_rpython
+ def debug_check_pos(self, position):
+ raise NotImplementedError
+ @not_rpython
+ def maximum_distance(self, position_low, position_high):
+ raise NotImplementedError
+ @not_rpython
+ def bytes_difference(self, position1, position2):
+ raise NotImplementedError
+ @not_rpython
+ def get_single_byte(self, base_position, index):
+ raise NotImplementedError
+ @not_rpython
+ def go_forward_by_bytes(self, base_position, index):
+ raise NotImplementedError
def get_mark(self, gid):
mark = find_mark(self.match_marks, gid)
@@ -186,13 +218,56 @@
def fresh_copy(self, start):
raise NotImplementedError
-class BufMatchContext(AbstractMatchContext):
+
+class FixedMatchContext(AbstractMatchContext):
+ """Abstract subclass to introduce the default implementation for
+ these position methods. The Utf8 subclass doesn't inherit from here."""
+
+ ZERO = 0
+
+ def next(self, position):
+ return position + 1
+
+ def prev(self, position):
+ if position == 0:
+ raise EndOfString
+ return position - 1
+
+ def next_n(self, position, n, end_position):
+ position += n
+ if position > end_position:
+ raise EndOfString
+ return position
+
+ def prev_n(self, position, n, start_position):
+ position -= n
+ if position < start_position:
+ raise EndOfString
+ return position
+
+ def slowly_convert_byte_pos_to_index(self, position):
+ return position
+
+ def debug_check_pos(self, position):
+ pass
+
+ def maximum_distance(self, position_low, position_high):
+ return position_high - position_low
+
+ def bytes_difference(self, position1, position2):
+ return position1 - position2
+
+ def go_forward_by_bytes(self, base_position, index):
+ return base_position + index
+
+
+class BufMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a buffer."""
_immutable_fields_ = ["_buffer"]
def __init__(self, pattern, buf, match_start, end, flags):
- AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ FixedMatchContext.__init__(self, pattern, match_start, end, flags)
self._buffer = buf
def str(self, index):
@@ -203,17 +278,20 @@
c = self.str(index)
return rsre_char.getlower(c, self.flags)
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
def fresh_copy(self, start):
return BufMatchContext(self.pattern, self._buffer, start,
self.end, self.flags)
-class StrMatchContext(AbstractMatchContext):
+class StrMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a plain string."""
_immutable_fields_ = ["_string"]
def __init__(self, pattern, string, match_start, end, flags):
- AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ FixedMatchContext.__init__(self, pattern, match_start, end, flags)
self._string = string
if not we_are_translated() and isinstance(string, unicode):
self.flags |= rsre_char.SRE_FLAG_UNICODE # for rsre_re.py
@@ -226,17 +304,20 @@
c = self.str(index)
return rsre_char.getlower(c, self.flags)
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
def fresh_copy(self, start):
return StrMatchContext(self.pattern, self._string, start,
self.end, self.flags)
-class UnicodeMatchContext(AbstractMatchContext):
+class UnicodeMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a unicode string."""
_immutable_fields_ = ["_unicodestr"]
def __init__(self, pattern, unicodestr, match_start, end, flags):
- AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ FixedMatchContext.__init__(self, pattern, match_start, end, flags)
self._unicodestr = unicodestr
def str(self, index):
@@ -247,6 +328,9 @@
c = self.str(index)
return rsre_char.getlower(c, self.flags)
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
def fresh_copy(self, start):
return UnicodeMatchContext(self.pattern, self._unicodestr, start,
self.end, self.flags)
diff --git a/rpython/rlib/rsre/test/support.py
b/rpython/rlib/rsre/test/support.py
--- a/rpython/rlib/rsre/test/support.py
+++ b/rpython/rlib/rsre/test/support.py
@@ -25,7 +25,6 @@
"""Concrete subclass for matching in a plain string, tweaked for tests"""
ZERO = Position(0)
- EXACT_DISTANCE = False
def next(self, position):
assert isinstance(position, Position)
diff --git a/rpython/rlib/rsre/test/test_search.py
b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -1,44 +1,45 @@
import re, py
from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re
-from rpython.rlib.rsre.test.support import search, match, Position
+from rpython.rlib.rsre.test import support
+from rpython.rlib.rsre import rsre_core
-class TestSearch:
+class BaseTestSearch:
def test_code1(self):
r_code1 = get_code(r'[abc][def][ghi]')
- res = search(r_code1, "fooahedixxx")
+ res = self.search(r_code1, "fooahedixxx")
assert res is None
- res = search(r_code1, "fooahcdixxx")
+ res = self.search(r_code1, "fooahcdixxx")
assert res is not None
assert res.span() == (5, 8)
def test_code2(self):
r_code2 = get_code(r'<item>\s*<title>(.*?)</title>')
- res = search(r_code2, "foo bar <item> <title>abc</title>def")
+ res = self.search(r_code2, "foo bar <item> <title>abc</title>def")
assert res is not None
assert res.span() == (8, 34)
def test_pure_literal(self):
r_code3 = get_code(r'foobar')
- res = search(r_code3, "foo bar foobar baz")
+ res = self.search(r_code3, "foo bar foobar baz")
assert res is not None
assert res.span() == (8, 14)
def test_code3(self):
r_code1 = get_code(r'<item>\s*<title>(.*?)</title>')
- res = match(r_code1, "<item> <title>abc</title>def")
+ res = self.match(r_code1, "<item> <title>abc</title>def")
assert res is not None
def test_max_until_0_65535(self):
r_code2 = get_code(r'<abc>(?:xy)*xy</abc>')
- #res = match(r_code2, '<abc></abc>def')
+ #res = self.match(r_code2, '<abc></abc>def')
#assert res is None
- #res = match(r_code2, '<abc>xy</abc>def')
+ #res = self.match(r_code2, '<abc>xy</abc>def')
#assert res is not None
- res = match(r_code2, '<abc>xyxyxy</abc>def')
+ res = self.match(r_code2, '<abc>xyxyxy</abc>def')
assert res is not None
- res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
+ res = self.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
assert res is not None
def test_max_until_3_5(self):
@@ -46,18 +47,18 @@
for i in range(8):
s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef'
assert (r.match(s) is not None) is (3 <= i-1 <= 5)
- res = match(r_code2, s)
+ res = self.match(r_code2, s)
assert (res is not None) is (3 <= i-1 <= 5)
def test_min_until_0_65535(self):
r_code2 = get_code(r'<abc>(?:xy)*?xy</abc>')
- res = match(r_code2, '<abc></abc>def')
+ res = self.match(r_code2, '<abc></abc>def')
assert res is None
- res = match(r_code2, '<abc>xy</abc>def')
+ res = self.match(r_code2, '<abc>xy</abc>def')
assert res is not None
- res = match(r_code2, '<abc>xyxyxy</abc>def')
+ res = self.match(r_code2, '<abc>xyxyxy</abc>def')
assert res is not None
- res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
+ res = self.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
assert res is not None
def test_min_until_3_5(self):
@@ -65,44 +66,44 @@
for i in range(8):
s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef'
assert (r.match(s) is not None) is (3 <= i-1 <= 5)
- res = match(r_code2, s)
+ res = self.match(r_code2, s)
assert (res is not None) is (3 <= i-1 <= 5)
def test_min_repeat_one(self):
r_code3 = get_code(r'<abc>.{3,5}?y')
for i in range(8):
- res = match(r_code3, '<abc>' + 'x'*i + 'y')
+ res = self.match(r_code3, '<abc>' + 'x'*i + 'y')
assert (res is not None) is (3 <= i <= 5)
def test_simple_group(self):
r_code4 = get_code(r'<abc>(x.)</abc>')
- res = match(r_code4, '<abc>xa</abc>def')
+ res = self.match(r_code4, '<abc>xa</abc>def')
assert res is not None
assert res.get_mark(0) == 5
assert res.get_mark(1) == 7
def test_max_until_groups(self):
r_code4 = get_code(r'<abc>(x.)*xy</abc>')
- res = match(r_code4, '<abc>xaxbxy</abc>def')
+ res = self.match(r_code4, '<abc>xaxbxy</abc>def')
assert res is not None
assert res.get_mark(0) == 7
assert res.get_mark(1) == 9
def test_group_branch(self):
r_code5 = get_code(r'<abc>(ab|c)</abc>')
- res = match(r_code5, '<abc>ab</abc>def')
+ res = self.match(r_code5, '<abc>ab</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (5, 7)
- res = match(r_code5, '<abc>c</abc>def')
+ res = self.match(r_code5, '<abc>c</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (5, 6)
- res = match(r_code5, '<abc>de</abc>def')
+ res = self.match(r_code5, '<abc>de</abc>def')
assert res is None
def test_group_branch_max_until(self):
r_code6 = get_code(r'<abc>(ab|c)*a</abc>')
- res = match(r_code6, '<abc>ccabcccaba</abc>def')
+ res = self.match(r_code6, '<abc>ccabcccaba</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>')
- res = match(r_code7, '<abc>ccabcccaba</abc>def')
+ res = self.match(r_code7, '<abc>ccabcccaba</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
assert (res.get_mark(2), res.get_mark(3)) == (12, 14)
assert (res.get_mark(4), res.get_mark(5)) == (11, 12)
@@ -113,7 +114,7 @@
assert m.span(1) == (12, 13)
assert m.span(3) == (12, 13)
assert m.span(2) == (8, 9)
- res = match(r_code7, '<abc>bbbabbbb</abc>')
+ res = self.match(r_code7, '<abc>bbbabbbb</abc>')
assert (res.get_mark(0), res.get_mark(1)) == (12, 13)
assert (res.get_mark(4), res.get_mark(5)) == (12, 13)
assert (res.get_mark(2), res.get_mark(3)) == (8, 9)
@@ -124,7 +125,7 @@
assert m.span(1) == (6, 7)
assert m.span(3) == (6, 7)
assert m.span(2) == (5, 6)
- res = match(r_code8, '<abc>ab</abc>')
+ res = self.match(r_code8, '<abc>ab</abc>')
assert (res.get_mark(0), res.get_mark(1)) == (6, 7)
assert (res.get_mark(4), res.get_mark(5)) == (6, 7)
assert (res.get_mark(2), res.get_mark(3)) == (5, 6)
@@ -134,7 +135,7 @@
m = r9.match('xyzxc')
assert m.span(1) == (3, 4)
assert m.span(2) == (-1, -1)
- res = match(r_code9, 'xyzxc')
+ res = self.match(r_code9, 'xyzxc')
assert (res.get_mark(0), res.get_mark(1)) == (3, 4)
assert (res.get_mark(2), res.get_mark(3)) == (-1, -1)
@@ -142,8 +143,8 @@
r_code9, r9 = get_code_and_re(r'((x|yz)+?(y)??c)*')
m = r9.match('xycxyzxc')
assert m.span(2) == (6, 7)
- #assert match.span(3) == (1, 2) --- bug of CPython
- res = match(r_code9, 'xycxyzxc')
+ #assert self.match.span(3) == (1, 2) --- bug of CPython
+ res = self.match(r_code9, 'xycxyzxc')
assert (res.get_mark(2), res.get_mark(3)) == (6, 7)
assert (res.get_mark(4), res.get_mark(5)) == (1, 2)
@@ -151,19 +152,19 @@
r_code, r = get_code_and_re(r'(a?)+y')
assert r.match('y')
assert r.match('aaayaaay').span() == (0, 4)
- res = match(r_code, 'y')
+ res = self.match(r_code, 'y')
assert res
- res = match(r_code, 'aaayaaay')
+ res = self.match(r_code, 'aaayaaay')
assert res and res.span() == (0, 4)
#
r_code, r = get_code_and_re(r'(a?){4,6}y')
assert r.match('y')
- res = match(r_code, 'y')
+ res = self.match(r_code, 'y')
assert res
#
r_code, r = get_code_and_re(r'(a?)*y')
assert r.match('y')
- res = match(r_code, 'y')
+ res = self.match(r_code, 'y')
assert res
def test_empty_maxuntil_2(self):
@@ -173,24 +174,24 @@
py.test.skip("older version of the stdlib: %s" % (e,))
assert r.match('XfooXbarX').span() == (0, 5)
assert r.match('XfooXbarX').span(1) == (4, 4)
- res = match(r_code, 'XfooXbarX')
+ res = self.match(r_code, 'XfooXbarX')
assert res.span() == (0, 5)
assert res.span(1) == (4, 4)
def test_empty_minuntil(self):
r_code, r = get_code_and_re(r'(a?)+?y')
#assert not r.match('z') -- CPython bug (at least 2.5) eats all memory
- res = match(r_code, 'z')
+ res = self.match(r_code, 'z')
assert not res
#
r_code, r = get_code_and_re(r'(a?){4,6}?y')
assert not r.match('z')
- res = match(r_code, 'z')
+ res = self.match(r_code, 'z')
assert not res
#
r_code, r = get_code_and_re(r'(a?)*?y')
#assert not r.match('z') -- CPython bug (at least 2.5) eats all memory
- res = match(r_code, 'z')
+ res = self.match(r_code, 'z')
assert not res
def test_empty_search(self):
@@ -198,15 +199,26 @@
for j in range(-2, 6):
for i in range(-2, 6):
match = r.search('abc', i, j)
- res = search(r_code, 'abc', i, j)
+ res = self.search(r_code, 'abc', i, j)
jk = min(max(j, 0), 3)
ik = min(max(i, 0), 3)
if ik <= jk:
assert match is not None
assert match.span() == (ik, ik)
assert res is not None
- assert res.match_start == Position(ik)
- assert res.match_end == Position(ik)
+ assert res.match_start == self.Position(ik)
+ assert res.match_end == self.Position(ik)
else:
assert match is None
assert res is None
+
+
+class TestSearchCustom(BaseTestSearch):
+ search = staticmethod(support.search)
+ match = staticmethod(support.match)
+ Position = support.Position
+
+class TestSearchStr(BaseTestSearch):
+ search = staticmethod(rsre_core.search)
+ match = staticmethod(rsre_core.match)
+ Position = staticmethod(lambda n: n)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit