Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93291:fc5e50bec2b2 Date: 2017-12-07 09:01 +0100 http://bitbucket.org/pypy/pypy/changeset/fc5e50bec2b2/
Log: Refix and test the standard StrMatchContext diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -138,8 +138,40 @@ """Similar to str().""" raise NotImplementedError - def debug_check_pos(self, pos): - pass + # The following methods are provided to be overriden in + # Utf8MatchContext. The non-utf8 implementation is provided + # by the FixedMatchContext abstract subclass, in order to use + # the same @not_rpython safety trick as above. + @not_rpython + def next(self, position): + raise NotImplementedError + @not_rpython + def prev(self, position): + raise NotImplementedError + @not_rpython + def next_n(self, position, n): + raise NotImplementedError + @not_rpython + def prev_n(self, position, n, start_position): + raise NotImplementedError + @not_rpython + def slowly_convert_byte_pos_to_index(self, position): + raise NotImplementedError + @not_rpython + def debug_check_pos(self, position): + raise NotImplementedError + @not_rpython + def maximum_distance(self, position_low, position_high): + raise NotImplementedError + @not_rpython + def bytes_difference(self, position1, position2): + raise NotImplementedError + @not_rpython + def get_single_byte(self, base_position, index): + raise NotImplementedError + @not_rpython + def go_forward_by_bytes(self, base_position, index): + raise NotImplementedError def get_mark(self, gid): mark = find_mark(self.match_marks, gid) @@ -186,13 +218,56 @@ def fresh_copy(self, start): raise NotImplementedError -class BufMatchContext(AbstractMatchContext): + +class FixedMatchContext(AbstractMatchContext): + """Abstract subclass to introduce the default implementation for + these position methods. The Utf8 subclass doesn't inherit from here.""" + + ZERO = 0 + + def next(self, position): + return position + 1 + + def prev(self, position): + if position == 0: + raise EndOfString + return position - 1 + + def next_n(self, position, n, end_position): + position += n + if position > end_position: + raise EndOfString + return position + + def prev_n(self, position, n, start_position): + position -= n + if position < start_position: + raise EndOfString + return position + + def slowly_convert_byte_pos_to_index(self, position): + return position + + def debug_check_pos(self, position): + pass + + def maximum_distance(self, position_low, position_high): + return position_high - position_low + + def bytes_difference(self, position1, position2): + return position1 - position2 + + def go_forward_by_bytes(self, base_position, index): + return base_position + index + + +class BufMatchContext(FixedMatchContext): """Concrete subclass for matching in a buffer.""" _immutable_fields_ = ["_buffer"] def __init__(self, pattern, buf, match_start, end, flags): - AbstractMatchContext.__init__(self, pattern, match_start, end, flags) + FixedMatchContext.__init__(self, pattern, match_start, end, flags) self._buffer = buf def str(self, index): @@ -203,17 +278,20 @@ c = self.str(index) return rsre_char.getlower(c, self.flags) + def get_single_byte(self, base_position, index): + return self.str(base_position + index) + def fresh_copy(self, start): return BufMatchContext(self.pattern, self._buffer, start, self.end, self.flags) -class StrMatchContext(AbstractMatchContext): +class StrMatchContext(FixedMatchContext): """Concrete subclass for matching in a plain string.""" _immutable_fields_ = ["_string"] def __init__(self, pattern, string, match_start, end, flags): - AbstractMatchContext.__init__(self, pattern, match_start, end, flags) + FixedMatchContext.__init__(self, pattern, match_start, end, flags) self._string = string if not we_are_translated() and isinstance(string, unicode): self.flags |= rsre_char.SRE_FLAG_UNICODE # for rsre_re.py @@ -226,17 +304,20 @@ c = self.str(index) return rsre_char.getlower(c, self.flags) + def get_single_byte(self, base_position, index): + return self.str(base_position + index) + def fresh_copy(self, start): return StrMatchContext(self.pattern, self._string, start, self.end, self.flags) -class UnicodeMatchContext(AbstractMatchContext): +class UnicodeMatchContext(FixedMatchContext): """Concrete subclass for matching in a unicode string.""" _immutable_fields_ = ["_unicodestr"] def __init__(self, pattern, unicodestr, match_start, end, flags): - AbstractMatchContext.__init__(self, pattern, match_start, end, flags) + FixedMatchContext.__init__(self, pattern, match_start, end, flags) self._unicodestr = unicodestr def str(self, index): @@ -247,6 +328,9 @@ c = self.str(index) return rsre_char.getlower(c, self.flags) + def get_single_byte(self, base_position, index): + return self.str(base_position + index) + def fresh_copy(self, start): return UnicodeMatchContext(self.pattern, self._unicodestr, start, self.end, self.flags) diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py --- a/rpython/rlib/rsre/test/support.py +++ b/rpython/rlib/rsre/test/support.py @@ -25,7 +25,6 @@ """Concrete subclass for matching in a plain string, tweaked for tests""" ZERO = Position(0) - EXACT_DISTANCE = False def next(self, position): assert isinstance(position, Position) diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py --- a/rpython/rlib/rsre/test/test_search.py +++ b/rpython/rlib/rsre/test/test_search.py @@ -1,44 +1,45 @@ import re, py from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re -from rpython.rlib.rsre.test.support import search, match, Position +from rpython.rlib.rsre.test import support +from rpython.rlib.rsre import rsre_core -class TestSearch: +class BaseTestSearch: def test_code1(self): r_code1 = get_code(r'[abc][def][ghi]') - res = search(r_code1, "fooahedixxx") + res = self.search(r_code1, "fooahedixxx") assert res is None - res = search(r_code1, "fooahcdixxx") + res = self.search(r_code1, "fooahcdixxx") assert res is not None assert res.span() == (5, 8) def test_code2(self): r_code2 = get_code(r'<item>\s*<title>(.*?)</title>') - res = search(r_code2, "foo bar <item> <title>abc</title>def") + res = self.search(r_code2, "foo bar <item> <title>abc</title>def") assert res is not None assert res.span() == (8, 34) def test_pure_literal(self): r_code3 = get_code(r'foobar') - res = search(r_code3, "foo bar foobar baz") + res = self.search(r_code3, "foo bar foobar baz") assert res is not None assert res.span() == (8, 14) def test_code3(self): r_code1 = get_code(r'<item>\s*<title>(.*?)</title>') - res = match(r_code1, "<item> <title>abc</title>def") + res = self.match(r_code1, "<item> <title>abc</title>def") assert res is not None def test_max_until_0_65535(self): r_code2 = get_code(r'<abc>(?:xy)*xy</abc>') - #res = match(r_code2, '<abc></abc>def') + #res = self.match(r_code2, '<abc></abc>def') #assert res is None - #res = match(r_code2, '<abc>xy</abc>def') + #res = self.match(r_code2, '<abc>xy</abc>def') #assert res is not None - res = match(r_code2, '<abc>xyxyxy</abc>def') + res = self.match(r_code2, '<abc>xyxyxy</abc>def') assert res is not None - res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') + res = self.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') assert res is not None def test_max_until_3_5(self): @@ -46,18 +47,18 @@ for i in range(8): s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef' assert (r.match(s) is not None) is (3 <= i-1 <= 5) - res = match(r_code2, s) + res = self.match(r_code2, s) assert (res is not None) is (3 <= i-1 <= 5) def test_min_until_0_65535(self): r_code2 = get_code(r'<abc>(?:xy)*?xy</abc>') - res = match(r_code2, '<abc></abc>def') + res = self.match(r_code2, '<abc></abc>def') assert res is None - res = match(r_code2, '<abc>xy</abc>def') + res = self.match(r_code2, '<abc>xy</abc>def') assert res is not None - res = match(r_code2, '<abc>xyxyxy</abc>def') + res = self.match(r_code2, '<abc>xyxyxy</abc>def') assert res is not None - res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') + res = self.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') assert res is not None def test_min_until_3_5(self): @@ -65,44 +66,44 @@ for i in range(8): s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef' assert (r.match(s) is not None) is (3 <= i-1 <= 5) - res = match(r_code2, s) + res = self.match(r_code2, s) assert (res is not None) is (3 <= i-1 <= 5) def test_min_repeat_one(self): r_code3 = get_code(r'<abc>.{3,5}?y') for i in range(8): - res = match(r_code3, '<abc>' + 'x'*i + 'y') + res = self.match(r_code3, '<abc>' + 'x'*i + 'y') assert (res is not None) is (3 <= i <= 5) def test_simple_group(self): r_code4 = get_code(r'<abc>(x.)</abc>') - res = match(r_code4, '<abc>xa</abc>def') + res = self.match(r_code4, '<abc>xa</abc>def') assert res is not None assert res.get_mark(0) == 5 assert res.get_mark(1) == 7 def test_max_until_groups(self): r_code4 = get_code(r'<abc>(x.)*xy</abc>') - res = match(r_code4, '<abc>xaxbxy</abc>def') + res = self.match(r_code4, '<abc>xaxbxy</abc>def') assert res is not None assert res.get_mark(0) == 7 assert res.get_mark(1) == 9 def test_group_branch(self): r_code5 = get_code(r'<abc>(ab|c)</abc>') - res = match(r_code5, '<abc>ab</abc>def') + res = self.match(r_code5, '<abc>ab</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (5, 7) - res = match(r_code5, '<abc>c</abc>def') + res = self.match(r_code5, '<abc>c</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (5, 6) - res = match(r_code5, '<abc>de</abc>def') + res = self.match(r_code5, '<abc>de</abc>def') assert res is None def test_group_branch_max_until(self): r_code6 = get_code(r'<abc>(ab|c)*a</abc>') - res = match(r_code6, '<abc>ccabcccaba</abc>def') + res = self.match(r_code6, '<abc>ccabcccaba</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (12, 14) r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>') - res = match(r_code7, '<abc>ccabcccaba</abc>def') + res = self.match(r_code7, '<abc>ccabcccaba</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (12, 14) assert (res.get_mark(2), res.get_mark(3)) == (12, 14) assert (res.get_mark(4), res.get_mark(5)) == (11, 12) @@ -113,7 +114,7 @@ assert m.span(1) == (12, 13) assert m.span(3) == (12, 13) assert m.span(2) == (8, 9) - res = match(r_code7, '<abc>bbbabbbb</abc>') + res = self.match(r_code7, '<abc>bbbabbbb</abc>') assert (res.get_mark(0), res.get_mark(1)) == (12, 13) assert (res.get_mark(4), res.get_mark(5)) == (12, 13) assert (res.get_mark(2), res.get_mark(3)) == (8, 9) @@ -124,7 +125,7 @@ assert m.span(1) == (6, 7) assert m.span(3) == (6, 7) assert m.span(2) == (5, 6) - res = match(r_code8, '<abc>ab</abc>') + res = self.match(r_code8, '<abc>ab</abc>') assert (res.get_mark(0), res.get_mark(1)) == (6, 7) assert (res.get_mark(4), res.get_mark(5)) == (6, 7) assert (res.get_mark(2), res.get_mark(3)) == (5, 6) @@ -134,7 +135,7 @@ m = r9.match('xyzxc') assert m.span(1) == (3, 4) assert m.span(2) == (-1, -1) - res = match(r_code9, 'xyzxc') + res = self.match(r_code9, 'xyzxc') assert (res.get_mark(0), res.get_mark(1)) == (3, 4) assert (res.get_mark(2), res.get_mark(3)) == (-1, -1) @@ -142,8 +143,8 @@ r_code9, r9 = get_code_and_re(r'((x|yz)+?(y)??c)*') m = r9.match('xycxyzxc') assert m.span(2) == (6, 7) - #assert match.span(3) == (1, 2) --- bug of CPython - res = match(r_code9, 'xycxyzxc') + #assert self.match.span(3) == (1, 2) --- bug of CPython + res = self.match(r_code9, 'xycxyzxc') assert (res.get_mark(2), res.get_mark(3)) == (6, 7) assert (res.get_mark(4), res.get_mark(5)) == (1, 2) @@ -151,19 +152,19 @@ r_code, r = get_code_and_re(r'(a?)+y') assert r.match('y') assert r.match('aaayaaay').span() == (0, 4) - res = match(r_code, 'y') + res = self.match(r_code, 'y') assert res - res = match(r_code, 'aaayaaay') + res = self.match(r_code, 'aaayaaay') assert res and res.span() == (0, 4) # r_code, r = get_code_and_re(r'(a?){4,6}y') assert r.match('y') - res = match(r_code, 'y') + res = self.match(r_code, 'y') assert res # r_code, r = get_code_and_re(r'(a?)*y') assert r.match('y') - res = match(r_code, 'y') + res = self.match(r_code, 'y') assert res def test_empty_maxuntil_2(self): @@ -173,24 +174,24 @@ py.test.skip("older version of the stdlib: %s" % (e,)) assert r.match('XfooXbarX').span() == (0, 5) assert r.match('XfooXbarX').span(1) == (4, 4) - res = match(r_code, 'XfooXbarX') + res = self.match(r_code, 'XfooXbarX') assert res.span() == (0, 5) assert res.span(1) == (4, 4) def test_empty_minuntil(self): r_code, r = get_code_and_re(r'(a?)+?y') #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory - res = match(r_code, 'z') + res = self.match(r_code, 'z') assert not res # r_code, r = get_code_and_re(r'(a?){4,6}?y') assert not r.match('z') - res = match(r_code, 'z') + res = self.match(r_code, 'z') assert not res # r_code, r = get_code_and_re(r'(a?)*?y') #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory - res = match(r_code, 'z') + res = self.match(r_code, 'z') assert not res def test_empty_search(self): @@ -198,15 +199,26 @@ for j in range(-2, 6): for i in range(-2, 6): match = r.search('abc', i, j) - res = search(r_code, 'abc', i, j) + res = self.search(r_code, 'abc', i, j) jk = min(max(j, 0), 3) ik = min(max(i, 0), 3) if ik <= jk: assert match is not None assert match.span() == (ik, ik) assert res is not None - assert res.match_start == Position(ik) - assert res.match_end == Position(ik) + assert res.match_start == self.Position(ik) + assert res.match_end == self.Position(ik) else: assert match is None assert res is None + + +class TestSearchCustom(BaseTestSearch): + search = staticmethod(support.search) + match = staticmethod(support.match) + Position = support.Position + +class TestSearchStr(BaseTestSearch): + search = staticmethod(rsre_core.search) + match = staticmethod(rsre_core.match) + Position = staticmethod(lambda n: n) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit