Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93312:b58a53172e21 Date: 2017-12-08 12:44 +0100 http://bitbucket.org/pypy/pypy/changeset/b58a53172e21/
Log: Remove slowly_convert_byte_pos_to_index diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -159,9 +159,6 @@ def prev_n(self, position, n, start_position): raise NotImplementedError @not_rpython - def slowly_convert_byte_pos_to_index(self, position): - raise NotImplementedError - @not_rpython def debug_check_pos(self, position): raise NotImplementedError @not_rpython @@ -178,15 +175,13 @@ raise NotImplementedError def get_mark(self, gid): - mark = find_mark(self.match_marks, gid) - return self.slowly_convert_byte_pos_to_index(mark) + return find_mark(self.match_marks, gid) def flatten_marks(self): # for testing if self.match_marks_flat is None: self._compute_flattened_marks() - return [self.slowly_convert_byte_pos_to_index(i) - for i in self.match_marks_flat] + return self.match_marks_flat def _compute_flattened_marks(self): self.match_marks_flat = [self.match_start, self.match_end] @@ -249,9 +244,6 @@ raise EndOfString return position - def slowly_convert_byte_pos_to_index(self, position): - return position - def debug_check_pos(self, position): pass diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py --- a/rpython/rlib/rsre/rsre_utf8.py +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -3,16 +3,19 @@ from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString from rpython.rlib.rsre import rsre_char +from rpython.rlib.objectmodel import we_are_translated from rpython.rlib import rutf8 class Utf8MatchContext(AbstractMatchContext): + """A context that matches unicode, but encoded in a utf8 string. + Be careful because most positions taken by, handled in, and returned + by this class are expressed in *bytes*, not in characters. + """ - def __init__(self, pattern, utf8string, index_storage, - match_start, end, flags): + def __init__(self, pattern, utf8string, match_start, end, flags): AbstractMatchContext.__init__(self, pattern, match_start, end, flags) self._utf8 = utf8string - self._index_storage = index_storage def str(self, index): check_nonneg(index) @@ -58,16 +61,15 @@ assert position >= 0 return position - def slowly_convert_byte_pos_to_index(self, position): - return rutf8.codepoint_index_at_byte_position( - self._utf8, self._index_storage, position) - def debug_check_pos(self, position): + if we_are_translated(): + return + if position == len(self._utf8): + return # end of string is fine assert not (0x80 <= self._utf8[position] < 0xC0) # continuation byte -def utf8search(pattern, utf8string, index_storage=None, bytestart=0, - byteend=sys.maxint, flags=0): +def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0): # bytestart and byteend must be valid byte positions inside the # utf8string. from rpython.rlib.rsre.rsre_core import search_context @@ -76,11 +78,9 @@ assert 0 <= byteend if byteend > len(utf8string): byteend = len(utf8string) - if index_storage is None: # should be restricted to tests only - length = rutf8.check_utf8(utf8string, allow_surrogates=True) - index_storage = rutf8.create_utf8_index_storage(utf8string, length) - ctx = Utf8MatchContext(pattern, utf8string, index_storage, - bytestart, byteend, flags) + ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags) + ctx.debug_check_pos(bytestart) + ctx.debug_check_pos(byteend) if search_context(ctx): return ctx else: diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py --- a/rpython/rlib/rsre/test/test_search.py +++ b/rpython/rlib/rsre/test/test_search.py @@ -12,19 +12,22 @@ assert res is None res = self.search(r_code1, "fooahcdixxx") assert res is not None - assert res.span() == (5, 8) + P = self.P + assert res.span() == (P(5), P(8)) def test_code2(self): r_code2 = get_code(r'<item>\s*<title>(.*?)</title>') res = self.search(r_code2, "foo bar <item> <title>abc</title>def") assert res is not None - assert res.span() == (8, 34) + P = self.P + assert res.span() == (P(8), P(34)) def test_pure_literal(self): r_code3 = get_code(r'foobar') res = self.search(r_code3, "foo bar foobar baz") assert res is not None - assert res.span() == (8, 14) + P = self.P + assert res.span() == (P(8), P(14)) def test_code3(self): r_code1 = get_code(r'<item>\s*<title>(.*?)</title>') @@ -79,34 +82,38 @@ r_code4 = get_code(r'<abc>(x.)</abc>') res = self.match(r_code4, '<abc>xa</abc>def') assert res is not None - assert res.get_mark(0) == 5 - assert res.get_mark(1) == 7 + P = self.P + assert res.get_mark(0) == P(5) + assert res.get_mark(1) == P(7) def test_max_until_groups(self): r_code4 = get_code(r'<abc>(x.)*xy</abc>') res = self.match(r_code4, '<abc>xaxbxy</abc>def') assert res is not None - assert res.get_mark(0) == 7 - assert res.get_mark(1) == 9 + P = self.P + assert res.get_mark(0) == P(7) + assert res.get_mark(1) == P(9) def test_group_branch(self): r_code5 = get_code(r'<abc>(ab|c)</abc>') res = self.match(r_code5, '<abc>ab</abc>def') - assert (res.get_mark(0), res.get_mark(1)) == (5, 7) + P = self.P + assert (res.get_mark(0), res.get_mark(1)) == (P(5), P(7)) res = self.match(r_code5, '<abc>c</abc>def') - assert (res.get_mark(0), res.get_mark(1)) == (5, 6) + assert (res.get_mark(0), res.get_mark(1)) == (P(5), P(6)) res = self.match(r_code5, '<abc>de</abc>def') assert res is None def test_group_branch_max_until(self): r_code6 = get_code(r'<abc>(ab|c)*a</abc>') res = self.match(r_code6, '<abc>ccabcccaba</abc>def') - assert (res.get_mark(0), res.get_mark(1)) == (12, 14) + P = self.P + assert (res.get_mark(0), res.get_mark(1)) == (P(12), P(14)) r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>') res = self.match(r_code7, '<abc>ccabcccaba</abc>def') - assert (res.get_mark(0), res.get_mark(1)) == (12, 14) - assert (res.get_mark(2), res.get_mark(3)) == (12, 14) - assert (res.get_mark(4), res.get_mark(5)) == (11, 12) + assert (res.get_mark(0), res.get_mark(1)) == (P(12), P(14)) + assert (res.get_mark(2), res.get_mark(3)) == (P(12), P(14)) + assert (res.get_mark(4), res.get_mark(5)) == (P(11), P(12)) def test_group_7(self): r_code7, r7 = get_code_and_re(r'<abc>((a)?(b))*</abc>') @@ -115,9 +122,10 @@ assert m.span(3) == (12, 13) assert m.span(2) == (8, 9) res = self.match(r_code7, '<abc>bbbabbbb</abc>') - assert (res.get_mark(0), res.get_mark(1)) == (12, 13) - assert (res.get_mark(4), res.get_mark(5)) == (12, 13) - assert (res.get_mark(2), res.get_mark(3)) == (8, 9) + P = self.P + assert (res.get_mark(0), res.get_mark(1)) == (P(12), P(13)) + assert (res.get_mark(4), res.get_mark(5)) == (P(12), P(13)) + assert (res.get_mark(2), res.get_mark(3)) == (P(8), P(9)) def test_group_branch_repeat_complex_case(self): r_code8, r8 = get_code_and_re(r'<abc>((a)|(b))*</abc>') @@ -126,9 +134,10 @@ assert m.span(3) == (6, 7) assert m.span(2) == (5, 6) res = self.match(r_code8, '<abc>ab</abc>') - assert (res.get_mark(0), res.get_mark(1)) == (6, 7) - assert (res.get_mark(4), res.get_mark(5)) == (6, 7) - assert (res.get_mark(2), res.get_mark(3)) == (5, 6) + P = self.P + assert (res.get_mark(0), res.get_mark(1)) == (P(6), P(7)) + assert (res.get_mark(4), res.get_mark(5)) == (P(6), P(7)) + assert (res.get_mark(2), res.get_mark(3)) == (P(5), P(6)) def test_minuntil_lastmark_restore(self): r_code9, r9 = get_code_and_re(r'(x|yz)+?(y)??c') @@ -136,7 +145,8 @@ assert m.span(1) == (3, 4) assert m.span(2) == (-1, -1) res = self.match(r_code9, 'xyzxc') - assert (res.get_mark(0), res.get_mark(1)) == (3, 4) + P = self.P + assert (res.get_mark(0), res.get_mark(1)) == (P(3), P(4)) assert (res.get_mark(2), res.get_mark(3)) == (-1, -1) def test_minuntil_bug(self): @@ -145,8 +155,9 @@ assert m.span(2) == (6, 7) #assert self.match.span(3) == (1, 2) --- bug of CPython res = self.match(r_code9, 'xycxyzxc') - assert (res.get_mark(2), res.get_mark(3)) == (6, 7) - assert (res.get_mark(4), res.get_mark(5)) == (1, 2) + P = self.P + assert (res.get_mark(2), res.get_mark(3)) == (P(6), P(7)) + assert (res.get_mark(4), res.get_mark(5)) == (P(1), P(2)) def test_empty_maxuntil(self): r_code, r = get_code_and_re(r'(a?)+y') @@ -155,7 +166,8 @@ res = self.match(r_code, 'y') assert res res = self.match(r_code, 'aaayaaay') - assert res and res.span() == (0, 4) + P = self.P + assert res and res.span() == (P(0), P(4)) # r_code, r = get_code_and_re(r'(a?){4,6}y') assert r.match('y') @@ -175,8 +187,9 @@ assert r.match('XfooXbarX').span() == (0, 5) assert r.match('XfooXbarX').span(1) == (4, 4) res = self.match(r_code, 'XfooXbarX') - assert res.span() == (0, 5) - assert res.span(1) == (4, 4) + P = self.P + assert res.span() == (P(0), P(5)) + assert res.span(1) == (P(4), P(4)) def test_empty_minuntil(self): r_code, r = get_code_and_re(r'(a?)+?y') @@ -206,8 +219,8 @@ assert match is not None assert match.span() == (ik, ik) assert res is not None - assert res.match_start == self.Position(ik) - assert res.match_end == self.Position(ik) + assert res.match_start == self.P(ik) + assert res.match_end == self.P(ik) else: assert match is None assert res is None @@ -216,14 +229,14 @@ class TestSearchCustom(BaseTestSearch): search = staticmethod(support.search) match = staticmethod(support.match) - Position = support.Position + P = support.Position class TestSearchStr(BaseTestSearch): search = staticmethod(rsre_core.search) match = staticmethod(rsre_core.match) - Position = staticmethod(lambda n: n) + P = staticmethod(lambda n: n) class TestSearchUtf8(BaseTestSearch): search = staticmethod(rsre_utf8.utf8search) match = staticmethod(rsre_utf8.utf8match) - Position = staticmethod(lambda n: n) # NB. only for plain ascii + P = staticmethod(lambda n: n) # NB. only for plain ascii _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit