Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93244:4b6473b3ea05 Date: 2017-12-03 15:44 +0100 http://bitbucket.org/pypy/pypy/changeset/4b6473b3ea05/
Log: in-progress diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -1164,7 +1164,7 @@ if sre_match(ctx, base, start, None) is not None: ctx.match_start = start return True - start += 1 + start = ctx.next(start) return False install_jitdriver_spec('FastSearch', @@ -1183,6 +1183,8 @@ prefix_len = ctx.pat(5) assert prefix_len >= 0 i = 0 + j = 0 + past_start_positions = [0] * (prefix_len - 1) while True: ctx.jitdriver_FastSearch.jit_merge_point(ctx=ctx, string_position=string_position, i=i, prefix_len=prefix_len) @@ -1196,10 +1198,26 @@ i += 1 if i == prefix_len: # found a potential match - start = string_position + 1 - prefix_len - assert start >= 0 + + # This would be 'start = string_position + 1 - prefix_len' + # but it's probably faster to record the 'prefix_len' + # most recent locations, for utf8 + start = past_start_positions[j] + assert start >= ctx.ZERO prefix_skip = ctx.pat(6) - ptr = start + prefix_skip + if prefix_skip >= prefix_len - 1: + try: + ptr = ctx.next_n(string_position, + prefix_skip - (prefix_len - 1), + ctx.end) + except EndOfString: + ptr = -1 + else: + assert prefix_skip < prefix_len - 1 + j_prefix_skip = j + prefix_skip + if j_prefix_skip >= prefix_len - 1: + j_prefix_skip -= (prefix_len - 1) + ptr = past_start_positions[j_prefix_skip] #flags = ctx.pat(2) #if flags & rsre_char.SRE_INFO_LITERAL: # # matched all of pure literal pattern @@ -1209,11 +1227,16 @@ # return True pattern_offset = ctx.pat(1) + 1 ppos_start = pattern_offset + 2 * prefix_skip - if sre_match(ctx, ppos_start, ptr, None) is not None: + if (ptr >= ctx.ZERO and + sre_match(ctx, ppos_start, ptr, None) is not None): ctx.match_start = start return True overlap_offset = prefix_len + (7 - 1) i = ctx.pat(overlap_offset + i) - string_position += 1 + past_start_positions[j] = string_position + string_position = ctx.next(string_position) if string_position >= ctx.end: return False + j += 1 + if j == prefix_len - 1: + j = 0 diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py --- a/rpython/rlib/rsre/test/support.py +++ b/rpython/rlib/rsre/test/support.py @@ -1,6 +1,6 @@ import sys, random from rpython.rlib import debug -from rpython.rlib.rsre.rsre_core import _adjust, match_context +from rpython.rlib.rsre.rsre_core import _adjust, match_context, search_context from rpython.rlib.rsre.rsre_core import StrMatchContext, EndOfString @@ -112,3 +112,13 @@ def fullmatch(pattern, string, start=0, end=sys.maxint, flags=0): return match(pattern, string, start, end, flags, fullmatch=True) + +def search(pattern, string, start=0, end=sys.maxint, flags=0): + start, end = _adjust(start, end, len(string)) + start = Position(start) + end = Position(end) + ctx = MatchContextForTests(pattern, string, start, end, flags) + if search_context(ctx): + return ctx + else: + return None diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py --- a/rpython/rlib/rsre/test/test_search.py +++ b/rpython/rlib/rsre/test/test_search.py @@ -1,44 +1,44 @@ import re, py -from rpython.rlib.rsre import rsre_core from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re +from rpython.rlib.rsre.test.support import search, match class TestSearch: def test_code1(self): r_code1 = get_code(r'[abc][def][ghi]') - res = rsre_core.search(r_code1, "fooahedixxx") + res = search(r_code1, "fooahedixxx") assert res is None - res = rsre_core.search(r_code1, "fooahcdixxx") + res = search(r_code1, "fooahcdixxx") assert res is not None assert res.span() == (5, 8) def test_code2(self): r_code2 = get_code(r'<item>\s*<title>(.*?)</title>') - res = rsre_core.search(r_code2, "foo bar <item> <title>abc</title>def") + res = search(r_code2, "foo bar <item> <title>abc</title>def") assert res is not None assert res.span() == (8, 34) def test_pure_literal(self): r_code3 = get_code(r'foobar') - res = rsre_core.search(r_code3, "foo bar foobar baz") + res = search(r_code3, "foo bar foobar baz") assert res is not None assert res.span() == (8, 14) def test_code3(self): r_code1 = get_code(r'<item>\s*<title>(.*?)</title>') - res = rsre_core.match(r_code1, "<item> <title>abc</title>def") + res = match(r_code1, "<item> <title>abc</title>def") assert res is not None def test_max_until_0_65535(self): r_code2 = get_code(r'<abc>(?:xy)*xy</abc>') - #res = rsre_core.match(r_code2, '<abc></abc>def') + #res = match(r_code2, '<abc></abc>def') #assert res is None - #res = rsre_core.match(r_code2, '<abc>xy</abc>def') + #res = match(r_code2, '<abc>xy</abc>def') #assert res is not None - res = rsre_core.match(r_code2, '<abc>xyxyxy</abc>def') + res = match(r_code2, '<abc>xyxyxy</abc>def') assert res is not None - res = rsre_core.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') + res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') assert res is not None def test_max_until_3_5(self): @@ -46,18 +46,18 @@ for i in range(8): s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef' assert (r.match(s) is not None) is (3 <= i-1 <= 5) - res = rsre_core.match(r_code2, s) + res = match(r_code2, s) assert (res is not None) is (3 <= i-1 <= 5) def test_min_until_0_65535(self): r_code2 = get_code(r'<abc>(?:xy)*?xy</abc>') - res = rsre_core.match(r_code2, '<abc></abc>def') + res = match(r_code2, '<abc></abc>def') assert res is None - res = rsre_core.match(r_code2, '<abc>xy</abc>def') + res = match(r_code2, '<abc>xy</abc>def') assert res is not None - res = rsre_core.match(r_code2, '<abc>xyxyxy</abc>def') + res = match(r_code2, '<abc>xyxyxy</abc>def') assert res is not None - res = rsre_core.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') + res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def') assert res is not None def test_min_until_3_5(self): @@ -65,44 +65,44 @@ for i in range(8): s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef' assert (r.match(s) is not None) is (3 <= i-1 <= 5) - res = rsre_core.match(r_code2, s) + res = match(r_code2, s) assert (res is not None) is (3 <= i-1 <= 5) def test_min_repeat_one(self): r_code3 = get_code(r'<abc>.{3,5}?y') for i in range(8): - res = rsre_core.match(r_code3, '<abc>' + 'x'*i + 'y') + res = match(r_code3, '<abc>' + 'x'*i + 'y') assert (res is not None) is (3 <= i <= 5) def test_simple_group(self): r_code4 = get_code(r'<abc>(x.)</abc>') - res = rsre_core.match(r_code4, '<abc>xa</abc>def') + res = match(r_code4, '<abc>xa</abc>def') assert res is not None assert res.get_mark(0) == 5 assert res.get_mark(1) == 7 def test_max_until_groups(self): r_code4 = get_code(r'<abc>(x.)*xy</abc>') - res = rsre_core.match(r_code4, '<abc>xaxbxy</abc>def') + res = match(r_code4, '<abc>xaxbxy</abc>def') assert res is not None assert res.get_mark(0) == 7 assert res.get_mark(1) == 9 def test_group_branch(self): r_code5 = get_code(r'<abc>(ab|c)</abc>') - res = rsre_core.match(r_code5, '<abc>ab</abc>def') + res = match(r_code5, '<abc>ab</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (5, 7) - res = rsre_core.match(r_code5, '<abc>c</abc>def') + res = match(r_code5, '<abc>c</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (5, 6) - res = rsre_core.match(r_code5, '<abc>de</abc>def') + res = match(r_code5, '<abc>de</abc>def') assert res is None def test_group_branch_max_until(self): r_code6 = get_code(r'<abc>(ab|c)*a</abc>') - res = rsre_core.match(r_code6, '<abc>ccabcccaba</abc>def') + res = match(r_code6, '<abc>ccabcccaba</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (12, 14) r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>') - res = rsre_core.match(r_code7, '<abc>ccabcccaba</abc>def') + res = match(r_code7, '<abc>ccabcccaba</abc>def') assert (res.get_mark(0), res.get_mark(1)) == (12, 14) assert (res.get_mark(2), res.get_mark(3)) == (12, 14) assert (res.get_mark(4), res.get_mark(5)) == (11, 12) @@ -113,7 +113,7 @@ assert match.span(1) == (12, 13) assert match.span(3) == (12, 13) assert match.span(2) == (8, 9) - res = rsre_core.match(r_code7, '<abc>bbbabbbb</abc>') + res = match(r_code7, '<abc>bbbabbbb</abc>') assert (res.get_mark(0), res.get_mark(1)) == (12, 13) assert (res.get_mark(4), res.get_mark(5)) == (12, 13) assert (res.get_mark(2), res.get_mark(3)) == (8, 9) @@ -124,7 +124,7 @@ assert match.span(1) == (6, 7) assert match.span(3) == (6, 7) assert match.span(2) == (5, 6) - res = rsre_core.match(r_code8, '<abc>ab</abc>') + res = match(r_code8, '<abc>ab</abc>') assert (res.get_mark(0), res.get_mark(1)) == (6, 7) assert (res.get_mark(4), res.get_mark(5)) == (6, 7) assert (res.get_mark(2), res.get_mark(3)) == (5, 6) @@ -134,7 +134,7 @@ match = r9.match('xyzxc') assert match.span(1) == (3, 4) assert match.span(2) == (-1, -1) - res = rsre_core.match(r_code9, 'xyzxc') + res = match(r_code9, 'xyzxc') assert (res.get_mark(0), res.get_mark(1)) == (3, 4) assert (res.get_mark(2), res.get_mark(3)) == (-1, -1) @@ -143,7 +143,7 @@ match = r9.match('xycxyzxc') assert match.span(2) == (6, 7) #assert match.span(3) == (1, 2) --- bug of CPython - res = rsre_core.match(r_code9, 'xycxyzxc') + res = match(r_code9, 'xycxyzxc') assert (res.get_mark(2), res.get_mark(3)) == (6, 7) assert (res.get_mark(4), res.get_mark(5)) == (1, 2) @@ -151,19 +151,19 @@ r_code, r = get_code_and_re(r'(a?)+y') assert r.match('y') assert r.match('aaayaaay').span() == (0, 4) - res = rsre_core.match(r_code, 'y') + res = match(r_code, 'y') assert res - res = rsre_core.match(r_code, 'aaayaaay') + res = match(r_code, 'aaayaaay') assert res and res.span() == (0, 4) # r_code, r = get_code_and_re(r'(a?){4,6}y') assert r.match('y') - res = rsre_core.match(r_code, 'y') + res = match(r_code, 'y') assert res # r_code, r = get_code_and_re(r'(a?)*y') assert r.match('y') - res = rsre_core.match(r_code, 'y') + res = match(r_code, 'y') assert res def test_empty_maxuntil_2(self): @@ -173,24 +173,24 @@ py.test.skip("older version of the stdlib: %s" % (e,)) assert r.match('XfooXbarX').span() == (0, 5) assert r.match('XfooXbarX').span(1) == (4, 4) - res = rsre_core.match(r_code, 'XfooXbarX') + res = match(r_code, 'XfooXbarX') assert res.span() == (0, 5) assert res.span(1) == (4, 4) def test_empty_minuntil(self): r_code, r = get_code_and_re(r'(a?)+?y') #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory - res = rsre_core.match(r_code, 'z') + res = match(r_code, 'z') assert not res # r_code, r = get_code_and_re(r'(a?){4,6}?y') assert not r.match('z') - res = rsre_core.match(r_code, 'z') + res = match(r_code, 'z') assert not res # r_code, r = get_code_and_re(r'(a?)*?y') #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory - res = rsre_core.match(r_code, 'z') + res = match(r_code, 'z') assert not res def test_empty_search(self): @@ -198,7 +198,7 @@ for j in range(-2, 6): for i in range(-2, 6): match = r.search('abc', i, j) - res = rsre_core.search(r_code, 'abc', i, j) + res = search(r_code, 'abc', i, j) jk = min(max(j, 0), 3) ik = min(max(i, 0), 3) if ik <= jk: _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit