Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93335:2114fde9ada8 Date: 2017-12-09 19:38 +0100 http://bitbucket.org/pypy/pypy/changeset/2114fde9ada8/
Log: in-progress diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -34,11 +34,14 @@ def slice_w(space, ctx, start, end, w_default): - if 0 <= start <= end: + # 'start' and 'end' are byte positions + if ctx.ZERO <= start <= end: if isinstance(ctx, rsre_core.BufMatchContext): return space.newbytes(ctx._buffer.getslice(start, end, 1, end-start)) if isinstance(ctx, rsre_core.StrMatchContext): + start = ctx._real_pos(start) + end = ctx._real_pos(end) return space.newbytes(ctx._string[start:end]) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): XXXXXXX @@ -60,6 +63,7 @@ return None result = [-1] * (2 * num_groups) mark = ctx.match_marks + XXX while mark is not None: index = jit.promote(mark.gid) if result[index] == -1: @@ -70,6 +74,7 @@ @jit.look_inside_iff(lambda space, ctx, fmarks, num_groups, w_default: jit.isconstant(num_groups)) def allgroups_w(space, ctx, fmarks, num_groups, w_default): + XXX grps = [slice_w(space, ctx, fmarks[i * 2], fmarks[i * 2 + 1], w_default) for i in range(num_groups)] return space.newtuple(grps) @@ -138,8 +143,7 @@ pos = len(str) if endpos > len(str): endpos = len(str) - return rsre_core.StrMatchContext(self.code, str, - pos, endpos, self.flags) + return self._make_str_match_context(str, pos, endpos) else: buf = space.readbuf_w(w_string) size = buf.getlength() @@ -151,6 +155,11 @@ return rsre_core.BufMatchContext(self.code, buf, pos, endpos, self.flags) + def _make_str_match_context(self, str, pos, endpos): + # for tests to override + return rsre_core.StrMatchContext(self.code, str, + pos, endpos, self.flags) + def getmatch(self, ctx, found): if found: return W_SRE_Match(self, ctx) @@ -191,6 +200,7 @@ matchlist_w.append(w_item) no_progress = (ctx.match_start == ctx.match_end) ctx.reset(ctx.match_end + no_progress) + XXX # ^^^ return space.newlist(matchlist_w) @unwrap_spec(pos=int, endpos=int) @@ -215,6 +225,7 @@ if ctx.match_start == ctx.end: # or end of string break ctx.reset(ctx.match_end + 1) + XXX # ^^^ continue splitlist.append(slice_w(space, ctx, last, ctx.match_start, space.w_None)) @@ -283,7 +294,7 @@ sublist_w = strbuilder = unicodebuilder = None if use_builder: if filter_as_unicode is not None: - unicodebuilder = Utf8StringBuilder(ctx.end) + unicodebuilder = XXX #Utf8StringBuilder(ctx.end) else: assert filter_as_string is not None strbuilder = StringBuilder(ctx.end) @@ -499,18 +510,30 @@ @unwrap_spec(w_groupnum=WrappedDefault(0)) def start_w(self, w_groupnum): - return self.space.newint(self.do_span(w_groupnum)[0]) + start, end = self.do_span(w_groupnum) + start = self.bytepos_to_charindex(start) + return self.space.newint(start) @unwrap_spec(w_groupnum=WrappedDefault(0)) def end_w(self, w_groupnum): - return self.space.newint(self.do_span(w_groupnum)[1]) + start, end = self.do_span(w_groupnum) + end = self.bytepos_to_charindex(end) + return self.space.newint(end) @unwrap_spec(w_groupnum=WrappedDefault(0)) def span_w(self, w_groupnum): start, end = self.do_span(w_groupnum) + start = self.bytepos_to_charindex(start) + end = self.bytepos_to_charindex(end) return self.space.newtuple([self.space.newint(start), self.space.newint(end)]) + def bytepos_to_charindex(self, bytepos): + # Transform a 'byte position', as returned by all methods from + # rsre_core, back into a 'character index'. This is for UTF8 + # handling. + XXXX + def flatten_marks(self): if self.flatten_cache is None: num_groups = self.srepat.num_groups diff --git a/pypy/module/_sre/test/test_app_sre.py b/pypy/module/_sre/test/test_app_sre.py --- a/pypy/module/_sre/test/test_app_sre.py +++ b/pypy/module/_sre/test/test_app_sre.py @@ -4,6 +4,8 @@ import py from py.test import raises, skip from pypy.interpreter.gateway import app2interp_temp +from pypy.module._sre import interp_sre +from rpython.rlib.rsre.test import support def init_app_test(cls, space): @@ -20,6 +22,33 @@ sys.path.pop(0) """) +def _test_sre_ctx_(self, str, start, end): + # Use the MatchContextForTests class, which handles Position + # instances instead of plain integers. This is used to detect when + # we're accepting or escaping a Position to app-level, which we + # should not: Positions are meant to be byte indexes inside a + # possibly UTF8 string, not character indexes. + start = support.Position(start) + end = support.Position(end) + return support.MatchContextForTests(self.code, str, start, end, self.flags) + +def _bytepos_to_charindex(self, bytepos): + return self.ctx._real_pos(bytepos) + +def setup_module(mod): + mod._org_maker = ( + interp_sre.W_SRE_Pattern._make_str_match_context, + interp_sre.W_SRE_Match.bytepos_to_charindex, + ) + interp_sre.W_SRE_Pattern._make_str_match_context = _test_sre_ctx_ + interp_sre.W_SRE_Match.bytepos_to_charindex = _bytepos_to_charindex + +def teardown_module(mod): + ( + interp_sre.W_SRE_Pattern._make_str_match_context, + interp_sre.W_SRE_Match.bytepos_to_charindex, + ) = mod._org_maker + class AppTestSrePy: def test_magic(self): diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -296,6 +296,9 @@ def get_single_byte(self, base_position, index): return self.str(base_position + index) + def _real_pos(self, index): + return index # overridden by tests + def fresh_copy(self, start): return StrMatchContext(self.pattern, self._string, start, self.end, self.flags) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit