Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93337:6b113f6d5350 Date: 2017-12-09 20:30 +0100 http://bitbucket.org/pypy/pypy/changeset/6b113f6d5350/
Log: Tests and fixes diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -115,7 +115,9 @@ if endpos < pos: endpos = pos if space.isinstance_w(w_string, space.w_unicode): - utf8str, length = space.utf8_len_w(w_string) + w_unicode_obj = space.convert_arg_to_w_unicode(w_string) + utf8str = w_unicode_obj._utf8 + length = w_unicode_obj._len() if pos <= 0: bytepos = 0 elif pos >= length: @@ -127,8 +129,12 @@ else: endbytepos = rutf8.codepoint_at_index(utf8str, index_storage, endpos) - return rsre_utf8.Utf8MatchContext( + ctx = rsre_utf8.Utf8MatchContext( self.code, utf8str, bytepos, endbytepos, self.flags) + # xxx we store the w_string on the ctx too, for + # W_SRE_Match.bytepos_to_charindex() + ctx.w_unicode_obj = w_unicode_obj + return ctx elif space.isinstance_w(w_string, space.w_bytes): str = space.bytes_w(w_string) if pos > len(str): @@ -520,7 +526,13 @@ # Transform a 'byte position', as returned by all methods from # rsre_core, back into a 'character index'. This is for UTF8 # handling. - XXXX + ctx = self.ctx + if isinstance(ctx, rsre_utf8.Utf8MatchContext): + index_storage = ctx.w_unicode_obj._get_index_storage() + return rutf8.codepoint_index_at_byte_position( + ctx.w_unicode_obj._utf8, index_storage, bytepos) + else: + return bytepos def flatten_marks(self): if self.flatten_cache is None: @@ -603,9 +615,8 @@ elif isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): - XXXXXXXX - lgt = rutf8.check_utf8(ctx._unicodestr, True) - return space.newutf8(ctx._unicodestr, lgt) + lgt = rutf8.get_utf8_length(ctx._utf8) + return space.newutf8(ctx._utf8, lgt) else: raise SystemError diff --git a/pypy/module/_sre/test/test_app_sre.py b/pypy/module/_sre/test/test_app_sre.py --- a/pypy/module/_sre/test/test_app_sre.py +++ b/pypy/module/_sre/test/test_app_sre.py @@ -35,7 +35,7 @@ def _bytepos_to_charindex(self, bytepos): if isinstance(self.ctx, support.MatchContextForTests): return self.ctx._real_pos(bytepos) - return bytepos + return _org_maker[1](self, bytepos) def setup_module(mod): mod._org_maker = ( @@ -1037,3 +1037,15 @@ import re assert re.search(".+ab", "wowowowawoabwowo") assert None == re.search(".+ab", "wowowaowowo") + + +class AppTestUnicodeExtra: + def test_string_attribute(self): + import re + match = re.search(u"\u1234", u"\u1233\u1234\u1235") + assert match.string == u"\u1233\u1234\u1235" + + def test_match_start(self): + import re + match = re.search(u"\u1234", u"\u1233\u1234\u1235") + assert match.start() == 1 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit