Author: Armin Rigo <[email protected]>
Branch: unicode-utf8-re
Changeset: r93337:6b113f6d5350
Date: 2017-12-09 20:30 +0100
http://bitbucket.org/pypy/pypy/changeset/6b113f6d5350/
Log: Tests and fixes
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -115,7 +115,9 @@
if endpos < pos:
endpos = pos
if space.isinstance_w(w_string, space.w_unicode):
- utf8str, length = space.utf8_len_w(w_string)
+ w_unicode_obj = space.convert_arg_to_w_unicode(w_string)
+ utf8str = w_unicode_obj._utf8
+ length = w_unicode_obj._len()
if pos <= 0:
bytepos = 0
elif pos >= length:
@@ -127,8 +129,12 @@
else:
endbytepos = rutf8.codepoint_at_index(utf8str, index_storage,
endpos)
- return rsre_utf8.Utf8MatchContext(
+ ctx = rsre_utf8.Utf8MatchContext(
self.code, utf8str, bytepos, endbytepos, self.flags)
+ # xxx we store the w_string on the ctx too, for
+ # W_SRE_Match.bytepos_to_charindex()
+ ctx.w_unicode_obj = w_unicode_obj
+ return ctx
elif space.isinstance_w(w_string, space.w_bytes):
str = space.bytes_w(w_string)
if pos > len(str):
@@ -520,7 +526,13 @@
# Transform a 'byte position', as returned by all methods from
# rsre_core, back into a 'character index'. This is for UTF8
# handling.
- XXXX
+ ctx = self.ctx
+ if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ index_storage = ctx.w_unicode_obj._get_index_storage()
+ return rutf8.codepoint_index_at_byte_position(
+ ctx.w_unicode_obj._utf8, index_storage, bytepos)
+ else:
+ return bytepos
def flatten_marks(self):
if self.flatten_cache is None:
@@ -603,9 +615,8 @@
elif isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string)
elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
- XXXXXXXX
- lgt = rutf8.check_utf8(ctx._unicodestr, True)
- return space.newutf8(ctx._unicodestr, lgt)
+ lgt = rutf8.get_utf8_length(ctx._utf8)
+ return space.newutf8(ctx._utf8, lgt)
else:
raise SystemError
diff --git a/pypy/module/_sre/test/test_app_sre.py
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -35,7 +35,7 @@
def _bytepos_to_charindex(self, bytepos):
if isinstance(self.ctx, support.MatchContextForTests):
return self.ctx._real_pos(bytepos)
- return bytepos
+ return _org_maker[1](self, bytepos)
def setup_module(mod):
mod._org_maker = (
@@ -1037,3 +1037,15 @@
import re
assert re.search(".+ab", "wowowowawoabwowo")
assert None == re.search(".+ab", "wowowaowowo")
+
+
+class AppTestUnicodeExtra:
+ def test_string_attribute(self):
+ import re
+ match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+ assert match.string == u"\u1233\u1234\u1235"
+
+ def test_match_start(self):
+ import re
+ match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+ assert match.start() == 1
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit