[pypy-commit] pypy unicode-utf8-re: Tests and fixes

arigo Sat, 09 Dec 2017 11:31:54 -0800

Author: Armin Rigo <ar...@tunes.org>
Branch: unicode-utf8-re
Changeset: r93337:6b113f6d5350
Date: 2017-12-09 20:30 +0100
http://bitbucket.org/pypy/pypy/changeset/6b113f6d5350/


Log:    Tests and fixes

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -115,7 +115,9 @@
         if endpos < pos:
             endpos = pos
         if space.isinstance_w(w_string, space.w_unicode):
-            utf8str, length = space.utf8_len_w(w_string)
+            w_unicode_obj = space.convert_arg_to_w_unicode(w_string)
+            utf8str = w_unicode_obj._utf8
+            length = w_unicode_obj._len()
             if pos <= 0:
                 bytepos = 0
             elif pos >= length:
@@ -127,8 +129,12 @@
             else:
                 endbytepos = rutf8.codepoint_at_index(utf8str, index_storage,
                                                       endpos)
-            return rsre_utf8.Utf8MatchContext(
+            ctx = rsre_utf8.Utf8MatchContext(
                 self.code, utf8str, bytepos, endbytepos, self.flags)
+            # xxx we store the w_string on the ctx too, for
+            # W_SRE_Match.bytepos_to_charindex()
+            ctx.w_unicode_obj = w_unicode_obj
+            return ctx
         elif space.isinstance_w(w_string, space.w_bytes):
             str = space.bytes_w(w_string)
             if pos > len(str):
@@ -520,7 +526,13 @@
         # Transform a 'byte position', as returned by all methods from
         # rsre_core, back into a 'character index'.  This is for UTF8
         # handling.
-        XXXX
+        ctx = self.ctx
+        if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            index_storage = ctx.w_unicode_obj._get_index_storage()
+            return rutf8.codepoint_index_at_byte_position(
+                ctx.w_unicode_obj._utf8, index_storage, bytepos)
+        else:
+            return bytepos
 
     def flatten_marks(self):
         if self.flatten_cache is None:
@@ -603,9 +615,8 @@
         elif isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string)
         elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
-            XXXXXXXX
-            lgt = rutf8.check_utf8(ctx._unicodestr, True)
-            return space.newutf8(ctx._unicodestr, lgt)
+            lgt = rutf8.get_utf8_length(ctx._utf8)
+            return space.newutf8(ctx._utf8, lgt)
         else:
             raise SystemError
 
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -35,7 +35,7 @@
 def _bytepos_to_charindex(self, bytepos):
     if isinstance(self.ctx, support.MatchContextForTests):
         return self.ctx._real_pos(bytepos)
-    return bytepos
+    return _org_maker[1](self, bytepos)
 
 def setup_module(mod):
     mod._org_maker = (
@@ -1037,3 +1037,15 @@
         import re
         assert re.search(".+ab", "wowowowawoabwowo")
         assert None == re.search(".+ab", "wowowaowowo")
+
+
+class AppTestUnicodeExtra:
+    def test_string_attribute(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.string == u"\u1233\u1234\u1235"
+
+    def test_match_start(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.start() == 1
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-re: Tests and fixes

Reply via email to