[pypy-commit] pypy unicode-utf8-re: in-progress

arigo Sat, 09 Dec 2017 10:39:03 -0800

Author: Armin Rigo <ar...@tunes.org>
Branch: unicode-utf8-re
Changeset: r93335:2114fde9ada8
Date: 2017-12-09 19:38 +0100
http://bitbucket.org/pypy/pypy/changeset/2114fde9ada8/


Log:    in-progress

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -34,11 +34,14 @@
 
 
 def slice_w(space, ctx, start, end, w_default):
-    if 0 <= start <= end:
+    # 'start' and 'end' are byte positions
+    if ctx.ZERO <= start <= end:
         if isinstance(ctx, rsre_core.BufMatchContext):
             return space.newbytes(ctx._buffer.getslice(start, end, 1,
                                                         end-start))
         if isinstance(ctx, rsre_core.StrMatchContext):
+            start = ctx._real_pos(start)
+            end = ctx._real_pos(end)
             return space.newbytes(ctx._string[start:end])
         elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
             XXXXXXX
@@ -60,6 +63,7 @@
         return None
     result = [-1] * (2 * num_groups)
     mark = ctx.match_marks
+    XXX
     while mark is not None:
         index = jit.promote(mark.gid)
         if result[index] == -1:
@@ -70,6 +74,7 @@
 
 @jit.look_inside_iff(lambda space, ctx, fmarks, num_groups, w_default: 
jit.isconstant(num_groups))
 def allgroups_w(space, ctx, fmarks, num_groups, w_default):
+    XXX
     grps = [slice_w(space, ctx, fmarks[i * 2], fmarks[i * 2 + 1], w_default)
             for i in range(num_groups)]
     return space.newtuple(grps)
@@ -138,8 +143,7 @@
                 pos = len(str)
             if endpos > len(str):
                 endpos = len(str)
-            return rsre_core.StrMatchContext(self.code, str,
-                                             pos, endpos, self.flags)
+            return self._make_str_match_context(str, pos, endpos)
         else:
             buf = space.readbuf_w(w_string)
             size = buf.getlength()
@@ -151,6 +155,11 @@
             return rsre_core.BufMatchContext(self.code, buf,
                                              pos, endpos, self.flags)
 
+    def _make_str_match_context(self, str, pos, endpos):
+        # for tests to override
+        return rsre_core.StrMatchContext(self.code, str,
+                                         pos, endpos, self.flags)
+
     def getmatch(self, ctx, found):
         if found:
             return W_SRE_Match(self, ctx)
@@ -191,6 +200,7 @@
             matchlist_w.append(w_item)
             no_progress = (ctx.match_start == ctx.match_end)
             ctx.reset(ctx.match_end + no_progress)
+            XXX #                   ^^^
         return space.newlist(matchlist_w)
 
     @unwrap_spec(pos=int, endpos=int)
@@ -215,6 +225,7 @@
                 if ctx.match_start == ctx.end:       # or end of string
                     break
                 ctx.reset(ctx.match_end + 1)
+                XXX   #                 ^^^
                 continue
             splitlist.append(slice_w(space, ctx, last, ctx.match_start,
                                      space.w_None))
@@ -283,7 +294,7 @@
         sublist_w = strbuilder = unicodebuilder = None
         if use_builder:
             if filter_as_unicode is not None:
-                unicodebuilder = Utf8StringBuilder(ctx.end)
+                unicodebuilder = XXX  #Utf8StringBuilder(ctx.end)
             else:
                 assert filter_as_string is not None
                 strbuilder = StringBuilder(ctx.end)
@@ -499,18 +510,30 @@
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def start_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[0])
+        start, end = self.do_span(w_groupnum)
+        start = self.bytepos_to_charindex(start)
+        return self.space.newint(start)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def end_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[1])
+        start, end = self.do_span(w_groupnum)
+        end = self.bytepos_to_charindex(end)
+        return self.space.newint(end)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def span_w(self, w_groupnum):
         start, end = self.do_span(w_groupnum)
+        start = self.bytepos_to_charindex(start)
+        end = self.bytepos_to_charindex(end)
         return self.space.newtuple([self.space.newint(start),
                                     self.space.newint(end)])
 
+    def bytepos_to_charindex(self, bytepos):
+        # Transform a 'byte position', as returned by all methods from
+        # rsre_core, back into a 'character index'.  This is for UTF8
+        # handling.
+        XXXX
+
     def flatten_marks(self):
         if self.flatten_cache is None:
             num_groups = self.srepat.num_groups
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -4,6 +4,8 @@
 import py
 from py.test import raises, skip
 from pypy.interpreter.gateway import app2interp_temp
+from pypy.module._sre import interp_sre
+from rpython.rlib.rsre.test import support
 
 
 def init_app_test(cls, space):
@@ -20,6 +22,33 @@
             sys.path.pop(0)
         """)
 
+def _test_sre_ctx_(self, str, start, end):
+    # Use the MatchContextForTests class, which handles Position
+    # instances instead of plain integers.  This is used to detect when
+    # we're accepting or escaping a Position to app-level, which we
+    # should not: Positions are meant to be byte indexes inside a
+    # possibly UTF8 string, not character indexes.
+    start = support.Position(start)
+    end = support.Position(end)
+    return support.MatchContextForTests(self.code, str, start, end, self.flags)
+
+def _bytepos_to_charindex(self, bytepos):
+    return self.ctx._real_pos(bytepos)
+
+def setup_module(mod):
+    mod._org_maker = (
+        interp_sre.W_SRE_Pattern._make_str_match_context,
+        interp_sre.W_SRE_Match.bytepos_to_charindex,
+        )
+    interp_sre.W_SRE_Pattern._make_str_match_context = _test_sre_ctx_
+    interp_sre.W_SRE_Match.bytepos_to_charindex = _bytepos_to_charindex
+
+def teardown_module(mod):
+    (
+        interp_sre.W_SRE_Pattern._make_str_match_context,
+        interp_sre.W_SRE_Match.bytepos_to_charindex,
+    ) = mod._org_maker
+
 
 class AppTestSrePy:
     def test_magic(self):
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -296,6 +296,9 @@
     def get_single_byte(self, base_position, index):
         return self.str(base_position + index)
 
+    def _real_pos(self, index):
+        return index     # overridden by tests
+
     def fresh_copy(self, start):
         return StrMatchContext(self.pattern, self._string, start,
                                self.end, self.flags)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-re: in-progress

Reply via email to