[pypy-commit] pypy unicode-utf8: Tests and fixes

arigo Tue, 12 Dec 2017 00:47:10 -0800

Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r93384:d978136e55f3
Date: 2017-12-12 09:46 +0100
http://bitbucket.org/pypy/pypy/changeset/d978136e55f3/


Log:    Tests and fixes

        * cannot access a position greater than ctx.end; need some small
        refactorings and added an assert

        * w_unicode_obj needs to be copied by fresh_copy() too, so move it
        to interp_sre where it really belongs

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -156,6 +156,19 @@
             return rsre_core.BufMatchContext(self.code, buf,
                                              pos, endpos, self.flags)
 
+    def fresh_copy(self, ctx, start):
+        if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            result = rsre_utf8.Utf8MatchContext(
+                ctx.pattern, ctx._utf8, start, ctx.end, ctx.flags)
+            result.w_unicode_obj = ctx.w_unicode_obj
+            return result
+        if isinstance(ctx, rsre_core.StrMatchContext):
+            return self._make_str_match_context(ctx._string, start, ctx.end)
+        if isinstance(ctx, rsre_core.BufMatchContext):
+            return rsre_core.BufMatchContext(
+                ctx.pattern, ctx._buffer, start, ctx.end, ctx.flags)
+        raise AssertionError("bad ctx type")
+
     def _make_str_match_context(self, str, pos, endpos):
         # for tests to override
         return rsre_core.StrMatchContext(self.code, str,
@@ -182,7 +195,7 @@
         space = self.space
         matchlist_w = []
         ctx = self.make_ctx(w_string, pos, endpos)
-        while ctx.match_start <= ctx.end:
+        while True:
             if not searchcontext(space, ctx):
                 break
             num_groups = self.num_groups
@@ -201,6 +214,8 @@
             matchlist_w.append(w_item)
             reset_at = ctx.match_end
             if ctx.match_start == ctx.match_end:
+                if reset_at == ctx.end:
+                    break
                 reset_at = ctx.next_indirect(reset_at)
             ctx.reset(reset_at)
         return space.newlist(matchlist_w)
@@ -321,9 +336,6 @@
                 _sub_append_slice(
                     ctx, space, use_builder, sublist_w,
                     strbuilder, last_pos, ctx.match_start)
-            start = ctx.match_end
-            if start == ctx.match_start:
-                start = ctx.next_indirect(start)
             if not (last_pos == ctx.match_start
                              == ctx.match_end and n > 0):
                 # the above ignores empty matches on latest position
@@ -345,6 +357,12 @@
                 n += 1
             elif last_pos >= ctx.end:
                 break    # empty match at the end: finished
+
+            start = ctx.match_end
+            if start == ctx.match_start:
+                if start == ctx.end:
+                    break
+                start = ctx.next_indirect(start)
             ctx.reset(start)
 
         if last_pos < ctx.end:
@@ -663,40 +681,52 @@
         self.srepat = pattern
         self.ctx = ctx
         # 'self.ctx' is always a fresh context in which no searching
-        # or matching succeeded so far.
+        # or matching succeeded so far.  It is None when the iterator is
+        # exhausted.
 
     def iter_w(self):
         return self
 
     def next_w(self):
-        if self.ctx.match_start > self.ctx.end:
+        if self.ctx is None:
             raise OperationError(self.space.w_StopIteration, self.space.w_None)
         if not searchcontext(self.space, self.ctx):
             raise OperationError(self.space.w_StopIteration, self.space.w_None)
         return self.getmatch(True)
 
     def match_w(self):
-        if self.ctx.match_start > self.ctx.end:
+        if self.ctx is None:
             return self.space.w_None
         return self.getmatch(matchcontext(self.space, self.ctx))
 
     def search_w(self):
-        if self.ctx.match_start > self.ctx.end:
+        if self.ctx is None:
             return self.space.w_None
         return self.getmatch(searchcontext(self.space, self.ctx))
 
     def getmatch(self, found):
+        ctx = self.ctx
+        assert ctx is not None
         if found:
-            ctx = self.ctx
             nextstart = ctx.match_end
+            exhausted = False
             if ctx.match_start == nextstart:
-                nextstart = ctx.next_indirect(nextstart)
-            self.ctx = ctx.fresh_copy(nextstart)
+                if nextstart == ctx.end:
+                    exhausted = True
+                else:
+                    nextstart = ctx.next_indirect(nextstart)
+            if exhausted:
+                self.ctx = None
+            else:
+                self.ctx = self.srepat.fresh_copy(ctx, nextstart)
             match = W_SRE_Match(self.srepat, ctx)
             return match
         else:
             # obscure corner case
-            self.ctx.match_start = self.ctx.next_indirect(self.ctx.match_start)
+            if ctx.match_start == ctx.end:
+                self.ctx = None
+            else:
+                ctx.match_start = ctx.next_indirect(ctx.match_start)
             return None
 
 W_SRE_Scanner.typedef = TypeDef(
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -28,8 +28,10 @@
     # we're accepting or escaping a Position to app-level, which we
     # should not: Positions are meant to be byte indexes inside a
     # possibly UTF8 string, not character indexes.
-    start = support.Position(start)
-    end = support.Position(end)
+    if not isinstance(start, support.Position):
+        start = support.Position(start)
+    if not isinstance(end, support.Position):
+        end = support.Position(end)
     return support.MatchContextForTests(self.code, str, start, end, self.flags)
 
 def _bytepos_to_charindex(self, bytepos):
@@ -140,6 +142,9 @@
         assert ['', 'a', 'l', 'a', 'lla'] == re.split("b(a)", "balballa")
         assert ['', 'a', None, 'l', 'u', None, 'lla'] == (
             re.split("b([ua]|(s))", "balbulla"))
+        assert ["abc"] == re.split("", "abc")
+        assert ["abc"] == re.split("X?", "abc")
+        assert ["a", "c"] == re.split("b?", "abc")
 
     def test_weakref(self):
         import re, _weakref
@@ -253,6 +258,7 @@
         assert "rbd\nbr\n" == re.sub("a(.)", r"b\1\n", "radar")
         assert ("rbd\nbr\n", 2) == re.subn("a(.)", r"b\1\n", "radar")
         assert ("bbbba", 2) == re.subn("a", "b", "ababa", 2)
+        assert "XaXbXcX" == re.sub("", "X", "abc")
 
     def test_sub_unicode(self):
         import re
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -173,6 +173,7 @@
     def go_forward_by_bytes(self, base_position, index):
         return base_position + index
     def next_indirect(self, position):
+        assert position < self.end
         return position + 1     # like next(), but can be called indirectly
     def prev_indirect(self, position):
         position -= 1           # like prev(), but can be called indirectly
@@ -213,9 +214,6 @@
             return (-1, -1)
         return (fmarks[groupnum], fmarks[groupnum+1])
 
-    def fresh_copy(self, start):
-        raise NotImplementedError
-
 
 class FixedMatchContext(AbstractMatchContext):
     """Abstract subclass to introduce the default implementation for
@@ -264,9 +262,6 @@
     def get_single_byte(self, base_position, index):
         return self.str(base_position + index)
 
-    def fresh_copy(self, start):
-        return BufMatchContext(self.pattern, self._buffer, start,
-                               self.end, self.flags)
 
 class StrMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a plain string."""
@@ -293,9 +288,6 @@
     def _real_pos(self, index):
         return index     # overridden by tests
 
-    def fresh_copy(self, start):
-        return StrMatchContext(self.pattern, self._string, start,
-                               self.end, self.flags)
 
 class UnicodeMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a unicode string."""
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -27,10 +27,6 @@
     def get_single_byte(self, base_position, index):
         return self.str(base_position + index)
 
-    def fresh_copy(self, start):
-        return Utf8MatchContext(self.pattern, self._utf8, start,
-                                self.end, self.flags)
-
     def next(self, position):
         return rutf8.next_codepoint_pos(self._utf8, position)
     next_indirect = next
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: Tests and fixes

Reply via email to