Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r93384:d978136e55f3
Date: 2017-12-12 09:46 +0100
http://bitbucket.org/pypy/pypy/changeset/d978136e55f3/
Log: Tests and fixes
* cannot access a position greater than ctx.end; need some small
refactorings and added an assert
* w_unicode_obj needs to be copied by fresh_copy() too, so move it
to interp_sre where it really belongs
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -156,6 +156,19 @@
return rsre_core.BufMatchContext(self.code, buf,
pos, endpos, self.flags)
+ def fresh_copy(self, ctx, start):
+ if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ result = rsre_utf8.Utf8MatchContext(
+ ctx.pattern, ctx._utf8, start, ctx.end, ctx.flags)
+ result.w_unicode_obj = ctx.w_unicode_obj
+ return result
+ if isinstance(ctx, rsre_core.StrMatchContext):
+ return self._make_str_match_context(ctx._string, start, ctx.end)
+ if isinstance(ctx, rsre_core.BufMatchContext):
+ return rsre_core.BufMatchContext(
+ ctx.pattern, ctx._buffer, start, ctx.end, ctx.flags)
+ raise AssertionError("bad ctx type")
+
def _make_str_match_context(self, str, pos, endpos):
# for tests to override
return rsre_core.StrMatchContext(self.code, str,
@@ -182,7 +195,7 @@
space = self.space
matchlist_w = []
ctx = self.make_ctx(w_string, pos, endpos)
- while ctx.match_start <= ctx.end:
+ while True:
if not searchcontext(space, ctx):
break
num_groups = self.num_groups
@@ -201,6 +214,8 @@
matchlist_w.append(w_item)
reset_at = ctx.match_end
if ctx.match_start == ctx.match_end:
+ if reset_at == ctx.end:
+ break
reset_at = ctx.next_indirect(reset_at)
ctx.reset(reset_at)
return space.newlist(matchlist_w)
@@ -321,9 +336,6 @@
_sub_append_slice(
ctx, space, use_builder, sublist_w,
strbuilder, last_pos, ctx.match_start)
- start = ctx.match_end
- if start == ctx.match_start:
- start = ctx.next_indirect(start)
if not (last_pos == ctx.match_start
== ctx.match_end and n > 0):
# the above ignores empty matches on latest position
@@ -345,6 +357,12 @@
n += 1
elif last_pos >= ctx.end:
break # empty match at the end: finished
+
+ start = ctx.match_end
+ if start == ctx.match_start:
+ if start == ctx.end:
+ break
+ start = ctx.next_indirect(start)
ctx.reset(start)
if last_pos < ctx.end:
@@ -663,40 +681,52 @@
self.srepat = pattern
self.ctx = ctx
# 'self.ctx' is always a fresh context in which no searching
- # or matching succeeded so far.
+ # or matching succeeded so far. It is None when the iterator is
+ # exhausted.
def iter_w(self):
return self
def next_w(self):
- if self.ctx.match_start > self.ctx.end:
+ if self.ctx is None:
raise OperationError(self.space.w_StopIteration, self.space.w_None)
if not searchcontext(self.space, self.ctx):
raise OperationError(self.space.w_StopIteration, self.space.w_None)
return self.getmatch(True)
def match_w(self):
- if self.ctx.match_start > self.ctx.end:
+ if self.ctx is None:
return self.space.w_None
return self.getmatch(matchcontext(self.space, self.ctx))
def search_w(self):
- if self.ctx.match_start > self.ctx.end:
+ if self.ctx is None:
return self.space.w_None
return self.getmatch(searchcontext(self.space, self.ctx))
def getmatch(self, found):
+ ctx = self.ctx
+ assert ctx is not None
if found:
- ctx = self.ctx
nextstart = ctx.match_end
+ exhausted = False
if ctx.match_start == nextstart:
- nextstart = ctx.next_indirect(nextstart)
- self.ctx = ctx.fresh_copy(nextstart)
+ if nextstart == ctx.end:
+ exhausted = True
+ else:
+ nextstart = ctx.next_indirect(nextstart)
+ if exhausted:
+ self.ctx = None
+ else:
+ self.ctx = self.srepat.fresh_copy(ctx, nextstart)
match = W_SRE_Match(self.srepat, ctx)
return match
else:
# obscure corner case
- self.ctx.match_start = self.ctx.next_indirect(self.ctx.match_start)
+ if ctx.match_start == ctx.end:
+ self.ctx = None
+ else:
+ ctx.match_start = ctx.next_indirect(ctx.match_start)
return None
W_SRE_Scanner.typedef = TypeDef(
diff --git a/pypy/module/_sre/test/test_app_sre.py
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -28,8 +28,10 @@
# we're accepting or escaping a Position to app-level, which we
# should not: Positions are meant to be byte indexes inside a
# possibly UTF8 string, not character indexes.
- start = support.Position(start)
- end = support.Position(end)
+ if not isinstance(start, support.Position):
+ start = support.Position(start)
+ if not isinstance(end, support.Position):
+ end = support.Position(end)
return support.MatchContextForTests(self.code, str, start, end, self.flags)
def _bytepos_to_charindex(self, bytepos):
@@ -140,6 +142,9 @@
assert ['', 'a', 'l', 'a', 'lla'] == re.split("b(a)", "balballa")
assert ['', 'a', None, 'l', 'u', None, 'lla'] == (
re.split("b([ua]|(s))", "balbulla"))
+ assert ["abc"] == re.split("", "abc")
+ assert ["abc"] == re.split("X?", "abc")
+ assert ["a", "c"] == re.split("b?", "abc")
def test_weakref(self):
import re, _weakref
@@ -253,6 +258,7 @@
assert "rbd\nbr\n" == re.sub("a(.)", r"b\1\n", "radar")
assert ("rbd\nbr\n", 2) == re.subn("a(.)", r"b\1\n", "radar")
assert ("bbbba", 2) == re.subn("a", "b", "ababa", 2)
+ assert "XaXbXcX" == re.sub("", "X", "abc")
def test_sub_unicode(self):
import re
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -173,6 +173,7 @@
def go_forward_by_bytes(self, base_position, index):
return base_position + index
def next_indirect(self, position):
+ assert position < self.end
return position + 1 # like next(), but can be called indirectly
def prev_indirect(self, position):
position -= 1 # like prev(), but can be called indirectly
@@ -213,9 +214,6 @@
return (-1, -1)
return (fmarks[groupnum], fmarks[groupnum+1])
- def fresh_copy(self, start):
- raise NotImplementedError
-
class FixedMatchContext(AbstractMatchContext):
"""Abstract subclass to introduce the default implementation for
@@ -264,9 +262,6 @@
def get_single_byte(self, base_position, index):
return self.str(base_position + index)
- def fresh_copy(self, start):
- return BufMatchContext(self.pattern, self._buffer, start,
- self.end, self.flags)
class StrMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a plain string."""
@@ -293,9 +288,6 @@
def _real_pos(self, index):
return index # overridden by tests
- def fresh_copy(self, start):
- return StrMatchContext(self.pattern, self._string, start,
- self.end, self.flags)
class UnicodeMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a unicode string."""
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -27,10 +27,6 @@
def get_single_byte(self, base_position, index):
return self.str(base_position + index)
- def fresh_copy(self, start):
- return Utf8MatchContext(self.pattern, self._utf8, start,
- self.end, self.flags)
-
def next(self, position):
return rutf8.next_codepoint_pos(self._utf8, position)
next_indirect = next
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit