Author: Armin Rigo <[email protected]>
Branch: unicode-utf8-re
Changeset: r93335:2114fde9ada8
Date: 2017-12-09 19:38 +0100
http://bitbucket.org/pypy/pypy/changeset/2114fde9ada8/
Log: in-progress
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -34,11 +34,14 @@
def slice_w(space, ctx, start, end, w_default):
- if 0 <= start <= end:
+ # 'start' and 'end' are byte positions
+ if ctx.ZERO <= start <= end:
if isinstance(ctx, rsre_core.BufMatchContext):
return space.newbytes(ctx._buffer.getslice(start, end, 1,
end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
+ start = ctx._real_pos(start)
+ end = ctx._real_pos(end)
return space.newbytes(ctx._string[start:end])
elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
XXXXXXX
@@ -60,6 +63,7 @@
return None
result = [-1] * (2 * num_groups)
mark = ctx.match_marks
+ XXX
while mark is not None:
index = jit.promote(mark.gid)
if result[index] == -1:
@@ -70,6 +74,7 @@
@jit.look_inside_iff(lambda space, ctx, fmarks, num_groups, w_default:
jit.isconstant(num_groups))
def allgroups_w(space, ctx, fmarks, num_groups, w_default):
+ XXX
grps = [slice_w(space, ctx, fmarks[i * 2], fmarks[i * 2 + 1], w_default)
for i in range(num_groups)]
return space.newtuple(grps)
@@ -138,8 +143,7 @@
pos = len(str)
if endpos > len(str):
endpos = len(str)
- return rsre_core.StrMatchContext(self.code, str,
- pos, endpos, self.flags)
+ return self._make_str_match_context(str, pos, endpos)
else:
buf = space.readbuf_w(w_string)
size = buf.getlength()
@@ -151,6 +155,11 @@
return rsre_core.BufMatchContext(self.code, buf,
pos, endpos, self.flags)
+ def _make_str_match_context(self, str, pos, endpos):
+ # for tests to override
+ return rsre_core.StrMatchContext(self.code, str,
+ pos, endpos, self.flags)
+
def getmatch(self, ctx, found):
if found:
return W_SRE_Match(self, ctx)
@@ -191,6 +200,7 @@
matchlist_w.append(w_item)
no_progress = (ctx.match_start == ctx.match_end)
ctx.reset(ctx.match_end + no_progress)
+ XXX # ^^^
return space.newlist(matchlist_w)
@unwrap_spec(pos=int, endpos=int)
@@ -215,6 +225,7 @@
if ctx.match_start == ctx.end: # or end of string
break
ctx.reset(ctx.match_end + 1)
+ XXX # ^^^
continue
splitlist.append(slice_w(space, ctx, last, ctx.match_start,
space.w_None))
@@ -283,7 +294,7 @@
sublist_w = strbuilder = unicodebuilder = None
if use_builder:
if filter_as_unicode is not None:
- unicodebuilder = Utf8StringBuilder(ctx.end)
+ unicodebuilder = XXX #Utf8StringBuilder(ctx.end)
else:
assert filter_as_string is not None
strbuilder = StringBuilder(ctx.end)
@@ -499,18 +510,30 @@
@unwrap_spec(w_groupnum=WrappedDefault(0))
def start_w(self, w_groupnum):
- return self.space.newint(self.do_span(w_groupnum)[0])
+ start, end = self.do_span(w_groupnum)
+ start = self.bytepos_to_charindex(start)
+ return self.space.newint(start)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def end_w(self, w_groupnum):
- return self.space.newint(self.do_span(w_groupnum)[1])
+ start, end = self.do_span(w_groupnum)
+ end = self.bytepos_to_charindex(end)
+ return self.space.newint(end)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def span_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
+ start = self.bytepos_to_charindex(start)
+ end = self.bytepos_to_charindex(end)
return self.space.newtuple([self.space.newint(start),
self.space.newint(end)])
+ def bytepos_to_charindex(self, bytepos):
+ # Transform a 'byte position', as returned by all methods from
+ # rsre_core, back into a 'character index'. This is for UTF8
+ # handling.
+ XXXX
+
def flatten_marks(self):
if self.flatten_cache is None:
num_groups = self.srepat.num_groups
diff --git a/pypy/module/_sre/test/test_app_sre.py
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -4,6 +4,8 @@
import py
from py.test import raises, skip
from pypy.interpreter.gateway import app2interp_temp
+from pypy.module._sre import interp_sre
+from rpython.rlib.rsre.test import support
def init_app_test(cls, space):
@@ -20,6 +22,33 @@
sys.path.pop(0)
""")
+def _test_sre_ctx_(self, str, start, end):
+ # Use the MatchContextForTests class, which handles Position
+ # instances instead of plain integers. This is used to detect when
+ # we're accepting or escaping a Position to app-level, which we
+ # should not: Positions are meant to be byte indexes inside a
+ # possibly UTF8 string, not character indexes.
+ start = support.Position(start)
+ end = support.Position(end)
+ return support.MatchContextForTests(self.code, str, start, end, self.flags)
+
+def _bytepos_to_charindex(self, bytepos):
+ return self.ctx._real_pos(bytepos)
+
+def setup_module(mod):
+ mod._org_maker = (
+ interp_sre.W_SRE_Pattern._make_str_match_context,
+ interp_sre.W_SRE_Match.bytepos_to_charindex,
+ )
+ interp_sre.W_SRE_Pattern._make_str_match_context = _test_sre_ctx_
+ interp_sre.W_SRE_Match.bytepos_to_charindex = _bytepos_to_charindex
+
+def teardown_module(mod):
+ (
+ interp_sre.W_SRE_Pattern._make_str_match_context,
+ interp_sre.W_SRE_Match.bytepos_to_charindex,
+ ) = mod._org_maker
+
class AppTestSrePy:
def test_magic(self):
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -296,6 +296,9 @@
def get_single_byte(self, base_position, index):
return self.str(base_position + index)
+ def _real_pos(self, index):
+ return index # overridden by tests
+
def fresh_copy(self, start):
return StrMatchContext(self.pattern, self._string, start,
self.end, self.flags)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit