Author: fijal
Branch: unicode-utf8
Changeset: r93355:43e73aa47541
Date: 2017-12-11 08:05 +0200
http://bitbucket.org/pypy/pypy/changeset/43e73aa47541/
Log: merge unicode-utf8-re
diff too long, truncating to 2000 out of 2355 lines
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -8,13 +8,12 @@
from rpython.rlib.rarithmetic import intmask
from rpython.rlib import jit, rutf8
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import Utf8StringBuilder
# ____________________________________________________________
#
# Constants and exposed functions
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_utf8
from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, getlower,
set_unicode_db
@@ -35,15 +34,18 @@
def slice_w(space, ctx, start, end, w_default):
- if 0 <= start <= end:
+ # 'start' and 'end' are byte positions
+ if ctx.ZERO <= start <= end:
if isinstance(ctx, rsre_core.BufMatchContext):
return space.newbytes(ctx._buffer.getslice(start, end, 1,
end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
+ start = ctx._real_pos(start)
+ end = ctx._real_pos(end)
return space.newbytes(ctx._string[start:end])
- elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- s = ctx._unicodestr[start:end]
- lgt = rutf8.check_utf8(s, True)
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ s = ctx._utf8[start:end]
+ lgt = rutf8.get_utf8_length(s)
return space.newutf8(s, lgt)
else:
# unreachable
@@ -56,6 +58,7 @@
# Returns a list of RPython-level integers.
# Unlike the app-level groups() method, groups are numbered from 0
# and the returned list does not start with the whole match range.
+ # The integers are byte positions, not character indexes (for utf8).
if num_groups == 0:
return None
result = [-1] * (2 * num_groups)
@@ -104,7 +107,7 @@
raise oefmt(space.w_TypeError, "cannot copy this pattern object")
def make_ctx(self, w_string, pos=0, endpos=sys.maxint):
- """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
+ """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
searching in the given w_string object."""
space = self.space
if pos < 0:
@@ -112,23 +115,36 @@
if endpos < pos:
endpos = pos
if space.isinstance_w(w_string, space.w_unicode):
- unicodestr = space.utf8_w(w_string)
- # XXX will fail some tests, the length need to be adjusted for
- # real char len etc
- if pos > len(unicodestr):
- pos = len(unicodestr)
- if endpos > len(unicodestr):
- endpos = len(unicodestr)
- return rsre_core.UnicodeMatchContext(self.code, unicodestr,
- pos, endpos, self.flags)
+ w_unicode_obj = space.convert_arg_to_w_unicode(w_string)
+ utf8str = w_unicode_obj._utf8
+ length = w_unicode_obj._len()
+ if pos <= 0:
+ bytepos = 0
+ elif pos >= length:
+ bytepos = len(utf8str)
+ else:
+ index_storage = w_unicode_obj._get_index_storage()
+ bytepos = rutf8.codepoint_position_at_index(utf8str,
+ index_storage, pos)
+ if endpos >= length:
+ endbytepos = len(utf8str)
+ else:
+ index_storage = w_unicode_obj._get_index_storage()
+ endbytepos = rutf8.codepoint_position_at_index(utf8str,
+ index_storage, endpos)
+ ctx = rsre_utf8.Utf8MatchContext(
+ self.code, utf8str, bytepos, endbytepos, self.flags)
+ # xxx we store the w_string on the ctx too, for
+ # W_SRE_Match.bytepos_to_charindex()
+ ctx.w_unicode_obj = w_unicode_obj
+ return ctx
elif space.isinstance_w(w_string, space.w_bytes):
str = space.bytes_w(w_string)
if pos > len(str):
pos = len(str)
if endpos > len(str):
endpos = len(str)
- return rsre_core.StrMatchContext(self.code, str,
- pos, endpos, self.flags)
+ return self._make_str_match_context(str, pos, endpos)
else:
buf = space.readbuf_w(w_string)
size = buf.getlength()
@@ -140,6 +156,11 @@
return rsre_core.BufMatchContext(self.code, buf,
pos, endpos, self.flags)
+ def _make_str_match_context(self, str, pos, endpos):
+ # for tests to override
+ return rsre_core.StrMatchContext(self.code, str,
+ pos, endpos, self.flags)
+
def getmatch(self, ctx, found):
if found:
return W_SRE_Match(self, ctx)
@@ -178,8 +199,10 @@
w_item = allgroups_w(space, ctx, fmarks, num_groups,
w_emptystr)
matchlist_w.append(w_item)
- no_progress = (ctx.match_start == ctx.match_end)
- ctx.reset(ctx.match_end + no_progress)
+ reset_at = ctx.match_end
+ if ctx.match_start == ctx.match_end:
+ reset_at = ctx.next_indirect(reset_at)
+ ctx.reset(reset_at)
return space.newlist(matchlist_w)
@unwrap_spec(pos=int, endpos=int)
@@ -195,15 +218,15 @@
space = self.space
splitlist = []
n = 0
- last = 0
ctx = self.make_ctx(w_string)
+ last = ctx.ZERO
while not maxsplit or n < maxsplit:
if not searchcontext(space, ctx):
break
if ctx.match_start == ctx.match_end: # zero-width match
if ctx.match_start == ctx.end: # or end of string
break
- ctx.reset(ctx.match_end + 1)
+ ctx.reset(ctx.next_indirect(ctx.match_end))
continue
splitlist.append(slice_w(space, ctx, last, ctx.match_start,
space.w_None))
@@ -232,20 +255,20 @@
def subx(self, w_ptemplate, w_string, count):
space = self.space
- # use a (much faster) string/unicode builder if w_ptemplate and
+ # use a (much faster) string builder (possibly utf8) if w_ptemplate and
# w_string are both string or both unicode objects, and if w_ptemplate
# is a literal
- use_builder = False
- filter_as_unicode = filter_as_string = None
+ use_builder = '\x00' # or 'S'tring or 'U'nicode/UTF8
+ filter_as_string = None
if space.is_true(space.callable(w_ptemplate)):
w_filter = w_ptemplate
filter_is_callable = True
else:
if space.isinstance_w(w_ptemplate, space.w_unicode):
- filter_as_unicode = space.utf8_w(w_ptemplate)
- literal = '\\' not in filter_as_unicode
- use_builder = (
- space.isinstance_w(w_string, space.w_unicode) and literal)
+ filter_as_string = space.utf8_w(w_ptemplate)
+ literal = '\\' not in filter_as_string
+ if space.isinstance_w(w_string, space.w_unicode) and literal:
+ use_builder = 'U'
else:
try:
filter_as_string = space.bytes_w(w_ptemplate)
@@ -255,8 +278,8 @@
literal = False
else:
literal = '\\' not in filter_as_string
- use_builder = (
- space.isinstance_w(w_string, space.w_bytes) and
literal)
+ if space.isinstance_w(w_string, space.w_bytes) and literal:
+ use_builder = 'S'
if literal:
w_filter = w_ptemplate
filter_is_callable = False
@@ -269,16 +292,14 @@
#
# XXX this is a bit of a mess, but it improves performance a lot
ctx = self.make_ctx(w_string)
- sublist_w = strbuilder = unicodebuilder = None
- if use_builder:
- if filter_as_unicode is not None:
- unicodebuilder = Utf8StringBuilder(ctx.end)
- else:
- assert filter_as_string is not None
- strbuilder = StringBuilder(ctx.end)
+ sublist_w = strbuilder = None
+ if use_builder != '\x00':
+ assert filter_as_string is not None
+ strbuilder = StringBuilder(ctx.end)
else:
sublist_w = []
- n = last_pos = 0
+ n = 0
+ last_pos = ctx.ZERO
while not count or n < count:
sub_jitdriver.jit_merge_point(
self=self,
@@ -288,9 +309,7 @@
ctx=ctx,
w_filter=w_filter,
strbuilder=strbuilder,
- unicodebuilder=unicodebuilder,
filter_as_string=filter_as_string,
- filter_as_unicode=filter_as_unicode,
count=count,
w_string=w_string,
n=n, last_pos=last_pos, sublist_w=sublist_w
@@ -301,10 +320,10 @@
if last_pos < ctx.match_start:
_sub_append_slice(
ctx, space, use_builder, sublist_w,
- strbuilder, unicodebuilder, last_pos, ctx.match_start)
+ strbuilder, last_pos, ctx.match_start)
start = ctx.match_end
if start == ctx.match_start:
- start += 1
+ start = ctx.next_indirect(start)
if not (last_pos == ctx.match_start
== ctx.match_end and n > 0):
# the above ignores empty matches on latest position
@@ -312,18 +331,14 @@
w_match = self.getmatch(ctx, True)
w_piece = space.call_function(w_filter, w_match)
if not space.is_w(w_piece, space.w_None):
- assert strbuilder is None and unicodebuilder is None
- assert not use_builder
+ assert strbuilder is None
+ assert use_builder == '\x00'
sublist_w.append(w_piece)
else:
- if use_builder:
- if strbuilder is not None:
- assert filter_as_string is not None
- strbuilder.append(filter_as_string)
- else:
- assert unicodebuilder is not None
- assert filter_as_unicode is not None
- unicodebuilder.append(filter_as_unicode)
+ if use_builder != '\x00':
+ assert filter_as_string is not None
+ assert strbuilder is not None
+ strbuilder.append(filter_as_string)
else:
sublist_w.append(w_filter)
last_pos = ctx.match_end
@@ -334,14 +349,19 @@
if last_pos < ctx.end:
_sub_append_slice(ctx, space, use_builder, sublist_w,
- strbuilder, unicodebuilder, last_pos, ctx.end)
- if use_builder:
- if strbuilder is not None:
- return space.newbytes(strbuilder.build()), n
+ strbuilder, last_pos, ctx.end)
+ if use_builder != '\x00':
+ assert strbuilder is not None
+ result_bytes = strbuilder.build()
+ if use_builder == 'S':
+ assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
+ return space.newbytes(result_bytes), n
+ elif use_builder == 'U':
+ assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
+ return space.newutf8(result_bytes,
+ rutf8.get_utf8_length(result_bytes)), n
else:
- assert unicodebuilder is not None
- return space.newutf8(unicodebuilder.build(),
- unicodebuilder.get_length()), n
+ raise AssertionError(use_builder)
else:
if space.isinstance_w(w_string, space.w_unicode):
w_emptystr = space.newutf8('', 0)
@@ -354,26 +374,28 @@
sub_jitdriver = jit.JitDriver(
reds="""count n last_pos
ctx w_filter
- strbuilder unicodebuilder
+ strbuilder
filter_as_string
- filter_as_unicode
w_string sublist_w
self""".split(),
greens=["filter_is_callable", "use_builder", "filter_type", "ctx.pattern"])
def _sub_append_slice(ctx, space, use_builder, sublist_w,
- strbuilder, unicodebuilder, start, end):
- if use_builder:
+ strbuilder, start, end):
+ if use_builder != '\x00':
+ assert strbuilder is not None
if isinstance(ctx, rsre_core.BufMatchContext):
- assert strbuilder is not None
+ assert use_builder == 'S'
return strbuilder.append(ctx._buffer.getslice(start, end, 1,
end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
- assert strbuilder is not None
+ assert use_builder == 'S'
+ start = ctx._real_pos(start)
+ end = ctx._real_pos(end)
return strbuilder.append_slice(ctx._string, start, end)
- elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- assert unicodebuilder is not None
- return unicodebuilder.append_slice(ctx._unicodestr, start, end)
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ assert use_builder == 'U'
+ return strbuilder.append_slice(ctx._utf8, start, end)
assert 0, "unreachable"
else:
sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@@ -487,18 +509,39 @@
@unwrap_spec(w_groupnum=WrappedDefault(0))
def start_w(self, w_groupnum):
- return self.space.newint(self.do_span(w_groupnum)[0])
+ start, end = self.do_span(w_groupnum)
+ start = self.bytepos_to_charindex(start)
+ return self.space.newint(start)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def end_w(self, w_groupnum):
- return self.space.newint(self.do_span(w_groupnum)[1])
+ start, end = self.do_span(w_groupnum)
+ end = self.bytepos_to_charindex(end)
+ return self.space.newint(end)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def span_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
+ return self.new_charindex_tuple(start, end)
+
+ def new_charindex_tuple(self, start, end):
+ start = self.bytepos_to_charindex(start)
+ end = self.bytepos_to_charindex(end)
return self.space.newtuple([self.space.newint(start),
self.space.newint(end)])
+ def bytepos_to_charindex(self, bytepos):
+ # Transform a 'byte position', as returned by all methods from
+ # rsre_core, back into a 'character index'. This is for UTF8
+ # handling.
+ ctx = self.ctx
+ if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ index_storage = ctx.w_unicode_obj._get_index_storage()
+ return rutf8.codepoint_index_at_byte_position(
+ ctx.w_unicode_obj._utf8, index_storage, bytepos)
+ else:
+ return bytepos
+
def flatten_marks(self):
if self.flatten_cache is None:
num_groups = self.srepat.num_groups
@@ -506,6 +549,8 @@
return self.flatten_cache
def do_span(self, w_arg):
+ # return a pair of integers, which are byte positions, not
+ # character indexes (for utf8)
space = self.space
try:
groupnum = space.int_w(w_arg)
@@ -553,10 +598,10 @@
return space.w_None
def fget_pos(self, space):
- return space.newint(self.ctx.original_pos)
+ return space.newint(self.bytepos_to_charindex(self.ctx.original_pos))
def fget_endpos(self, space):
- return space.newint(self.ctx.end)
+ return space.newint(self.bytepos_to_charindex(self.ctx.end))
def fget_regs(self, space):
space = self.space
@@ -564,11 +609,11 @@
num_groups = self.srepat.num_groups
result_w = [None] * (num_groups + 1)
ctx = self.ctx
- result_w[0] = space.newtuple([space.newint(ctx.match_start),
- space.newint(ctx.match_end)])
+ result_w[0] = self.new_charindex_tuple(ctx.match_start,
+ ctx.match_end)
for i in range(num_groups):
- result_w[i + 1] = space.newtuple([space.newint(fmarks[i*2]),
- space.newint(fmarks[i*2+1])])
+ result_w[i + 1] = self.new_charindex_tuple(fmarks[i*2],
+ fmarks[i*2+1])
return space.newtuple(result_w)
def fget_string(self, space):
@@ -577,9 +622,9 @@
return space.newbytes(ctx._buffer.as_str())
elif isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string)
- elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- lgt = rutf8.check_utf8(ctx._unicodestr, True)
- return space.newutf8(ctx._unicodestr, lgt)
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ lgt = rutf8.get_utf8_length(ctx._utf8)
+ return space.newutf8(ctx._utf8, lgt)
else:
raise SystemError
@@ -644,12 +689,14 @@
if found:
ctx = self.ctx
nextstart = ctx.match_end
- nextstart += (ctx.match_start == nextstart)
+ if ctx.match_start == nextstart:
+ nextstart = ctx.next_indirect(nextstart)
self.ctx = ctx.fresh_copy(nextstart)
match = W_SRE_Match(self.srepat, ctx)
return match
else:
- self.ctx.match_start += 1 # obscure corner case
+ # obscure corner case
+ self.ctx.match_start = self.ctx.next_indirect(self.ctx.match_start)
return None
W_SRE_Scanner.typedef = TypeDef(
diff --git a/pypy/module/_sre/test/test_app_sre.py
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -4,6 +4,8 @@
import py
from py.test import raises, skip
from pypy.interpreter.gateway import app2interp_temp
+from pypy.module._sre import interp_sre
+from rpython.rlib.rsre.test import support
def init_app_test(cls, space):
@@ -20,6 +22,35 @@
sys.path.pop(0)
""")
+def _test_sre_ctx_(self, str, start, end):
+ # Use the MatchContextForTests class, which handles Position
+ # instances instead of plain integers. This is used to detect when
+ # we're accepting or escaping a Position to app-level, which we
+ # should not: Positions are meant to be byte indexes inside a
+ # possibly UTF8 string, not character indexes.
+ start = support.Position(start)
+ end = support.Position(end)
+ return support.MatchContextForTests(self.code, str, start, end, self.flags)
+
+def _bytepos_to_charindex(self, bytepos):
+ if isinstance(self.ctx, support.MatchContextForTests):
+ return self.ctx._real_pos(bytepos)
+ return _org_maker[1](self, bytepos)
+
+def setup_module(mod):
+ mod._org_maker = (
+ interp_sre.W_SRE_Pattern._make_str_match_context,
+ interp_sre.W_SRE_Match.bytepos_to_charindex,
+ )
+ interp_sre.W_SRE_Pattern._make_str_match_context = _test_sre_ctx_
+ interp_sre.W_SRE_Match.bytepos_to_charindex = _bytepos_to_charindex
+
+def teardown_module(mod):
+ (
+ interp_sre.W_SRE_Pattern._make_str_match_context,
+ interp_sre.W_SRE_Match.bytepos_to_charindex,
+ ) = mod._org_maker
+
class AppTestSrePy:
def test_magic(self):
@@ -87,6 +118,13 @@
assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+ def test_findall_unicode(self):
+ import re
+ assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000")
+ assert ["a", "u"] == re.findall("b(.)", "abalbus")
+ assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
+ assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+
def test_finditer(self):
import re
it = re.finditer("b(.)", "brabbel")
@@ -999,3 +1037,15 @@
import re
assert re.search(".+ab", "wowowowawoabwowo")
assert None == re.search(".+ab", "wowowaowowo")
+
+
+class AppTestUnicodeExtra:
+ def test_string_attribute(self):
+ import re
+ match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+ assert match.string == u"\u1233\u1234\u1235"
+
+ def test_match_start(self):
+ import re
+ match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+ assert match.start() == 1
diff --git a/rpython/rlib/debug.py b/rpython/rlib/debug.py
--- a/rpython/rlib/debug.py
+++ b/rpython/rlib/debug.py
@@ -316,14 +316,21 @@
class ExpectedRegularInt(Exception):
pass
+class NegativeArgumentNotAllowed(Exception):
+ pass
+
def check_nonneg(x):
"""Give a translation-time error if 'x' is not known to be non-negative.
To help debugging, this also gives a translation-time error if 'x' is
actually typed as an r_uint (in which case the call to check_nonneg()
is a bit strange and probably unexpected).
"""
- assert type(x)(-1) < 0 # otherwise, 'x' is a r_uint or similar
- assert x >= 0
+ try:
+ assert type(x)(-1) < 0 # otherwise, 'x' is a r_uint or similar
+ except NegativeArgumentNotAllowed:
+ pass
+ else:
+ assert x >= 0
return x
class Entry(ExtRegistryEntry):
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,6 +55,8 @@
specific subclass, calling 'func' is a direct call; if 'ctx' is only known
to be of class AbstractMatchContext, calling 'func' is an indirect call.
"""
+ from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
+
assert func.func_code.co_varnames[0] == 'ctx'
specname = '_spec_' + func.func_name
while specname in _seen_specname:
@@ -65,7 +67,8 @@
specialized_methods = []
for prefix, concreteclass in [('buf', BufMatchContext),
('str', StrMatchContext),
- ('uni', UnicodeMatchContext)]:
+ ('uni', UnicodeMatchContext),
+ ('utf8', Utf8MatchContext)]:
newfunc = func_with_new_name(func, prefix + specname)
assert not hasattr(concreteclass, specname)
setattr(concreteclass, specname, newfunc)
@@ -83,6 +86,9 @@
def __init__(self, msg):
self.msg = msg
+class EndOfString(Exception):
+ pass
+
class AbstractMatchContext(object):
"""Abstract base class"""
_immutable_fields_ = ['pattern[*]', 'flags', 'end']
@@ -135,6 +141,45 @@
"""Similar to str()."""
raise NotImplementedError
+ # The following methods are provided to be overriden in
+ # Utf8MatchContext. The non-utf8 implementation is provided
+ # by the FixedMatchContext abstract subclass, in order to use
+ # the same @not_rpython safety trick as above.
+ ZERO = 0
+ @not_rpython
+ def next(self, position):
+ raise NotImplementedError
+ @not_rpython
+ def prev(self, position):
+ raise NotImplementedError
+ @not_rpython
+ def next_n(self, position, n):
+ raise NotImplementedError
+ @not_rpython
+ def prev_n(self, position, n, start_position):
+ raise NotImplementedError
+ @not_rpython
+ def debug_check_pos(self, position):
+ raise NotImplementedError
+ @not_rpython
+ def maximum_distance(self, position_low, position_high):
+ raise NotImplementedError
+ @not_rpython
+ def get_single_byte(self, base_position, index):
+ raise NotImplementedError
+
+ def bytes_difference(self, position1, position2):
+ return position1 - position2
+ def go_forward_by_bytes(self, base_position, index):
+ return base_position + index
+ def next_indirect(self, position):
+ return position + 1 # like next(), but can be called indirectly
+ def prev_indirect(self, position):
+ position -= 1 # like prev(), but can be called indirectly
+ if position < 0:
+ raise EndOfString
+ return position
+
def get_mark(self, gid):
return find_mark(self.match_marks, gid)
@@ -168,23 +213,44 @@
return (-1, -1)
return (fmarks[groupnum], fmarks[groupnum+1])
- def group(self, groupnum=0):
- frm, to = self.span(groupnum)
- if 0 <= frm <= to:
- return self._string[frm:to]
- else:
- return None
-
def fresh_copy(self, start):
raise NotImplementedError
-class BufMatchContext(AbstractMatchContext):
+
+class FixedMatchContext(AbstractMatchContext):
+ """Abstract subclass to introduce the default implementation for
+ these position methods. The Utf8MatchContext subclass doesn't
+ inherit from here."""
+
+ next = AbstractMatchContext.next_indirect
+ prev = AbstractMatchContext.prev_indirect
+
+ def next_n(self, position, n, end_position):
+ position += n
+ if position > end_position:
+ raise EndOfString
+ return position
+
+ def prev_n(self, position, n, start_position):
+ position -= n
+ if position < start_position:
+ raise EndOfString
+ return position
+
+ def debug_check_pos(self, position):
+ pass
+
+ def maximum_distance(self, position_low, position_high):
+ return position_high - position_low
+
+
+class BufMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a buffer."""
_immutable_fields_ = ["_buffer"]
def __init__(self, pattern, buf, match_start, end, flags):
- AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ FixedMatchContext.__init__(self, pattern, match_start, end, flags)
self._buffer = buf
def str(self, index):
@@ -195,17 +261,20 @@
c = self.str(index)
return rsre_char.getlower(c, self.flags)
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
def fresh_copy(self, start):
return BufMatchContext(self.pattern, self._buffer, start,
self.end, self.flags)
-class StrMatchContext(AbstractMatchContext):
+class StrMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a plain string."""
_immutable_fields_ = ["_string"]
def __init__(self, pattern, string, match_start, end, flags):
- AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ FixedMatchContext.__init__(self, pattern, match_start, end, flags)
self._string = string
if not we_are_translated() and isinstance(string, unicode):
self.flags |= rsre_char.SRE_FLAG_UNICODE # for rsre_re.py
@@ -218,17 +287,23 @@
c = self.str(index)
return rsre_char.getlower(c, self.flags)
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
+ def _real_pos(self, index):
+ return index # overridden by tests
+
def fresh_copy(self, start):
return StrMatchContext(self.pattern, self._string, start,
self.end, self.flags)
-class UnicodeMatchContext(AbstractMatchContext):
+class UnicodeMatchContext(FixedMatchContext):
"""Concrete subclass for matching in a unicode string."""
_immutable_fields_ = ["_unicodestr"]
def __init__(self, pattern, unicodestr, match_start, end, flags):
- AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ FixedMatchContext.__init__(self, pattern, match_start, end, flags)
self._unicodestr = unicodestr
def str(self, index):
@@ -239,6 +314,9 @@
c = self.str(index)
return rsre_char.getlower(c, self.flags)
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
def fresh_copy(self, start):
return UnicodeMatchContext(self.pattern, self._unicodestr, start,
self.end, self.flags)
@@ -317,7 +395,10 @@
ctx.jitdriver_RepeatOne.jit_merge_point(
self=self, ptr=ptr, ctx=ctx, nextppos=nextppos)
result = sre_match(ctx, nextppos, ptr, self.start_marks)
- ptr -= 1
+ try:
+ ptr = ctx.prev_indirect(ptr)
+ except EndOfString:
+ ptr = -1
if result is not None:
self.subresult = result
self.start_ptr = ptr
@@ -328,37 +409,41 @@
class MinRepeatOneMatchResult(MatchResult):
install_jitdriver('MinRepeatOne',
greens=['nextppos', 'ppos3', 'ctx.pattern'],
- reds=['ptr', 'self', 'ctx'],
+ reds=['max_count', 'ptr', 'self', 'ctx'],
debugprint=(2, 0)) # indices in 'greens'
- def __init__(self, nextppos, ppos3, maxptr, ptr, marks):
+ def __init__(self, nextppos, ppos3, max_count, ptr, marks):
self.nextppos = nextppos
self.ppos3 = ppos3
- self.maxptr = maxptr
+ self.max_count = max_count
self.start_ptr = ptr
self.start_marks = marks
def find_first_result(self, ctx):
ptr = self.start_ptr
nextppos = self.nextppos
+ max_count = self.max_count
ppos3 = self.ppos3
- while ptr <= self.maxptr:
+ while max_count >= 0:
ctx.jitdriver_MinRepeatOne.jit_merge_point(
- self=self, ptr=ptr, ctx=ctx, nextppos=nextppos, ppos3=ppos3)
+ self=self, ptr=ptr, ctx=ctx, nextppos=nextppos, ppos3=ppos3,
+ max_count=max_count)
result = sre_match(ctx, nextppos, ptr, self.start_marks)
if result is not None:
self.subresult = result
self.start_ptr = ptr
+ self.max_count = max_count
return self
if not self.next_char_ok(ctx, ptr, ppos3):
break
- ptr += 1
+ ptr = ctx.next_indirect(ptr)
+ max_count -= 1
def find_next_result(self, ctx):
ptr = self.start_ptr
if not self.next_char_ok(ctx, ptr, self.ppos3):
return
- self.start_ptr = ptr + 1
+ self.start_ptr = ctx.next_indirect(ptr)
return self.find_first_result(ctx)
def next_char_ok(self, ctx, ptr, ppos):
@@ -430,12 +515,12 @@
min = ctx.pat(ppos+1)
if enum is not None:
# matched one more 'item'. record it and continue.
- last_match_length = ctx.match_end - ptr
+ last_match_zero_length = (ctx.match_end == ptr)
self.pending = Pending(ptr, marks, enum, self.pending)
self.num_pending += 1
ptr = ctx.match_end
marks = ctx.match_marks
- if last_match_length == 0 and self.num_pending >= min:
+ if last_match_zero_length and self.num_pending >= min:
# zero-width protection: after an empty match, if there
# are enough matches, don't try to match more. Instead,
# fall through to trying to match 'tail'.
@@ -520,6 +605,7 @@
need all results; in that case we use the method move_to_next_result()
of the MatchResult."""
while True:
+ ctx.debug_check_pos(ptr)
op = ctx.pat(ppos)
ppos += 1
@@ -551,22 +637,25 @@
# <ANY>
if ptr >= ctx.end or rsre_char.is_linebreak(ctx.str(ptr)):
return
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_ANY_ALL:
# match anything
# <ANY_ALL>
if ptr >= ctx.end:
return
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_ASSERT:
# assert subpattern
# <ASSERT> <0=skip> <1=back> <pattern>
- ptr1 = ptr - ctx.pat(ppos+1)
+ try:
+ ptr1 = ctx.prev_n(ptr, ctx.pat(ppos+1), ctx.ZERO)
+ except EndOfString:
+ return
saved = ctx.fullmatch_only
ctx.fullmatch_only = False
- stop = ptr1 < 0 or sre_match(ctx, ppos + 2, ptr1, marks) is None
+ stop = sre_match(ctx, ppos + 2, ptr1, marks) is None
ctx.fullmatch_only = saved
if stop:
return
@@ -576,14 +665,17 @@
elif op == OPCODE_ASSERT_NOT:
# assert not subpattern
# <ASSERT_NOT> <0=skip> <1=back> <pattern>
- ptr1 = ptr - ctx.pat(ppos+1)
- saved = ctx.fullmatch_only
- ctx.fullmatch_only = False
- stop = (ptr1 >= 0 and sre_match(ctx, ppos + 2, ptr1, marks)
- is not None)
- ctx.fullmatch_only = saved
- if stop:
- return
+ try:
+ ptr1 = ctx.prev_n(ptr, ctx.pat(ppos+1), ctx.ZERO)
+ except EndOfString:
+ pass
+ else:
+ saved = ctx.fullmatch_only
+ ctx.fullmatch_only = False
+ stop = sre_match(ctx, ppos + 2, ptr1, marks) is not None
+ ctx.fullmatch_only = saved
+ if stop:
+ return
ppos += ctx.pat(ppos)
elif op == OPCODE_AT:
@@ -606,36 +698,36 @@
if (ptr == ctx.end or
not rsre_char.category_dispatch(ctx.pat(ppos), ctx.str(ptr))):
return
- ptr += 1
+ ptr = ctx.next(ptr)
ppos += 1
elif op == OPCODE_GROUPREF:
# match backreference
# <GROUPREF> <groupnum>
- startptr, length = get_group_ref(marks, ctx.pat(ppos))
- if length < 0:
+ startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+ if length_bytes < 0:
return # group was not previously defined
- if not match_repeated(ctx, ptr, startptr, length):
+ if not match_repeated(ctx, ptr, startptr, length_bytes):
return # no match
- ptr += length
+ ptr = ctx.go_forward_by_bytes(ptr, length_bytes)
ppos += 1
elif op == OPCODE_GROUPREF_IGNORE:
# match backreference
# <GROUPREF> <groupnum>
- startptr, length = get_group_ref(marks, ctx.pat(ppos))
- if length < 0:
+ startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+ if length_bytes < 0:
return # group was not previously defined
- if not match_repeated_ignore(ctx, ptr, startptr, length):
+ ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes)
+ if ptr < ctx.ZERO:
return # no match
- ptr += length
ppos += 1
elif op == OPCODE_GROUPREF_EXISTS:
# conditional match depending on the existence of a group
# <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ...
- _, length = get_group_ref(marks, ctx.pat(ppos))
- if length >= 0:
+ _, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+ if length_bytes >= 0:
ppos += 2 # jump to 'codeyes'
else:
ppos += ctx.pat(ppos+1) # jump to 'codeno'
@@ -647,7 +739,7 @@
ctx.str(ptr)):
return
ppos += ctx.pat(ppos)
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_IN_IGNORE:
# match set member (or non_member), ignoring case
@@ -656,12 +748,12 @@
ctx.lowstr(ptr)):
return
ppos += ctx.pat(ppos)
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_INFO:
# optimization info block
# <INFO> <0=skip> <1=flags> <2=min> ...
- if (ctx.end - ptr) < ctx.pat(ppos+2):
+ if ctx.maximum_distance(ptr, ctx.end) < ctx.pat(ppos+2):
return
ppos += ctx.pat(ppos)
@@ -674,7 +766,7 @@
if ptr >= ctx.end or ctx.str(ptr) != ctx.pat(ppos):
return
ppos += 1
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_LITERAL_IGNORE:
# match literal string, ignoring case
@@ -682,7 +774,7 @@
if ptr >= ctx.end or ctx.lowstr(ptr) != ctx.pat(ppos):
return
ppos += 1
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_MARK:
# set mark
@@ -697,7 +789,7 @@
if ptr >= ctx.end or ctx.str(ptr) == ctx.pat(ppos):
return
ppos += 1
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_NOT_LITERAL_IGNORE:
# match if it's not a literal string, ignoring case
@@ -705,7 +797,7 @@
if ptr >= ctx.end or ctx.lowstr(ptr) == ctx.pat(ppos):
return
ppos += 1
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_REPEAT:
# general repeat. in this version of the re module, all the work
@@ -743,8 +835,9 @@
# use the MAX_REPEAT operator.
# <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail
start = ptr
- minptr = start + ctx.pat(ppos+1)
- if minptr > ctx.end:
+ try:
+ minptr = ctx.next_n(start, ctx.pat(ppos+1), ctx.end)
+ except EndOfString:
return # cannot match
ptr = find_repetition_end(ctx, ppos+3, start, ctx.pat(ppos+2),
marks)
@@ -765,22 +858,22 @@
start = ptr
min = ctx.pat(ppos+1)
if min > 0:
- minptr = ptr + min
- if minptr > ctx.end:
- return # cannot match
+ try:
+ minptr = ctx.next_n(ptr, min, ctx.end)
+ except EndOfString:
+ return # cannot match
# count using pattern min as the maximum
ptr = find_repetition_end(ctx, ppos+3, ptr, min, marks)
if ptr < minptr:
return # did not match minimum number of times
- maxptr = ctx.end
+ max_count = sys.maxint
max = ctx.pat(ppos+2)
if max != rsre_char.MAXREPEAT:
- maxptr1 = start + max
- if maxptr1 <= maxptr:
- maxptr = maxptr1
+ max_count = max - min
+ assert max_count >= 0
nextppos = ppos + ctx.pat(ppos)
- result = MinRepeatOneMatchResult(nextppos, ppos+3, maxptr,
+ result = MinRepeatOneMatchResult(nextppos, ppos+3, max_count,
ptr, marks)
return result.find_first_result(ctx)
@@ -788,37 +881,41 @@
raise Error("bad pattern code %d" % op)
-def get_group_ref(marks, groupnum):
+def get_group_ref(ctx, marks, groupnum):
gid = groupnum * 2
startptr = find_mark(marks, gid)
- if startptr < 0:
+ if startptr < ctx.ZERO:
return 0, -1
endptr = find_mark(marks, gid + 1)
- length = endptr - startptr # < 0 if endptr < startptr (or if endptr=-1)
- return startptr, length
+ length_bytes = ctx.bytes_difference(endptr, startptr)
+ # < 0 if endptr < startptr (or if endptr=-1)
+ return startptr, length_bytes
@specializectx
-def match_repeated(ctx, ptr, oldptr, length):
- if ptr + length > ctx.end:
+def match_repeated(ctx, ptr, oldptr, length_bytes):
+ if ctx.bytes_difference(ctx.end, ptr) < length_bytes:
return False
- for i in range(length):
- if ctx.str(ptr + i) != ctx.str(oldptr + i):
+ for i in range(length_bytes):
+ if ctx.get_single_byte(ptr, i) != ctx.get_single_byte(oldptr, i):
return False
return True
@specializectx
-def match_repeated_ignore(ctx, ptr, oldptr, length):
- if ptr + length > ctx.end:
- return False
- for i in range(length):
- if ctx.lowstr(ptr + i) != ctx.lowstr(oldptr + i):
- return False
- return True
+def match_repeated_ignore(ctx, ptr, oldptr, length_bytes):
+ oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+ while oldptr < oldend:
+ if ptr >= ctx.end:
+ return -1
+ if ctx.lowstr(ptr) != ctx.lowstr(oldptr):
+ return -1
+ ptr = ctx.next(ptr)
+ oldptr = ctx.next(oldptr)
+ return ptr
@specializectx
def find_repetition_end(ctx, ppos, ptr, maxcount, marks):
end = ctx.end
- ptrp1 = ptr + 1
+ ptrp1 = ctx.next(ptr)
# First get rid of the cases where we don't have room for any match.
if maxcount <= 0 or ptrp1 > end:
return ptr
@@ -843,9 +940,10 @@
# Else we really need to count how many times it matches.
if maxcount != rsre_char.MAXREPEAT:
# adjust end
- end1 = ptr + maxcount
- if end1 <= end:
- end = end1
+ try:
+ end = ctx.next_n(ptr, maxcount, end)
+ except EndOfString:
+ pass
op = ctx.pat(ppos)
for op1, fre in unroll_fre_checker:
if op1 == op:
@@ -862,7 +960,7 @@
if end1 <= end:
end = end1
while ptr < end and sre_match(ctx, ppos, ptr, marks) is not None:
- ptr += 1
+ ptr = ctx.next(ptr)
return ptr
@specializectx
@@ -904,7 +1002,7 @@
ctx.jitdriver_MatchIn.jit_merge_point(ctx=ctx, ptr=ptr,
end=end, ppos=ppos)
if ptr < end and checkerfn(ctx, ptr, ppos):
- ptr += 1
+ ptr = ctx.next(ptr)
else:
return ptr
elif checkerfn == match_IN_IGNORE:
@@ -918,7 +1016,7 @@
ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr,
end=end, ppos=ppos)
if ptr < end and checkerfn(ctx, ptr, ppos):
- ptr += 1
+ ptr = ctx.next(ptr)
else:
return ptr
else:
@@ -927,7 +1025,7 @@
@specializectx
def fre(ctx, ptr, end, ppos):
while ptr < end and checkerfn(ctx, ptr, ppos):
- ptr += 1
+ ptr = ctx.next(ptr)
return ptr
fre = func_with_new_name(fre, 'fre_' + checkerfn.__name__)
return fre
@@ -967,11 +1065,14 @@
def sre_at(ctx, atcode, ptr):
if (atcode == AT_BEGINNING or
atcode == AT_BEGINNING_STRING):
- return ptr == 0
+ return ptr == ctx.ZERO
elif atcode == AT_BEGINNING_LINE:
- prevptr = ptr - 1
- return prevptr < 0 or rsre_char.is_linebreak(ctx.str(prevptr))
+ try:
+ prevptr = ctx.prev(ptr)
+ except EndOfString:
+ return True
+ return rsre_char.is_linebreak(ctx.str(prevptr))
elif atcode == AT_BOUNDARY:
return at_boundary(ctx, ptr)
@@ -980,9 +1081,8 @@
return at_non_boundary(ctx, ptr)
elif atcode == AT_END:
- remaining_chars = ctx.end - ptr
- return remaining_chars <= 0 or (
- remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr)))
+ return (ptr == ctx.end or
+ (ctx.next(ptr) == ctx.end and
rsre_char.is_linebreak(ctx.str(ptr))))
elif atcode == AT_END_LINE:
return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr))
@@ -1007,18 +1107,26 @@
def _make_boundary(word_checker):
@specializectx
def at_boundary(ctx, ptr):
- if ctx.end == 0:
+ if ctx.end == ctx.ZERO:
return False
- prevptr = ptr - 1
- that = prevptr >= 0 and word_checker(ctx.str(prevptr))
+ try:
+ prevptr = ctx.prev(ptr)
+ except EndOfString:
+ that = False
+ else:
+ that = word_checker(ctx.str(prevptr))
this = ptr < ctx.end and word_checker(ctx.str(ptr))
return this != that
@specializectx
def at_non_boundary(ctx, ptr):
- if ctx.end == 0:
+ if ctx.end == ctx.ZERO:
return False
- prevptr = ptr - 1
- that = prevptr >= 0 and word_checker(ctx.str(prevptr))
+ try:
+ prevptr = ctx.prev(ptr)
+ except EndOfString:
+ that = False
+ else:
+ that = word_checker(ctx.str(prevptr))
this = ptr < ctx.end and word_checker(ctx.str(ptr))
return this == that
return at_boundary, at_non_boundary
@@ -1100,7 +1208,7 @@
if sre_match(ctx, base, start, None) is not None:
ctx.match_start = start
return True
- start += 1
+ start = ctx.next_indirect(start)
return False
install_jitdriver_spec("LiteralSearch",
@@ -1117,11 +1225,12 @@
while start < ctx.end:
ctx.jitdriver_LiteralSearch.jit_merge_point(ctx=ctx, start=start,
base=base, character=character)
+ start1 = ctx.next(start)
if ctx.str(start) == character:
- if sre_match(ctx, base, start + 1, None) is not None:
+ if sre_match(ctx, base, start1, None) is not None:
ctx.match_start = start
return True
- start += 1
+ start = start1
return False
install_jitdriver_spec("CharsetSearch",
@@ -1139,7 +1248,7 @@
if sre_match(ctx, base, start, None) is not None:
ctx.match_start = start
return True
- start += 1
+ start = ctx.next(start)
return False
install_jitdriver_spec('FastSearch',
@@ -1156,7 +1265,7 @@
if string_position >= ctx.end:
return False
prefix_len = ctx.pat(5)
- assert prefix_len >= 0
+ assert prefix_len > 0
i = 0
while True:
ctx.jitdriver_FastSearch.jit_merge_point(ctx=ctx,
@@ -1171,10 +1280,14 @@
i += 1
if i == prefix_len:
# found a potential match
- start = string_position + 1 - prefix_len
- assert start >= 0
+ # start = string_position + 1 - prefix_len: computed later
+ ptr = string_position
prefix_skip = ctx.pat(6)
- ptr = start + prefix_skip
+ if prefix_skip == prefix_len:
+ ptr = ctx.next(ptr)
+ else:
+ assert prefix_skip < prefix_len
+ ptr = ctx.prev_n(ptr, prefix_len-1 - prefix_skip, ctx.ZERO)
#flags = ctx.pat(2)
#if flags & rsre_char.SRE_INFO_LITERAL:
# # matched all of pure literal pattern
@@ -1185,10 +1298,11 @@
pattern_offset = ctx.pat(1) + 1
ppos_start = pattern_offset + 2 * prefix_skip
if sre_match(ctx, ppos_start, ptr, None) is not None:
+ start = ctx.prev_n(ptr, prefix_skip, ctx.ZERO)
ctx.match_start = start
return True
overlap_offset = prefix_len + (7 - 1)
i = ctx.pat(overlap_offset + i)
- string_position += 1
+ string_position = ctx.next(string_position)
if string_position >= ctx.end:
return False
diff --git a/rpython/rlib/rsre/rsre_jit.py b/rpython/rlib/rsre/rsre_jit.py
--- a/rpython/rlib/rsre/rsre_jit.py
+++ b/rpython/rlib/rsre/rsre_jit.py
@@ -36,8 +36,10 @@
from rpython.rlib.rsre.rsre_core import BufMatchContext
from rpython.rlib.rsre.rsre_core import StrMatchContext
from rpython.rlib.rsre.rsre_core import UnicodeMatchContext
+ from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
for prefix, concreteclass in [('Buf', BufMatchContext),
('Str', StrMatchContext),
- ('Uni', UnicodeMatchContext)]:
+ ('Uni', UnicodeMatchContext),
+ ('Utf8', Utf8MatchContext)]:
jitdriver = RSreJitDriver(prefix + name, **kwds)
setattr(concreteclass, 'jitdriver_' + name, jitdriver)
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -0,0 +1,105 @@
+import sys
+from rpython.rlib.debug import check_nonneg
+from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
+from rpython.rlib.rsre import rsre_char
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rlib import rutf8
+
+
+class Utf8MatchContext(AbstractMatchContext):
+ """A context that matches unicode, but encoded in a utf8 string.
+ Be careful because most positions taken by, handled in, and returned
+ by this class are expressed in *bytes*, not in characters.
+ """
+
+ def __init__(self, pattern, utf8string, match_start, end, flags):
+ AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ self._utf8 = utf8string
+
+ def str(self, index):
+ check_nonneg(index)
+ return rutf8.codepoint_at_pos(self._utf8, index)
+
+ def lowstr(self, index):
+ c = self.str(index)
+ return rsre_char.getlower(c, self.flags)
+
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
+ def fresh_copy(self, start):
+ return Utf8MatchContext(self.pattern, self._utf8, start,
+ self.end, self.flags)
+
+ def next(self, position):
+ return rutf8.next_codepoint_pos(self._utf8, position)
+ next_indirect = next
+
+ def prev(self, position):
+ if position <= 0:
+ raise EndOfString
+ position = rutf8.prev_codepoint_pos(self._utf8, position)
+ assert position >= 0
+ return position
+ prev_indirect = prev
+
+ def next_n(self, position, n, end_position):
+ for i in range(n):
+ if position >= end_position:
+ raise EndOfString
+ position = rutf8.next_codepoint_pos(self._utf8, position)
+ return position
+
+ def prev_n(self, position, n, start_position):
+ for i in range(n):
+ if position <= start_position:
+ raise EndOfString
+ position = rutf8.prev_codepoint_pos(self._utf8, position)
+ assert position >= 0
+ return position
+
+ def debug_check_pos(self, position):
+ if we_are_translated():
+ return
+ if position == len(self._utf8):
+ return # end of string is fine
+ assert not (0x80 <= self._utf8[position] < 0xC0) # continuation byte
+
+ def maximum_distance(self, position_low, position_high):
+ # may overestimate if there are non-ascii chars
+ return position_high - position_low
+
+
+def make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags):
+ if bytestart < 0: bytestart = 0
+ elif bytestart > len(utf8string): bytestart = len(utf8string)
+ if byteend < 0: byteend = 0
+ elif byteend > len(utf8string): byteend = len(utf8string)
+ ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags)
+ ctx.debug_check_pos(bytestart)
+ ctx.debug_check_pos(byteend)
+ return ctx
+
+def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0):
+ # bytestart and byteend must be valid byte positions inside the
+ # utf8string.
+ from rpython.rlib.rsre.rsre_core import search_context
+
+ ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags)
+ if search_context(ctx):
+ return ctx
+ else:
+ return None
+
+def utf8match(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0,
+ fullmatch=False):
+ # bytestart and byteend must be valid byte positions inside the
+ # utf8string.
+ from rpython.rlib.rsre.rsre_core import match_context
+
+ ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags)
+ ctx.fullmatch_only = fullmatch
+ if match_context(ctx):
+ return ctx
+ else:
+ return None
diff --git a/rpython/rlib/rsre/test/support.py
b/rpython/rlib/rsre/test/support.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rsre/test/support.py
@@ -0,0 +1,136 @@
+import sys, random
+from rpython.rlib import debug
+from rpython.rlib.rsre.rsre_core import _adjust, match_context, search_context
+from rpython.rlib.rsre.rsre_core import StrMatchContext, EndOfString
+
+
+class Position(object):
+ def __init__(self, p):
+ assert isinstance(p, int)
+ if p < 0:
+ raise debug.NegativeArgumentNotAllowed(
+ "making a Position with byte index %r" % p)
+ self._p = p
+ def __repr__(self):
+ return '<Position %d>' % (self._p)
+ def __cmp__(self, other):
+ if isinstance(other, Position):
+ return cmp(self._p, other._p)
+ if type(other) is int and other == -1:
+ return cmp(self._p, -1)
+ raise TypeError("cannot compare %r with %r" % (self, other))
+
+
+class MatchContextForTests(StrMatchContext):
+ """Concrete subclass for matching in a plain string, tweaked for tests"""
+
+ ZERO = Position(0)
+
+ def next(self, position):
+ assert isinstance(position, Position)
+ return Position(position._p + 1)
+ next_indirect = next
+
+ def prev(self, position):
+ assert isinstance(position, Position)
+ if position._p == 0:
+ raise EndOfString
+ return Position(position._p - 1)
+ prev_indirect = prev
+
+ def next_n(self, position, n, end_position):
+ assert isinstance(position, Position)
+ assert isinstance(end_position, Position)
+ assert position._p <= end_position._p
+ r = position._p + n
+ if r > end_position._p:
+ raise EndOfString
+ return Position(r)
+
+ def prev_n(self, position, n, start_position):
+ assert isinstance(position, Position)
+ assert isinstance(start_position, Position)
+ assert position._p >= start_position._p
+ r = position._p - n
+ if r < start_position._p:
+ raise EndOfString
+ return Position(r)
+
+ def _real_pos(self, position):
+ if type(position) is int and position == -1:
+ return -1
+ assert isinstance(position, Position)
+ return position._p
+
+ def group(self, groupnum=0):
+ frm, to = self.span(groupnum)
+ if self.ZERO <= frm <= to:
+ return self._string[self._real_pos(frm):self._real_pos(to)]
+ else:
+ return None
+
+ def str(self, position):
+ assert isinstance(position, Position)
+ return ord(self._string[position._p])
+
+ def debug_check_pos(self, position):
+ assert isinstance(position, Position)
+
+ #def minimum_distance(self, position_low, position_high):
+ # """Return an estimate. The real value may be higher."""
+ # assert isinstance(position_low, Position)
+ # assert isinstance(position_high, Position)
+ # dist = position_high._p - position_low._p
+ # if dist == 0:
+ # return 0
+ # return random.randrange(1, dist + 1)
+
+ def maximum_distance(self, position_low, position_high):
+ """Return an estimate. The real value may be lower."""
+ assert isinstance(position_low, Position)
+ assert isinstance(position_high, Position)
+ return position_high._p - position_low._p + random.randrange(0, 10)
+
+ def bytes_difference(self, position1, position2):
+ assert isinstance(position1, Position)
+ assert isinstance(position2, Position)
+ return position1._p - position2._p
+
+ def get_single_byte(self, base_position, index):
+ assert isinstance(base_position, Position)
+ assert isinstance(index, int)
+ return ord(self._string[base_position._p + index])
+
+ def go_forward_by_bytes(self, base_position, index):
+ assert isinstance(base_position, Position)
+ assert isinstance(index, int)
+ return Position(base_position._p + index)
+
+ def fresh_copy(self, start):
+ return MatchContextForTests(self.pattern, self._string, start,
+ self.end, self.flags)
+
+
+def match(pattern, string, start=0, end=sys.maxint, flags=0, fullmatch=False):
+ start, end = _adjust(start, end, len(string))
+ start = Position(start)
+ end = Position(end)
+ ctx = MatchContextForTests(pattern, string, start, end, flags)
+ ctx.fullmatch_only = fullmatch
+ if match_context(ctx):
+ return ctx
+ else:
+ return None
+
+def fullmatch(pattern, string, start=0, end=sys.maxint, flags=0):
+ return match(pattern, string, start, end, flags, fullmatch=True)
+
+def search(pattern, string, start=0, end=sys.maxint, flags=0):
+ start, end = _adjust(start, end, len(string))
+ start = Position(start)
+ end = Position(end)
+ ctx = MatchContextForTests(pattern, string, start, end, flags)
+ if search_context(ctx):
+ return ctx
+ else:
+ return None
diff --git a/rpython/rlib/rsre/test/test_ext_opcode.py
b/rpython/rlib/rsre/test/test_ext_opcode.py
--- a/rpython/rlib/rsre/test/test_ext_opcode.py
+++ b/rpython/rlib/rsre/test/test_ext_opcode.py
@@ -5,6 +5,7 @@
from rpython.rlib.rsre import rsre_core
from rpython.rlib.rsre.rsre_char import MAXREPEAT
+from rpython.rlib.rsre.test.support import match, Position
# import OPCODE_XX as XX
for name, value in rsre_core.__dict__.items():
@@ -17,10 +18,10 @@
# it's a valid optimization because \1 is always one character long
r = [MARK, 0, ANY, MARK, 1, REPEAT_ONE, 6, 0, MAXREPEAT,
GROUPREF, 0, SUCCESS, SUCCESS]
- assert rsre_core.match(r, "aaa").match_end == 3
+ assert match(r, "aaa").match_end == Position(3)
def test_min_repeat_one_with_backref():
# Python 3.5 compiles "(.)\1*?b" using MIN_REPEAT_ONE
r = [MARK, 0, ANY, MARK, 1, MIN_REPEAT_ONE, 6, 0, MAXREPEAT,
GROUPREF, 0, SUCCESS, LITERAL, 98, SUCCESS]
- assert rsre_core.match(r, "aaab").match_end == 4
+ assert match(r, "aaab").match_end == Position(4)
diff --git a/rpython/rlib/rsre/test/test_match.py
b/rpython/rlib/rsre/test/test_match.py
--- a/rpython/rlib/rsre/test/test_match.py
+++ b/rpython/rlib/rsre/test/test_match.py
@@ -1,6 +1,7 @@
import re, random, py
-from rpython.rlib.rsre import rsre_core, rsre_char
+from rpython.rlib.rsre import rsre_char
from rpython.rlib.rsre.rpy import get_code, VERSION
+from rpython.rlib.rsre.test.support import match, fullmatch, Position as P
def get_code_and_re(regexp):
@@ -16,234 +17,234 @@
def test_or(self):
r = get_code(r"a|bc|def")
- assert rsre_core.match(r, "a")
- assert rsre_core.match(r, "bc")
- assert rsre_core.match(r, "def")
- assert not rsre_core.match(r, "ghij")
+ assert match(r, "a")
+ assert match(r, "bc")
+ assert match(r, "def")
+ assert not match(r, "ghij")
def test_any(self):
r = get_code(r"ab.cd")
- assert rsre_core.match(r, "abXcdef")
- assert not rsre_core.match(r, "ab\ncdef")
- assert not rsre_core.match(r, "abXcDef")
+ assert match(r, "abXcdef")
+ assert not match(r, "ab\ncdef")
+ assert not match(r, "abXcDef")
def test_any_repetition(self):
r = get_code(r"ab.*cd")
- assert rsre_core.match(r, "abXXXXcdef")
- assert rsre_core.match(r, "abcdef")
- assert not rsre_core.match(r, "abX\nXcdef")
- assert not rsre_core.match(r, "abXXXXcDef")
+ assert match(r, "abXXXXcdef")
+ assert match(r, "abcdef")
+ assert not match(r, "abX\nXcdef")
+ assert not match(r, "abXXXXcDef")
def test_any_all(self):
r = get_code(r"(?s)ab.cd")
- assert rsre_core.match(r, "abXcdef")
- assert rsre_core.match(r, "ab\ncdef")
- assert not rsre_core.match(r, "ab\ncDef")
+ assert match(r, "abXcdef")
+ assert match(r, "ab\ncdef")
+ assert not match(r, "ab\ncDef")
def test_any_all_repetition(self):
r = get_code(r"(?s)ab.*cd")
- assert rsre_core.match(r, "abXXXXcdef")
- assert rsre_core.match(r, "abcdef")
- assert rsre_core.match(r, "abX\nXcdef")
- assert not rsre_core.match(r, "abX\nXcDef")
+ assert match(r, "abXXXXcdef")
+ assert match(r, "abcdef")
+ assert match(r, "abX\nXcdef")
+ assert not match(r, "abX\nXcDef")
def test_assert(self):
r = get_code(r"abc(?=def)(.)")
- res = rsre_core.match(r, "abcdefghi")
- assert res is not None and res.get_mark(1) == 4
- assert not rsre_core.match(r, "abcdeFghi")
+ res = match(r, "abcdefghi")
+ assert res is not None and res.get_mark(1) == P(4)
+ assert not match(r, "abcdeFghi")
def test_assert_not(self):
r = get_code(r"abc(?!def)(.)")
- res = rsre_core.match(r, "abcdeFghi")
- assert res is not None and res.get_mark(1) == 4
- assert not rsre_core.match(r, "abcdefghi")
+ res = match(r, "abcdeFghi")
+ assert res is not None and res.get_mark(1) == P(4)
+ assert not match(r, "abcdefghi")
def test_lookbehind(self):
r = get_code(r"([a-z]*)(?<=de)")
- assert rsre_core.match(r, "ade")
- res = rsre_core.match(r, "adefg")
- assert res is not None and res.get_mark(1) == 3
- assert not rsre_core.match(r, "abc")
- assert not rsre_core.match(r, "X")
- assert not rsre_core.match(r, "eX")
+ assert match(r, "ade")
+ res = match(r, "adefg")
+ assert res is not None and res.get_mark(1) == P(3)
+ assert not match(r, "abc")
+ assert not match(r, "X")
+ assert not match(r, "eX")
def test_negative_lookbehind(self):
def found(s):
- res = rsre_core.match(r, s)
+ res = match(r, s)
assert res is not None
return res.get_mark(1)
r = get_code(r"([a-z]*)(?<!dd)")
- assert found("ade") == 3
- assert found("adefg") == 5
- assert found("abcdd") == 4
- assert found("abddd") == 3
- assert found("adddd") == 2
- assert found("ddddd") == 1
- assert found("abXde") == 2
+ assert found("ade") == P(3)
+ assert found("adefg") == P(5)
+ assert found("abcdd") == P(4)
+ assert found("abddd") == P(3)
+ assert found("adddd") == P(2)
+ assert found("ddddd") == P(1)
+ assert found("abXde") == P(2)
def test_at(self):
r = get_code(r"abc$")
- assert rsre_core.match(r, "abc")
- assert not rsre_core.match(r, "abcd")
- assert not rsre_core.match(r, "ab")
+ assert match(r, "abc")
+ assert not match(r, "abcd")
+ assert not match(r, "ab")
def test_repeated_set(self):
r = get_code(r"[a0x]+f")
- assert rsre_core.match(r, "a0af")
- assert not rsre_core.match(r, "a0yaf")
+ assert match(r, "a0af")
+ assert not match(r, "a0yaf")
def test_category(self):
r = get_code(r"[\sx]")
- assert rsre_core.match(r, "x")
- assert rsre_core.match(r, " ")
- assert not rsre_core.match(r, "n")
+ assert match(r, "x")
+ assert match(r, " ")
+ assert not match(r, "n")
def test_groupref(self):
r = get_code(r"(xx+)\1+$") # match non-prime numbers of x
- assert not rsre_core.match(r, "xx")
- assert not rsre_core.match(r, "xxx")
- assert rsre_core.match(r, "xxxx")
- assert not rsre_core.match(r, "xxxxx")
- assert rsre_core.match(r, "xxxxxx")
- assert not rsre_core.match(r, "xxxxxxx")
- assert rsre_core.match(r, "xxxxxxxx")
- assert rsre_core.match(r, "xxxxxxxxx")
+ assert not match(r, "xx")
+ assert not match(r, "xxx")
+ assert match(r, "xxxx")
+ assert not match(r, "xxxxx")
+ assert match(r, "xxxxxx")
+ assert not match(r, "xxxxxxx")
+ assert match(r, "xxxxxxxx")
+ assert match(r, "xxxxxxxxx")
def test_groupref_ignore(self):
r = get_code(r"(?i)(xx+)\1+$") # match non-prime numbers of x
- assert not rsre_core.match(r, "xX")
- assert not rsre_core.match(r, "xxX")
- assert rsre_core.match(r, "Xxxx")
- assert not rsre_core.match(r, "xxxXx")
- assert rsre_core.match(r, "xXxxxx")
- assert not rsre_core.match(r, "xxxXxxx")
- assert rsre_core.match(r, "xxxxxxXx")
- assert rsre_core.match(r, "xxxXxxxxx")
+ assert not match(r, "xX")
+ assert not match(r, "xxX")
+ assert match(r, "Xxxx")
+ assert not match(r, "xxxXx")
+ assert match(r, "xXxxxx")
+ assert not match(r, "xxxXxxx")
+ assert match(r, "xxxxxxXx")
+ assert match(r, "xxxXxxxxx")
def test_groupref_exists(self):
r = get_code(r"((a)|(b))c(?(2)d)$")
- assert not rsre_core.match(r, "ac")
- assert rsre_core.match(r, "acd")
- assert rsre_core.match(r, "bc")
- assert not rsre_core.match(r, "bcd")
+ assert not match(r, "ac")
+ assert match(r, "acd")
+ assert match(r, "bc")
+ assert not match(r, "bcd")
#
r = get_code(r"((a)|(b))c(?(2)d|e)$")
- assert not rsre_core.match(r, "ac")
- assert rsre_core.match(r, "acd")
- assert not rsre_core.match(r, "ace")
- assert not rsre_core.match(r, "bc")
- assert not rsre_core.match(r, "bcd")
- assert rsre_core.match(r, "bce")
+ assert not match(r, "ac")
+ assert match(r, "acd")
+ assert not match(r, "ace")
+ assert not match(r, "bc")
+ assert not match(r, "bcd")
+ assert match(r, "bce")
def test_in_ignore(self):
r = get_code(r"(?i)[a-f]")
- assert rsre_core.match(r, "b")
- assert rsre_core.match(r, "C")
- assert not rsre_core.match(r, "g")
+ assert match(r, "b")
+ assert match(r, "C")
+ assert not match(r, "g")
r = get_code(r"(?i)[a-f]+$")
- assert rsre_core.match(r, "bCdEf")
- assert not rsre_core.match(r, "g")
- assert not rsre_core.match(r, "aaagaaa")
+ assert match(r, "bCdEf")
+ assert not match(r, "g")
+ assert not match(r, "aaagaaa")
def test_not_literal(self):
r = get_code(r"[^a]")
- assert rsre_core.match(r, "A")
- assert not rsre_core.match(r, "a")
+ assert match(r, "A")
+ assert not match(r, "a")
r = get_code(r"[^a]+$")
- assert rsre_core.match(r, "Bx123")
- assert not rsre_core.match(r, "--a--")
+ assert match(r, "Bx123")
+ assert not match(r, "--a--")
def test_not_literal_ignore(self):
r = get_code(r"(?i)[^a]")
- assert rsre_core.match(r, "G")
- assert not rsre_core.match(r, "a")
- assert not rsre_core.match(r, "A")
+ assert match(r, "G")
+ assert not match(r, "a")
+ assert not match(r, "A")
r = get_code(r"(?i)[^a]+$")
- assert rsre_core.match(r, "Gx123")
- assert not rsre_core.match(r, "--A--")
+ assert match(r, "Gx123")
+ assert not match(r, "--A--")
def test_repeated_single_character_pattern(self):
r = get_code(r"foo(?:(?<=foo)x)+$")
- assert rsre_core.match(r, "foox")
+ assert match(r, "foox")
def test_flatten_marks(self):
r = get_code(r"a(b)c((d)(e))+$")
- res = rsre_core.match(r, "abcdedede")
- assert res.flatten_marks() == [0, 9, 1, 2, 7, 9, 7, 8, 8, 9]
- assert res.flatten_marks() == [0, 9, 1, 2, 7, 9, 7, 8, 8, 9]
+ res = match(r, "abcdedede")
+ assert res.flatten_marks() == map(P, [0, 9, 1, 2, 7, 9, 7, 8, 8, 9])
+ assert res.flatten_marks() == map(P, [0, 9, 1, 2, 7, 9, 7, 8, 8, 9])
def test_bug1(self):
# REPEAT_ONE inside REPEAT
r = get_code(r"(?:.+)?B")
- assert rsre_core.match(r, "AB") is not None
+ assert match(r, "AB") is not None
r = get_code(r"(?:AA+?)+B")
- assert rsre_core.match(r, "AAAB") is not None
+ assert match(r, "AAAB") is not None
r = get_code(r"(?:AA+)+?B")
- assert rsre_core.match(r, "AAAB") is not None
+ assert match(r, "AAAB") is not None
r = get_code(r"(?:AA+?)+?B")
- assert rsre_core.match(r, "AAAB") is not None
+ assert match(r, "AAAB") is not None
# REPEAT inside REPEAT
r = get_code(r"(?:(?:xy)+)?B")
- assert rsre_core.match(r, "xyB") is not None
+ assert match(r, "xyB") is not None
r = get_code(r"(?:xy(?:xy)+?)+B")
- assert rsre_core.match(r, "xyxyxyB") is not None
+ assert match(r, "xyxyxyB") is not None
r = get_code(r"(?:xy(?:xy)+)+?B")
- assert rsre_core.match(r, "xyxyxyB") is not None
+ assert match(r, "xyxyxyB") is not None
r = get_code(r"(?:xy(?:xy)+?)+?B")
- assert rsre_core.match(r, "xyxyxyB") is not None
+ assert match(r, "xyxyxyB") is not None
def test_assert_group(self):
r = get_code(r"abc(?=(..)f)(.)")
- res = rsre_core.match(r, "abcdefghi")
+ res = match(r, "abcdefghi")
assert res is not None
- assert res.span(2) == (3, 4)
- assert res.span(1) == (3, 5)
+ assert res.span(2) == (P(3), P(4))
+ assert res.span(1) == (P(3), P(5))
def test_assert_not_group(self):
r = get_code(r"abc(?!(de)f)(.)")
- res = rsre_core.match(r, "abcdeFghi")
+ res = match(r, "abcdeFghi")
assert res is not None
- assert res.span(2) == (3, 4)
+ assert res.span(2) == (P(3), P(4))
# this I definitely classify as Horrendously Implementation Dependent.
# CPython answers (3, 5).
assert res.span(1) == (-1, -1)
def test_match_start(self):
r = get_code(r"^ab")
- assert rsre_core.match(r, "abc")
- assert not rsre_core.match(r, "xxxabc", start=3)
- assert not rsre_core.match(r, "xx\nabc", start=3)
+ assert match(r, "abc")
+ assert not match(r, "xxxabc", start=3)
+ assert not match(r, "xx\nabc", start=3)
#
r = get_code(r"(?m)^ab")
- assert rsre_core.match(r, "abc")
- assert not rsre_core.match(r, "xxxabc", start=3)
- assert rsre_core.match(r, "xx\nabc", start=3)
+ assert match(r, "abc")
+ assert not match(r, "xxxabc", start=3)
+ assert match(r, "xx\nabc", start=3)
def test_match_end(self):
r = get_code("ab")
- assert rsre_core.match(r, "abc")
- assert rsre_core.match(r, "abc", end=333)
- assert rsre_core.match(r, "abc", end=3)
- assert rsre_core.match(r, "abc", end=2)
- assert not rsre_core.match(r, "abc", end=1)
- assert not rsre_core.match(r, "abc", end=0)
- assert not rsre_core.match(r, "abc", end=-1)
+ assert match(r, "abc")
+ assert match(r, "abc", end=333)
+ assert match(r, "abc", end=3)
+ assert match(r, "abc", end=2)
+ assert not match(r, "abc", end=1)
+ assert not match(r, "abc", end=0)
+ assert not match(r, "abc", end=-1)
def test_match_bug1(self):
r = get_code(r'(x??)?$')
- assert rsre_core.match(r, "x")
+ assert match(r, "x")
def test_match_bug2(self):
r = get_code(r'(x??)??$')
- assert rsre_core.match(r, "x")
+ assert match(r, "x")
def test_match_bug3(self):
if VERSION == "2.7.5":
py.test.skip("pattern fails to compile with exactly 2.7.5 "
"(works on 2.7.3 and on 2.7.trunk though)")
r = get_code(r'([ax]*?x*)?$')
- assert rsre_core.match(r, "aaxaa")
+ assert match(r, "aaxaa")
def test_bigcharset(self):
for i in range(100):
@@ -252,10 +253,10 @@
pattern = u'[%s]' % (u''.join(chars),)
r = get_code(pattern)
for c in chars:
- assert rsre_core.match(r, c)
+ assert match(r, c)
for i in range(200):
c = unichr(random.randrange(0x0, 0xD000))
- res = rsre_core.match(r, c)
+ res = match(r, c)
if c in chars:
assert res is not None
else:
@@ -264,41 +265,41 @@
def test_simple_match_1(self):
r = get_code(r"ab*bbbbbbbc")
print r
- match = rsre_core.match(r, "abbbbbbbbbcdef")
- assert match
- assert match.match_end == 11
+ m = match(r, "abbbbbbbbbcdef")
+ assert m
+ assert m.match_end == P(11)
def test_empty_maxuntil(self):
r = get_code("\\{\\{((?:.*?)+)\\}\\}")
- match = rsre_core.match(r, "{{a}}{{b}}")
- assert match.group(1) == "a"
+ m = match(r, "{{a}}{{b}}")
+ assert m.group(1) == "a"
def test_fullmatch_1(self):
r = get_code(r"ab*c")
- assert not rsre_core.fullmatch(r, "abbbcdef")
- assert rsre_core.fullmatch(r, "abbbc")
+ assert not fullmatch(r, "abbbcdef")
+ assert fullmatch(r, "abbbc")
def test_fullmatch_2(self):
r = get_code(r"a(b*?)")
- match = rsre_core.fullmatch(r, "abbb")
+ match = fullmatch(r, "abbb")
assert match.group(1) == "bbb"
- assert not rsre_core.fullmatch(r, "abbbc")
+ assert not fullmatch(r, "abbbc")
def test_fullmatch_3(self):
r = get_code(r"a((bp)*?)c")
- match = rsre_core.fullmatch(r, "abpbpbpc")
+ match = fullmatch(r, "abpbpbpc")
assert match.group(1) == "bpbpbp"
def test_fullmatch_4(self):
r = get_code(r"a((bp)*)c")
- match = rsre_core.fullmatch(r, "abpbpbpc")
+ match = fullmatch(r, "abpbpbpc")
assert match.group(1) == "bpbpbp"
def test_fullmatch_assertion(self):
r = get_code(r"(?=a).b")
- assert rsre_core.fullmatch(r, "ab")
+ assert fullmatch(r, "ab")
r = get_code(r"(?!a)..")
- assert not rsre_core.fullmatch(r, "ab")
+ assert not fullmatch(r, "ab")
def test_range_ignore(self):
from rpython.rlib.unicodedata import unicodedb
@@ -307,4 +308,4 @@
r = get_code(u"[\U00010428-\U0001044f]", re.I)
assert r.count(27) == 1 # OPCODE_RANGE
r[r.index(27)] = 32 # => OPCODE_RANGE_IGNORE
- assert rsre_core.match(r, u"\U00010428")
+ assert match(r, u"\U00010428")
diff --git a/rpython/rlib/rsre/test/test_search.py
b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -1,44 +1,48 @@
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit