[pypy-commit] pypy unicode-utf8: merge unicode-utf8-re

fijal Sun, 10 Dec 2017 22:07:40 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r93355:43e73aa47541
Date: 2017-12-11 08:05 +0200
http://bitbucket.org/pypy/pypy/changeset/43e73aa47541/


Log:    merge unicode-utf8-re

diff too long, truncating to 2000 out of 2355 lines

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -8,13 +8,12 @@
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib import jit, rutf8
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import Utf8StringBuilder
 
 # ____________________________________________________________
 #
 # Constants and exposed functions
 
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_utf8
 from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, getlower, 
set_unicode_db
 
 
@@ -35,15 +34,18 @@
 
 
 def slice_w(space, ctx, start, end, w_default):
-    if 0 <= start <= end:
+    # 'start' and 'end' are byte positions
+    if ctx.ZERO <= start <= end:
         if isinstance(ctx, rsre_core.BufMatchContext):
             return space.newbytes(ctx._buffer.getslice(start, end, 1,
                                                         end-start))
         if isinstance(ctx, rsre_core.StrMatchContext):
+            start = ctx._real_pos(start)
+            end = ctx._real_pos(end)
             return space.newbytes(ctx._string[start:end])
-        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            s = ctx._unicodestr[start:end]
-            lgt = rutf8.check_utf8(s, True)
+        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            s = ctx._utf8[start:end]
+            lgt = rutf8.get_utf8_length(s)
             return space.newutf8(s, lgt)
         else:
             # unreachable
@@ -56,6 +58,7 @@
     # Returns a list of RPython-level integers.
     # Unlike the app-level groups() method, groups are numbered from 0
     # and the returned list does not start with the whole match range.
+    # The integers are byte positions, not character indexes (for utf8).
     if num_groups == 0:
         return None
     result = [-1] * (2 * num_groups)
@@ -104,7 +107,7 @@
         raise oefmt(space.w_TypeError, "cannot copy this pattern object")
 
     def make_ctx(self, w_string, pos=0, endpos=sys.maxint):
-        """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
+        """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
         searching in the given w_string object."""
         space = self.space
         if pos < 0:
@@ -112,23 +115,36 @@
         if endpos < pos:
             endpos = pos
         if space.isinstance_w(w_string, space.w_unicode):
-            unicodestr = space.utf8_w(w_string)
-            # XXX will fail some tests, the length need to be adjusted for
-            #     real char len etc
-            if pos > len(unicodestr):
-                pos = len(unicodestr)
-            if endpos > len(unicodestr):
-                endpos = len(unicodestr)
-            return rsre_core.UnicodeMatchContext(self.code, unicodestr,
-                                                 pos, endpos, self.flags)
+            w_unicode_obj = space.convert_arg_to_w_unicode(w_string)
+            utf8str = w_unicode_obj._utf8
+            length = w_unicode_obj._len()
+            if pos <= 0:
+                bytepos = 0
+            elif pos >= length:
+                bytepos = len(utf8str)
+            else:
+                index_storage = w_unicode_obj._get_index_storage()
+                bytepos = rutf8.codepoint_position_at_index(utf8str,
+                                index_storage, pos)
+            if endpos >= length:
+                endbytepos = len(utf8str)
+            else:
+                index_storage = w_unicode_obj._get_index_storage()
+                endbytepos = rutf8.codepoint_position_at_index(utf8str,
+                                index_storage, endpos)
+            ctx = rsre_utf8.Utf8MatchContext(
+                self.code, utf8str, bytepos, endbytepos, self.flags)
+            # xxx we store the w_string on the ctx too, for
+            # W_SRE_Match.bytepos_to_charindex()
+            ctx.w_unicode_obj = w_unicode_obj
+            return ctx
         elif space.isinstance_w(w_string, space.w_bytes):
             str = space.bytes_w(w_string)
             if pos > len(str):
                 pos = len(str)
             if endpos > len(str):
                 endpos = len(str)
-            return rsre_core.StrMatchContext(self.code, str,
-                                             pos, endpos, self.flags)
+            return self._make_str_match_context(str, pos, endpos)
         else:
             buf = space.readbuf_w(w_string)
             size = buf.getlength()
@@ -140,6 +156,11 @@
             return rsre_core.BufMatchContext(self.code, buf,
                                              pos, endpos, self.flags)
 
+    def _make_str_match_context(self, str, pos, endpos):
+        # for tests to override
+        return rsre_core.StrMatchContext(self.code, str,
+                                         pos, endpos, self.flags)
+
     def getmatch(self, ctx, found):
         if found:
             return W_SRE_Match(self, ctx)
@@ -178,8 +199,10 @@
                     w_item = allgroups_w(space, ctx, fmarks, num_groups,
                                          w_emptystr)
             matchlist_w.append(w_item)
-            no_progress = (ctx.match_start == ctx.match_end)
-            ctx.reset(ctx.match_end + no_progress)
+            reset_at = ctx.match_end
+            if ctx.match_start == ctx.match_end:
+                reset_at = ctx.next_indirect(reset_at)
+            ctx.reset(reset_at)
         return space.newlist(matchlist_w)
 
     @unwrap_spec(pos=int, endpos=int)
@@ -195,15 +218,15 @@
         space = self.space
         splitlist = []
         n = 0
-        last = 0
         ctx = self.make_ctx(w_string)
+        last = ctx.ZERO
         while not maxsplit or n < maxsplit:
             if not searchcontext(space, ctx):
                 break
             if ctx.match_start == ctx.match_end:     # zero-width match
                 if ctx.match_start == ctx.end:       # or end of string
                     break
-                ctx.reset(ctx.match_end + 1)
+                ctx.reset(ctx.next_indirect(ctx.match_end))
                 continue
             splitlist.append(slice_w(space, ctx, last, ctx.match_start,
                                      space.w_None))
@@ -232,20 +255,20 @@
 
     def subx(self, w_ptemplate, w_string, count):
         space = self.space
-        # use a (much faster) string/unicode builder if w_ptemplate and
+        # use a (much faster) string builder (possibly utf8) if w_ptemplate and
         # w_string are both string or both unicode objects, and if w_ptemplate
         # is a literal
-        use_builder = False
-        filter_as_unicode = filter_as_string = None
+        use_builder = '\x00'   # or 'S'tring or 'U'nicode/UTF8
+        filter_as_string = None
         if space.is_true(space.callable(w_ptemplate)):
             w_filter = w_ptemplate
             filter_is_callable = True
         else:
             if space.isinstance_w(w_ptemplate, space.w_unicode):
-                filter_as_unicode = space.utf8_w(w_ptemplate)
-                literal = '\\' not in filter_as_unicode
-                use_builder = (
-                    space.isinstance_w(w_string, space.w_unicode) and literal)
+                filter_as_string = space.utf8_w(w_ptemplate)
+                literal = '\\' not in filter_as_string
+                if space.isinstance_w(w_string, space.w_unicode) and literal:
+                    use_builder = 'U'
             else:
                 try:
                     filter_as_string = space.bytes_w(w_ptemplate)
@@ -255,8 +278,8 @@
                     literal = False
                 else:
                     literal = '\\' not in filter_as_string
-                    use_builder = (
-                        space.isinstance_w(w_string, space.w_bytes) and 
literal)
+                    if space.isinstance_w(w_string, space.w_bytes) and literal:
+                        use_builder = 'S'
             if literal:
                 w_filter = w_ptemplate
                 filter_is_callable = False
@@ -269,16 +292,14 @@
         #
         # XXX this is a bit of a mess, but it improves performance a lot
         ctx = self.make_ctx(w_string)
-        sublist_w = strbuilder = unicodebuilder = None
-        if use_builder:
-            if filter_as_unicode is not None:
-                unicodebuilder = Utf8StringBuilder(ctx.end)
-            else:
-                assert filter_as_string is not None
-                strbuilder = StringBuilder(ctx.end)
+        sublist_w = strbuilder = None
+        if use_builder != '\x00':
+            assert filter_as_string is not None
+            strbuilder = StringBuilder(ctx.end)
         else:
             sublist_w = []
-        n = last_pos = 0
+        n = 0
+        last_pos = ctx.ZERO
         while not count or n < count:
             sub_jitdriver.jit_merge_point(
                 self=self,
@@ -288,9 +309,7 @@
                 ctx=ctx,
                 w_filter=w_filter,
                 strbuilder=strbuilder,
-                unicodebuilder=unicodebuilder,
                 filter_as_string=filter_as_string,
-                filter_as_unicode=filter_as_unicode,
                 count=count,
                 w_string=w_string,
                 n=n, last_pos=last_pos, sublist_w=sublist_w
@@ -301,10 +320,10 @@
             if last_pos < ctx.match_start:
                 _sub_append_slice(
                     ctx, space, use_builder, sublist_w,
-                    strbuilder, unicodebuilder, last_pos, ctx.match_start)
+                    strbuilder, last_pos, ctx.match_start)
             start = ctx.match_end
             if start == ctx.match_start:
-                start += 1
+                start = ctx.next_indirect(start)
             if not (last_pos == ctx.match_start
                              == ctx.match_end and n > 0):
                 # the above ignores empty matches on latest position
@@ -312,18 +331,14 @@
                     w_match = self.getmatch(ctx, True)
                     w_piece = space.call_function(w_filter, w_match)
                     if not space.is_w(w_piece, space.w_None):
-                        assert strbuilder is None and unicodebuilder is None
-                        assert not use_builder
+                        assert strbuilder is None
+                        assert use_builder == '\x00'
                         sublist_w.append(w_piece)
                 else:
-                    if use_builder:
-                        if strbuilder is not None:
-                            assert filter_as_string is not None
-                            strbuilder.append(filter_as_string)
-                        else:
-                            assert unicodebuilder is not None
-                            assert filter_as_unicode is not None
-                            unicodebuilder.append(filter_as_unicode)
+                    if use_builder != '\x00':
+                        assert filter_as_string is not None
+                        assert strbuilder is not None
+                        strbuilder.append(filter_as_string)
                     else:
                         sublist_w.append(w_filter)
                 last_pos = ctx.match_end
@@ -334,14 +349,19 @@
 
         if last_pos < ctx.end:
             _sub_append_slice(ctx, space, use_builder, sublist_w,
-                              strbuilder, unicodebuilder, last_pos, ctx.end)
-        if use_builder:
-            if strbuilder is not None:
-                return space.newbytes(strbuilder.build()), n
+                              strbuilder, last_pos, ctx.end)
+        if use_builder != '\x00':
+            assert strbuilder is not None
+            result_bytes = strbuilder.build()
+            if use_builder == 'S':
+                assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
+                return space.newbytes(result_bytes), n
+            elif use_builder == 'U':
+                assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
+                return space.newutf8(result_bytes,
+                                     rutf8.get_utf8_length(result_bytes)), n
             else:
-                assert unicodebuilder is not None
-                return space.newutf8(unicodebuilder.build(),
-                                     unicodebuilder.get_length()), n
+                raise AssertionError(use_builder)
         else:
             if space.isinstance_w(w_string, space.w_unicode):
                 w_emptystr = space.newutf8('', 0)
@@ -354,26 +374,28 @@
 sub_jitdriver = jit.JitDriver(
     reds="""count n last_pos
             ctx w_filter
-            strbuilder unicodebuilder
+            strbuilder
             filter_as_string
-            filter_as_unicode
             w_string sublist_w
             self""".split(),
     greens=["filter_is_callable", "use_builder", "filter_type", "ctx.pattern"])
 
 
 def _sub_append_slice(ctx, space, use_builder, sublist_w,
-                      strbuilder, unicodebuilder, start, end):
-    if use_builder:
+                      strbuilder, start, end):
+    if use_builder != '\x00':
+        assert strbuilder is not None
         if isinstance(ctx, rsre_core.BufMatchContext):
-            assert strbuilder is not None
+            assert use_builder == 'S'
             return strbuilder.append(ctx._buffer.getslice(start, end, 1, 
end-start))
         if isinstance(ctx, rsre_core.StrMatchContext):
-            assert strbuilder is not None
+            assert use_builder == 'S'
+            start = ctx._real_pos(start)
+            end = ctx._real_pos(end)
             return strbuilder.append_slice(ctx._string, start, end)
-        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            assert unicodebuilder is not None
-            return unicodebuilder.append_slice(ctx._unicodestr, start, end)
+        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            assert use_builder == 'U'
+            return strbuilder.append_slice(ctx._utf8, start, end)
         assert 0, "unreachable"
     else:
         sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@@ -487,18 +509,39 @@
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def start_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[0])
+        start, end = self.do_span(w_groupnum)
+        start = self.bytepos_to_charindex(start)
+        return self.space.newint(start)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def end_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[1])
+        start, end = self.do_span(w_groupnum)
+        end = self.bytepos_to_charindex(end)
+        return self.space.newint(end)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def span_w(self, w_groupnum):
         start, end = self.do_span(w_groupnum)
+        return self.new_charindex_tuple(start, end)
+
+    def new_charindex_tuple(self, start, end):
+        start = self.bytepos_to_charindex(start)
+        end = self.bytepos_to_charindex(end)
         return self.space.newtuple([self.space.newint(start),
                                     self.space.newint(end)])
 
+    def bytepos_to_charindex(self, bytepos):
+        # Transform a 'byte position', as returned by all methods from
+        # rsre_core, back into a 'character index'.  This is for UTF8
+        # handling.
+        ctx = self.ctx
+        if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            index_storage = ctx.w_unicode_obj._get_index_storage()
+            return rutf8.codepoint_index_at_byte_position(
+                ctx.w_unicode_obj._utf8, index_storage, bytepos)
+        else:
+            return bytepos
+
     def flatten_marks(self):
         if self.flatten_cache is None:
             num_groups = self.srepat.num_groups
@@ -506,6 +549,8 @@
         return self.flatten_cache
 
     def do_span(self, w_arg):
+        # return a pair of integers, which are byte positions, not
+        # character indexes (for utf8)
         space = self.space
         try:
             groupnum = space.int_w(w_arg)
@@ -553,10 +598,10 @@
         return space.w_None
 
     def fget_pos(self, space):
-        return space.newint(self.ctx.original_pos)
+        return space.newint(self.bytepos_to_charindex(self.ctx.original_pos))
 
     def fget_endpos(self, space):
-        return space.newint(self.ctx.end)
+        return space.newint(self.bytepos_to_charindex(self.ctx.end))
 
     def fget_regs(self, space):
         space = self.space
@@ -564,11 +609,11 @@
         num_groups = self.srepat.num_groups
         result_w = [None] * (num_groups + 1)
         ctx = self.ctx
-        result_w[0] = space.newtuple([space.newint(ctx.match_start),
-                                      space.newint(ctx.match_end)])
+        result_w[0] = self.new_charindex_tuple(ctx.match_start,
+                                               ctx.match_end)
         for i in range(num_groups):
-            result_w[i + 1] = space.newtuple([space.newint(fmarks[i*2]),
-                                              space.newint(fmarks[i*2+1])])
+            result_w[i + 1] = self.new_charindex_tuple(fmarks[i*2],
+                                                       fmarks[i*2+1])
         return space.newtuple(result_w)
 
     def fget_string(self, space):
@@ -577,9 +622,9 @@
             return space.newbytes(ctx._buffer.as_str())
         elif isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string)
-        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            lgt = rutf8.check_utf8(ctx._unicodestr, True)
-            return space.newutf8(ctx._unicodestr, lgt)
+        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            lgt = rutf8.get_utf8_length(ctx._utf8)
+            return space.newutf8(ctx._utf8, lgt)
         else:
             raise SystemError
 
@@ -644,12 +689,14 @@
         if found:
             ctx = self.ctx
             nextstart = ctx.match_end
-            nextstart += (ctx.match_start == nextstart)
+            if ctx.match_start == nextstart:
+                nextstart = ctx.next_indirect(nextstart)
             self.ctx = ctx.fresh_copy(nextstart)
             match = W_SRE_Match(self.srepat, ctx)
             return match
         else:
-            self.ctx.match_start += 1     # obscure corner case
+            # obscure corner case
+            self.ctx.match_start = self.ctx.next_indirect(self.ctx.match_start)
             return None
 
 W_SRE_Scanner.typedef = TypeDef(
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -4,6 +4,8 @@
 import py
 from py.test import raises, skip
 from pypy.interpreter.gateway import app2interp_temp
+from pypy.module._sre import interp_sre
+from rpython.rlib.rsre.test import support
 
 
 def init_app_test(cls, space):
@@ -20,6 +22,35 @@
             sys.path.pop(0)
         """)
 
+def _test_sre_ctx_(self, str, start, end):
+    # Use the MatchContextForTests class, which handles Position
+    # instances instead of plain integers.  This is used to detect when
+    # we're accepting or escaping a Position to app-level, which we
+    # should not: Positions are meant to be byte indexes inside a
+    # possibly UTF8 string, not character indexes.
+    start = support.Position(start)
+    end = support.Position(end)
+    return support.MatchContextForTests(self.code, str, start, end, self.flags)
+
+def _bytepos_to_charindex(self, bytepos):
+    if isinstance(self.ctx, support.MatchContextForTests):
+        return self.ctx._real_pos(bytepos)
+    return _org_maker[1](self, bytepos)
+
+def setup_module(mod):
+    mod._org_maker = (
+        interp_sre.W_SRE_Pattern._make_str_match_context,
+        interp_sre.W_SRE_Match.bytepos_to_charindex,
+        )
+    interp_sre.W_SRE_Pattern._make_str_match_context = _test_sre_ctx_
+    interp_sre.W_SRE_Match.bytepos_to_charindex = _bytepos_to_charindex
+
+def teardown_module(mod):
+    (
+        interp_sre.W_SRE_Pattern._make_str_match_context,
+        interp_sre.W_SRE_Match.bytepos_to_charindex,
+    ) = mod._org_maker
+
 
 class AppTestSrePy:
     def test_magic(self):
@@ -87,6 +118,13 @@
         assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
         assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
 
+    def test_findall_unicode(self):
+        import re
+        assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000")
+        assert ["a", "u"] == re.findall("b(.)", "abalbus")
+        assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
+        assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+
     def test_finditer(self):
         import re
         it = re.finditer("b(.)", "brabbel")
@@ -999,3 +1037,15 @@
         import re
         assert re.search(".+ab", "wowowowawoabwowo")
         assert None == re.search(".+ab", "wowowaowowo")
+
+
+class AppTestUnicodeExtra:
+    def test_string_attribute(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.string == u"\u1233\u1234\u1235"
+
+    def test_match_start(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.start() == 1
diff --git a/rpython/rlib/debug.py b/rpython/rlib/debug.py
--- a/rpython/rlib/debug.py
+++ b/rpython/rlib/debug.py
@@ -316,14 +316,21 @@
 class ExpectedRegularInt(Exception):
     pass
 
+class NegativeArgumentNotAllowed(Exception):
+    pass
+
 def check_nonneg(x):
     """Give a translation-time error if 'x' is not known to be non-negative.
     To help debugging, this also gives a translation-time error if 'x' is
     actually typed as an r_uint (in which case the call to check_nonneg()
     is a bit strange and probably unexpected).
     """
-    assert type(x)(-1) < 0     # otherwise, 'x' is a r_uint or similar
-    assert x >= 0
+    try:
+        assert type(x)(-1) < 0     # otherwise, 'x' is a r_uint or similar
+    except NegativeArgumentNotAllowed:
+        pass
+    else:
+        assert x >= 0
     return x
 
 class Entry(ExtRegistryEntry):
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,6 +55,8 @@
     specific subclass, calling 'func' is a direct call; if 'ctx' is only known
     to be of class AbstractMatchContext, calling 'func' is an indirect call.
     """
+    from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
+
     assert func.func_code.co_varnames[0] == 'ctx'
     specname = '_spec_' + func.func_name
     while specname in _seen_specname:
@@ -65,7 +67,8 @@
     specialized_methods = []
     for prefix, concreteclass in [('buf', BufMatchContext),
                                   ('str', StrMatchContext),
-                                  ('uni', UnicodeMatchContext)]:
+                                  ('uni', UnicodeMatchContext),
+                                  ('utf8', Utf8MatchContext)]:
         newfunc = func_with_new_name(func, prefix + specname)
         assert not hasattr(concreteclass, specname)
         setattr(concreteclass, specname, newfunc)
@@ -83,6 +86,9 @@
     def __init__(self, msg):
         self.msg = msg
 
+class EndOfString(Exception):
+    pass
+
 class AbstractMatchContext(object):
     """Abstract base class"""
     _immutable_fields_ = ['pattern[*]', 'flags', 'end']
@@ -135,6 +141,45 @@
         """Similar to str()."""
         raise NotImplementedError
 
+    # The following methods are provided to be overriden in
+    # Utf8MatchContext.  The non-utf8 implementation is provided
+    # by the FixedMatchContext abstract subclass, in order to use
+    # the same @not_rpython safety trick as above.
+    ZERO = 0
+    @not_rpython
+    def next(self, position):
+        raise NotImplementedError
+    @not_rpython
+    def prev(self, position):
+        raise NotImplementedError
+    @not_rpython
+    def next_n(self, position, n):
+        raise NotImplementedError
+    @not_rpython
+    def prev_n(self, position, n, start_position):
+        raise NotImplementedError
+    @not_rpython
+    def debug_check_pos(self, position):
+        raise NotImplementedError
+    @not_rpython
+    def maximum_distance(self, position_low, position_high):
+        raise NotImplementedError
+    @not_rpython
+    def get_single_byte(self, base_position, index):
+        raise NotImplementedError
+
+    def bytes_difference(self, position1, position2):
+        return position1 - position2
+    def go_forward_by_bytes(self, base_position, index):
+        return base_position + index
+    def next_indirect(self, position):
+        return position + 1     # like next(), but can be called indirectly
+    def prev_indirect(self, position):
+        position -= 1           # like prev(), but can be called indirectly
+        if position < 0:
+            raise EndOfString
+        return position
+
     def get_mark(self, gid):
         return find_mark(self.match_marks, gid)
 
@@ -168,23 +213,44 @@
             return (-1, -1)
         return (fmarks[groupnum], fmarks[groupnum+1])
 
-    def group(self, groupnum=0):
-        frm, to = self.span(groupnum)
-        if 0 <= frm <= to:
-            return self._string[frm:to]
-        else:
-            return None
-
     def fresh_copy(self, start):
         raise NotImplementedError
 
-class BufMatchContext(AbstractMatchContext):
+
+class FixedMatchContext(AbstractMatchContext):
+    """Abstract subclass to introduce the default implementation for
+    these position methods.  The Utf8MatchContext subclass doesn't
+    inherit from here."""
+
+    next = AbstractMatchContext.next_indirect
+    prev = AbstractMatchContext.prev_indirect
+
+    def next_n(self, position, n, end_position):
+        position += n
+        if position > end_position:
+            raise EndOfString
+        return position
+
+    def prev_n(self, position, n, start_position):
+        position -= n
+        if position < start_position:
+            raise EndOfString
+        return position
+
+    def debug_check_pos(self, position):
+        pass
+
+    def maximum_distance(self, position_low, position_high):
+        return position_high - position_low
+
+
+class BufMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a buffer."""
 
     _immutable_fields_ = ["_buffer"]
 
     def __init__(self, pattern, buf, match_start, end, flags):
-        AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+        FixedMatchContext.__init__(self, pattern, match_start, end, flags)
         self._buffer = buf
 
     def str(self, index):
@@ -195,17 +261,20 @@
         c = self.str(index)
         return rsre_char.getlower(c, self.flags)
 
+    def get_single_byte(self, base_position, index):
+        return self.str(base_position + index)
+
     def fresh_copy(self, start):
         return BufMatchContext(self.pattern, self._buffer, start,
                                self.end, self.flags)
 
-class StrMatchContext(AbstractMatchContext):
+class StrMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a plain string."""
 
     _immutable_fields_ = ["_string"]
 
     def __init__(self, pattern, string, match_start, end, flags):
-        AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+        FixedMatchContext.__init__(self, pattern, match_start, end, flags)
         self._string = string
         if not we_are_translated() and isinstance(string, unicode):
             self.flags |= rsre_char.SRE_FLAG_UNICODE   # for rsre_re.py
@@ -218,17 +287,23 @@
         c = self.str(index)
         return rsre_char.getlower(c, self.flags)
 
+    def get_single_byte(self, base_position, index):
+        return self.str(base_position + index)
+
+    def _real_pos(self, index):
+        return index     # overridden by tests
+
     def fresh_copy(self, start):
         return StrMatchContext(self.pattern, self._string, start,
                                self.end, self.flags)
 
-class UnicodeMatchContext(AbstractMatchContext):
+class UnicodeMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a unicode string."""
 
     _immutable_fields_ = ["_unicodestr"]
 
     def __init__(self, pattern, unicodestr, match_start, end, flags):
-        AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+        FixedMatchContext.__init__(self, pattern, match_start, end, flags)
         self._unicodestr = unicodestr
 
     def str(self, index):
@@ -239,6 +314,9 @@
         c = self.str(index)
         return rsre_char.getlower(c, self.flags)
 
+    def get_single_byte(self, base_position, index):
+        return self.str(base_position + index)
+
     def fresh_copy(self, start):
         return UnicodeMatchContext(self.pattern, self._unicodestr, start,
                                    self.end, self.flags)
@@ -317,7 +395,10 @@
             ctx.jitdriver_RepeatOne.jit_merge_point(
                 self=self, ptr=ptr, ctx=ctx, nextppos=nextppos)
             result = sre_match(ctx, nextppos, ptr, self.start_marks)
-            ptr -= 1
+            try:
+                ptr = ctx.prev_indirect(ptr)
+            except EndOfString:
+                ptr = -1
             if result is not None:
                 self.subresult = result
                 self.start_ptr = ptr
@@ -328,37 +409,41 @@
 class MinRepeatOneMatchResult(MatchResult):
     install_jitdriver('MinRepeatOne',
                       greens=['nextppos', 'ppos3', 'ctx.pattern'],
-                      reds=['ptr', 'self', 'ctx'],
+                      reds=['max_count', 'ptr', 'self', 'ctx'],
                       debugprint=(2, 0))   # indices in 'greens'
 
-    def __init__(self, nextppos, ppos3, maxptr, ptr, marks):
+    def __init__(self, nextppos, ppos3, max_count, ptr, marks):
         self.nextppos = nextppos
         self.ppos3 = ppos3
-        self.maxptr = maxptr
+        self.max_count = max_count
         self.start_ptr = ptr
         self.start_marks = marks
 
     def find_first_result(self, ctx):
         ptr = self.start_ptr
         nextppos = self.nextppos
+        max_count = self.max_count
         ppos3 = self.ppos3
-        while ptr <= self.maxptr:
+        while max_count >= 0:
             ctx.jitdriver_MinRepeatOne.jit_merge_point(
-                self=self, ptr=ptr, ctx=ctx, nextppos=nextppos, ppos3=ppos3)
+                self=self, ptr=ptr, ctx=ctx, nextppos=nextppos, ppos3=ppos3,
+                max_count=max_count)
             result = sre_match(ctx, nextppos, ptr, self.start_marks)
             if result is not None:
                 self.subresult = result
                 self.start_ptr = ptr
+                self.max_count = max_count
                 return self
             if not self.next_char_ok(ctx, ptr, ppos3):
                 break
-            ptr += 1
+            ptr = ctx.next_indirect(ptr)
+            max_count -= 1
 
     def find_next_result(self, ctx):
         ptr = self.start_ptr
         if not self.next_char_ok(ctx, ptr, self.ppos3):
             return
-        self.start_ptr = ptr + 1
+        self.start_ptr = ctx.next_indirect(ptr)
         return self.find_first_result(ctx)
 
     def next_char_ok(self, ctx, ptr, ppos):
@@ -430,12 +515,12 @@
             min = ctx.pat(ppos+1)
             if enum is not None:
                 # matched one more 'item'.  record it and continue.
-                last_match_length = ctx.match_end - ptr
+                last_match_zero_length = (ctx.match_end == ptr)
                 self.pending = Pending(ptr, marks, enum, self.pending)
                 self.num_pending += 1
                 ptr = ctx.match_end
                 marks = ctx.match_marks
-                if last_match_length == 0 and self.num_pending >= min:
+                if last_match_zero_length and self.num_pending >= min:
                     # zero-width protection: after an empty match, if there
                     # are enough matches, don't try to match more.  Instead,
                     # fall through to trying to match 'tail'.
@@ -520,6 +605,7 @@
     need all results; in that case we use the method move_to_next_result()
     of the MatchResult."""
     while True:
+        ctx.debug_check_pos(ptr)
         op = ctx.pat(ppos)
         ppos += 1
 
@@ -551,22 +637,25 @@
             # <ANY>
             if ptr >= ctx.end or rsre_char.is_linebreak(ctx.str(ptr)):
                 return
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_ANY_ALL:
             # match anything
             # <ANY_ALL>
             if ptr >= ctx.end:
                 return
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_ASSERT:
             # assert subpattern
             # <ASSERT> <0=skip> <1=back> <pattern>
-            ptr1 = ptr - ctx.pat(ppos+1)
+            try:
+                ptr1 = ctx.prev_n(ptr, ctx.pat(ppos+1), ctx.ZERO)
+            except EndOfString:
+                return
             saved = ctx.fullmatch_only
             ctx.fullmatch_only = False
-            stop = ptr1 < 0 or sre_match(ctx, ppos + 2, ptr1, marks) is None
+            stop = sre_match(ctx, ppos + 2, ptr1, marks) is None
             ctx.fullmatch_only = saved
             if stop:
                 return
@@ -576,14 +665,17 @@
         elif op == OPCODE_ASSERT_NOT:
             # assert not subpattern
             # <ASSERT_NOT> <0=skip> <1=back> <pattern>
-            ptr1 = ptr - ctx.pat(ppos+1)
-            saved = ctx.fullmatch_only
-            ctx.fullmatch_only = False
-            stop = (ptr1 >= 0 and sre_match(ctx, ppos + 2, ptr1, marks)
-                                      is not None)
-            ctx.fullmatch_only = saved
-            if stop:
-                return
+            try:
+                ptr1 = ctx.prev_n(ptr, ctx.pat(ppos+1), ctx.ZERO)
+            except EndOfString:
+                pass
+            else:
+                saved = ctx.fullmatch_only
+                ctx.fullmatch_only = False
+                stop = sre_match(ctx, ppos + 2, ptr1, marks) is not None
+                ctx.fullmatch_only = saved
+                if stop:
+                    return
             ppos += ctx.pat(ppos)
 
         elif op == OPCODE_AT:
@@ -606,36 +698,36 @@
             if (ptr == ctx.end or
                 not rsre_char.category_dispatch(ctx.pat(ppos), ctx.str(ptr))):
                 return
-            ptr += 1
+            ptr = ctx.next(ptr)
             ppos += 1
 
         elif op == OPCODE_GROUPREF:
             # match backreference
             # <GROUPREF> <groupnum>
-            startptr, length = get_group_ref(marks, ctx.pat(ppos))
-            if length < 0:
+            startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+            if length_bytes < 0:
                 return     # group was not previously defined
-            if not match_repeated(ctx, ptr, startptr, length):
+            if not match_repeated(ctx, ptr, startptr, length_bytes):
                 return     # no match
-            ptr += length
+            ptr = ctx.go_forward_by_bytes(ptr, length_bytes)
             ppos += 1
 
         elif op == OPCODE_GROUPREF_IGNORE:
             # match backreference
             # <GROUPREF> <groupnum>
-            startptr, length = get_group_ref(marks, ctx.pat(ppos))
-            if length < 0:
+            startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+            if length_bytes < 0:
                 return     # group was not previously defined
-            if not match_repeated_ignore(ctx, ptr, startptr, length):
+            ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes)
+            if ptr < ctx.ZERO:
                 return     # no match
-            ptr += length
             ppos += 1
 
         elif op == OPCODE_GROUPREF_EXISTS:
             # conditional match depending on the existence of a group
             # <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ...
-            _, length = get_group_ref(marks, ctx.pat(ppos))
-            if length >= 0:
+            _, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+            if length_bytes >= 0:
                 ppos += 2                  # jump to 'codeyes'
             else:
                 ppos += ctx.pat(ppos+1)    # jump to 'codeno'
@@ -647,7 +739,7 @@
                                                              ctx.str(ptr)):
                 return
             ppos += ctx.pat(ppos)
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_IN_IGNORE:
             # match set member (or non_member), ignoring case
@@ -656,12 +748,12 @@
                                                              ctx.lowstr(ptr)):
                 return
             ppos += ctx.pat(ppos)
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_INFO:
             # optimization info block
             # <INFO> <0=skip> <1=flags> <2=min> ...
-            if (ctx.end - ptr) < ctx.pat(ppos+2):
+            if ctx.maximum_distance(ptr, ctx.end) < ctx.pat(ppos+2):
                 return
             ppos += ctx.pat(ppos)
 
@@ -674,7 +766,7 @@
             if ptr >= ctx.end or ctx.str(ptr) != ctx.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_LITERAL_IGNORE:
             # match literal string, ignoring case
@@ -682,7 +774,7 @@
             if ptr >= ctx.end or ctx.lowstr(ptr) != ctx.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_MARK:
             # set mark
@@ -697,7 +789,7 @@
             if ptr >= ctx.end or ctx.str(ptr) == ctx.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_NOT_LITERAL_IGNORE:
             # match if it's not a literal string, ignoring case
@@ -705,7 +797,7 @@
             if ptr >= ctx.end or ctx.lowstr(ptr) == ctx.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_REPEAT:
             # general repeat.  in this version of the re module, all the work
@@ -743,8 +835,9 @@
             # use the MAX_REPEAT operator.
             # <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail
             start = ptr
-            minptr = start + ctx.pat(ppos+1)
-            if minptr > ctx.end:
+            try:
+                minptr = ctx.next_n(start, ctx.pat(ppos+1), ctx.end)
+            except EndOfString:
                 return    # cannot match
             ptr = find_repetition_end(ctx, ppos+3, start, ctx.pat(ppos+2),
                                       marks)
@@ -765,22 +858,22 @@
             start = ptr
             min = ctx.pat(ppos+1)
             if min > 0:
-                minptr = ptr + min
-                if minptr > ctx.end:
-                    return   # cannot match
+                try:
+                    minptr = ctx.next_n(ptr, min, ctx.end)
+                except EndOfString:
+                    return    # cannot match
                 # count using pattern min as the maximum
                 ptr = find_repetition_end(ctx, ppos+3, ptr, min, marks)
                 if ptr < minptr:
                     return   # did not match minimum number of times
 
-            maxptr = ctx.end
+            max_count = sys.maxint
             max = ctx.pat(ppos+2)
             if max != rsre_char.MAXREPEAT:
-                maxptr1 = start + max
-                if maxptr1 <= maxptr:
-                    maxptr = maxptr1
+                max_count = max - min
+                assert max_count >= 0
             nextppos = ppos + ctx.pat(ppos)
-            result = MinRepeatOneMatchResult(nextppos, ppos+3, maxptr,
+            result = MinRepeatOneMatchResult(nextppos, ppos+3, max_count,
                                              ptr, marks)
             return result.find_first_result(ctx)
 
@@ -788,37 +881,41 @@
             raise Error("bad pattern code %d" % op)
 
 
-def get_group_ref(marks, groupnum):
+def get_group_ref(ctx, marks, groupnum):
     gid = groupnum * 2
     startptr = find_mark(marks, gid)
-    if startptr < 0:
+    if startptr < ctx.ZERO:
         return 0, -1
     endptr = find_mark(marks, gid + 1)
-    length = endptr - startptr     # < 0 if endptr < startptr (or if endptr=-1)
-    return startptr, length
+    length_bytes = ctx.bytes_difference(endptr, startptr)
+    #        < 0 if endptr < startptr (or if endptr=-1)
+    return startptr, length_bytes
 
 @specializectx
-def match_repeated(ctx, ptr, oldptr, length):
-    if ptr + length > ctx.end:
+def match_repeated(ctx, ptr, oldptr, length_bytes):
+    if ctx.bytes_difference(ctx.end, ptr) < length_bytes:
         return False
-    for i in range(length):
-        if ctx.str(ptr + i) != ctx.str(oldptr + i):
+    for i in range(length_bytes):
+        if ctx.get_single_byte(ptr, i) != ctx.get_single_byte(oldptr, i):
             return False
     return True
 
 @specializectx
-def match_repeated_ignore(ctx, ptr, oldptr, length):
-    if ptr + length > ctx.end:
-        return False
-    for i in range(length):
-        if ctx.lowstr(ptr + i) != ctx.lowstr(oldptr + i):
-            return False
-    return True
+def match_repeated_ignore(ctx, ptr, oldptr, length_bytes):
+    oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+    while oldptr < oldend:
+        if ptr >= ctx.end:
+            return -1
+        if ctx.lowstr(ptr) != ctx.lowstr(oldptr):
+            return -1
+        ptr = ctx.next(ptr)
+        oldptr = ctx.next(oldptr)
+    return ptr
 
 @specializectx
 def find_repetition_end(ctx, ppos, ptr, maxcount, marks):
     end = ctx.end
-    ptrp1 = ptr + 1
+    ptrp1 = ctx.next(ptr)
     # First get rid of the cases where we don't have room for any match.
     if maxcount <= 0 or ptrp1 > end:
         return ptr
@@ -843,9 +940,10 @@
     # Else we really need to count how many times it matches.
     if maxcount != rsre_char.MAXREPEAT:
         # adjust end
-        end1 = ptr + maxcount
-        if end1 <= end:
-            end = end1
+        try:
+            end = ctx.next_n(ptr, maxcount, end)
+        except EndOfString:
+            pass
     op = ctx.pat(ppos)
     for op1, fre in unroll_fre_checker:
         if op1 == op:
@@ -862,7 +960,7 @@
         if end1 <= end:
             end = end1
     while ptr < end and sre_match(ctx, ppos, ptr, marks) is not None:
-        ptr += 1
+        ptr = ctx.next(ptr)
     return ptr
 
 @specializectx
@@ -904,7 +1002,7 @@
                 ctx.jitdriver_MatchIn.jit_merge_point(ctx=ctx, ptr=ptr,
                                                       end=end, ppos=ppos)
                 if ptr < end and checkerfn(ctx, ptr, ppos):
-                    ptr += 1
+                    ptr = ctx.next(ptr)
                 else:
                     return ptr
     elif checkerfn == match_IN_IGNORE:
@@ -918,7 +1016,7 @@
                 ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr,
                                                             end=end, ppos=ppos)
                 if ptr < end and checkerfn(ctx, ptr, ppos):
-                    ptr += 1
+                    ptr = ctx.next(ptr)
                 else:
                     return ptr
     else:
@@ -927,7 +1025,7 @@
         @specializectx
         def fre(ctx, ptr, end, ppos):
             while ptr < end and checkerfn(ctx, ptr, ppos):
-                ptr += 1
+                ptr = ctx.next(ptr)
             return ptr
     fre = func_with_new_name(fre, 'fre_' + checkerfn.__name__)
     return fre
@@ -967,11 +1065,14 @@
 def sre_at(ctx, atcode, ptr):
     if (atcode == AT_BEGINNING or
         atcode == AT_BEGINNING_STRING):
-        return ptr == 0
+        return ptr == ctx.ZERO
 
     elif atcode == AT_BEGINNING_LINE:
-        prevptr = ptr - 1
-        return prevptr < 0 or rsre_char.is_linebreak(ctx.str(prevptr))
+        try:
+            prevptr = ctx.prev(ptr)
+        except EndOfString:
+            return True
+        return rsre_char.is_linebreak(ctx.str(prevptr))
 
     elif atcode == AT_BOUNDARY:
         return at_boundary(ctx, ptr)
@@ -980,9 +1081,8 @@
         return at_non_boundary(ctx, ptr)
 
     elif atcode == AT_END:
-        remaining_chars = ctx.end - ptr
-        return remaining_chars <= 0 or (
-            remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr)))
+        return (ptr == ctx.end or
+            (ctx.next(ptr) == ctx.end and 
rsre_char.is_linebreak(ctx.str(ptr))))
 
     elif atcode == AT_END_LINE:
         return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr))
@@ -1007,18 +1107,26 @@
 def _make_boundary(word_checker):
     @specializectx
     def at_boundary(ctx, ptr):
-        if ctx.end == 0:
+        if ctx.end == ctx.ZERO:
             return False
-        prevptr = ptr - 1
-        that = prevptr >= 0 and word_checker(ctx.str(prevptr))
+        try:
+            prevptr = ctx.prev(ptr)
+        except EndOfString:
+            that = False
+        else:
+            that = word_checker(ctx.str(prevptr))
         this = ptr < ctx.end and word_checker(ctx.str(ptr))
         return this != that
     @specializectx
     def at_non_boundary(ctx, ptr):
-        if ctx.end == 0:
+        if ctx.end == ctx.ZERO:
             return False
-        prevptr = ptr - 1
-        that = prevptr >= 0 and word_checker(ctx.str(prevptr))
+        try:
+            prevptr = ctx.prev(ptr)
+        except EndOfString:
+            that = False
+        else:
+            that = word_checker(ctx.str(prevptr))
         this = ptr < ctx.end and word_checker(ctx.str(ptr))
         return this == that
     return at_boundary, at_non_boundary
@@ -1100,7 +1208,7 @@
         if sre_match(ctx, base, start, None) is not None:
             ctx.match_start = start
             return True
-        start += 1
+        start = ctx.next_indirect(start)
     return False
 
 install_jitdriver_spec("LiteralSearch",
@@ -1117,11 +1225,12 @@
     while start < ctx.end:
         ctx.jitdriver_LiteralSearch.jit_merge_point(ctx=ctx, start=start,
                                           base=base, character=character)
+        start1 = ctx.next(start)
         if ctx.str(start) == character:
-            if sre_match(ctx, base, start + 1, None) is not None:
+            if sre_match(ctx, base, start1, None) is not None:
                 ctx.match_start = start
                 return True
-        start += 1
+        start = start1
     return False
 
 install_jitdriver_spec("CharsetSearch",
@@ -1139,7 +1248,7 @@
             if sre_match(ctx, base, start, None) is not None:
                 ctx.match_start = start
                 return True
-        start += 1
+        start = ctx.next(start)
     return False
 
 install_jitdriver_spec('FastSearch',
@@ -1156,7 +1265,7 @@
     if string_position >= ctx.end:
         return False
     prefix_len = ctx.pat(5)
-    assert prefix_len >= 0
+    assert prefix_len > 0
     i = 0
     while True:
         ctx.jitdriver_FastSearch.jit_merge_point(ctx=ctx,
@@ -1171,10 +1280,14 @@
             i += 1
             if i == prefix_len:
                 # found a potential match
-                start = string_position + 1 - prefix_len
-                assert start >= 0
+                # start = string_position + 1 - prefix_len: computed later
+                ptr = string_position
                 prefix_skip = ctx.pat(6)
-                ptr = start + prefix_skip
+                if prefix_skip == prefix_len:
+                    ptr = ctx.next(ptr)
+                else:
+                    assert prefix_skip < prefix_len
+                    ptr = ctx.prev_n(ptr, prefix_len-1 - prefix_skip, ctx.ZERO)
                 #flags = ctx.pat(2)
                 #if flags & rsre_char.SRE_INFO_LITERAL:
                 #    # matched all of pure literal pattern
@@ -1185,10 +1298,11 @@
                 pattern_offset = ctx.pat(1) + 1
                 ppos_start = pattern_offset + 2 * prefix_skip
                 if sre_match(ctx, ppos_start, ptr, None) is not None:
+                    start = ctx.prev_n(ptr, prefix_skip, ctx.ZERO)
                     ctx.match_start = start
                     return True
                 overlap_offset = prefix_len + (7 - 1)
                 i = ctx.pat(overlap_offset + i)
-        string_position += 1
+        string_position = ctx.next(string_position)
         if string_position >= ctx.end:
             return False
diff --git a/rpython/rlib/rsre/rsre_jit.py b/rpython/rlib/rsre/rsre_jit.py
--- a/rpython/rlib/rsre/rsre_jit.py
+++ b/rpython/rlib/rsre/rsre_jit.py
@@ -36,8 +36,10 @@
     from rpython.rlib.rsre.rsre_core import BufMatchContext
     from rpython.rlib.rsre.rsre_core import StrMatchContext
     from rpython.rlib.rsre.rsre_core import UnicodeMatchContext
+    from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
     for prefix, concreteclass in [('Buf', BufMatchContext),
                                   ('Str', StrMatchContext),
-                                  ('Uni', UnicodeMatchContext)]:
+                                  ('Uni', UnicodeMatchContext),
+                                  ('Utf8', Utf8MatchContext)]:
         jitdriver = RSreJitDriver(prefix + name, **kwds)
         setattr(concreteclass, 'jitdriver_' + name, jitdriver)
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -0,0 +1,105 @@
+import sys
+from rpython.rlib.debug import check_nonneg
+from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
+from rpython.rlib.rsre import rsre_char
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rlib import rutf8
+
+
+class Utf8MatchContext(AbstractMatchContext):
+    """A context that matches unicode, but encoded in a utf8 string.
+    Be careful because most positions taken by, handled in, and returned
+    by this class are expressed in *bytes*, not in characters.
+    """
+
+    def __init__(self, pattern, utf8string, match_start, end, flags):
+        AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+        self._utf8 = utf8string
+
+    def str(self, index):
+        check_nonneg(index)
+        return rutf8.codepoint_at_pos(self._utf8, index)
+
+    def lowstr(self, index):
+        c = self.str(index)
+        return rsre_char.getlower(c, self.flags)
+
+    def get_single_byte(self, base_position, index):
+        return self.str(base_position + index)
+
+    def fresh_copy(self, start):
+        return Utf8MatchContext(self.pattern, self._utf8, start,
+                                self.end, self.flags)
+
+    def next(self, position):
+        return rutf8.next_codepoint_pos(self._utf8, position)
+    next_indirect = next
+
+    def prev(self, position):
+        if position <= 0:
+            raise EndOfString
+        position = rutf8.prev_codepoint_pos(self._utf8, position)
+        assert position >= 0
+        return position
+    prev_indirect = prev
+
+    def next_n(self, position, n, end_position):
+        for i in range(n):
+            if position >= end_position:
+                raise EndOfString
+            position = rutf8.next_codepoint_pos(self._utf8, position)
+        return position
+
+    def prev_n(self, position, n, start_position):
+        for i in range(n):
+            if position <= start_position:
+                raise EndOfString
+            position = rutf8.prev_codepoint_pos(self._utf8, position)
+        assert position >= 0
+        return position
+
+    def debug_check_pos(self, position):
+        if we_are_translated():
+            return
+        if position == len(self._utf8):
+            return   # end of string is fine
+        assert not (0x80 <= self._utf8[position] < 0xC0)   # continuation byte
+
+    def maximum_distance(self, position_low, position_high):
+        # may overestimate if there are non-ascii chars
+        return position_high - position_low
+
+
+def make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags):
+    if bytestart < 0: bytestart = 0
+    elif bytestart > len(utf8string): bytestart = len(utf8string)
+    if byteend < 0: byteend = 0
+    elif byteend > len(utf8string): byteend = len(utf8string)
+    ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags)
+    ctx.debug_check_pos(bytestart)
+    ctx.debug_check_pos(byteend)
+    return ctx
+
+def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0):
+    # bytestart and byteend must be valid byte positions inside the
+    # utf8string.
+    from rpython.rlib.rsre.rsre_core import search_context
+
+    ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags)
+    if search_context(ctx):
+        return ctx
+    else:
+        return None
+
+def utf8match(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0,
+              fullmatch=False):
+    # bytestart and byteend must be valid byte positions inside the
+    # utf8string.
+    from rpython.rlib.rsre.rsre_core import match_context
+
+    ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags)
+    ctx.fullmatch_only = fullmatch
+    if match_context(ctx):
+        return ctx
+    else:
+        return None
diff --git a/rpython/rlib/rsre/test/support.py 
b/rpython/rlib/rsre/test/support.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rsre/test/support.py
@@ -0,0 +1,136 @@
+import sys, random
+from rpython.rlib import debug
+from rpython.rlib.rsre.rsre_core import _adjust, match_context, search_context
+from rpython.rlib.rsre.rsre_core import StrMatchContext, EndOfString
+
+
+class Position(object):
+    def __init__(self, p):
+        assert isinstance(p, int)
+        if p < 0:
+            raise debug.NegativeArgumentNotAllowed(
+                "making a Position with byte index %r" % p)
+        self._p = p
+    def __repr__(self):
+        return '<Position %d>' % (self._p)
+    def __cmp__(self, other):
+        if isinstance(other, Position):
+            return cmp(self._p, other._p)
+        if type(other) is int and other == -1:
+            return cmp(self._p, -1)
+        raise TypeError("cannot compare %r with %r" % (self, other))
+
+
+class MatchContextForTests(StrMatchContext):
+    """Concrete subclass for matching in a plain string, tweaked for tests"""
+
+    ZERO = Position(0)
+
+    def next(self, position):
+        assert isinstance(position, Position)
+        return Position(position._p + 1)
+    next_indirect = next
+
+    def prev(self, position):
+        assert isinstance(position, Position)
+        if position._p == 0:
+            raise EndOfString
+        return Position(position._p - 1)
+    prev_indirect = prev
+
+    def next_n(self, position, n, end_position):
+        assert isinstance(position, Position)
+        assert isinstance(end_position, Position)
+        assert position._p <= end_position._p
+        r = position._p + n
+        if r > end_position._p:
+            raise EndOfString
+        return Position(r)
+
+    def prev_n(self, position, n, start_position):
+        assert isinstance(position, Position)
+        assert isinstance(start_position, Position)
+        assert position._p >= start_position._p
+        r = position._p - n
+        if r < start_position._p:
+            raise EndOfString
+        return Position(r)
+
+    def _real_pos(self, position):
+        if type(position) is int and position == -1:
+            return -1
+        assert isinstance(position, Position)
+        return position._p
+
+    def group(self, groupnum=0):
+        frm, to = self.span(groupnum)
+        if self.ZERO <= frm <= to:
+            return self._string[self._real_pos(frm):self._real_pos(to)]
+        else:
+            return None
+
+    def str(self, position):
+        assert isinstance(position, Position)
+        return ord(self._string[position._p])
+
+    def debug_check_pos(self, position):
+        assert isinstance(position, Position)
+
+    #def minimum_distance(self, position_low, position_high):
+    #    """Return an estimate.  The real value may be higher."""
+    #    assert isinstance(position_low, Position)
+    #    assert isinstance(position_high, Position)
+    #    dist = position_high._p - position_low._p
+    #    if dist == 0:
+    #        return 0
+    #    return random.randrange(1, dist + 1)
+
+    def maximum_distance(self, position_low, position_high):
+        """Return an estimate.  The real value may be lower."""
+        assert isinstance(position_low, Position)
+        assert isinstance(position_high, Position)
+        return position_high._p - position_low._p + random.randrange(0, 10)
+
+    def bytes_difference(self, position1, position2):
+        assert isinstance(position1, Position)
+        assert isinstance(position2, Position)
+        return position1._p - position2._p
+
+    def get_single_byte(self, base_position, index):
+        assert isinstance(base_position, Position)
+        assert isinstance(index, int)
+        return ord(self._string[base_position._p + index])
+
+    def go_forward_by_bytes(self, base_position, index):
+        assert isinstance(base_position, Position)
+        assert isinstance(index, int)
+        return Position(base_position._p + index)
+
+    def fresh_copy(self, start):
+        return MatchContextForTests(self.pattern, self._string, start,
+                                    self.end, self.flags)
+
+
+def match(pattern, string, start=0, end=sys.maxint, flags=0, fullmatch=False):
+    start, end = _adjust(start, end, len(string))
+    start = Position(start)
+    end = Position(end)
+    ctx = MatchContextForTests(pattern, string, start, end, flags)
+    ctx.fullmatch_only = fullmatch
+    if match_context(ctx):
+        return ctx
+    else:
+        return None
+
+def fullmatch(pattern, string, start=0, end=sys.maxint, flags=0):
+    return match(pattern, string, start, end, flags, fullmatch=True)
+
+def search(pattern, string, start=0, end=sys.maxint, flags=0):
+    start, end = _adjust(start, end, len(string))
+    start = Position(start)
+    end = Position(end)
+    ctx = MatchContextForTests(pattern, string, start, end, flags)
+    if search_context(ctx):
+        return ctx
+    else:
+        return None
diff --git a/rpython/rlib/rsre/test/test_ext_opcode.py 
b/rpython/rlib/rsre/test/test_ext_opcode.py
--- a/rpython/rlib/rsre/test/test_ext_opcode.py
+++ b/rpython/rlib/rsre/test/test_ext_opcode.py
@@ -5,6 +5,7 @@
 
 from rpython.rlib.rsre import rsre_core
 from rpython.rlib.rsre.rsre_char import MAXREPEAT
+from rpython.rlib.rsre.test.support import match, Position
 
 # import OPCODE_XX as XX
 for name, value in rsre_core.__dict__.items():
@@ -17,10 +18,10 @@
     # it's a valid optimization because \1 is always one character long
     r = [MARK, 0, ANY, MARK, 1, REPEAT_ONE, 6, 0, MAXREPEAT, 
          GROUPREF, 0, SUCCESS, SUCCESS]
-    assert rsre_core.match(r, "aaa").match_end == 3
+    assert match(r, "aaa").match_end == Position(3)
 
 def test_min_repeat_one_with_backref():
     # Python 3.5 compiles "(.)\1*?b" using MIN_REPEAT_ONE
     r = [MARK, 0, ANY, MARK, 1, MIN_REPEAT_ONE, 6, 0, MAXREPEAT,
          GROUPREF, 0, SUCCESS, LITERAL, 98, SUCCESS]
-    assert rsre_core.match(r, "aaab").match_end == 4
+    assert match(r, "aaab").match_end == Position(4)
diff --git a/rpython/rlib/rsre/test/test_match.py 
b/rpython/rlib/rsre/test/test_match.py
--- a/rpython/rlib/rsre/test/test_match.py
+++ b/rpython/rlib/rsre/test/test_match.py
@@ -1,6 +1,7 @@
 import re, random, py
-from rpython.rlib.rsre import rsre_core, rsre_char
+from rpython.rlib.rsre import rsre_char
 from rpython.rlib.rsre.rpy import get_code, VERSION
+from rpython.rlib.rsre.test.support import match, fullmatch, Position as P
 
 
 def get_code_and_re(regexp):
@@ -16,234 +17,234 @@
 
     def test_or(self):
         r = get_code(r"a|bc|def")
-        assert rsre_core.match(r, "a")
-        assert rsre_core.match(r, "bc")
-        assert rsre_core.match(r, "def")
-        assert not rsre_core.match(r, "ghij")
+        assert match(r, "a")
+        assert match(r, "bc")
+        assert match(r, "def")
+        assert not match(r, "ghij")
 
     def test_any(self):
         r = get_code(r"ab.cd")
-        assert rsre_core.match(r, "abXcdef")
-        assert not rsre_core.match(r, "ab\ncdef")
-        assert not rsre_core.match(r, "abXcDef")
+        assert match(r, "abXcdef")
+        assert not match(r, "ab\ncdef")
+        assert not match(r, "abXcDef")
 
     def test_any_repetition(self):
         r = get_code(r"ab.*cd")
-        assert rsre_core.match(r, "abXXXXcdef")
-        assert rsre_core.match(r, "abcdef")
-        assert not rsre_core.match(r, "abX\nXcdef")
-        assert not rsre_core.match(r, "abXXXXcDef")
+        assert match(r, "abXXXXcdef")
+        assert match(r, "abcdef")
+        assert not match(r, "abX\nXcdef")
+        assert not match(r, "abXXXXcDef")
 
     def test_any_all(self):
         r = get_code(r"(?s)ab.cd")
-        assert rsre_core.match(r, "abXcdef")
-        assert rsre_core.match(r, "ab\ncdef")
-        assert not rsre_core.match(r, "ab\ncDef")
+        assert match(r, "abXcdef")
+        assert match(r, "ab\ncdef")
+        assert not match(r, "ab\ncDef")
 
     def test_any_all_repetition(self):
         r = get_code(r"(?s)ab.*cd")
-        assert rsre_core.match(r, "abXXXXcdef")
-        assert rsre_core.match(r, "abcdef")
-        assert rsre_core.match(r, "abX\nXcdef")
-        assert not rsre_core.match(r, "abX\nXcDef")
+        assert match(r, "abXXXXcdef")
+        assert match(r, "abcdef")
+        assert match(r, "abX\nXcdef")
+        assert not match(r, "abX\nXcDef")
 
     def test_assert(self):
         r = get_code(r"abc(?=def)(.)")
-        res = rsre_core.match(r, "abcdefghi")
-        assert res is not None and res.get_mark(1) == 4
-        assert not rsre_core.match(r, "abcdeFghi")
+        res = match(r, "abcdefghi")
+        assert res is not None and res.get_mark(1) == P(4)
+        assert not match(r, "abcdeFghi")
 
     def test_assert_not(self):
         r = get_code(r"abc(?!def)(.)")
-        res = rsre_core.match(r, "abcdeFghi")
-        assert res is not None and res.get_mark(1) == 4
-        assert not rsre_core.match(r, "abcdefghi")
+        res = match(r, "abcdeFghi")
+        assert res is not None and res.get_mark(1) == P(4)
+        assert not match(r, "abcdefghi")
 
     def test_lookbehind(self):
         r = get_code(r"([a-z]*)(?<=de)")
-        assert rsre_core.match(r, "ade")
-        res = rsre_core.match(r, "adefg")
-        assert res is not None and res.get_mark(1) == 3
-        assert not rsre_core.match(r, "abc")
-        assert not rsre_core.match(r, "X")
-        assert not rsre_core.match(r, "eX")
+        assert match(r, "ade")
+        res = match(r, "adefg")
+        assert res is not None and res.get_mark(1) == P(3)
+        assert not match(r, "abc")
+        assert not match(r, "X")
+        assert not match(r, "eX")
 
     def test_negative_lookbehind(self):
         def found(s):
-            res = rsre_core.match(r, s)
+            res = match(r, s)
             assert res is not None
             return res.get_mark(1)
         r = get_code(r"([a-z]*)(?<!dd)")
-        assert found("ade") == 3
-        assert found("adefg") == 5
-        assert found("abcdd") == 4
-        assert found("abddd") == 3
-        assert found("adddd") == 2
-        assert found("ddddd") == 1
-        assert found("abXde") == 2
+        assert found("ade") == P(3)
+        assert found("adefg") == P(5)
+        assert found("abcdd") == P(4)
+        assert found("abddd") == P(3)
+        assert found("adddd") == P(2)
+        assert found("ddddd") == P(1)
+        assert found("abXde") == P(2)
 
     def test_at(self):
         r = get_code(r"abc$")
-        assert rsre_core.match(r, "abc")
-        assert not rsre_core.match(r, "abcd")
-        assert not rsre_core.match(r, "ab")
+        assert match(r, "abc")
+        assert not match(r, "abcd")
+        assert not match(r, "ab")
 
     def test_repeated_set(self):
         r = get_code(r"[a0x]+f")
-        assert rsre_core.match(r, "a0af")
-        assert not rsre_core.match(r, "a0yaf")
+        assert match(r, "a0af")
+        assert not match(r, "a0yaf")
 
     def test_category(self):
         r = get_code(r"[\sx]")
-        assert rsre_core.match(r, "x")
-        assert rsre_core.match(r, " ")
-        assert not rsre_core.match(r, "n")
+        assert match(r, "x")
+        assert match(r, " ")
+        assert not match(r, "n")
 
     def test_groupref(self):
         r = get_code(r"(xx+)\1+$")     # match non-prime numbers of x
-        assert not rsre_core.match(r, "xx")
-        assert not rsre_core.match(r, "xxx")
-        assert     rsre_core.match(r, "xxxx")
-        assert not rsre_core.match(r, "xxxxx")
-        assert     rsre_core.match(r, "xxxxxx")
-        assert not rsre_core.match(r, "xxxxxxx")
-        assert     rsre_core.match(r, "xxxxxxxx")
-        assert     rsre_core.match(r, "xxxxxxxxx")
+        assert not match(r, "xx")
+        assert not match(r, "xxx")
+        assert     match(r, "xxxx")
+        assert not match(r, "xxxxx")
+        assert     match(r, "xxxxxx")
+        assert not match(r, "xxxxxxx")
+        assert     match(r, "xxxxxxxx")
+        assert     match(r, "xxxxxxxxx")
 
     def test_groupref_ignore(self):
         r = get_code(r"(?i)(xx+)\1+$")     # match non-prime numbers of x
-        assert not rsre_core.match(r, "xX")
-        assert not rsre_core.match(r, "xxX")
-        assert     rsre_core.match(r, "Xxxx")
-        assert not rsre_core.match(r, "xxxXx")
-        assert     rsre_core.match(r, "xXxxxx")
-        assert not rsre_core.match(r, "xxxXxxx")
-        assert     rsre_core.match(r, "xxxxxxXx")
-        assert     rsre_core.match(r, "xxxXxxxxx")
+        assert not match(r, "xX")
+        assert not match(r, "xxX")
+        assert     match(r, "Xxxx")
+        assert not match(r, "xxxXx")
+        assert     match(r, "xXxxxx")
+        assert not match(r, "xxxXxxx")
+        assert     match(r, "xxxxxxXx")
+        assert     match(r, "xxxXxxxxx")
 
     def test_groupref_exists(self):
         r = get_code(r"((a)|(b))c(?(2)d)$")
-        assert not rsre_core.match(r, "ac")
-        assert     rsre_core.match(r, "acd")
-        assert     rsre_core.match(r, "bc")
-        assert not rsre_core.match(r, "bcd")
+        assert not match(r, "ac")
+        assert     match(r, "acd")
+        assert     match(r, "bc")
+        assert not match(r, "bcd")
         #
         r = get_code(r"((a)|(b))c(?(2)d|e)$")
-        assert not rsre_core.match(r, "ac")
-        assert     rsre_core.match(r, "acd")
-        assert not rsre_core.match(r, "ace")
-        assert not rsre_core.match(r, "bc")
-        assert not rsre_core.match(r, "bcd")
-        assert     rsre_core.match(r, "bce")
+        assert not match(r, "ac")
+        assert     match(r, "acd")
+        assert not match(r, "ace")
+        assert not match(r, "bc")
+        assert not match(r, "bcd")
+        assert     match(r, "bce")
 
     def test_in_ignore(self):
         r = get_code(r"(?i)[a-f]")
-        assert rsre_core.match(r, "b")
-        assert rsre_core.match(r, "C")
-        assert not rsre_core.match(r, "g")
+        assert match(r, "b")
+        assert match(r, "C")
+        assert not match(r, "g")
         r = get_code(r"(?i)[a-f]+$")
-        assert rsre_core.match(r, "bCdEf")
-        assert not rsre_core.match(r, "g")
-        assert not rsre_core.match(r, "aaagaaa")
+        assert match(r, "bCdEf")
+        assert not match(r, "g")
+        assert not match(r, "aaagaaa")
 
     def test_not_literal(self):
         r = get_code(r"[^a]")
-        assert rsre_core.match(r, "A")
-        assert not rsre_core.match(r, "a")
+        assert match(r, "A")
+        assert not match(r, "a")
         r = get_code(r"[^a]+$")
-        assert rsre_core.match(r, "Bx123")
-        assert not rsre_core.match(r, "--a--")
+        assert match(r, "Bx123")
+        assert not match(r, "--a--")
 
     def test_not_literal_ignore(self):
         r = get_code(r"(?i)[^a]")
-        assert rsre_core.match(r, "G")
-        assert not rsre_core.match(r, "a")
-        assert not rsre_core.match(r, "A")
+        assert match(r, "G")
+        assert not match(r, "a")
+        assert not match(r, "A")
         r = get_code(r"(?i)[^a]+$")
-        assert rsre_core.match(r, "Gx123")
-        assert not rsre_core.match(r, "--A--")
+        assert match(r, "Gx123")
+        assert not match(r, "--A--")
 
     def test_repeated_single_character_pattern(self):
         r = get_code(r"foo(?:(?<=foo)x)+$")
-        assert rsre_core.match(r, "foox")
+        assert match(r, "foox")
 
     def test_flatten_marks(self):
         r = get_code(r"a(b)c((d)(e))+$")
-        res = rsre_core.match(r, "abcdedede")
-        assert res.flatten_marks() == [0, 9, 1, 2, 7, 9, 7, 8, 8, 9]
-        assert res.flatten_marks() == [0, 9, 1, 2, 7, 9, 7, 8, 8, 9]
+        res = match(r, "abcdedede")
+        assert res.flatten_marks() == map(P, [0, 9, 1, 2, 7, 9, 7, 8, 8, 9])
+        assert res.flatten_marks() == map(P, [0, 9, 1, 2, 7, 9, 7, 8, 8, 9])
 
     def test_bug1(self):
         # REPEAT_ONE inside REPEAT
         r = get_code(r"(?:.+)?B")
-        assert rsre_core.match(r, "AB") is not None
+        assert match(r, "AB") is not None
         r = get_code(r"(?:AA+?)+B")
-        assert rsre_core.match(r, "AAAB") is not None
+        assert match(r, "AAAB") is not None
         r = get_code(r"(?:AA+)+?B")
-        assert rsre_core.match(r, "AAAB") is not None
+        assert match(r, "AAAB") is not None
         r = get_code(r"(?:AA+?)+?B")
-        assert rsre_core.match(r, "AAAB") is not None
+        assert match(r, "AAAB") is not None
         # REPEAT inside REPEAT
         r = get_code(r"(?:(?:xy)+)?B")
-        assert rsre_core.match(r, "xyB") is not None
+        assert match(r, "xyB") is not None
         r = get_code(r"(?:xy(?:xy)+?)+B")
-        assert rsre_core.match(r, "xyxyxyB") is not None
+        assert match(r, "xyxyxyB") is not None
         r = get_code(r"(?:xy(?:xy)+)+?B")
-        assert rsre_core.match(r, "xyxyxyB") is not None
+        assert match(r, "xyxyxyB") is not None
         r = get_code(r"(?:xy(?:xy)+?)+?B")
-        assert rsre_core.match(r, "xyxyxyB") is not None
+        assert match(r, "xyxyxyB") is not None
 
     def test_assert_group(self):
         r = get_code(r"abc(?=(..)f)(.)")
-        res = rsre_core.match(r, "abcdefghi")
+        res = match(r, "abcdefghi")
         assert res is not None
-        assert res.span(2) == (3, 4)
-        assert res.span(1) == (3, 5)
+        assert res.span(2) == (P(3), P(4))
+        assert res.span(1) == (P(3), P(5))
 
     def test_assert_not_group(self):
         r = get_code(r"abc(?!(de)f)(.)")
-        res = rsre_core.match(r, "abcdeFghi")
+        res = match(r, "abcdeFghi")
         assert res is not None
-        assert res.span(2) == (3, 4)
+        assert res.span(2) == (P(3), P(4))
         # this I definitely classify as Horrendously Implementation Dependent.
         # CPython answers (3, 5).
         assert res.span(1) == (-1, -1)
 
     def test_match_start(self):
         r = get_code(r"^ab")
-        assert     rsre_core.match(r, "abc")
-        assert not rsre_core.match(r, "xxxabc", start=3)
-        assert not rsre_core.match(r, "xx\nabc", start=3)
+        assert     match(r, "abc")
+        assert not match(r, "xxxabc", start=3)
+        assert not match(r, "xx\nabc", start=3)
         #
         r = get_code(r"(?m)^ab")
-        assert     rsre_core.match(r, "abc")
-        assert not rsre_core.match(r, "xxxabc", start=3)
-        assert     rsre_core.match(r, "xx\nabc", start=3)
+        assert     match(r, "abc")
+        assert not match(r, "xxxabc", start=3)
+        assert     match(r, "xx\nabc", start=3)
 
     def test_match_end(self):
         r = get_code("ab")
-        assert     rsre_core.match(r, "abc")
-        assert     rsre_core.match(r, "abc", end=333)
-        assert     rsre_core.match(r, "abc", end=3)
-        assert     rsre_core.match(r, "abc", end=2)
-        assert not rsre_core.match(r, "abc", end=1)
-        assert not rsre_core.match(r, "abc", end=0)
-        assert not rsre_core.match(r, "abc", end=-1)
+        assert     match(r, "abc")
+        assert     match(r, "abc", end=333)
+        assert     match(r, "abc", end=3)
+        assert     match(r, "abc", end=2)
+        assert not match(r, "abc", end=1)
+        assert not match(r, "abc", end=0)
+        assert not match(r, "abc", end=-1)
 
     def test_match_bug1(self):
         r = get_code(r'(x??)?$')
-        assert rsre_core.match(r, "x")
+        assert match(r, "x")
 
     def test_match_bug2(self):
         r = get_code(r'(x??)??$')
-        assert rsre_core.match(r, "x")
+        assert match(r, "x")
 
     def test_match_bug3(self):
         if VERSION == "2.7.5":
             py.test.skip("pattern fails to compile with exactly 2.7.5 "
                          "(works on 2.7.3 and on 2.7.trunk though)")
         r = get_code(r'([ax]*?x*)?$')
-        assert rsre_core.match(r, "aaxaa")
+        assert match(r, "aaxaa")
 
     def test_bigcharset(self):
         for i in range(100):
@@ -252,10 +253,10 @@
             pattern = u'[%s]' % (u''.join(chars),)
             r = get_code(pattern)
             for c in chars:
-                assert rsre_core.match(r, c)
+                assert match(r, c)
             for i in range(200):
                 c = unichr(random.randrange(0x0, 0xD000))
-                res = rsre_core.match(r, c)
+                res = match(r, c)
                 if c in chars:
                     assert res is not None
                 else:
@@ -264,41 +265,41 @@
     def test_simple_match_1(self):
         r = get_code(r"ab*bbbbbbbc")
         print r
-        match = rsre_core.match(r, "abbbbbbbbbcdef")
-        assert match
-        assert match.match_end == 11
+        m = match(r, "abbbbbbbbbcdef")
+        assert m
+        assert m.match_end == P(11)
 
     def test_empty_maxuntil(self):
         r = get_code("\\{\\{((?:.*?)+)\\}\\}")
-        match = rsre_core.match(r, "{{a}}{{b}}")
-        assert match.group(1) == "a"
+        m = match(r, "{{a}}{{b}}")
+        assert m.group(1) == "a"
 
     def test_fullmatch_1(self):
         r = get_code(r"ab*c")
-        assert not rsre_core.fullmatch(r, "abbbcdef")
-        assert rsre_core.fullmatch(r, "abbbc")
+        assert not fullmatch(r, "abbbcdef")
+        assert fullmatch(r, "abbbc")
 
     def test_fullmatch_2(self):
         r = get_code(r"a(b*?)")
-        match = rsre_core.fullmatch(r, "abbb")
+        match = fullmatch(r, "abbb")
         assert match.group(1) == "bbb"
-        assert not rsre_core.fullmatch(r, "abbbc")
+        assert not fullmatch(r, "abbbc")
 
     def test_fullmatch_3(self):
         r = get_code(r"a((bp)*?)c")
-        match = rsre_core.fullmatch(r, "abpbpbpc")
+        match = fullmatch(r, "abpbpbpc")
         assert match.group(1) == "bpbpbp"
 
     def test_fullmatch_4(self):
         r = get_code(r"a((bp)*)c")
-        match = rsre_core.fullmatch(r, "abpbpbpc")
+        match = fullmatch(r, "abpbpbpc")
         assert match.group(1) == "bpbpbp"
 
     def test_fullmatch_assertion(self):
         r = get_code(r"(?=a).b")
-        assert rsre_core.fullmatch(r, "ab")
+        assert fullmatch(r, "ab")
         r = get_code(r"(?!a)..")
-        assert not rsre_core.fullmatch(r, "ab")
+        assert not fullmatch(r, "ab")
 
     def test_range_ignore(self):
         from rpython.rlib.unicodedata import unicodedb
@@ -307,4 +308,4 @@
         r = get_code(u"[\U00010428-\U0001044f]", re.I)
         assert r.count(27) == 1       # OPCODE_RANGE
         r[r.index(27)] = 32           # => OPCODE_RANGE_IGNORE
-        assert rsre_core.match(r, u"\U00010428")
+        assert match(r, u"\U00010428")
diff --git a/rpython/rlib/rsre/test/test_search.py 
b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -1,44 +1,48 @@
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: merge unicode-utf8-re

Reply via email to