[pypy-commit] pypy unicode-utf8: fix merge

mattip Mon, 30 Apr 2018 09:28:36 -0700

Author: Matti Picus <[email protected]>
Branch: unicode-utf8
Changeset: r94459:ab5ac9802e14
Date: 2018-04-29 23:28 +0300
http://bitbucket.org/pypy/pypy/changeset/ab5ac9802e14/


Log:    fix merge

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -133,7 +133,7 @@
                 endbytepos = rutf8.codepoint_position_at_index(utf8str,
                                 index_storage, endpos)
             ctx = rsre_utf8.Utf8MatchContext(
-                self.code, utf8str, bytepos, endbytepos, self.flags)
+                utf8str, bytepos, endbytepos, self.flags)
             # xxx we store the w_string on the ctx too, for
             # W_SRE_Match.bytepos_to_charindex()
             ctx.w_unicode_obj = w_unicode_obj
@@ -159,14 +159,14 @@
     def fresh_copy(self, ctx):
         if isinstance(ctx, rsre_utf8.Utf8MatchContext):
             result = rsre_utf8.Utf8MatchContext(
-                ctx.pattern, ctx._utf8, ctx.match_start, ctx.end, ctx.flags)
+                ctx._utf8, ctx.match_start, ctx.end, ctx.flags)
             result.w_unicode_obj = ctx.w_unicode_obj
         elif isinstance(ctx, rsre_core.StrMatchContext):
             result = self._make_str_match_context(
                 ctx._string, ctx.match_start, ctx.end)
         elif isinstance(ctx, rsre_core.BufMatchContext):
             result = rsre_core.BufMatchContext(
-                ctx.pattern, ctx._buffer, ctx.match_start, ctx.end, ctx.flags)
+                ctx._buffer, ctx.match_start, ctx.end, ctx.flags)
         else:
             raise AssertionError("bad ctx type")
         result.match_end = ctx.match_end
@@ -174,7 +174,7 @@
 
     def _make_str_match_context(self, str, pos, endpos):
         # for tests to override
-        return rsre_core.StrMatchContext(self.code, str,
+        return rsre_core.StrMatchContext(str,
                                          pos, endpos, self.flags)
 
     def getmatch(self, ctx, found):
@@ -319,7 +319,7 @@
         n = 0
         last_pos = ctx.ZERO
         while not count or n < count:
-            pattern = ctx.pattern
+            pattern = self.code
             sub_jitdriver.jit_merge_point(
                 self=self,
                 use_builder=use_builder,
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -32,7 +32,7 @@
         start = support.Position(start)
     if not isinstance(end, support.Position):
         end = support.Position(end)
-    return support.MatchContextForTests(self.code, str, start, end, self.flags)
+    return support.MatchContextForTests(str, start, end, self.flags)
 
 def _bytepos_to_charindex(self, bytepos):
     if isinstance(self.ctx, support.MatchContextForTests):
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,6 +55,8 @@
     specific subclass, calling 'func' is a direct call; if 'ctx' is only known
     to be of class AbstractMatchContext, calling 'func' is an indirect call.
     """
+    from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
+
     assert func.func_code.co_varnames[0] == 'ctx'
     specname = '_spec_' + func.func_name
     while specname in _seen_specname:
@@ -65,7 +67,9 @@
     specialized_methods = []
     for prefix, concreteclass in [('buf', BufMatchContext),
                                   ('str', StrMatchContext),
-                                  ('uni', UnicodeMatchContext)]:
+                                  ('uni', UnicodeMatchContext),
+                                  ('utf8', Utf8MatchContext),
+                                  ]:
         newfunc = func_with_new_name(func, prefix + specname)
         assert not hasattr(concreteclass, specname)
         setattr(concreteclass, specname, newfunc)
@@ -83,6 +87,8 @@
     def __init__(self, msg):
         self.msg = msg
 
+class EndOfString(Exception):
+    pass
 
 class CompiledPattern(object):
     _immutable_fields_ = ['pattern[*]']
@@ -142,6 +148,46 @@
         """Similar to str()."""
         raise NotImplementedError
 
+    # The following methods are provided to be overriden in
+    # Utf8MatchContext.  The non-utf8 implementation is provided
+    # by the FixedMatchContext abstract subclass, in order to use
+    # the same @not_rpython safety trick as above.
+    ZERO = 0
+    @not_rpython
+    def next(self, position):
+        raise NotImplementedError
+    @not_rpython
+    def prev(self, position):
+        raise NotImplementedError
+    @not_rpython
+    def next_n(self, position, n):
+        raise NotImplementedError
+    @not_rpython
+    def prev_n(self, position, n, start_position):
+        raise NotImplementedError
+    @not_rpython
+    def debug_check_pos(self, position):
+        raise NotImplementedError
+    @not_rpython
+    def maximum_distance(self, position_low, position_high):
+        raise NotImplementedError
+    @not_rpython
+    def get_single_byte(self, base_position, index):
+        raise NotImplementedError
+
+    def bytes_difference(self, position1, position2):
+        return position1 - position2
+    def go_forward_by_bytes(self, base_position, index):
+        return base_position + index
+    def next_indirect(self, position):
+        assert position < self.end
+        return position + 1     # like next(), but can be called indirectly
+    def prev_indirect(self, position):
+        position -= 1           # like prev(), but can be called indirectly
+        if position < 0:
+            raise EndOfString
+        return position
+
     def get_mark(self, gid):
         return find_mark(self.match_marks, gid)
 
@@ -185,13 +231,40 @@
     def fresh_copy(self, start):
         raise NotImplementedError
 
-class BufMatchContext(AbstractMatchContext):
+class FixedMatchContext(AbstractMatchContext):
+    """Abstract subclass to introduce the default implementation for
+    these position methods.  The Utf8MatchContext subclass doesn't
+    inherit from here."""
+
+    next = AbstractMatchContext.next_indirect
+    prev = AbstractMatchContext.prev_indirect
+
+    def next_n(self, position, n, end_position):
+        position += n
+        if position > end_position:
+            raise EndOfString
+        return position
+
+    def prev_n(self, position, n, start_position):
+        position -= n
+        if position < start_position:
+            raise EndOfString
+        return position
+
+    def debug_check_pos(self, position):
+        pass
+
+    def maximum_distance(self, position_low, position_high):
+        return position_high - position_low
+
+
+class BufMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a buffer."""
 
     _immutable_fields_ = ["_buffer"]
 
     def __init__(self, buf, match_start, end, flags):
-        AbstractMatchContext.__init__(self, match_start, end, flags)
+        FixedMatchContext.__init__(self, match_start, end, flags)
         self._buffer = buf
 
     def str(self, index):
@@ -206,13 +279,17 @@
         return BufMatchContext(self._buffer, start,
                                self.end, self.flags)
 
-class StrMatchContext(AbstractMatchContext):
+    def get_single_byte(self, base_position, index):
+        return self.str(base_position + index)
+
+
+class StrMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a plain string."""
 
     _immutable_fields_ = ["_string"]
 
     def __init__(self, string, match_start, end, flags):
-        AbstractMatchContext.__init__(self, match_start, end, flags)
+        FixedMatchContext.__init__(self, match_start, end, flags)
         self._string = string
         if not we_are_translated() and isinstance(string, unicode):
             self.flags |= rsre_char.SRE_FLAG_UNICODE   # for rsre_re.py
@@ -229,13 +306,20 @@
         return StrMatchContext(self._string, start,
                                self.end, self.flags)
 
-class UnicodeMatchContext(AbstractMatchContext):
+    def get_single_byte(self, base_position, index):
+        return self.str(base_position + index)
+
+    def _real_pos(self, index):
+        return index     # overridden by tests
+
+
+class UnicodeMatchContext(FixedMatchContext):
     """Concrete subclass for matching in a unicode string."""
 
     _immutable_fields_ = ["_unicodestr"]
 
     def __init__(self, unicodestr, match_start, end, flags):
-        AbstractMatchContext.__init__(self, match_start, end, flags)
+        FixedMatchContext.__init__(self, match_start, end, flags)
         self._unicodestr = unicodestr
 
     def str(self, index):
@@ -250,6 +334,9 @@
         return UnicodeMatchContext(self._unicodestr, start,
                                    self.end, self.flags)
 
+    def get_single_byte(self, base_position, index):
+        return self.str(base_position + index)
+
 # ____________________________________________________________
 
 class Mark(object):
@@ -325,7 +412,10 @@
                 self=self, ptr=ptr, ctx=ctx, nextppos=nextppos,
                 pattern=pattern)
             result = sre_match(ctx, pattern, nextppos, ptr, self.start_marks)
-            ptr -= 1
+            try:
+                ptr = ctx.prev_indirect(ptr)
+            except EndOfString:
+                ptr = -1
             if result is not None:
                 self.subresult = result
                 self.start_ptr = ptr
@@ -336,32 +426,35 @@
 class MinRepeatOneMatchResult(MatchResult):
     install_jitdriver('MinRepeatOne',
                       greens=['nextppos', 'ppos3', 'pattern'],
-                      reds=['ptr', 'self', 'ctx'],
+                      reds=['max_count', 'ptr', 'self', 'ctx'],
                       debugprint=(2, 0))   # indices in 'greens'
 
-    def __init__(self, nextppos, ppos3, maxptr, ptr, marks):
+    def __init__(self, nextppos, ppos3, max_count, ptr, marks):
         self.nextppos = nextppos
         self.ppos3 = ppos3
-        self.maxptr = maxptr
+        self.max_count = max_count
         self.start_ptr = ptr
         self.start_marks = marks
 
     def find_first_result(self, ctx, pattern):
         ptr = self.start_ptr
         nextppos = self.nextppos
+        max_count = self.max_count
         ppos3 = self.ppos3
-        while ptr <= self.maxptr:
+        while max_count >= 0:
             ctx.jitdriver_MinRepeatOne.jit_merge_point(
                 self=self, ptr=ptr, ctx=ctx, nextppos=nextppos, ppos3=ppos3,
-                pattern=pattern)
+                max_count=max_count, pattern=pattern)
             result = sre_match(ctx, pattern, nextppos, ptr, self.start_marks)
             if result is not None:
                 self.subresult = result
                 self.start_ptr = ptr
+                self.max_count = max_count
                 return self
             if not self.next_char_ok(ctx, pattern, ptr, ppos3):
                 break
-            ptr += 1
+            ptr = ctx.next_indirect(ptr)
+            max_count -= 1
 
     def find_next_result(self, ctx, pattern):
         ptr = self.start_ptr
@@ -440,12 +533,12 @@
             min = pattern.pat(ppos+1)
             if enum is not None:
                 # matched one more 'item'.  record it and continue.
-                last_match_length = ctx.match_end - ptr
+                last_match_zero_length = (ctx.match_end == ptr)
                 self.pending = Pending(ptr, marks, enum, self.pending)
                 self.num_pending += 1
                 ptr = ctx.match_end
                 marks = ctx.match_marks
-                if last_match_length == 0 and self.num_pending >= min:
+                if last_match_zero_length and self.num_pending >= min:
                     # zero-width protection: after an empty match, if there
                     # are enough matches, don't try to match more.  Instead,
                     # fall through to trying to match 'tail'.
@@ -561,22 +654,25 @@
             # <ANY>
             if ptr >= ctx.end or rsre_char.is_linebreak(ctx.str(ptr)):
                 return
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_ANY_ALL:
             # match anything
             # <ANY_ALL>
             if ptr >= ctx.end:
                 return
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_ASSERT:
             # assert subpattern
             # <ASSERT> <0=skip> <1=back> <pattern>
-            ptr1 = ptr - pattern.pat(ppos+1)
+            try:
+                ptr1 = ctx.prev_n(ptr, pattern.pat(ppos+1), ctx.ZERO)
+            except EndOfString:
+                return
             saved = ctx.fullmatch_only
             ctx.fullmatch_only = False
-            stop = ptr1 < 0 or sre_match(ctx, pattern, ppos + 2, ptr1, marks) 
is None
+            stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is None
             ctx.fullmatch_only = saved
             if stop:
                 return
@@ -586,14 +682,18 @@
         elif op == OPCODE_ASSERT_NOT:
             # assert not subpattern
             # <ASSERT_NOT> <0=skip> <1=back> <pattern>
-            ptr1 = ptr - pattern.pat(ppos+1)
-            saved = ctx.fullmatch_only
-            ctx.fullmatch_only = False
-            stop = (ptr1 >= 0 and sre_match(ctx, pattern, ppos + 2, ptr1, 
marks)
-                                      is not None)
-            ctx.fullmatch_only = saved
-            if stop:
-                return
+
+            try:
+                ptr1 = ctx.prev_n(ptr, pattern.pat(ppos+1), ctx.ZERO)
+            except EndOfString:
+                pass
+            else:
+                saved = ctx.fullmatch_only
+                ctx.fullmatch_only = False
+                stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is not 
None
+                ctx.fullmatch_only = saved
+                if stop:
+                    return
             ppos += pattern.pat(ppos)
 
         elif op == OPCODE_AT:
@@ -616,36 +716,36 @@
             if (ptr == ctx.end or
                 not rsre_char.category_dispatch(pattern.pat(ppos), 
ctx.str(ptr))):
                 return
-            ptr += 1
+            ptr = ctx.next(ptr)
             ppos += 1
 
         elif op == OPCODE_GROUPREF:
             # match backreference
             # <GROUPREF> <groupnum>
-            startptr, length = get_group_ref(marks, pattern.pat(ppos))
-            if length < 0:
+            startptr, length_bytes = get_group_ref(ctx, marks, 
pattern.pat(ppos))
+            if length_bytes < 0:
                 return     # group was not previously defined
-            if not match_repeated(ctx, ptr, startptr, length):
+            if not match_repeated(ctx, ptr, startptr, length_bytes):
                 return     # no match
-            ptr += length
+            ptr = ctx.go_forward_by_bytes(ptr, length_bytes)
             ppos += 1
 
         elif op == OPCODE_GROUPREF_IGNORE:
             # match backreference
             # <GROUPREF> <groupnum>
-            startptr, length = get_group_ref(marks, pattern.pat(ppos))
-            if length < 0:
+            startptr, length_bytes = get_group_ref(ctx, marks, 
pattern.pat(ppos))
+            if length_bytes < 0:
                 return     # group was not previously defined
-            if not match_repeated_ignore(ctx, ptr, startptr, length):
+            if not match_repeated_ignore(ctx, ptr, startptr, length_bytes):
                 return     # no match
-            ptr += length
+            ptr = ctx.go_forward_by_bytes(ptr, length_bytes)
             ppos += 1
 
         elif op == OPCODE_GROUPREF_EXISTS:
             # conditional match depending on the existence of a group
             # <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ...
-            _, length = get_group_ref(marks, pattern.pat(ppos))
-            if length >= 0:
+            _, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos))
+            if length_bytes >= 0:
                 ppos += 2                  # jump to 'codeyes'
             else:
                 ppos += pattern.pat(ppos+1)    # jump to 'codeno'
@@ -657,7 +757,7 @@
                                                              ctx.str(ptr)):
                 return
             ppos += pattern.pat(ppos)
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_IN_IGNORE:
             # match set member (or non_member), ignoring case
@@ -666,12 +766,12 @@
                                                              ctx.lowstr(ptr)):
                 return
             ppos += pattern.pat(ppos)
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_INFO:
             # optimization info block
             # <INFO> <0=skip> <1=flags> <2=min> ...
-            if (ctx.end - ptr) < pattern.pat(ppos+2):
+            if ctx.maximum_distance(ptr, ctx.end) < pattern.pat(ppos+2):
                 return
             ppos += pattern.pat(ppos)
 
@@ -684,7 +784,7 @@
             if ptr >= ctx.end or ctx.str(ptr) != pattern.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_LITERAL_IGNORE:
             # match literal string, ignoring case
@@ -692,7 +792,7 @@
             if ptr >= ctx.end or ctx.lowstr(ptr) != pattern.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_MARK:
             # set mark
@@ -707,7 +807,7 @@
             if ptr >= ctx.end or ctx.str(ptr) == pattern.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_NOT_LITERAL_IGNORE:
             # match if it's not a literal string, ignoring case
@@ -715,7 +815,7 @@
             if ptr >= ctx.end or ctx.lowstr(ptr) == pattern.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_REPEAT:
             # general repeat.  in this version of the re module, all the work
@@ -753,8 +853,10 @@
             # use the MAX_REPEAT operator.
             # <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail
             start = ptr
-            minptr = start + pattern.pat(ppos+1)
-            if minptr > ctx.end:
+
+            try:
+                minptr = ctx.next_n(start, pattern.pat(ppos+1), ctx.end)
+            except EndOfString:
                 return    # cannot match
             ptr = find_repetition_end(ctx, pattern, ppos+3, start,
                                       pattern.pat(ppos+2),
@@ -776,22 +878,22 @@
             start = ptr
             min = pattern.pat(ppos+1)
             if min > 0:
-                minptr = ptr + min
-                if minptr > ctx.end:
-                    return   # cannot match
+                try:
+                    minptr = ctx.next_n(ptr, min, ctx.end)
+                except EndOfString:
+                    return    # cannot match
                 # count using pattern min as the maximum
                 ptr = find_repetition_end(ctx, pattern, ppos+3, ptr, min, 
marks)
                 if ptr < minptr:
                     return   # did not match minimum number of times
 
-            maxptr = ctx.end
+            max_count = sys.maxint
             max = pattern.pat(ppos+2)
             if max != rsre_char.MAXREPEAT:
-                maxptr1 = start + max
-                if maxptr1 <= maxptr:
-                    maxptr = maxptr1
+                max_count = max - min
+                assert max_count >= 0
             nextppos = ppos + pattern.pat(ppos)
-            result = MinRepeatOneMatchResult(nextppos, ppos+3, maxptr,
+            result = MinRepeatOneMatchResult(nextppos, ppos+3, max_count,
                                              ptr, marks)
             return result.find_first_result(ctx, pattern)
 
@@ -799,40 +901,43 @@
             raise Error("bad pattern code %d" % op)
 
 
-def get_group_ref(marks, groupnum):
+def get_group_ref(ctx, marks, groupnum):
     gid = groupnum * 2
     startptr = find_mark(marks, gid)
-    if startptr < 0:
+    if startptr < ctx.ZERO:
         return 0, -1
     endptr = find_mark(marks, gid + 1)
-    length = endptr - startptr     # < 0 if endptr < startptr (or if endptr=-1)
-    return startptr, length
+    length_bytes = ctx.bytes_difference(endptr, startptr)
+    return startptr, length_bytes
 
 @specializectx
-def match_repeated(ctx, ptr, oldptr, length):
-    if ptr + length > ctx.end:
+def match_repeated(ctx, ptr, oldptr, length_bytes):
+    if ctx.bytes_difference(ctx.end, ptr) < length_bytes:
         return False
-    for i in range(length):
-        if ctx.str(ptr + i) != ctx.str(oldptr + i):
+    for i in range(length_bytes):
+        if ctx.get_single_byte(ptr, i) != ctx.get_single_byte(oldptr, i):
             return False
     return True
 
 @specializectx
-def match_repeated_ignore(ctx, ptr, oldptr, length):
-    if ptr + length > ctx.end:
-        return False
-    for i in range(length):
-        if ctx.lowstr(ptr + i) != ctx.lowstr(oldptr + i):
-            return False
-    return True
+def match_repeated_ignore(ctx, ptr, oldptr, length_bytes):
+    oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+    while oldptr < oldend:
+        if ptr >= ctx.end:
+            return -1
+        if ctx.lowstr(ptr) != ctx.lowstr(oldptr):
+            return -1
+        ptr = ctx.next(ptr)
+        oldptr = ctx.next(oldptr)
+    return ptr
 
 @specializectx
 def find_repetition_end(ctx, pattern, ppos, ptr, maxcount, marks):
     end = ctx.end
-    ptrp1 = ptr + 1
     # First get rid of the cases where we don't have room for any match.
-    if maxcount <= 0 or ptrp1 > end:
+    if maxcount <= 0 or ptr >= end:
         return ptr
+    ptrp1 = ctx.next(ptr)
     # Check the first character directly.  If it doesn't match, we are done.
     # The idea is to be fast for cases like re.search("b+"), where we expect
     # the common case to be a non-match.  It's much faster with the JIT to
@@ -854,9 +959,10 @@
     # Else we really need to count how many times it matches.
     if maxcount != rsre_char.MAXREPEAT:
         # adjust end
-        end1 = ptr + maxcount
-        if end1 <= end:
-            end = end1
+        try:
+            end = ctx.next_n(ptr, maxcount, end)
+        except EndOfString:
+            pass
     op = pattern.pat(ppos)
     for op1, fre in unroll_fre_checker:
         if op1 == op:
@@ -873,7 +979,7 @@
         if end1 <= end:
             end = end1
     while ptr < end and sre_match(ctx, patern, ppos, ptr, marks) is not None:
-        ptr += 1
+        ptr = ctx.next(ptr)
     return ptr
 
 @specializectx
@@ -916,7 +1022,7 @@
                                                       end=end, ppos=ppos,
                                                       pattern=pattern)
                 if ptr < end and checkerfn(ctx, pattern, ptr, ppos):
-                    ptr += 1
+                    ptr = ctx.next(ptr)
                 else:
                     return ptr
     elif checkerfn == match_IN_IGNORE:
@@ -931,7 +1037,7 @@
                                                             end=end, ppos=ppos,
                                                             pattern=pattern)
                 if ptr < end and checkerfn(ctx, pattern, ptr, ppos):
-                    ptr += 1
+                    ptr = ctx.next(ptr)
                 else:
                     return ptr
     else:
@@ -940,7 +1046,7 @@
         @specializectx
         def fre(ctx, pattern, ptr, end, ppos):
             while ptr < end and checkerfn(ctx, pattern, ptr, ppos):
-                ptr += 1
+                ptr = ctx.next(ptr)
             return ptr
     fre = func_with_new_name(fre, 'fre_' + checkerfn.__name__)
     return fre
@@ -980,11 +1086,14 @@
 def sre_at(ctx, atcode, ptr):
     if (atcode == AT_BEGINNING or
         atcode == AT_BEGINNING_STRING):
-        return ptr == 0
+        return ptr == ctx.ZERO
 
     elif atcode == AT_BEGINNING_LINE:
-        prevptr = ptr - 1
-        return prevptr < 0 or rsre_char.is_linebreak(ctx.str(prevptr))
+        try:
+            prevptr = ctx.prev(ptr)
+        except EndOfString:
+            return True
+        return rsre_char.is_linebreak(ctx.str(prevptr))
 
     elif atcode == AT_BOUNDARY:
         return at_boundary(ctx, ptr)
@@ -993,9 +1102,8 @@
         return at_non_boundary(ctx, ptr)
 
     elif atcode == AT_END:
-        remaining_chars = ctx.end - ptr
-        return remaining_chars <= 0 or (
-            remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr)))
+        return (ptr == ctx.end or
+            (ctx.next(ptr) == ctx.end and 
rsre_char.is_linebreak(ctx.str(ptr))))
 
     elif atcode == AT_END_LINE:
         return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr))
@@ -1020,18 +1128,26 @@
 def _make_boundary(word_checker):
     @specializectx
     def at_boundary(ctx, ptr):
-        if ctx.end == 0:
+        if ctx.end == ctx.ZERO:
             return False
-        prevptr = ptr - 1
-        that = prevptr >= 0 and word_checker(ctx.str(prevptr))
+        try:
+            prevptr = ctx.prev(ptr)
+        except EndOfString:
+            that = False
+        else:
+            that = word_checker(ctx.str(prevptr))
         this = ptr < ctx.end and word_checker(ctx.str(ptr))
         return this != that
     @specializectx
     def at_non_boundary(ctx, ptr):
-        if ctx.end == 0:
+        if ctx.end == ctx.ZERO:
             return False
-        prevptr = ptr - 1
-        that = prevptr >= 0 and word_checker(ctx.str(prevptr))
+        try:
+            prevptr = ctx.prev(ptr)
+        except EndOfString:
+            that = False
+        else:
+            that = word_checker(ctx.str(prevptr))
         this = ptr < ctx.end and word_checker(ctx.str(ptr))
         return this == that
     return at_boundary, at_non_boundary
@@ -1109,13 +1225,15 @@
 
 def regular_search(ctx, pattern, base):
     start = ctx.match_start
-    while start <= ctx.end:
-        ctx.jitdriver_RegularSearch.jit_merge_point(ctx=ctx, start=start,
-                                                    base=base, pattern=pattern)
+    while True:
+        ctx.jitdriver_RegularSearch.jit_merge_point(ctx=ctx, pattern=pattern,
+                                                    start=start, base=base)
         if sre_match(ctx, pattern, base, start, None) is not None:
             ctx.match_start = start
             return True
-        start += 1
+        if start >= ctx.end:
+            break
+        start = ctx.next_indirect(start)
     return False
 
 install_jitdriver_spec("LiteralSearch",
@@ -1132,11 +1250,12 @@
     while start < ctx.end:
         ctx.jitdriver_LiteralSearch.jit_merge_point(ctx=ctx, start=start,
                                           base=base, character=character, 
pattern=pattern)
+        start1 = ctx.next(start)
         if ctx.str(start) == character:
-            if sre_match(ctx, pattern, base, start + 1, None) is not None:
+            if sre_match(ctx, pattern, base, start1, None) is not None:
                 ctx.match_start = start
                 return True
-        start += 1
+        start = start1
     return False
 
 install_jitdriver_spec("CharsetSearch",
@@ -1154,7 +1273,7 @@
             if sre_match(ctx, pattern, base, start, None) is not None:
                 ctx.match_start = start
                 return True
-        start += 1
+        start = ctx.next(start)
     return False
 
 install_jitdriver_spec('FastSearch',
@@ -1186,11 +1305,14 @@
         else:
             i += 1
             if i == prefix_len:
-                # found a potential match
-                start = string_position + 1 - prefix_len
-                assert start >= 0
+                # start = string_position + 1 - prefix_len: computed later
+                ptr = string_position
                 prefix_skip = pattern.pat(6)
-                ptr = start + prefix_skip
+                if prefix_skip == prefix_len:
+                    ptr = ctx.next(ptr)
+                else:
+                    assert prefix_skip < prefix_len
+                    ptr = ctx.prev_n(ptr, prefix_len-1 - prefix_skip, ctx.ZERO)
                 #flags = pattern.pat(2)
                 #if flags & rsre_char.SRE_INFO_LITERAL:
                 #    # matched all of pure literal pattern
@@ -1201,10 +1323,11 @@
                 pattern_offset = pattern.pat(1) + 1
                 ppos_start = pattern_offset + 2 * prefix_skip
                 if sre_match(ctx, pattern, ppos_start, ptr, None) is not None:
+                    start = ctx.prev_n(ptr, prefix_skip, ctx.ZERO)
                     ctx.match_start = start
                     return True
                 overlap_offset = prefix_len + (7 - 1)
                 i = pattern.pat(overlap_offset + i)
-        string_position += 1
+        string_position = ctx.next(string_position)
         if string_position >= ctx.end:
             return False
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -12,8 +12,8 @@
     by this class are expressed in *bytes*, not in characters.
     """
 
-    def __init__(self, pattern, utf8string, match_start, end, flags):
-        AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+    def __init__(self, utf8string, match_start, end, flags):
+        AbstractMatchContext.__init__(self, match_start, end, flags)
         self._utf8 = utf8string
 
     def str(self, index):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: fix merge

Reply via email to