Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93241:87a98889b109 Date: 2017-12-03 15:12 +0100 http://bitbucket.org/pypy/pypy/changeset/87a98889b109/
Log: in-progress diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -324,7 +324,10 @@ ctx.jitdriver_RepeatOne.jit_merge_point( self=self, ptr=ptr, ctx=ctx, nextppos=nextppos) result = sre_match(ctx, nextppos, ptr, self.start_marks) - ptr = ctx.prev_or_minus1(ptr) + try: + ptr = ctx.prev(ptr) + except EndOfString: + ptr = -1 if result is not None: self.subresult = result self.start_ptr = ptr @@ -440,12 +443,12 @@ min = ctx.pat(ppos+1) if enum is not None: # matched one more 'item'. record it and continue. - last_match_length = ctx.match_end - ptr + last_match_zero_length = (ctx.match_end == ptr) self.pending = Pending(ptr, marks, enum, self.pending) self.num_pending += 1 ptr = ctx.match_end marks = ctx.match_marks - if last_match_length == 0 and self.num_pending >= min: + if last_match_zero_length and self.num_pending >= min: # zero-width protection: after an empty match, if there # are enough matches, don't try to match more. Instead, # fall through to trying to match 'tail'. @@ -629,30 +632,30 @@ elif op == OPCODE_GROUPREF: # match backreference # <GROUPREF> <groupnum> - startptr, length = get_group_ref(marks, ctx.pat(ppos)) - if length < 0: + startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos)) + if length_bytes < 0: return # group was not previously defined - if not match_repeated(ctx, ptr, startptr, length): + if not match_repeated(ctx, ptr, startptr, length_bytes): return # no match - ptr += length + ptr = ctx.go_forward_by_bytes(ptr, length_bytes) ppos += 1 elif op == OPCODE_GROUPREF_IGNORE: # match backreference # <GROUPREF> <groupnum> - startptr, length = get_group_ref(marks, ctx.pat(ppos)) - if length < 0: + startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos)) + if length_bytes < 0: return # group was not previously defined - if not match_repeated_ignore(ctx, ptr, startptr, length): + ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes) + if ptr < ctx.ZERO: return # no match - ptr += length ppos += 1 elif op == OPCODE_GROUPREF_EXISTS: # conditional match depending on the existence of a group # <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... - _, length = get_group_ref(marks, ctx.pat(ppos)) - if length >= 0: + _, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos)) + if length_bytes >= 0: ppos += 2 # jump to 'codeyes' else: ppos += ctx.pat(ppos+1) # jump to 'codeno' @@ -664,7 +667,7 @@ ctx.str(ptr)): return ppos += ctx.pat(ppos) - ptr += 1 + ptr = ctx.next(ptr) elif op == OPCODE_IN_IGNORE: # match set member (or non_member), ignoring case @@ -673,7 +676,7 @@ ctx.lowstr(ptr)): return ppos += ctx.pat(ppos) - ptr += 1 + ptr = ctx.next(ptr) elif op == OPCODE_INFO: # optimization info block @@ -699,7 +702,7 @@ if ptr >= ctx.end or ctx.lowstr(ptr) != ctx.pat(ppos): return ppos += 1 - ptr += 1 + ptr = ctx.next(ptr) elif op == OPCODE_MARK: # set mark @@ -804,32 +807,36 @@ raise Error("bad pattern code %d" % op) -def get_group_ref(marks, groupnum): +def get_group_ref(ctx, marks, groupnum): gid = groupnum * 2 startptr = find_mark(marks, gid) - if startptr < 0: + if startptr < ctx.ZERO: return 0, -1 endptr = find_mark(marks, gid + 1) - length = endptr - startptr # < 0 if endptr < startptr (or if endptr=-1) - return startptr, length + length_bytes = ctx.bytes_difference(endptr, startptr) + # < 0 if endptr < startptr (or if endptr=-1) + return startptr, length_bytes @specializectx -def match_repeated(ctx, ptr, oldptr, length): - if ptr + length > ctx.end: +def match_repeated(ctx, ptr, oldptr, length_bytes): + if ctx.bytes_difference(ctx.end, ptr) < length_bytes: return False - for i in range(length): - if ctx.str(ptr + i) != ctx.str(oldptr + i): + for i in range(length_bytes): + if ctx.get_single_byte(ptr, i) != ctx.get_single_byte(oldptr, i): return False return True @specializectx -def match_repeated_ignore(ctx, ptr, oldptr, length): - if ptr + length > ctx.end: - return False - for i in range(length): - if ctx.lowstr(ptr + i) != ctx.lowstr(oldptr + i): - return False - return True +def match_repeated_ignore(ctx, ptr, oldptr, length_bytes): + oldend = ctx.go_forward_by_bytes(oldptr, length_bytes) + while oldptr < oldend: + if ptr >= ctx.end: + return -1 + if ctx.lowstr(ptr) != ctx.lowstr(oldptr): + return -1 + ptr = ctx.next(ptr) + oldptr = ctx.next(oldptr) + return ptr @specializectx def find_repetition_end(ctx, ppos, ptr, maxcount, marks): @@ -934,7 +941,7 @@ ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr, end=end, ppos=ppos) if ptr < end and checkerfn(ctx, ptr, ppos): - ptr += 1 + ptr = ctx.next(ptr) else: return ptr else: @@ -996,9 +1003,8 @@ return at_non_boundary(ctx, ptr) elif atcode == AT_END: - remaining_chars = ctx.end - ptr - return remaining_chars <= 0 or ( - remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr))) + return (ptr == ctx.end or + (ctx.next(ptr) == ctx.end and rsre_char.is_linebreak(ctx.str(ptr)))) elif atcode == AT_END_LINE: return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr)) diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py --- a/rpython/rlib/rsre/test/support.py +++ b/rpython/rlib/rsre/test/support.py @@ -14,35 +14,27 @@ def __repr__(self): return '<Position %d>' % (self._p) def __cmp__(self, other): - if not isinstance(other, (Position, MinusOnePosition)): - raise TypeError("cannot compare %r with %r" % (self, other)) - return cmp(self._p, other._p) - -class MinusOnePosition(object): - _p = -1 - def __repr__(self): - return '<MinusOnePosition>' - def __cmp__(self, other): - if not isinstance(other, (Position, MinusOnePosition)): - raise TypeError("cannot compare %r with %r" % (self, other)) - return cmp(self._p, other._p) + if isinstance(other, Position): + return cmp(self._p, other._p) + if type(other) is int and other == -1: + return cmp(self._p, -1) + raise TypeError("cannot compare %r with %r" % (self, other)) class MatchContextForTests(StrMatchContext): """Concrete subclass for matching in a plain string, tweaked for tests""" ZERO = Position(0) - MINUS1 = MinusOnePosition() EXACT_DISTANCE = False def next(self, position): assert isinstance(position, Position) return Position(position._p + 1) - def prev_or_minus1(self, position): + def prev(self, position): assert isinstance(position, Position) if position._p == 0: - return self.MINUS1 + raise EndOfString return Position(position._p - 1) def next_n(self, position, n, end_position): @@ -89,6 +81,21 @@ assert isinstance(position_high, Position) return position_high._p - position_low._p + random.randrange(0, 10) + def bytes_difference(self, position1, position2): + assert isinstance(position1, Position) + assert isinstance(position2, Position) + return position1._p - position2._p + + def get_single_byte(self, base_position, index): + assert isinstance(base_position, Position) + assert isinstance(index, int) + return ord(self._string[base_position._p + index]) + + def go_forward_by_bytes(self, base_position, index): + assert isinstance(base_position, Position) + assert isinstance(index, int) + return Position(base_position._p + index) + def match(pattern, string, start=0, end=sys.maxint, flags=0, fullmatch=False): start, end = _adjust(start, end, len(string)) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit