Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93311:336fb075d139 Date: 2017-12-08 12:22 +0100 http://bitbucket.org/pypy/pypy/changeset/336fb075d139/
Log: in-progress diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -13,7 +13,7 @@ # # Constants and exposed functions -from rpython.rlib.rsre import rsre_core +from rpython.rlib.rsre import rsre_core, rsre_utf8 from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, getlower, set_unicode_db @@ -40,7 +40,8 @@ end-start)) if isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string[start:end]) - elif isinstance(ctx, rsre_core.UnicodeMatchContext): + elif isinstance(ctx, rsre_utf8.Utf8MatchContext): + XXXXXXX s = ctx._unicodestr[start:end] lgt = rutf8.check_utf8(s, True) return space.newutf8(s, lgt) @@ -103,7 +104,7 @@ raise oefmt(space.w_TypeError, "cannot copy this pattern object") def make_ctx(self, w_string, pos=0, endpos=sys.maxint): - """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for + """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for searching in the given w_string object.""" space = self.space if pos < 0: @@ -111,17 +112,26 @@ if endpos < pos: endpos = pos if space.isinstance_w(w_string, space.w_unicode): - utf8str, length = space.utf8_len_w(w_string) - if pos >= length: + # xxx fish for the _index_storage + w_string = space.convert_arg_to_w_unicode(w_string) + utf8str = w_string._utf8 + length = w_string._len() + index_storage = w_string._get_index_storage() + # + if pos <= 0: + bytepos = 0 + elif pos >= length: bytepos = len(utf8str) else: - bytepos = rutf8.codepoint_at_index(..) - - pos = length + bytepos = rutf8.codepoint_at_index(utf8str, index_storage, pos) if endpos >= length: - endpos = length - return rsre_core.UnicodeMatchContext(self.code, unicodestr, - pos, endpos, self.flags) + endbytepos = len(utf8str) + else: + endbytepos = rutf8.codepoint_at_index(utf8str, index_storage, + endpos) + return rsre_utf8.Utf8MatchContext( + self.code, unicodestr, index_storage, + bytepos, endbytepos, self.flags) elif space.isinstance_w(w_string, space.w_bytes): str = space.bytes_w(w_string) if pos > len(str): @@ -372,7 +382,8 @@ if isinstance(ctx, rsre_core.StrMatchContext): assert strbuilder is not None return strbuilder.append_slice(ctx._string, start, end) - elif isinstance(ctx, rsre_core.UnicodeMatchContext): + elif isinstance(ctx, rsre_utf8.Utf8MatchContext): + XXXXXXX assert unicodebuilder is not None return unicodebuilder.append_slice(ctx._unicodestr, start, end) assert 0, "unreachable" @@ -578,7 +589,8 @@ return space.newbytes(ctx._buffer.as_str()) elif isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string) - elif isinstance(ctx, rsre_core.UnicodeMatchContext): + elif isinstance(ctx, rsre_utf8.Utf8MatchContext): + XXXXXXXX lgt = rutf8.check_utf8(ctx._unicodestr, True) return space.newutf8(ctx._unicodestr, lgt) else: diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -55,6 +55,8 @@ specific subclass, calling 'func' is a direct call; if 'ctx' is only known to be of class AbstractMatchContext, calling 'func' is an indirect call. """ + from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext + assert func.func_code.co_varnames[0] == 'ctx' specname = '_spec_' + func.func_name while specname in _seen_specname: @@ -65,7 +67,8 @@ specialized_methods = [] for prefix, concreteclass in [('buf', BufMatchContext), ('str', StrMatchContext), - ('uni', UnicodeMatchContext)]: + ('uni', UnicodeMatchContext), + ('utf8', Utf8MatchContext)]: newfunc = func_with_new_name(func, prefix + specname) assert not hasattr(concreteclass, specname) setattr(concreteclass, specname, newfunc) diff --git a/rpython/rlib/rsre/rsre_jit.py b/rpython/rlib/rsre/rsre_jit.py --- a/rpython/rlib/rsre/rsre_jit.py +++ b/rpython/rlib/rsre/rsre_jit.py @@ -36,8 +36,10 @@ from rpython.rlib.rsre.rsre_core import BufMatchContext from rpython.rlib.rsre.rsre_core import StrMatchContext from rpython.rlib.rsre.rsre_core import UnicodeMatchContext + from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext for prefix, concreteclass in [('Buf', BufMatchContext), ('Str', StrMatchContext), - ('Uni', UnicodeMatchContext)]: + ('Uni', UnicodeMatchContext), + ('Utf8', Utf8MatchContext)]: jitdriver = RSreJitDriver(prefix + name, **kwds) setattr(concreteclass, 'jitdriver_' + name, jitdriver) diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py --- a/rpython/rlib/rsre/rsre_utf8.py +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -1,3 +1,4 @@ +import sys from rpython.rlib.debug import check_nonneg from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString @@ -7,9 +8,11 @@ class Utf8MatchContext(AbstractMatchContext): - def __init__(self, pattern, utf8string, match_start, end, flags): + def __init__(self, pattern, utf8string, index_storage, + match_start, end, flags): AbstractMatchContext.__init__(self, pattern, match_start, end, flags) self._utf8 = utf8string + self._index_storage = index_storage def str(self, index): check_nonneg(index) @@ -56,4 +59,32 @@ return position def slowly_convert_byte_pos_to_index(self, position): - + return rutf8.codepoint_index_at_byte_position( + self._utf8, self._index_storage, position) + + def debug_check_pos(self, position): + assert not (0x80 <= self._utf8[position] < 0xC0) # continuation byte + + +def utf8search(pattern, utf8string, index_storage=None, bytestart=0, + byteend=sys.maxint, flags=0): + # bytestart and byteend must be valid byte positions inside the + # utf8string. + from rpython.rlib.rsre.rsre_core import search_context + + assert 0 <= bytestart <= len(utf8string) + assert 0 <= byteend + if byteend > len(utf8string): + byteend = len(utf8string) + if index_storage is None: # should be restricted to tests only + length = rutf8.check_utf8(utf8string, allow_surrogates=True) + index_storage = rutf8.create_utf8_index_storage(utf8string, length) + ctx = Utf8MatchContext(pattern, utf8string, index_storage, + bytestart, byteend, flags) + if search_context(ctx): + return ctx + else: + return None + +def utf8match(*args, **kwds): + NOT_IMPLEMENTED _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit