Author: Armin Rigo <[email protected]>
Branch: unicode-utf8-re
Changeset: r93311:336fb075d139
Date: 2017-12-08 12:22 +0100
http://bitbucket.org/pypy/pypy/changeset/336fb075d139/
Log: in-progress
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -13,7 +13,7 @@
#
# Constants and exposed functions
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_utf8
from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, getlower,
set_unicode_db
@@ -40,7 +40,8 @@
end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string[start:end])
- elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ XXXXXXX
s = ctx._unicodestr[start:end]
lgt = rutf8.check_utf8(s, True)
return space.newutf8(s, lgt)
@@ -103,7 +104,7 @@
raise oefmt(space.w_TypeError, "cannot copy this pattern object")
def make_ctx(self, w_string, pos=0, endpos=sys.maxint):
- """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
+ """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
searching in the given w_string object."""
space = self.space
if pos < 0:
@@ -111,17 +112,26 @@
if endpos < pos:
endpos = pos
if space.isinstance_w(w_string, space.w_unicode):
- utf8str, length = space.utf8_len_w(w_string)
- if pos >= length:
+ # xxx fish for the _index_storage
+ w_string = space.convert_arg_to_w_unicode(w_string)
+ utf8str = w_string._utf8
+ length = w_string._len()
+ index_storage = w_string._get_index_storage()
+ #
+ if pos <= 0:
+ bytepos = 0
+ elif pos >= length:
bytepos = len(utf8str)
else:
- bytepos = rutf8.codepoint_at_index(..)
-
- pos = length
+ bytepos = rutf8.codepoint_at_index(utf8str, index_storage, pos)
if endpos >= length:
- endpos = length
- return rsre_core.UnicodeMatchContext(self.code, unicodestr,
- pos, endpos, self.flags)
+ endbytepos = len(utf8str)
+ else:
+ endbytepos = rutf8.codepoint_at_index(utf8str, index_storage,
+ endpos)
+ return rsre_utf8.Utf8MatchContext(
+ self.code, unicodestr, index_storage,
+ bytepos, endbytepos, self.flags)
elif space.isinstance_w(w_string, space.w_bytes):
str = space.bytes_w(w_string)
if pos > len(str):
@@ -372,7 +382,8 @@
if isinstance(ctx, rsre_core.StrMatchContext):
assert strbuilder is not None
return strbuilder.append_slice(ctx._string, start, end)
- elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ XXXXXXX
assert unicodebuilder is not None
return unicodebuilder.append_slice(ctx._unicodestr, start, end)
assert 0, "unreachable"
@@ -578,7 +589,8 @@
return space.newbytes(ctx._buffer.as_str())
elif isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string)
- elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ XXXXXXXX
lgt = rutf8.check_utf8(ctx._unicodestr, True)
return space.newutf8(ctx._unicodestr, lgt)
else:
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,6 +55,8 @@
specific subclass, calling 'func' is a direct call; if 'ctx' is only known
to be of class AbstractMatchContext, calling 'func' is an indirect call.
"""
+ from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
+
assert func.func_code.co_varnames[0] == 'ctx'
specname = '_spec_' + func.func_name
while specname in _seen_specname:
@@ -65,7 +67,8 @@
specialized_methods = []
for prefix, concreteclass in [('buf', BufMatchContext),
('str', StrMatchContext),
- ('uni', UnicodeMatchContext)]:
+ ('uni', UnicodeMatchContext),
+ ('utf8', Utf8MatchContext)]:
newfunc = func_with_new_name(func, prefix + specname)
assert not hasattr(concreteclass, specname)
setattr(concreteclass, specname, newfunc)
diff --git a/rpython/rlib/rsre/rsre_jit.py b/rpython/rlib/rsre/rsre_jit.py
--- a/rpython/rlib/rsre/rsre_jit.py
+++ b/rpython/rlib/rsre/rsre_jit.py
@@ -36,8 +36,10 @@
from rpython.rlib.rsre.rsre_core import BufMatchContext
from rpython.rlib.rsre.rsre_core import StrMatchContext
from rpython.rlib.rsre.rsre_core import UnicodeMatchContext
+ from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
for prefix, concreteclass in [('Buf', BufMatchContext),
('Str', StrMatchContext),
- ('Uni', UnicodeMatchContext)]:
+ ('Uni', UnicodeMatchContext),
+ ('Utf8', Utf8MatchContext)]:
jitdriver = RSreJitDriver(prefix + name, **kwds)
setattr(concreteclass, 'jitdriver_' + name, jitdriver)
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -1,3 +1,4 @@
+import sys
from rpython.rlib.debug import check_nonneg
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
@@ -7,9 +8,11 @@
class Utf8MatchContext(AbstractMatchContext):
- def __init__(self, pattern, utf8string, match_start, end, flags):
+ def __init__(self, pattern, utf8string, index_storage,
+ match_start, end, flags):
AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
self._utf8 = utf8string
+ self._index_storage = index_storage
def str(self, index):
check_nonneg(index)
@@ -56,4 +59,32 @@
return position
def slowly_convert_byte_pos_to_index(self, position):
-
+ return rutf8.codepoint_index_at_byte_position(
+ self._utf8, self._index_storage, position)
+
+ def debug_check_pos(self, position):
+ assert not (0x80 <= self._utf8[position] < 0xC0) # continuation byte
+
+
+def utf8search(pattern, utf8string, index_storage=None, bytestart=0,
+ byteend=sys.maxint, flags=0):
+ # bytestart and byteend must be valid byte positions inside the
+ # utf8string.
+ from rpython.rlib.rsre.rsre_core import search_context
+
+ assert 0 <= bytestart <= len(utf8string)
+ assert 0 <= byteend
+ if byteend > len(utf8string):
+ byteend = len(utf8string)
+ if index_storage is None: # should be restricted to tests only
+ length = rutf8.check_utf8(utf8string, allow_surrogates=True)
+ index_storage = rutf8.create_utf8_index_storage(utf8string, length)
+ ctx = Utf8MatchContext(pattern, utf8string, index_storage,
+ bytestart, byteend, flags)
+ if search_context(ctx):
+ return ctx
+ else:
+ return None
+
+def utf8match(*args, **kwds):
+ NOT_IMPLEMENTED
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit