[pypy-commit] pypy unicode-utf8-re: in-progress

arigo Fri, 08 Dec 2017 03:24:07 -0800

Author: Armin Rigo <ar...@tunes.org>
Branch: unicode-utf8-re
Changeset: r93311:336fb075d139
Date: 2017-12-08 12:22 +0100
http://bitbucket.org/pypy/pypy/changeset/336fb075d139/


Log:    in-progress

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -13,7 +13,7 @@
 #
 # Constants and exposed functions
 
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_utf8
 from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, getlower, 
set_unicode_db
 
 
@@ -40,7 +40,8 @@
                                                         end-start))
         if isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string[start:end])
-        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            XXXXXXX
             s = ctx._unicodestr[start:end]
             lgt = rutf8.check_utf8(s, True)
             return space.newutf8(s, lgt)
@@ -103,7 +104,7 @@
         raise oefmt(space.w_TypeError, "cannot copy this pattern object")
 
     def make_ctx(self, w_string, pos=0, endpos=sys.maxint):
-        """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
+        """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
         searching in the given w_string object."""
         space = self.space
         if pos < 0:
@@ -111,17 +112,26 @@
         if endpos < pos:
             endpos = pos
         if space.isinstance_w(w_string, space.w_unicode):
-            utf8str, length = space.utf8_len_w(w_string)
-            if pos >= length:
+            # xxx fish for the _index_storage
+            w_string = space.convert_arg_to_w_unicode(w_string)
+            utf8str = w_string._utf8
+            length = w_string._len()
+            index_storage = w_string._get_index_storage()
+            #
+            if pos <= 0:
+                bytepos = 0
+            elif pos >= length:
                 bytepos = len(utf8str)
             else:
-                bytepos = rutf8.codepoint_at_index(..)
-
-                pos = length
+                bytepos = rutf8.codepoint_at_index(utf8str, index_storage, pos)
             if endpos >= length:
-                endpos = length
-            return rsre_core.UnicodeMatchContext(self.code, unicodestr,
-                                                 pos, endpos, self.flags)
+                endbytepos = len(utf8str)
+            else:
+                endbytepos = rutf8.codepoint_at_index(utf8str, index_storage,
+                                                      endpos)
+            return rsre_utf8.Utf8MatchContext(
+                self.code, unicodestr, index_storage,
+                bytepos, endbytepos, self.flags)
         elif space.isinstance_w(w_string, space.w_bytes):
             str = space.bytes_w(w_string)
             if pos > len(str):
@@ -372,7 +382,8 @@
         if isinstance(ctx, rsre_core.StrMatchContext):
             assert strbuilder is not None
             return strbuilder.append_slice(ctx._string, start, end)
-        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            XXXXXXX
             assert unicodebuilder is not None
             return unicodebuilder.append_slice(ctx._unicodestr, start, end)
         assert 0, "unreachable"
@@ -578,7 +589,8 @@
             return space.newbytes(ctx._buffer.as_str())
         elif isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string)
-        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+            XXXXXXXX
             lgt = rutf8.check_utf8(ctx._unicodestr, True)
             return space.newutf8(ctx._unicodestr, lgt)
         else:
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,6 +55,8 @@
     specific subclass, calling 'func' is a direct call; if 'ctx' is only known
     to be of class AbstractMatchContext, calling 'func' is an indirect call.
     """
+    from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
+
     assert func.func_code.co_varnames[0] == 'ctx'
     specname = '_spec_' + func.func_name
     while specname in _seen_specname:
@@ -65,7 +67,8 @@
     specialized_methods = []
     for prefix, concreteclass in [('buf', BufMatchContext),
                                   ('str', StrMatchContext),
-                                  ('uni', UnicodeMatchContext)]:
+                                  ('uni', UnicodeMatchContext),
+                                  ('utf8', Utf8MatchContext)]:
         newfunc = func_with_new_name(func, prefix + specname)
         assert not hasattr(concreteclass, specname)
         setattr(concreteclass, specname, newfunc)
diff --git a/rpython/rlib/rsre/rsre_jit.py b/rpython/rlib/rsre/rsre_jit.py
--- a/rpython/rlib/rsre/rsre_jit.py
+++ b/rpython/rlib/rsre/rsre_jit.py
@@ -36,8 +36,10 @@
     from rpython.rlib.rsre.rsre_core import BufMatchContext
     from rpython.rlib.rsre.rsre_core import StrMatchContext
     from rpython.rlib.rsre.rsre_core import UnicodeMatchContext
+    from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
     for prefix, concreteclass in [('Buf', BufMatchContext),
                                   ('Str', StrMatchContext),
-                                  ('Uni', UnicodeMatchContext)]:
+                                  ('Uni', UnicodeMatchContext),
+                                  ('Utf8', Utf8MatchContext)]:
         jitdriver = RSreJitDriver(prefix + name, **kwds)
         setattr(concreteclass, 'jitdriver_' + name, jitdriver)
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -1,3 +1,4 @@
+import sys
 from rpython.rlib.debug import check_nonneg
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
@@ -7,9 +8,11 @@
 
 class Utf8MatchContext(AbstractMatchContext):
 
-    def __init__(self, pattern, utf8string, match_start, end, flags):
+    def __init__(self, pattern, utf8string, index_storage,
+                 match_start, end, flags):
         AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
         self._utf8 = utf8string
+        self._index_storage = index_storage
 
     def str(self, index):
         check_nonneg(index)
@@ -56,4 +59,32 @@
         return position
 
     def slowly_convert_byte_pos_to_index(self, position):
-        
+        return rutf8.codepoint_index_at_byte_position(
+            self._utf8, self._index_storage, position)
+
+    def debug_check_pos(self, position):
+        assert not (0x80 <= self._utf8[position] < 0xC0)   # continuation byte
+
+
+def utf8search(pattern, utf8string, index_storage=None, bytestart=0,
+               byteend=sys.maxint, flags=0):
+    # bytestart and byteend must be valid byte positions inside the
+    # utf8string.
+    from rpython.rlib.rsre.rsre_core import search_context
+
+    assert 0 <= bytestart <= len(utf8string)
+    assert 0 <= byteend
+    if byteend > len(utf8string):
+        byteend = len(utf8string)
+    if index_storage is None:     # should be restricted to tests only
+        length = rutf8.check_utf8(utf8string, allow_surrogates=True)
+        index_storage = rutf8.create_utf8_index_storage(utf8string, length)
+    ctx = Utf8MatchContext(pattern, utf8string, index_storage,
+                           bytestart, byteend, flags)
+    if search_context(ctx):
+        return ctx
+    else:
+        return None
+
+def utf8match(*args, **kwds):
+    NOT_IMPLEMENTED
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-re: in-progress

Reply via email to