Author: Armin Rigo <[email protected]>
Branch: unicode-utf8-re
Changeset: r93302:cb5b89596a2f
Date: 2017-12-08 11:44 +0100
http://bitbucket.org/pypy/pypy/changeset/cb5b89596a2f/
Log: in-progress
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -142,6 +142,7 @@
# Utf8MatchContext. The non-utf8 implementation is provided
# by the FixedMatchContext abstract subclass, in order to use
# the same @not_rpython safety trick as above.
+ ZERO = 0
@not_rpython
def next(self, position):
raise NotImplementedError
@@ -221,9 +222,8 @@
class FixedMatchContext(AbstractMatchContext):
"""Abstract subclass to introduce the default implementation for
- these position methods. The Utf8 subclass doesn't inherit from here."""
-
- ZERO = 0
+ these position methods. The Utf8MatchContext subclass doesn't
+ inherit from here."""
def next(self, position):
return position + 1
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -0,0 +1,59 @@
+from rpython.rlib.debug import check_nonneg
+from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
+from rpython.rlib.rsre import rsre_char
+from rpython.rlib import rutf8
+
+
+class Utf8MatchContext(AbstractMatchContext):
+
+ def __init__(self, pattern, utf8string, match_start, end, flags):
+ AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+ self._utf8 = utf8string
+
+ def str(self, index):
+ check_nonneg(index)
+ return rutf8.codepoint_at_pos(self._utf8, index)
+
+ def lowstr(self, index):
+ c = self.str(index)
+ return rsre_char.getlower(c, self.flags)
+
+ def get_single_byte(self, base_position, index):
+ return self.str(base_position + index)
+
+ def fresh_copy(self, start):
+ return Utf8MatchContext(self.pattern, self._utf8, start,
+ self.end, self.flags)
+
+ def next(self, position):
+ return rutf8.next_codepoint_pos(self._utf8, position)
+
+ def prev(self, position):
+ if position <= 0:
+ raise EndOfString
+ upos = r_uint(position)
+ upos = rutf8.prev_codepoint_pos(self._utf8, upos)
+ position = intmask(upos)
+ assert position >= 0
+ return position
+
+ def next_n(self, position, n, end_position):
+ for i in range(n):
+ if position >= end_position:
+ raise EndOfString
+ position = rutf8.next_codepoint_pos(self._utf8, position)
+ return position
+
+ def prev_n(self, position, n, start_position):
+ upos = r_uint(position)
+ for i in range(n):
+ if upos <= r_uint(start_position):
+ raise EndOfString
+ upos = rutf8.next_codepoint_pos(self._utf8, upos)
+ position = intmask(upos)
+ assert position >= 0
+ return position
+
+ def slowly_convert_byte_pos_to_index(self, position):
+
diff --git a/rpython/rlib/rsre/test/test_search.py
b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -1,7 +1,7 @@
import re, py
from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re
from rpython.rlib.rsre.test import support
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_utf8
class BaseTestSearch:
@@ -222,3 +222,8 @@
search = staticmethod(rsre_core.search)
match = staticmethod(rsre_core.match)
Position = staticmethod(lambda n: n)
+
+class TestSearchUtf8(BaseTestSearch):
+ search = staticmethod(rsre_utf8.utf8search)
+ match = staticmethod(rsre_utf8.utf8match)
+ Position = staticmethod(lambda n: n) # NB. only for plain ascii
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit