Author: fijal
Branch: unicode-utf8
Changeset: r92258:d2735187e72f
Date: 2017-08-24 18:39 +0200
http://bitbucket.org/pypy/pypy/changeset/d2735187e72f/
Log: (arigo, fijal) implement fast skipping technique in RPython
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -19,6 +19,7 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rlib import jit
from rpython.rlib.rarithmetic import r_uint
+from rpython.rtyper.lltypesystem import lltype
def unichr_as_utf8(code, allow_surrogates=False):
@@ -307,3 +308,65 @@
assert pos == len(s)
return pos - continuation_bytes
+
+
+UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
+ 'utf8_loc',
+ ('index', lltype.Signed),
+ ('ofs', lltype.FixedSizeArray(lltype.Char, 16))
+ ))
+
+EMPTY_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True)
+
+def create_utf8_index_storage(utf8, utf8len):
+ """ Create an index storage which stores index of each 4th character
+ in utf8 encoded unicode string.
+ """
+ if utf8len == 0:
+ return EMPTY_INDEX_STORAGE
+ arraysize = (utf8len + 63) // 64
+ storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
+ baseindex = 0
+ current = 0
+ next = 0
+ while True:
+ storage[current].index = baseindex
+ for i in range(16):
+ next = next_codepoint_pos(utf8, next)
+ storage[current].ofs[i] = chr(next - baseindex)
+ utf8len -= 4
+ if utf8len <= 0:
+ break
+ next = next_codepoint_pos(utf8, next)
+ next = next_codepoint_pos(utf8, next)
+ next = next_codepoint_pos(utf8, next)
+ else:
+ current += 1
+ baseindex = next
+ continue
+ break
+ return storage
+
+def codepoint_position_at_index(utf8, storage, index):
+ """ Return byte index of a character inside utf8 encoded string, given
+ storage of type UTF8_INDEX_STORAGE
+ """
+ current = index >> 6
+ ofs = ord(storage[current].ofs[(index >> 2) & 15])
+ bytepos = storage[current].index + ofs
+ index &= 0x3
+ if index == 0:
+ return prev_codepoint_pos(utf8, bytepos)
+ elif index == 1:
+ return bytepos
+ elif index == 2:
+ return next_codepoint_pos(utf8, bytepos)
+ else:
+ return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
+
+def codepoint_at_index(utf8, storage, index):
+ """ Return codepoint of a character inside utf8 encoded string, given
+ storage of type UTF8_INDEX_STORAGE
+ """
+ bytepos = codepoint_position_at_index(utf8, storage, index)
+ return codepoint_at_pos(utf8, bytepos)
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -84,3 +84,9 @@
response = rutf8.utf8_in_chars(ch.encode('utf8'), 0, txt.encode('utf8'))
r = (ch in txt)
assert r == response
+
+@given(strategies.text())
+def test_utf8_index_storage(u):
+ index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
+ for i, item in enumerate(u):
+ rutf8.codepoint_at_index(u.encode('utf8'), index, i) ==
item.encode('utf8')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit