Author: fijal
Branch: unicode-utf8
Changeset: r92604:f06c4111345a
Date: 2017-10-05 10:28 +0200
http://bitbucket.org/pypy/pypy/changeset/f06c4111345a/

Log:    merge

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -95,6 +95,8 @@
     """
     pos = r_uint(pos)
     pos -= 1
+    if pos >= len(code):     # for the case where pos - 1 == len(code):
+        return pos           # assume there is an extra '\x00' character
     chr1 = ord(code[pos])
     if chr1 <= 0x7F:
         return pos
@@ -347,6 +349,16 @@
     assert pos == len(s)
     return pos - continuation_bytes
 
[email protected]
+def surrogate_in_utf8(value):
+    """Check if the UTF-8 byte string 'value' contains a surrogate.
+    The 'value' argument must be otherwise correctly formed for UTF-8.
+    """
+    for i in range(len(value) - 2):
+        if value[i] == '\xed' and value[i + 1] >= '\xa0':
+            return True
+    return False
+
 
 UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
     'utf8_loc',
@@ -367,9 +379,9 @@
     """ Create an index storage which stores index of each 4th character
     in utf8 encoded unicode string.
     """
-    if len(utf8) == utf8len <= ASCII_INDEX_STORAGE_BLOCKS * 64:
+    if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
         return ASCII_INDEX_STORAGE
-    arraysize = (utf8len + 63) // 64
+    arraysize = utf8len // 64 + 1
     storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
     baseindex = 0
     current = 0
@@ -377,10 +389,14 @@
         storage[current].baseindex = baseindex
         next = baseindex
         for i in range(16):
-            next = next_codepoint_pos(utf8, next)
+            if utf8len == 0:
+                next += 1      # assume there is an extra '\x00' character
+            else:
+                next = next_codepoint_pos(utf8, next)
             storage[current].ofs[i] = chr(next - baseindex)
             utf8len -= 4
-            if utf8len <= 0:
+            if utf8len < 0:
+                assert current + 1 == len(storage)
                 break
             next = next_codepoint_pos(utf8, next)
             next = next_codepoint_pos(utf8, next)
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -93,8 +93,17 @@
                 ord(item))
 
 @given(strategies.text())
+@example(u'x' * 64 * 5)
+@example(u'x' * (64 * 5 - 1))
 def test_codepoint_position_at_index(u):
     index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
-    for i in range(len(u)):
+    for i in range(len(u) + 1):
         assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) 
==
                 len(u[:i].encode('utf8')))
+
+@given(strategies.lists(strategies.characters()))
+def test_surrogate_in_utf8(unichars):
+    uni = u''.join(unichars).encode('utf-8')
+    result = rutf8.surrogate_in_utf8(uni)
+    expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
+    assert result == expected
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to