[pypy-commit] pypy unicode-utf8: * Return a flag from check_utf8.

fijal Sat, 04 Nov 2017 15:17:30 -0700

Author: fijal
Branch: unicode-utf8
Changeset: r92933:a6e6ba074a22
Date: 2017-11-04 10:31 +0100
http://bitbucket.org/pypy/pypy/changeset/a6e6ba074a22/


Log:    * Return a flag from check_utf8.
        * Improve the tests and run it for more examples

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -194,14 +194,14 @@
         self.pos = pos
 
 def check_ascii(s):
-    res = _check_ascii(s)
+    res = first_non_ascii_char(s)
     if res < 0:
         return
     raise CheckError(res)
 
 
 @jit.elidable
-def _check_ascii(s):
+def first_non_ascii_char(s):
     for i in range(len(s)):
         if ord(s[i]) > 0x7F:
             return i
@@ -286,6 +286,9 @@
 _invalid_byte_3_of_4 = _invalid_cont_byte
 _invalid_byte_4_of_4 = _invalid_cont_byte
 
+def _surrogate_bytes(ch1, ch2):
+    return ch1 == 0xed and ch2 > 0x9f
+
 @enforceargs(allow_surrogates=bool)
 def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
     return (ordch2>>6 != 0x2 or    # 0b10
@@ -301,20 +304,22 @@
 
 def check_utf8(s, allow_surrogates, start=0, stop=-1):
     """Check that 's' is a utf-8-encoded byte string.
-    Returns the length (number of chars) or raise CheckError.
+
+    Returns the length (number of chars) and flags or raise CheckError.
     If allow_surrogates is False, then also raise if we see any.
     Note also codepoints_in_utf8(), which also computes the length
     faster by assuming that 's' is valid utf-8.
     """
-    res = _check_utf8(s, allow_surrogates, start, stop)
+    res, flags = _check_utf8(s, allow_surrogates, start, stop)
     if res >= 0:
-        return res
+        return res, flags
     raise CheckError(~res)
 
 @jit.elidable
 def _check_utf8(s, allow_surrogates, start, stop):
     pos = start
     continuation_bytes = 0
+    flag = FLAG_ASCII
     if stop < 0:
         end = len(s)
     else:
@@ -326,38 +331,44 @@
         if ordch1 <= 0x7F:
             continue
 
+        if flag == FLAG_ASCII:
+            flag = FLAG_REGULAR
+
         if ordch1 <= 0xC1:
-            return ~(pos - 1)
+            return ~(pos - 1), 0
 
         if ordch1 <= 0xDF:
             if pos >= end:
-                return ~(pos - 1)
+                return ~(pos - 1), 0
             ordch2 = ord(s[pos])
             pos += 1
 
             if _invalid_byte_2_of_2(ordch2):
-                return ~(pos - 2)
+                return ~(pos - 2), 0
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
             continuation_bytes += 1
             continue
 
         if ordch1 <= 0xEF:
             if (pos + 2) > end:
-                return ~(pos - 1)
+                return ~(pos - 1), 0
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
             pos += 2
 
             if (_invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates) or
                 _invalid_byte_3_of_3(ordch3)):
-                return ~(pos - 3)
+                return ~(pos - 3), 0
+
+            if allow_surrogates and _surrogate_bytes(ordch1, ordch2):
+                flag = FLAG_HAS_SURROGATES
             # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
             continuation_bytes += 2
             continue
 
         if ordch1 <= 0xF4:
             if (pos + 3) > end:
-                return ~(pos - 1)
+                return ~(pos - 1), 0
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
             ordch4 = ord(s[pos + 2])
@@ -366,16 +377,16 @@
             if (_invalid_byte_2_of_4(ordch1, ordch2) or
                 _invalid_byte_3_of_4(ordch3) or
                 _invalid_byte_4_of_4(ordch4)):
-                return ~(pos - 4)
+                return ~(pos - 4), 0
             # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
             continuation_bytes += 3
             continue
 
-        return ~(pos - 1)
+        return ~(pos - 1), 0
 
     assert pos == end
     assert pos - continuation_bytes >= 0
-    return pos - continuation_bytes
+    return pos - continuation_bytes, flag
 
 @jit.elidable
 def codepoints_in_utf8(value, start=0, end=sys.maxint):
@@ -408,9 +419,16 @@
 UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
     'utf8_loc',
     ('baseindex', lltype.Signed),
+    ('flag', lltype.Signed),
     ('ofs', lltype.FixedSizeArray(lltype.Char, 16))
     ))
 
+FLAG_REGULAR = 0
+FLAG_HAS_SURROGATES = 1
+FLAG_ASCII = 2
+# note that we never need index storage if we're pure ascii, but it's useful
+# for passing into W_UnicodeObject.__init__
+
 ASCII_INDEX_STORAGE_BLOCKS = 5
 ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE,
                                     ASCII_INDEX_STORAGE_BLOCKS,
@@ -423,6 +441,9 @@
 def null_storage():
     return lltype.nullptr(UTF8_INDEX_STORAGE)
 
+UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True)
+UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, 0, immortal=True)
+
 def create_utf8_index_storage(utf8, utf8len):
     """ Create an index storage which stores index of each 4th character
     in utf8 encoded unicode string.
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -28,6 +28,7 @@
     else:
         assert not raised
 
+@settings(max_examples=10000)
 @given(strategies.binary(), strategies.booleans())
 def test_check_utf8(s, allow_surrogates):
     _test_check_utf8(s, allow_surrogates)
@@ -37,19 +38,32 @@
     _test_check_utf8(u.encode('utf-8'), allow_surrogates)
 
 def _test_check_utf8(s, allow_surrogates):
+    def _has_surrogates(s):
+        for u in s.decode('utf8'):
+            if 0xD800 <= ord(u) <= 0xDB7F:
+                return True
+            if 0xDC00 <= ord(u) <= 0xDBFF:
+                return True
+        return False
+
     try:
         u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True,
                                          allow_surrogates=allow_surrogates)
         valid = True
     except UnicodeDecodeError as e:
         valid = False
-    try:
-        length = rutf8.check_utf8(s, allow_surrogates)
-    except rutf8.CheckError:
+    length, flag = rutf8._check_utf8(s, allow_surrogates, 0, len(s))
+    if length < 0:
         assert not valid
+        assert ~(length) == e.start
     else:
         assert valid
         assert length == len(u)
+        if flag == rutf8.FLAG_ASCII:
+            s.decode('ascii') # assert did not raise
+        elif flag == rutf8.FLAG_HAS_SURROGATES:
+            assert allow_surrogates
+            assert _has_surrogates(s)
 
 @given(strategies.characters())
 def test_next_pos(uni):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: * Return a flag from check_utf8.

Reply via email to