[pypy-commit] pypy unicode-utf8: remove flag handling from rutf8

fijal Thu, 07 Dec 2017 07:45:50 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r93296:1a4e2f08f746
Date: 2017-12-07 17:02 +0200
http://bitbucket.org/pypy/pypy/changeset/1a4e2f08f746/


Log:    remove flag handling from rutf8

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -328,18 +328,18 @@
 def check_utf8(s, allow_surrogates, start=0, stop=-1):
     """Check that 's' is a utf-8-encoded byte string.
 
-    Returns the length (number of chars) and flag or raise CheckError.
+    Returns the length (number of chars) or raise CheckError.
     If allow_surrogates is False, then also raise if we see any.
     Note also codepoints_in_utf8(), which also computes the length
     faster by assuming that 's' is valid utf-8.
     """
-    res, flag = _check_utf8(s, allow_surrogates, start, stop)
+    res = _check_utf8(s, allow_surrogates, start, stop)
     if res >= 0:
-        return res, flag
+        return res
     raise CheckError(~res)
 
-def get_utf8_length_flag(s, start=0, end=-1):
-    """ Get the length and flag out of valid utf8. For now just calls 
check_utf8
+def get_utf8_length(s, start=0, end=-1):
+    """ Get the length out of valid utf8. For now just calls check_utf8
     """
     return check_utf8(s, True, start, end)
 
@@ -347,7 +347,6 @@
 def _check_utf8(s, allow_surrogates, start, stop):
     pos = start
     continuation_bytes = 0
-    flag = FLAG_ASCII
     if stop < 0:
         end = len(s)
     else:
@@ -359,44 +358,39 @@
         if ordch1 <= 0x7F:
             continue
 
-        if flag == FLAG_ASCII:
-            flag = FLAG_REGULAR
-
         if ordch1 <= 0xC1:
-            return ~(pos - 1), 0
+            return ~(pos - 1)
 
         if ordch1 <= 0xDF:
             if pos >= end:
-                return ~(pos - 1), 0
+                return ~(pos - 1)
             ordch2 = ord(s[pos])
             pos += 1
 
             if _invalid_byte_2_of_2(ordch2):
-                return ~(pos - 2), 0
+                return ~(pos - 2)
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
             continuation_bytes += 1
             continue
 
         if ordch1 <= 0xEF:
             if (pos + 2) > end:
-                return ~(pos - 1), 0
+                return ~(pos - 1)
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
             pos += 2
 
             if (_invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates) or
                 _invalid_byte_3_of_3(ordch3)):
-                return ~(pos - 3), 0
+                return ~(pos - 3)
 
-            if allow_surrogates and _surrogate_bytes(ordch1, ordch2):
-                flag = FLAG_HAS_SURROGATES
             # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
             continuation_bytes += 2
             continue
 
         if ordch1 <= 0xF4:
             if (pos + 3) > end:
-                return ~(pos - 1), 0
+                return ~(pos - 1)
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
             ordch4 = ord(s[pos + 2])
@@ -405,16 +399,16 @@
             if (_invalid_byte_2_of_4(ordch1, ordch2) or
                 _invalid_byte_3_of_4(ordch3) or
                 _invalid_byte_4_of_4(ordch4)):
-                return ~(pos - 4), 0
+                return ~(pos - 4)
             # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
             continuation_bytes += 3
             continue
 
-        return ~(pos - 1), 0
+        return ~(pos - 1)
 
     assert pos == end
     assert pos - continuation_bytes >= 0
-    return pos - continuation_bytes, flag
+    return pos - continuation_bytes
 
 def reencode_utf8_with_surrogates(utf8):
     """ Receiving valid UTF8 which contains surrogates, combine surrogate
@@ -472,47 +466,14 @@
     return False
 
 
-UTF8_INDEX_STORAGE = lltype.GcStruct('utf8_loc',
-    ('flag', lltype.Signed),
-    ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct('utf8_loc_elem',
+UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct('utf8_loc_elem',
         ('baseindex', lltype.Signed),
         ('ofs', lltype.FixedSizeArray(lltype.Char, 16)),
-    )))))
-
-def get_flag_from_code(oc):
-    assert isinstance(oc, int)
-    if oc <= 0x7F:
-        return FLAG_ASCII
-    if 0xD800 <= oc <= 0xDFFF:
-        return FLAG_HAS_SURROGATES
-    return FLAG_REGULAR
-
-def combine_flags(one, two):
-    return one | two
-
-FLAG_ASCII          = 0     # no bits
-FLAG_REGULAR        = 1     # bit 0
-FLAG_HAS_SURROGATES = 3     # bit 0 and bit 1
-# note that we never need index storage if we're pure ascii, but it's useful
-# for passing into W_UnicodeObject.__init__
-
-#ASCII_INDEX_STORAGE_BLOCKS = 5
-#ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO,
-#                                    ASCII_INDEX_STORAGE_BLOCKS,
-#                                    immortal=True)
-#for _i in range(ASCII_INDEX_STORAGE_BLOCKS):
-#    ASCII_INDEX_STORAGE[_i].baseindex = _i * 64
-#    for _j in range(16):
-#        ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1)
+    ))
 
 def null_storage():
     return lltype.nullptr(UTF8_INDEX_STORAGE)
 
-UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True)
-UTF8_IS_ASCII.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO)
-UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True)
-UTF8_HAS_SURROGATES.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO)
-
 def create_utf8_index_storage(utf8, utf8len):
     """ Create an index storage which stores index of each 4th character
     in utf8 encoded unicode string.
@@ -520,23 +481,21 @@
 #    if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
 #        return ASCII_INDEX_STORAGE
     arraysize = utf8len // 64 + 1
-    storage = lltype.malloc(UTF8_INDEX_STORAGE)
-    contents = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO, arraysize)
-    storage.contents = contents
+    storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
     baseindex = 0
     current = 0
     while True:
-        contents[current].baseindex = baseindex
+        storage[current].baseindex = baseindex
         next = baseindex
         for i in range(16):
             if utf8len == 0:
                 next += 1      # assume there is an extra '\x00' character
             else:
                 next = next_codepoint_pos(utf8, next)
-            contents[current].ofs[i] = chr(next - baseindex)
+            storage[current].ofs[i] = chr(next - baseindex)
             utf8len -= 4
             if utf8len < 0:
-                assert current + 1 == len(contents)
+                assert current + 1 == len(storage)
                 break
             next = next_codepoint_pos(utf8, next)
             next = next_codepoint_pos(utf8, next)
@@ -556,8 +515,8 @@
     this function.
     """
     current = index >> 6
-    ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F])
-    bytepos = storage.contents[current].baseindex + ofs
+    ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
+    bytepos = storage[current].baseindex + ofs
     index &= 0x3
     if index == 0:
         return prev_codepoint_pos(utf8, bytepos)
@@ -575,8 +534,8 @@
     storage of type UTF8_INDEX_STORAGE
     """
     current = index >> 6
-    ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F])
-    bytepos = storage.contents[current].baseindex + ofs
+    ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
+    bytepos = storage[current].baseindex + ofs
     index &= 0x3
     if index == 0:
         return codepoint_before_pos(utf8, bytepos)
@@ -596,15 +555,15 @@
     is not tiny either.
     """
     index_min = 0
-    index_max = len(storage.contents) - 1
+    index_max = len(storage) - 1
     while index_min < index_max:
         index_middle = (index_min + index_max + 1) // 2
-        base_bytepos = storage.contents[index_middle].baseindex
+        base_bytepos = storage[index_middle].baseindex
         if bytepos < base_bytepos:
             index_max = index_middle - 1
         else:
             index_min = index_middle
-    bytepos1 = storage.contents[index_min].baseindex
+    bytepos1 = storage[index_min].baseindex
     result = index_min << 6
     while bytepos1 < bytepos:
         bytepos1 = next_codepoint_pos(utf8, bytepos1)
@@ -713,22 +672,19 @@
     def __init__(self, size=0):
         self._s = StringBuilder(size)
         self._lgt = 0
-        self._flag = FLAG_ASCII
 
     @always_inline
     def append(self, s):
         # for strings
         self._s.append(s)
-        newlgt, newflag = get_utf8_length_flag(s)
+        newlgt = get_utf8_length(s)
         self._lgt += newlgt
-        self._flag = combine_flags(self._flag, newflag)
 
     @always_inline
     def append_slice(self, s, start, end):
         self._s.append_slice(s, start, end)
-        newlgt, newflag = get_utf8_length_flag(s, start, end)
+        newlgt = get_utf8_length(s, start, end)
         self._lgt += newlgt
-        self._flag = combine_flags(self._flag, newflag)
 
     @signature(char(), returns=none())
     @always_inline
@@ -739,13 +695,11 @@
 
     @try_inline
     def append_code(self, code):
-        self._flag = combine_flags(self._flag, get_flag_from_code(code))
         self._lgt += 1
         unichr_as_utf8_append(self._s, code, True)
 
     @always_inline
-    def append_utf8(self, utf8, length, flag):
-        self._flag = combine_flags(self._flag, flag)
+    def append_utf8(self, utf8, length):
         self._lgt += length
         self._s.append(utf8)
 
@@ -754,10 +708,6 @@
         return self._s.build()
 
     @always_inline
-    def get_flag(self):
-        return self._flag
-
-    @always_inline
     def get_length(self):
         return self._lgt
 
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -38,30 +38,25 @@
 def test_check_utf8_valid(u, allow_surrogates):
     _test_check_utf8(u.encode('utf-8'), allow_surrogates)
 
+def _has_surrogates(s):
+    for u in s.decode('utf8'):
+        if 0xD800 <= ord(u) <= 0xDFFF:
+            return True
+    return False
+
 def _test_check_utf8(s, allow_surrogates):
-    def _has_surrogates(s):
-        for u in s.decode('utf8'):
-            if 0xD800 <= ord(u) <= 0xDFFF:
-                return True
-        return False
-
     try:
         u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True,
                                          allow_surrogates=allow_surrogates)
         valid = True
     except UnicodeDecodeError as e:
         valid = False
-    length, flag = rutf8._check_utf8(s, allow_surrogates, 0, len(s))
+    length = rutf8._check_utf8(s, allow_surrogates, 0, len(s))
     if length < 0:
         assert not valid
         assert ~(length) == e.start
     else:
         assert valid
-        if flag == rutf8.FLAG_ASCII:
-            s.decode('ascii') # assert did not raise
-        elif flag == rutf8.FLAG_HAS_SURROGATES:
-            assert allow_surrogates
-            assert _has_surrogates(s)
         if sys.maxunicode == 0x10FFFF or not _has_surrogates(s):
             assert length == len(u)
 
@@ -155,60 +150,45 @@
     assert result == expected
 
 @given(strategies.lists(strategies.characters()))
-def test_get_utf8_length_flag(unichars):
+def test_get_utf8_length(unichars):
     u = u''.join(unichars)
     exp_lgt = len(u)
-    exp_flag = rutf8.FLAG_ASCII
-    for c in u:
-        if ord(c) > 0x7F:
-            exp_flag = rutf8.FLAG_REGULAR
-        if 0xD800 <= ord(c) <= 0xDFFF:
-            exp_flag = rutf8.FLAG_HAS_SURROGATES
-            break
-    lgt, flag = rutf8.get_utf8_length_flag(''.join([c.encode('utf8') for c in 
u]))
-    if exp_flag != rutf8.FLAG_HAS_SURROGATES or sys.maxunicode > 0xffff:
+    s = ''.join([c.encode('utf8') for c in u])
+    lgt = rutf8.get_utf8_length(s)
+    if not _has_surrogates(s) or sys.maxunicode > 0xffff:
         assert lgt == exp_lgt
-    assert flag == exp_flag
 
 def test_utf8_string_builder():
     s = rutf8.Utf8StringBuilder()
     s.append("foo")
     s.append_char("x")
-    assert s.get_flag() == rutf8.FLAG_ASCII
     assert s.get_length() == 4
     assert s.build() == "foox"
     s.append(u"\u1234".encode("utf8"))
-    assert s.get_flag() == rutf8.FLAG_REGULAR
     assert s.get_length() == 5
     assert s.build().decode("utf8") == u"foox\u1234"
     s.append("foo")
     s.append_char("x")
-    assert s.get_flag() == rutf8.FLAG_REGULAR
     assert s.get_length() == 9
     assert s.build().decode("utf8") == u"foox\u1234foox"
 
     s = rutf8.Utf8StringBuilder()
     s.append_code(0x1234)
     assert s.build().decode("utf8") == u"\u1234"
-    assert s.get_flag() == rutf8.FLAG_REGULAR
     assert s.get_length() == 1
     s.append_code(0xD800)
-    assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
     assert s.get_length() == 2
 
     s = rutf8.Utf8StringBuilder()
-    s.append_utf8("abc", 3, rutf8.FLAG_ASCII)
-    assert s.get_flag() == rutf8.FLAG_ASCII
+    s.append_utf8("abc", 3)
     assert s.get_length() == 3
     assert s.build().decode("utf8") == u"abc"
 
-    s.append_utf8(u"\u1234".encode("utf8"), 1, rutf8.FLAG_REGULAR)
+    s.append_utf8(u"\u1234".encode("utf8"), 1)
     assert s.build().decode("utf8") == u"abc\u1234"
-    assert s.get_flag() == rutf8.FLAG_REGULAR
     assert s.get_length() == 4
 
     s.append_code(0xD800)
-    assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
     assert s.get_length() == 5
 
 @given(strategies.text())
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: remove flag handling from rutf8

Reply via email to