Author: fijal Branch: unicode-utf8 Changeset: r93296:1a4e2f08f746 Date: 2017-12-07 17:02 +0200 http://bitbucket.org/pypy/pypy/changeset/1a4e2f08f746/
Log: remove flag handling from rutf8 diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -328,18 +328,18 @@ def check_utf8(s, allow_surrogates, start=0, stop=-1): """Check that 's' is a utf-8-encoded byte string. - Returns the length (number of chars) and flag or raise CheckError. + Returns the length (number of chars) or raise CheckError. If allow_surrogates is False, then also raise if we see any. Note also codepoints_in_utf8(), which also computes the length faster by assuming that 's' is valid utf-8. """ - res, flag = _check_utf8(s, allow_surrogates, start, stop) + res = _check_utf8(s, allow_surrogates, start, stop) if res >= 0: - return res, flag + return res raise CheckError(~res) -def get_utf8_length_flag(s, start=0, end=-1): - """ Get the length and flag out of valid utf8. For now just calls check_utf8 +def get_utf8_length(s, start=0, end=-1): + """ Get the length out of valid utf8. For now just calls check_utf8 """ return check_utf8(s, True, start, end) @@ -347,7 +347,6 @@ def _check_utf8(s, allow_surrogates, start, stop): pos = start continuation_bytes = 0 - flag = FLAG_ASCII if stop < 0: end = len(s) else: @@ -359,44 +358,39 @@ if ordch1 <= 0x7F: continue - if flag == FLAG_ASCII: - flag = FLAG_REGULAR - if ordch1 <= 0xC1: - return ~(pos - 1), 0 + return ~(pos - 1) if ordch1 <= 0xDF: if pos >= end: - return ~(pos - 1), 0 + return ~(pos - 1) ordch2 = ord(s[pos]) pos += 1 if _invalid_byte_2_of_2(ordch2): - return ~(pos - 2), 0 + return ~(pos - 2) # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz continuation_bytes += 1 continue if ordch1 <= 0xEF: if (pos + 2) > end: - return ~(pos - 1), 0 + return ~(pos - 1) ordch2 = ord(s[pos]) ordch3 = ord(s[pos + 1]) pos += 2 if (_invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates) or _invalid_byte_3_of_3(ordch3)): - return ~(pos - 3), 0 + return ~(pos - 3) - if allow_surrogates and _surrogate_bytes(ordch1, ordch2): - flag = FLAG_HAS_SURROGATES # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz continuation_bytes += 2 continue if ordch1 <= 0xF4: if (pos + 3) > end: - return ~(pos - 1), 0 + return ~(pos - 1) ordch2 = ord(s[pos]) ordch3 = ord(s[pos + 1]) ordch4 = ord(s[pos + 2]) @@ -405,16 +399,16 @@ if (_invalid_byte_2_of_4(ordch1, ordch2) or _invalid_byte_3_of_4(ordch3) or _invalid_byte_4_of_4(ordch4)): - return ~(pos - 4), 0 + return ~(pos - 4) # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz continuation_bytes += 3 continue - return ~(pos - 1), 0 + return ~(pos - 1) assert pos == end assert pos - continuation_bytes >= 0 - return pos - continuation_bytes, flag + return pos - continuation_bytes def reencode_utf8_with_surrogates(utf8): """ Receiving valid UTF8 which contains surrogates, combine surrogate @@ -472,47 +466,14 @@ return False -UTF8_INDEX_STORAGE = lltype.GcStruct('utf8_loc', - ('flag', lltype.Signed), - ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct('utf8_loc_elem', +UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct('utf8_loc_elem', ('baseindex', lltype.Signed), ('ofs', lltype.FixedSizeArray(lltype.Char, 16)), - ))))) - -def get_flag_from_code(oc): - assert isinstance(oc, int) - if oc <= 0x7F: - return FLAG_ASCII - if 0xD800 <= oc <= 0xDFFF: - return FLAG_HAS_SURROGATES - return FLAG_REGULAR - -def combine_flags(one, two): - return one | two - -FLAG_ASCII = 0 # no bits -FLAG_REGULAR = 1 # bit 0 -FLAG_HAS_SURROGATES = 3 # bit 0 and bit 1 -# note that we never need index storage if we're pure ascii, but it's useful -# for passing into W_UnicodeObject.__init__ - -#ASCII_INDEX_STORAGE_BLOCKS = 5 -#ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO, -# ASCII_INDEX_STORAGE_BLOCKS, -# immortal=True) -#for _i in range(ASCII_INDEX_STORAGE_BLOCKS): -# ASCII_INDEX_STORAGE[_i].baseindex = _i * 64 -# for _j in range(16): -# ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1) + )) def null_storage(): return lltype.nullptr(UTF8_INDEX_STORAGE) -UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True) -UTF8_IS_ASCII.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO) -UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True) -UTF8_HAS_SURROGATES.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO) - def create_utf8_index_storage(utf8, utf8len): """ Create an index storage which stores index of each 4th character in utf8 encoded unicode string. @@ -520,23 +481,21 @@ # if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64: # return ASCII_INDEX_STORAGE arraysize = utf8len // 64 + 1 - storage = lltype.malloc(UTF8_INDEX_STORAGE) - contents = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO, arraysize) - storage.contents = contents + storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize) baseindex = 0 current = 0 while True: - contents[current].baseindex = baseindex + storage[current].baseindex = baseindex next = baseindex for i in range(16): if utf8len == 0: next += 1 # assume there is an extra '\x00' character else: next = next_codepoint_pos(utf8, next) - contents[current].ofs[i] = chr(next - baseindex) + storage[current].ofs[i] = chr(next - baseindex) utf8len -= 4 if utf8len < 0: - assert current + 1 == len(contents) + assert current + 1 == len(storage) break next = next_codepoint_pos(utf8, next) next = next_codepoint_pos(utf8, next) @@ -556,8 +515,8 @@ this function. """ current = index >> 6 - ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F]) - bytepos = storage.contents[current].baseindex + ofs + ofs = ord(storage[current].ofs[(index >> 2) & 0x0F]) + bytepos = storage[current].baseindex + ofs index &= 0x3 if index == 0: return prev_codepoint_pos(utf8, bytepos) @@ -575,8 +534,8 @@ storage of type UTF8_INDEX_STORAGE """ current = index >> 6 - ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F]) - bytepos = storage.contents[current].baseindex + ofs + ofs = ord(storage[current].ofs[(index >> 2) & 0x0F]) + bytepos = storage[current].baseindex + ofs index &= 0x3 if index == 0: return codepoint_before_pos(utf8, bytepos) @@ -596,15 +555,15 @@ is not tiny either. """ index_min = 0 - index_max = len(storage.contents) - 1 + index_max = len(storage) - 1 while index_min < index_max: index_middle = (index_min + index_max + 1) // 2 - base_bytepos = storage.contents[index_middle].baseindex + base_bytepos = storage[index_middle].baseindex if bytepos < base_bytepos: index_max = index_middle - 1 else: index_min = index_middle - bytepos1 = storage.contents[index_min].baseindex + bytepos1 = storage[index_min].baseindex result = index_min << 6 while bytepos1 < bytepos: bytepos1 = next_codepoint_pos(utf8, bytepos1) @@ -713,22 +672,19 @@ def __init__(self, size=0): self._s = StringBuilder(size) self._lgt = 0 - self._flag = FLAG_ASCII @always_inline def append(self, s): # for strings self._s.append(s) - newlgt, newflag = get_utf8_length_flag(s) + newlgt = get_utf8_length(s) self._lgt += newlgt - self._flag = combine_flags(self._flag, newflag) @always_inline def append_slice(self, s, start, end): self._s.append_slice(s, start, end) - newlgt, newflag = get_utf8_length_flag(s, start, end) + newlgt = get_utf8_length(s, start, end) self._lgt += newlgt - self._flag = combine_flags(self._flag, newflag) @signature(char(), returns=none()) @always_inline @@ -739,13 +695,11 @@ @try_inline def append_code(self, code): - self._flag = combine_flags(self._flag, get_flag_from_code(code)) self._lgt += 1 unichr_as_utf8_append(self._s, code, True) @always_inline - def append_utf8(self, utf8, length, flag): - self._flag = combine_flags(self._flag, flag) + def append_utf8(self, utf8, length): self._lgt += length self._s.append(utf8) @@ -754,10 +708,6 @@ return self._s.build() @always_inline - def get_flag(self): - return self._flag - - @always_inline def get_length(self): return self._lgt diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -38,30 +38,25 @@ def test_check_utf8_valid(u, allow_surrogates): _test_check_utf8(u.encode('utf-8'), allow_surrogates) +def _has_surrogates(s): + for u in s.decode('utf8'): + if 0xD800 <= ord(u) <= 0xDFFF: + return True + return False + def _test_check_utf8(s, allow_surrogates): - def _has_surrogates(s): - for u in s.decode('utf8'): - if 0xD800 <= ord(u) <= 0xDFFF: - return True - return False - try: u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True, allow_surrogates=allow_surrogates) valid = True except UnicodeDecodeError as e: valid = False - length, flag = rutf8._check_utf8(s, allow_surrogates, 0, len(s)) + length = rutf8._check_utf8(s, allow_surrogates, 0, len(s)) if length < 0: assert not valid assert ~(length) == e.start else: assert valid - if flag == rutf8.FLAG_ASCII: - s.decode('ascii') # assert did not raise - elif flag == rutf8.FLAG_HAS_SURROGATES: - assert allow_surrogates - assert _has_surrogates(s) if sys.maxunicode == 0x10FFFF or not _has_surrogates(s): assert length == len(u) @@ -155,60 +150,45 @@ assert result == expected @given(strategies.lists(strategies.characters())) -def test_get_utf8_length_flag(unichars): +def test_get_utf8_length(unichars): u = u''.join(unichars) exp_lgt = len(u) - exp_flag = rutf8.FLAG_ASCII - for c in u: - if ord(c) > 0x7F: - exp_flag = rutf8.FLAG_REGULAR - if 0xD800 <= ord(c) <= 0xDFFF: - exp_flag = rutf8.FLAG_HAS_SURROGATES - break - lgt, flag = rutf8.get_utf8_length_flag(''.join([c.encode('utf8') for c in u])) - if exp_flag != rutf8.FLAG_HAS_SURROGATES or sys.maxunicode > 0xffff: + s = ''.join([c.encode('utf8') for c in u]) + lgt = rutf8.get_utf8_length(s) + if not _has_surrogates(s) or sys.maxunicode > 0xffff: assert lgt == exp_lgt - assert flag == exp_flag def test_utf8_string_builder(): s = rutf8.Utf8StringBuilder() s.append("foo") s.append_char("x") - assert s.get_flag() == rutf8.FLAG_ASCII assert s.get_length() == 4 assert s.build() == "foox" s.append(u"\u1234".encode("utf8")) - assert s.get_flag() == rutf8.FLAG_REGULAR assert s.get_length() == 5 assert s.build().decode("utf8") == u"foox\u1234" s.append("foo") s.append_char("x") - assert s.get_flag() == rutf8.FLAG_REGULAR assert s.get_length() == 9 assert s.build().decode("utf8") == u"foox\u1234foox" s = rutf8.Utf8StringBuilder() s.append_code(0x1234) assert s.build().decode("utf8") == u"\u1234" - assert s.get_flag() == rutf8.FLAG_REGULAR assert s.get_length() == 1 s.append_code(0xD800) - assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES assert s.get_length() == 2 s = rutf8.Utf8StringBuilder() - s.append_utf8("abc", 3, rutf8.FLAG_ASCII) - assert s.get_flag() == rutf8.FLAG_ASCII + s.append_utf8("abc", 3) assert s.get_length() == 3 assert s.build().decode("utf8") == u"abc" - s.append_utf8(u"\u1234".encode("utf8"), 1, rutf8.FLAG_REGULAR) + s.append_utf8(u"\u1234".encode("utf8"), 1) assert s.build().decode("utf8") == u"abc\u1234" - assert s.get_flag() == rutf8.FLAG_REGULAR assert s.get_length() == 4 s.append_code(0xD800) - assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES assert s.get_length() == 5 @given(strategies.text()) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit