Author: fijal
Branch: unicode-utf8
Changeset: r93296:1a4e2f08f746
Date: 2017-12-07 17:02 +0200
http://bitbucket.org/pypy/pypy/changeset/1a4e2f08f746/
Log: remove flag handling from rutf8
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -328,18 +328,18 @@
def check_utf8(s, allow_surrogates, start=0, stop=-1):
"""Check that 's' is a utf-8-encoded byte string.
- Returns the length (number of chars) and flag or raise CheckError.
+ Returns the length (number of chars) or raise CheckError.
If allow_surrogates is False, then also raise if we see any.
Note also codepoints_in_utf8(), which also computes the length
faster by assuming that 's' is valid utf-8.
"""
- res, flag = _check_utf8(s, allow_surrogates, start, stop)
+ res = _check_utf8(s, allow_surrogates, start, stop)
if res >= 0:
- return res, flag
+ return res
raise CheckError(~res)
-def get_utf8_length_flag(s, start=0, end=-1):
- """ Get the length and flag out of valid utf8. For now just calls
check_utf8
+def get_utf8_length(s, start=0, end=-1):
+ """ Get the length out of valid utf8. For now just calls check_utf8
"""
return check_utf8(s, True, start, end)
@@ -347,7 +347,6 @@
def _check_utf8(s, allow_surrogates, start, stop):
pos = start
continuation_bytes = 0
- flag = FLAG_ASCII
if stop < 0:
end = len(s)
else:
@@ -359,44 +358,39 @@
if ordch1 <= 0x7F:
continue
- if flag == FLAG_ASCII:
- flag = FLAG_REGULAR
-
if ordch1 <= 0xC1:
- return ~(pos - 1), 0
+ return ~(pos - 1)
if ordch1 <= 0xDF:
if pos >= end:
- return ~(pos - 1), 0
+ return ~(pos - 1)
ordch2 = ord(s[pos])
pos += 1
if _invalid_byte_2_of_2(ordch2):
- return ~(pos - 2), 0
+ return ~(pos - 2)
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
continuation_bytes += 1
continue
if ordch1 <= 0xEF:
if (pos + 2) > end:
- return ~(pos - 1), 0
+ return ~(pos - 1)
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
pos += 2
if (_invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates) or
_invalid_byte_3_of_3(ordch3)):
- return ~(pos - 3), 0
+ return ~(pos - 3)
- if allow_surrogates and _surrogate_bytes(ordch1, ordch2):
- flag = FLAG_HAS_SURROGATES
# 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
continuation_bytes += 2
continue
if ordch1 <= 0xF4:
if (pos + 3) > end:
- return ~(pos - 1), 0
+ return ~(pos - 1)
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
ordch4 = ord(s[pos + 2])
@@ -405,16 +399,16 @@
if (_invalid_byte_2_of_4(ordch1, ordch2) or
_invalid_byte_3_of_4(ordch3) or
_invalid_byte_4_of_4(ordch4)):
- return ~(pos - 4), 0
+ return ~(pos - 4)
# 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
continuation_bytes += 3
continue
- return ~(pos - 1), 0
+ return ~(pos - 1)
assert pos == end
assert pos - continuation_bytes >= 0
- return pos - continuation_bytes, flag
+ return pos - continuation_bytes
def reencode_utf8_with_surrogates(utf8):
""" Receiving valid UTF8 which contains surrogates, combine surrogate
@@ -472,47 +466,14 @@
return False
-UTF8_INDEX_STORAGE = lltype.GcStruct('utf8_loc',
- ('flag', lltype.Signed),
- ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct('utf8_loc_elem',
+UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct('utf8_loc_elem',
('baseindex', lltype.Signed),
('ofs', lltype.FixedSizeArray(lltype.Char, 16)),
- )))))
-
-def get_flag_from_code(oc):
- assert isinstance(oc, int)
- if oc <= 0x7F:
- return FLAG_ASCII
- if 0xD800 <= oc <= 0xDFFF:
- return FLAG_HAS_SURROGATES
- return FLAG_REGULAR
-
-def combine_flags(one, two):
- return one | two
-
-FLAG_ASCII = 0 # no bits
-FLAG_REGULAR = 1 # bit 0
-FLAG_HAS_SURROGATES = 3 # bit 0 and bit 1
-# note that we never need index storage if we're pure ascii, but it's useful
-# for passing into W_UnicodeObject.__init__
-
-#ASCII_INDEX_STORAGE_BLOCKS = 5
-#ASCII_INDEX_STORAGE = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO,
-# ASCII_INDEX_STORAGE_BLOCKS,
-# immortal=True)
-#for _i in range(ASCII_INDEX_STORAGE_BLOCKS):
-# ASCII_INDEX_STORAGE[_i].baseindex = _i * 64
-# for _j in range(16):
-# ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1)
+ ))
def null_storage():
return lltype.nullptr(UTF8_INDEX_STORAGE)
-UTF8_IS_ASCII = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True)
-UTF8_IS_ASCII.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO)
-UTF8_HAS_SURROGATES = lltype.malloc(UTF8_INDEX_STORAGE, immortal=True)
-UTF8_HAS_SURROGATES.contents = lltype.nullptr(UTF8_INDEX_STORAGE.contents.TO)
-
def create_utf8_index_storage(utf8, utf8len):
""" Create an index storage which stores index of each 4th character
in utf8 encoded unicode string.
@@ -520,23 +481,21 @@
# if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
# return ASCII_INDEX_STORAGE
arraysize = utf8len // 64 + 1
- storage = lltype.malloc(UTF8_INDEX_STORAGE)
- contents = lltype.malloc(UTF8_INDEX_STORAGE.contents.TO, arraysize)
- storage.contents = contents
+ storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
baseindex = 0
current = 0
while True:
- contents[current].baseindex = baseindex
+ storage[current].baseindex = baseindex
next = baseindex
for i in range(16):
if utf8len == 0:
next += 1 # assume there is an extra '\x00' character
else:
next = next_codepoint_pos(utf8, next)
- contents[current].ofs[i] = chr(next - baseindex)
+ storage[current].ofs[i] = chr(next - baseindex)
utf8len -= 4
if utf8len < 0:
- assert current + 1 == len(contents)
+ assert current + 1 == len(storage)
break
next = next_codepoint_pos(utf8, next)
next = next_codepoint_pos(utf8, next)
@@ -556,8 +515,8 @@
this function.
"""
current = index >> 6
- ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F])
- bytepos = storage.contents[current].baseindex + ofs
+ ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
+ bytepos = storage[current].baseindex + ofs
index &= 0x3
if index == 0:
return prev_codepoint_pos(utf8, bytepos)
@@ -575,8 +534,8 @@
storage of type UTF8_INDEX_STORAGE
"""
current = index >> 6
- ofs = ord(storage.contents[current].ofs[(index >> 2) & 0x0F])
- bytepos = storage.contents[current].baseindex + ofs
+ ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
+ bytepos = storage[current].baseindex + ofs
index &= 0x3
if index == 0:
return codepoint_before_pos(utf8, bytepos)
@@ -596,15 +555,15 @@
is not tiny either.
"""
index_min = 0
- index_max = len(storage.contents) - 1
+ index_max = len(storage) - 1
while index_min < index_max:
index_middle = (index_min + index_max + 1) // 2
- base_bytepos = storage.contents[index_middle].baseindex
+ base_bytepos = storage[index_middle].baseindex
if bytepos < base_bytepos:
index_max = index_middle - 1
else:
index_min = index_middle
- bytepos1 = storage.contents[index_min].baseindex
+ bytepos1 = storage[index_min].baseindex
result = index_min << 6
while bytepos1 < bytepos:
bytepos1 = next_codepoint_pos(utf8, bytepos1)
@@ -713,22 +672,19 @@
def __init__(self, size=0):
self._s = StringBuilder(size)
self._lgt = 0
- self._flag = FLAG_ASCII
@always_inline
def append(self, s):
# for strings
self._s.append(s)
- newlgt, newflag = get_utf8_length_flag(s)
+ newlgt = get_utf8_length(s)
self._lgt += newlgt
- self._flag = combine_flags(self._flag, newflag)
@always_inline
def append_slice(self, s, start, end):
self._s.append_slice(s, start, end)
- newlgt, newflag = get_utf8_length_flag(s, start, end)
+ newlgt = get_utf8_length(s, start, end)
self._lgt += newlgt
- self._flag = combine_flags(self._flag, newflag)
@signature(char(), returns=none())
@always_inline
@@ -739,13 +695,11 @@
@try_inline
def append_code(self, code):
- self._flag = combine_flags(self._flag, get_flag_from_code(code))
self._lgt += 1
unichr_as_utf8_append(self._s, code, True)
@always_inline
- def append_utf8(self, utf8, length, flag):
- self._flag = combine_flags(self._flag, flag)
+ def append_utf8(self, utf8, length):
self._lgt += length
self._s.append(utf8)
@@ -754,10 +708,6 @@
return self._s.build()
@always_inline
- def get_flag(self):
- return self._flag
-
- @always_inline
def get_length(self):
return self._lgt
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -38,30 +38,25 @@
def test_check_utf8_valid(u, allow_surrogates):
_test_check_utf8(u.encode('utf-8'), allow_surrogates)
+def _has_surrogates(s):
+ for u in s.decode('utf8'):
+ if 0xD800 <= ord(u) <= 0xDFFF:
+ return True
+ return False
+
def _test_check_utf8(s, allow_surrogates):
- def _has_surrogates(s):
- for u in s.decode('utf8'):
- if 0xD800 <= ord(u) <= 0xDFFF:
- return True
- return False
-
try:
u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True,
allow_surrogates=allow_surrogates)
valid = True
except UnicodeDecodeError as e:
valid = False
- length, flag = rutf8._check_utf8(s, allow_surrogates, 0, len(s))
+ length = rutf8._check_utf8(s, allow_surrogates, 0, len(s))
if length < 0:
assert not valid
assert ~(length) == e.start
else:
assert valid
- if flag == rutf8.FLAG_ASCII:
- s.decode('ascii') # assert did not raise
- elif flag == rutf8.FLAG_HAS_SURROGATES:
- assert allow_surrogates
- assert _has_surrogates(s)
if sys.maxunicode == 0x10FFFF or not _has_surrogates(s):
assert length == len(u)
@@ -155,60 +150,45 @@
assert result == expected
@given(strategies.lists(strategies.characters()))
-def test_get_utf8_length_flag(unichars):
+def test_get_utf8_length(unichars):
u = u''.join(unichars)
exp_lgt = len(u)
- exp_flag = rutf8.FLAG_ASCII
- for c in u:
- if ord(c) > 0x7F:
- exp_flag = rutf8.FLAG_REGULAR
- if 0xD800 <= ord(c) <= 0xDFFF:
- exp_flag = rutf8.FLAG_HAS_SURROGATES
- break
- lgt, flag = rutf8.get_utf8_length_flag(''.join([c.encode('utf8') for c in
u]))
- if exp_flag != rutf8.FLAG_HAS_SURROGATES or sys.maxunicode > 0xffff:
+ s = ''.join([c.encode('utf8') for c in u])
+ lgt = rutf8.get_utf8_length(s)
+ if not _has_surrogates(s) or sys.maxunicode > 0xffff:
assert lgt == exp_lgt
- assert flag == exp_flag
def test_utf8_string_builder():
s = rutf8.Utf8StringBuilder()
s.append("foo")
s.append_char("x")
- assert s.get_flag() == rutf8.FLAG_ASCII
assert s.get_length() == 4
assert s.build() == "foox"
s.append(u"\u1234".encode("utf8"))
- assert s.get_flag() == rutf8.FLAG_REGULAR
assert s.get_length() == 5
assert s.build().decode("utf8") == u"foox\u1234"
s.append("foo")
s.append_char("x")
- assert s.get_flag() == rutf8.FLAG_REGULAR
assert s.get_length() == 9
assert s.build().decode("utf8") == u"foox\u1234foox"
s = rutf8.Utf8StringBuilder()
s.append_code(0x1234)
assert s.build().decode("utf8") == u"\u1234"
- assert s.get_flag() == rutf8.FLAG_REGULAR
assert s.get_length() == 1
s.append_code(0xD800)
- assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
assert s.get_length() == 2
s = rutf8.Utf8StringBuilder()
- s.append_utf8("abc", 3, rutf8.FLAG_ASCII)
- assert s.get_flag() == rutf8.FLAG_ASCII
+ s.append_utf8("abc", 3)
assert s.get_length() == 3
assert s.build().decode("utf8") == u"abc"
- s.append_utf8(u"\u1234".encode("utf8"), 1, rutf8.FLAG_REGULAR)
+ s.append_utf8(u"\u1234".encode("utf8"), 1)
assert s.build().decode("utf8") == u"abc\u1234"
- assert s.get_flag() == rutf8.FLAG_REGULAR
assert s.get_length() == 4
s.append_code(0xD800)
- assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
assert s.get_length() == 5
@given(strategies.text())
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit