Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r96454:1f16a5e43952 Date: 2019-04-13 15:36 +0200 http://bitbucket.org/pypy/pypy/changeset/1f16a5e43952/
Log: Fix the general testing for newstr(utf8, length_in_number_of_chars), which *now* should work and complain if we give an invalid number of chars. Fix array.array for a place where invalid utf8 strings were still being made, found by the above. diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py --- a/pypy/module/array/interp_array.py +++ b/pypy/module/array/interp_array.py @@ -1053,21 +1053,17 @@ code = r_uint(ord(item)) # cpython will allow values > sys.maxunicode # while silently truncating the top bits - if code <= r_uint(0x7F): - # Encode ASCII - item = chr(code) - elif code <= r_uint(0x07FF): - item = (chr((0xc0 | (code >> 6))) + - chr((0x80 | (code & 0x3f)))) - elif code <= r_uint(0xFFFF): - item = (chr((0xe0 | (code >> 12))) + - chr((0x80 | ((code >> 6) & 0x3f))) + - chr((0x80 | (code & 0x3f)))) - else: - item = (chr((0xf0 | (code >> 18)) & 0xff) + - chr((0x80 | ((code >> 12) & 0x3f))) + - chr((0x80 | ((code >> 6) & 0x3f))) + - chr((0x80 | (code & 0x3f)))) + # For now I (arigo) am going to ignore that and + # raise a ValueError always here, instead of getting + # some invalid utf8-encoded string which makes things + # potentially explode left and right. + try: + item = rutf8.unichr_as_utf8(code) + except rutf8.OutOfRange: + raise oefmt(space.w_ValueError, + "cannot operate on this array('u') because it contains" + " character %s not in range [U+0000; U+10ffff]" + " at index %d", 'U+%x' % code, idx) return space.newutf8(item, 1) assert 0, "unreachable" diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py --- a/pypy/module/array/test/test_array.py +++ b/pypy/module/array/test/test_array.py @@ -851,7 +851,13 @@ a = self.array('u', input_unicode) b = self.array('u', input_unicode) b.byteswap() - assert a != b + assert b[2] == u'\u0000' + raises(ValueError, "b[1]") # doesn't work + e = raises(ValueError, "a != b") # doesn't work + assert str(e.value) == ( + "cannot operate on this array('u') because it contains" + " character U+1000000 not in range [U+0000; U+10ffff]" + " at index 0") assert str(a) == "array('u', %r)" % (input_unicode,) assert str(b) == ("array('u', <character U+1000000 is not in" " range [U+0000; U+10ffff]>)") diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -42,13 +42,10 @@ self._length = length self._index_storage = rutf8.null_storage() if not we_are_translated(): - try: - # best effort, too expensive to handle surrogates - ulength = rutf8.codepoints_in_utf(utf8str) - except: - ulength = length - assert ulength == length - + # utf8str must always be a valid utf8 string, except maybe with + # explicit surrogate characters---which .decode('utf-8') doesn't + # special-case in Python 2, which is exactly what we want here + assert length == len(utf8str.decode('utf-8')) @staticmethod _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit