Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8-test Changeset: r93344:1665df77270e Date: 2017-12-10 05:27 +0000 http://bitbucket.org/pypy/pypy/changeset/1665df77270e/
Log: hg merge unicode-utf8 diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -12,3 +12,4 @@ * improve performance of splitlines * fix _pypyjson to not use a wrapped dict when decoding an object +* make sure we review all the places that call ord(unichr) to check for ValueErrors \ No newline at end of file diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1098,22 +1098,19 @@ elif ch >= 0xE000 or allow_surrogates: _STORECHAR(result, ch, byteorder) else: - ru, newindex = errorhandler(errors, public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - for j in range(newindex - index): - pos = rutf8.next_codepoint_pos(s, pos) - j = 0 - while j < len(ru): - ch = rutf8.codepoint_at_pos(ru, j) - if ord(ch) < 0xD800: - _STORECHAR(result, ord(ch), byteorder) + res_8, newindex = errorhandler( + errors, public_encoding_name, 'surrogates not allowed', + s, pos - 1, pos) + for cp in rutf8.Utf8StringIterator(res_8): + if cp < 0xD800: + _STORECHAR(result, cp, byteorder) else: errorhandler('strict', public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - j = rutf8.next_codepoint_pos(ru, j) - index = newindex + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue pos = rutf8.next_codepoint_pos(s, pos) @@ -1282,22 +1279,19 @@ ch = rutf8.codepoint_at_pos(s, pos) pos = rutf8.next_codepoint_pos(s, pos) if not allow_surrogates and 0xD800 <= ch < 0xE000: - ru, newindex = errorhandler(errors, public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - for j in range(newindex - index): - pos = rutf8.next_codepoint_pos(s, pos) - j = 0 - while j < len(ru): - ch = rutf8.codepoint_at_pos(ru, j) - if ord(ch) < 0xD800: - _STORECHAR32(result, ord(ch), byteorder) + res_8, newindex = errorhandler( + errors, public_encoding_name, 'surrogates not allowed', + s, pos - 1, pos) + for ch in rutf8.Utf8StringIterator(res_8): + if ch < 0xD800: + _STORECHAR32(result, ch, byteorder) else: - errorhandler('strict', public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - j = rutf8.next_codepoint_pos(ru, j) - index = newindex + errorhandler( + 'strict', public_encoding_name, 'surrogates not allowed', + s, pos - 1, pos) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue _STORECHAR32(result, ch, byteorder) index += 1 @@ -1425,8 +1419,7 @@ lgt = rutf8.check_utf8(r, True) return r, pos, lgt -def utf8_encode_charmap(s, errors, errorhandler=None, - mapping=None): +def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None): size = len(s) if mapping is None: return utf8_encode_latin_1(s, errors, errorhandler=errorhandler) @@ -1438,31 +1431,29 @@ index = 0 while pos < size: ch = rutf8.codepoint_at_pos(s, pos) - c = mapping.get(ch, '') if len(c) == 0: - # collect all unencodable chars. Important for narrow builds. - collend = rutf8.next_codepoint_pos(s, pos) - endindex = index + 1 - while collend < size and mapping.get(rutf8.codepoint_at_pos(s, collend), '') == '': - collend = rutf8.next_codepoint_pos(s, collend) - endindex += 1 - rs, endindex = errorhandler(errors, "charmap", + # collect all unencodable chars. + startindex = index + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + while (pos < size and + mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''): + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + res_8, newindex = errorhandler(errors, "charmap", "character maps to <undefined>", - s, index, endindex) - j = 0 - for _ in range(endindex - index): - ch2 = rutf8.codepoint_at_pos(rs, j) - ch2 = mapping.get(ch2, '') + s, startindex, index) + for cp2 in rutf8.Utf8StringIterator(res_8): + ch2 = mapping.get(cp2, '') if not ch2: errorhandler( - "strict", "charmap", - "character maps to <undefined>", - s, index, index + 1) + "strict", "charmap", "character maps to <undefined>", + s, startindex, index) result.append(ch2) - index += 1 - j = rutf8.next_codepoint_pos(rs, j) - pos = rutf8.next_codepoint_pos(s, pos) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue result.append(c) index += 1 diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -537,8 +537,12 @@ assert '\xff'.decode('utf-7', 'ignore') == '' assert '\x00'.decode('unicode-internal', 'ignore') == '' - def test_backslahreplace(self): - assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000' + def test_backslashreplace(self): + sin = u"a\xac\u1234\u20ac\u8000\U0010ffff" + expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" + assert sin.encode('ascii', 'backslashreplace') == expected + expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff" + assert sin.encode("iso-8859-15", "backslashreplace") == expected def test_badhandler(self): import codecs diff --git a/pypy/module/_locale/interp_locale.py b/pypy/module/_locale/interp_locale.py --- a/pypy/module/_locale/interp_locale.py +++ b/pypy/module/_locale/interp_locale.py @@ -133,10 +133,11 @@ rffi.free_charp(s1_c) rffi.free_charp(s2_c) - s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2) + s1, l1 = space.utf8_len_w(w_s1) + s2, l2 = space.utf8_len_w(w_s2) - s1_c = rffi.unicode2wcharp(s1) - s2_c = rffi.unicode2wcharp(s2) + s1_c = rffi.utf82wcharp(s1, l1) + s2_c = rffi.utf82wcharp(s2, l2) try: result = _wcscoll(s1_c, s2_c) finally: diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -227,8 +227,8 @@ ucharval = self.get_char(w_ffitype) return space.newbytes(chr(ucharval)) elif w_ffitype.is_unichar(): - wcharval = self.get_unichar(w_ffitype) - return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1) + wcharval = r_uint(self.get_unichar(w_ffitype)) + return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1) elif w_ffitype.is_double(): return self._float(w_ffitype) elif w_ffitype.is_singlefloat(): diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -448,7 +448,8 @@ elif c == 'c': return space.newbytes(func(add_arg, argdesc, ll_type)) elif c == 'u': - return space.newunicode(func(add_arg, argdesc, ll_type)) + return space.newutf8(rutf8.unichr_as_utf8( + ord(func(add_arg, argdesc, ll_type))), 1) elif c == 'f' or c == 'd' or c == 'g': return space.newfloat(float(func(add_arg, argdesc, ll_type))) else: @@ -596,10 +597,10 @@ return space.w_None wcharp_addr = rffi.cast(rffi.CWCHARP, address) if maxlength == -1: - s = rffi.wcharp2unicode(wcharp_addr) + s, lgt = rffi.wcharp2utf8(wcharp_addr) else: - s = rffi.wcharp2unicoden(wcharp_addr, maxlength) - return space.newunicode(s) + s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength) + return space.newutf8(s, lgt) @unwrap_spec(address=r_uint, maxlength=int) def charp2rawstring(space, address, maxlength=-1): @@ -612,8 +613,8 @@ def wcharp2rawunicode(space, address, maxlength=-1): if maxlength == -1: return wcharp2unicode(space, address) - s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength) - return space.newunicode(s) + s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength) + return space.newutf8(s, maxlength) @unwrap_spec(address=r_uint, newcontent='bufferstr') def rawstring2charp(space, address, newcontent): diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py --- a/rpython/annotator/unaryop.py +++ b/rpython/annotator/unaryop.py @@ -792,7 +792,7 @@ def ord(self): # warning, on 32-bit with 32-bit unichars, this might return # negative numbers - return SomeInteger() + return SomeInteger(nonneg=True) class __extend__(SomeIterator): diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -19,7 +19,7 @@ from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline from rpython.rlib.rstring import StringBuilder -from rpython.rlib import jit +from rpython.rlib import jit, types from rpython.rlib.signature import signature from rpython.rlib.types import char, none from rpython.rlib.rarithmetic import r_uint @@ -27,6 +27,8 @@ from rpython.rtyper.lltypesystem import lltype, rffi +# we need a way to accept both r_uint and int(nonneg=True) +#@signature(types.int_nonneg(), types.bool(), returns=types.str()) def unichr_as_utf8(code, allow_surrogates=False): """Encode code (numeric value) as utf8 encoded string """ @@ -437,7 +439,7 @@ low = codepoint_at_pos(utf8, i) if 0xDC00 <= low <= 0xDFFF: uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) - i = next_codepoint_pos(utf8, i) + i = next_codepoint_pos(utf8, i) # else not really a surrogate pair, just append high else: i = next_codepoint_pos(utf8, i) @@ -535,6 +537,13 @@ else: return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos)) +def _pos_at_index(utf8, index): + # Slow! + pos = 0 + for _ in range(index): + pos = next_codepoint_pos(utf8, pos) + return pos + @jit.dont_look_inside def codepoint_at_index(utf8, storage, index): """ Return codepoint of a character inside utf8 encoded string, given diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py --- a/rpython/rlib/types.py +++ b/rpython/rlib/types.py @@ -26,6 +26,8 @@ def int(): return model.SomeInteger() +def int_nonneg(): + return model.SomeInteger(nonneg=True) def bool(): return model.SomeBool() diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py --- a/rpython/rtyper/lltypesystem/rffi.py +++ b/rpython/rtyper/lltypesystem/rffi.py @@ -1019,7 +1019,27 @@ s = StringBuilder(size) for i in range(size): rutf8.unichr_as_utf8_append(s, ord(w[i])) - return s.build() + return s.build() + +def wcharp2utf8(w): + from rpython.rlib import rutf8 + + s = rutf8.Utf8StringBuilder() + i = 0 + while ord(w[i]): + s.append_code(ord(w[i])) + i += 1 + return s.build(), i + +def wcharp2utf8n(w, maxlen): + from rpython.rlib import rutf8 + + s = rutf8.Utf8StringBuilder(maxlen) + i = 0 + while i < maxlen and w[i]: + s.append_code(ord(w[i])) + i += 1 + return s.build(), i def utf82wcharp(utf8, utf8len): from rpython.rlib import rutf8 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit