Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93347:58b6fedc39bc Date: 2017-12-10 08:27 +0100 http://bitbucket.org/pypy/pypy/changeset/58b6fedc39bc/
Log: hg merge unicode-utf8 diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -12,3 +12,4 @@ * improve performance of splitlines * fix _pypyjson to not use a wrapped dict when decoding an object +* make sure we review all the places that call ord(unichr) to check for ValueErrors \ No newline at end of file diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -1,3 +1,4 @@ +import pytest from hypothesis import given, strategies from rpython.rlib import rutf8 @@ -5,6 +6,7 @@ from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii from pypy.interpreter import unicodehelper as uh +from pypy.module._codecs.interp_codecs import CodecState def decode_utf8(u): return str_decode_utf8(u, True, "strict", None) @@ -68,3 +70,16 @@ def test_unicode_escape(u): r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None) assert r == u.encode("unicode-escape") + +def test_encode_decimal(space): + assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 ' + with pytest.raises(ValueError): + uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None) + state = space.fromcache(CodecState) + handler = state.encode_error_handler + assert uh.unicode_encode_decimal( + u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v' + + result = uh.unicode_encode_decimal( + u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler) + assert result == '12ሴ' diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1,11 +1,13 @@ import sys -from pypy.interpreter.error import OperationError +from pypy.interpreter.error import OperationError, oefmt from rpython.rlib.objectmodel import specialize from rpython.rlib import rutf8 from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rstring import StringBuilder +from rpython.rtyper.lltypesystem import rffi from pypy.module._codecs import interp_codecs +from pypy.module.unicodedata import unicodedb @specialize.memo() def decode_error_handler(space): @@ -34,6 +36,16 @@ space.newtext(msg)])) return raise_unicode_exception_encode +def default_error_encode( + errors, encoding, msg, u, startingpos, endingpos): + """A default handler, for tests""" + assert endingpos >= 0 + if errors == 'replace': + return '?', endingpos + if errors == 'ignore': + return '', endingpos + raise ValueError + def convert_arg_to_w_unicode(space, w_arg, strict=None): return space.convert_arg_to_w_unicode(w_arg) @@ -204,7 +216,7 @@ if c > 0x7F: errorhandler("strict", 'ascii', 'ordinal not in range(128)', utf8, - pos, pos + 1) + pos, pos + 1) j = rutf8.next_codepoint_pos(r, j) pos = newpos res.append(r) @@ -530,6 +542,19 @@ return builder.build(), pos, outsize +def wcharpsize2utf8(space, wcharp, size): + """Safe version of rffi.wcharpsize2utf8. + + Raises app-level ValueError if any wchar value is outside the valid + codepoint range. + """ + try: + return rffi.wcharpsize2utf8(wcharp, size) + except ValueError: + raise oefmt(space.w_ValueError, + "character is not in range [U+0000; U+10ffff]") + + # ____________________________________________________________ # Raw unicode escape @@ -575,8 +600,8 @@ digits = 4 if s[pos] == 'u' else 8 message = "truncated \\uXXXX" pos += 1 - pos, _, _ = hexescape(result, s, pos, digits, - "rawunicodeescape", errorhandler, message, errors) + pos, _ = hexescape(result, s, pos, digits, + "rawunicodeescape", errorhandler, message, errors) r = result.build() lgt = rutf8.check_utf8(r, True) @@ -1073,22 +1098,19 @@ elif ch >= 0xE000 or allow_surrogates: _STORECHAR(result, ch, byteorder) else: - ru, newindex = errorhandler(errors, public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - for j in range(newindex - index): - pos = rutf8.next_codepoint_pos(s, pos) - j = 0 - while j < len(ru): - ch = rutf8.codepoint_at_pos(ru, j) - if ord(ch) < 0xD800: - _STORECHAR(result, ord(ch), byteorder) + res_8, newindex = errorhandler( + errors, public_encoding_name, 'surrogates not allowed', + s, pos - 1, pos) + for cp in rutf8.Utf8StringIterator(res_8): + if cp < 0xD800: + _STORECHAR(result, cp, byteorder) else: errorhandler('strict', public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - j = rutf8.next_codepoint_pos(ru, j) - index = newindex + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue pos = rutf8.next_codepoint_pos(s, pos) @@ -1257,22 +1279,19 @@ ch = rutf8.codepoint_at_pos(s, pos) pos = rutf8.next_codepoint_pos(s, pos) if not allow_surrogates and 0xD800 <= ch < 0xE000: - ru, newindex = errorhandler(errors, public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - for j in range(newindex - index): - pos = rutf8.next_codepoint_pos(s, pos) - j = 0 - while j < len(ru): - ch = rutf8.codepoint_at_pos(ru, j) - if ord(ch) < 0xD800: - _STORECHAR32(result, ord(ch), byteorder) + res_8, newindex = errorhandler( + errors, public_encoding_name, 'surrogates not allowed', + s, pos - 1, pos) + for ch in rutf8.Utf8StringIterator(res_8): + if ch < 0xD800: + _STORECHAR32(result, ch, byteorder) else: - errorhandler('strict', public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - j = rutf8.next_codepoint_pos(ru, j) - index = newindex + errorhandler( + 'strict', public_encoding_name, 'surrogates not allowed', + s, pos - 1, pos) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue _STORECHAR32(result, ch, byteorder) index += 1 @@ -1400,8 +1419,7 @@ lgt = rutf8.check_utf8(r, True) return r, pos, lgt -def utf8_encode_charmap(s, errors, errorhandler=None, - mapping=None): +def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None): size = len(s) if mapping is None: return utf8_encode_latin_1(s, errors, errorhandler=errorhandler) @@ -1413,34 +1431,99 @@ index = 0 while pos < size: ch = rutf8.codepoint_at_pos(s, pos) - c = mapping.get(ch, '') if len(c) == 0: - # collect all unencodable chars. Important for narrow builds. - collend = rutf8.next_codepoint_pos(s, pos) - endindex = index + 1 - while collend < size and mapping.get(rutf8.codepoint_at_pos(s, collend), '') == '': - collend = rutf8.next_codepoint_pos(s, collend) - endindex += 1 - rs, endindex = errorhandler(errors, "charmap", + # collect all unencodable chars. + startindex = index + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + while (pos < size and + mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''): + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + res_8, newindex = errorhandler(errors, "charmap", "character maps to <undefined>", - s, index, endindex) - j = 0 - for _ in range(endindex - index): - ch2 = rutf8.codepoint_at_pos(rs, j) - ch2 = mapping.get(ch2, '') + s, startindex, index) + for cp2 in rutf8.Utf8StringIterator(res_8): + ch2 = mapping.get(cp2, '') if not ch2: errorhandler( - "strict", "charmap", - "character maps to <undefined>", - s, index, index + 1) + "strict", "charmap", "character maps to <undefined>", + s, startindex, index) result.append(ch2) - index += 1 - j = rutf8.next_codepoint_pos(rs, j) - pos = rutf8.next_codepoint_pos(s, pos) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue result.append(c) index += 1 pos = rutf8.next_codepoint_pos(s, pos) return result.build() +# ____________________________________________________________ +# Decimal Encoder +def unicode_encode_decimal(s, errors, errorhandler=None): + """Converts whitespace to ' ', decimal characters to their + corresponding ASCII digit and all other Latin-1 characters except + \0 as-is. Characters outside this range (Unicode ordinals 1-256) + are treated as errors. This includes embedded NULL bytes. + """ + if errorhandler is None: + errorhandler = default_error_encode + result = StringBuilder(len(s)) + pos = 0 + i = 0 + it = rutf8.Utf8StringIterator(s) + for ch in it: + if unicodedb.isspace(ch): + result.append(' ') + i += 1 + continue + try: + decimal = unicodedb.decimal(ch) + except KeyError: + pass + else: + result.append(chr(48 + decimal)) + i += 1 + continue + if 0 < ch < 256: + result.append(chr(ch)) + i += 1 + continue + # All other characters are considered unencodable + start_index = i + i += 1 + while not it.done(): + ch = rutf8.codepoint_at_pos(s, it.get_pos()) + try: + if (0 < ch < 256 or unicodedb.isspace(ch) or + unicodedb.decimal(ch) >= 0): + break + except KeyError: + # not a decimal + pass + if it.done(): + break + ch = next(it) + i += 1 + end_index = i + msg = "invalid decimal Unicode string" + r, pos = errorhandler( + errors, 'decimal', msg, s, start_index, end_index) + for ch in rutf8.Utf8StringIterator(r): + if unicodedb.isspace(ch): + result.append(' ') + continue + try: + decimal = unicodedb.decimal(ch) + except KeyError: + pass + else: + result.append(chr(48 + decimal)) + continue + if 0 < ch < 256: + result.append(chr(ch)) + continue + errorhandler('strict', 'decimal', msg, s, start_index, end_index) + return result.build() diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -70,9 +70,6 @@ raise oefmt(space.w_IndexError, "position %d from error handler out of bounds", newpos) - if newpos < startpos: - raise oefmt(space.w_IndexError, - "position %d from error handler did not progress", newpos) w_replace = space.convert_to_w_unicode(w_replace) return w_replace._utf8, newpos return call_errorhandler @@ -226,7 +223,7 @@ w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) - end = w_obj._index_to_byte(end) + end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start obj = w_obj._utf8 @@ -460,22 +457,12 @@ # utf-8 functions are not regular, because we have to pass # "allow_surrogates=True" -@unwrap_spec(utf8='utf8', errors='text_or_none') -def utf_8_encode(space, utf8, errors="strict"): - length, _ = rutf8.check_utf8(utf8, allow_surrogates=True) - return space.newtuple([space.newbytes(utf8), space.newint(length)]) -#@unwrap_spec(uni=unicode, errors='text_or_none') -#def utf_8_encode(space, uni, errors="strict"): -# if errors is None: -# errors = 'strict' -# state = space.fromcache(CodecState) -# # NB. can't call unicode_encode_utf_8() directly because that's -# # an @elidable function nowadays. Instead, we need the _impl(). -# # (The problem is the errorhandler, which calls arbitrary Python.) -# result = runicode.unicode_encode_utf_8_impl( -# uni, len(uni), errors, state.encode_error_handler, -# allow_surrogates=True) -# return space.newtuple([space.newbytes(result), space.newint(len(uni))]) +@unwrap_spec(errors='text_or_none') +def utf_8_encode(space, w_obj, errors="strict"): + utf8, lgt = space.utf8_len_w(w_obj) + if rutf8.has_surrogates(utf8): + utf8 = rutf8.reencode_utf8_with_surrogates(utf8) + return space.newtuple([space.newbytes(utf8), space.newint(lgt)]) @unwrap_spec(string='bufferstr', errors='text_or_none', w_final = WrappedDefault(False)) diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -537,8 +537,12 @@ assert '\xff'.decode('utf-7', 'ignore') == '' assert '\x00'.decode('unicode-internal', 'ignore') == '' - def test_backslahreplace(self): - assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000' + def test_backslashreplace(self): + sin = u"a\xac\u1234\u20ac\u8000\U0010ffff" + expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" + assert sin.encode('ascii', 'backslashreplace') == expected + expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff" + assert sin.encode("iso-8859-15", "backslashreplace") == expected def test_badhandler(self): import codecs diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py --- a/pypy/module/_io/interp_stringio.py +++ b/pypy/module/_io/interp_stringio.py @@ -1,3 +1,5 @@ +from rpython.rlib.rutf8 import get_utf8_length + from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.typedef import ( TypeDef, generic_new_descr, GetSetProperty) @@ -152,7 +154,7 @@ if self.readnl is None: w_readnl = space.w_None else: - w_readnl = space.str(space.new_from_utf8(self.readnl)) # YYY + w_readnl = space.str(space.newutf8(self.readnl, get_utf8_length(self.readnl))) # YYY return space.newtuple([ w_initialval, w_readnl, space.newint(self.buf.pos), w_dict ]) @@ -215,7 +217,8 @@ if self.writenl: w_decoded = space.call_method( w_decoded, "replace", - space.newtext("\n"), space.new_from_utf8(self.writenl)) + space.newtext("\n"), space.newutf8(self.writenl, + get_utf8_length(self.writenl))) string = space.utf8_w(w_decoded) if string: self.buf.write(string) @@ -225,7 +228,9 @@ def read_w(self, space, w_size=None): self._check_closed(space) size = convert_size(space, w_size) - return space.new_from_utf8(self.buf.read(size)) + v = self.buf.read(size) + lgt = get_utf8_length(v) + return space.newutf8(v, lgt) def readline_w(self, space, w_limit=None): self._check_closed(space) @@ -239,7 +244,8 @@ else: newline = self.readnl result = self.buf.readline(newline, limit) - return space.new_from_utf8(result) + resultlen = get_utf8_length(result) + return space.newutf8(result, resultlen) @unwrap_spec(pos=int, mode=int) @@ -276,7 +282,9 @@ def getvalue_w(self, space): self._check_closed(space) - return space.new_from_utf8(self.buf.getvalue()) + v = self.buf.getvalue() + lgt = get_utf8_length(v) + return space.newutf8(v, lgt) def readable_w(self, space): self._check_closed(space) diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -12,7 +12,8 @@ from rpython.rlib.rbigint import rbigint from rpython.rlib.rstring import StringBuilder from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos, - codepoints_in_utf8) + codepoints_in_utf8, get_utf8_length, + Utf8StringBuilder) STATE_ZERO, STATE_OK, STATE_DETACHED = range(3) @@ -684,13 +685,15 @@ w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) - w_result = space.new_from_utf8(self.decoded.get_chars(-1)) + chars = self.decoded.get_chars(-1) + lgt = get_utf8_length(chars) + w_result = space.newutf8(chars, lgt) w_final = space.add(w_result, w_decoded) self.snapshot = None return w_final remaining = size - builder = StringBuilder(size) + builder = Utf8StringBuilder(size) # Keep reading chunks until we have n characters to return while remaining > 0: @@ -700,7 +703,7 @@ builder.append(data) remaining -= len(data) - return space.new_from_utf8(builder.build()) + return space.newutf8(builder.build(), builder.get_length()) def _scan_line_ending(self, limit): if self.readuniversal: @@ -725,6 +728,7 @@ limit = convert_size(space, w_limit) remnant = None builder = StringBuilder() + # XXX maybe use Utf8StringBuilder instead? while True: # First, get some data if necessary has_data = self._ensure_data(space) @@ -771,7 +775,8 @@ self.decoded.reset() result = builder.build() - return space.new_from_utf8(result) + lgt = get_utf8_length(result) + return space.newutf8(result, lgt) # _____________________________________________________________ # write methods @@ -794,8 +799,8 @@ if text.find('\n') >= 0: haslf = True if haslf and self.writetranslate and self.writenl: - w_text = space.call_method(w_text, "replace", space.new_from_utf8('\n'), - space.new_from_utf8(self.writenl)) + w_text = space.call_method(w_text, "replace", space.newutf8('\n', 1), + space.newutf8(self.writenl, get_utf8_length(self.writenl))) text = space.utf8_w(w_text) needflush = False diff --git a/pypy/module/_locale/interp_locale.py b/pypy/module/_locale/interp_locale.py --- a/pypy/module/_locale/interp_locale.py +++ b/pypy/module/_locale/interp_locale.py @@ -133,10 +133,11 @@ rffi.free_charp(s1_c) rffi.free_charp(s2_c) - s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2) + s1, l1 = space.utf8_len_w(w_s1) + s2, l2 = space.utf8_len_w(w_s2) - s1_c = rffi.unicode2wcharp(s1) - s2_c = rffi.unicode2wcharp(s2) + s1_c = rffi.utf82wcharp(s1, l1) + s2_c = rffi.utf82wcharp(s2, l2) try: result = _wcscoll(s1_c, s2_c) finally: diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -157,7 +157,7 @@ replace, end = errorcb(errors, namecb, reason, stringdata, start, end) # 'replace' is RPython unicode here - lgt, _ = rutf8.check_utf8(replace, True) + lgt = rutf8.get_utf8_length(replace) inbuf = rffi.utf82wcharp(replace, lgt) try: r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end) @@ -268,7 +268,7 @@ rets, end = errorcb(errors, namecb, reason, unicodedata, start, end) codec = pypy_cjk_enc_getcodec(encodebuf) - lgt, _ = rutf8.get_utf8_length_flag(rets) + lgt = rutf8.get_utf8_length(rets) replace = encode(codec, rets, lgt, "strict", errorcb, namecb) with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -66,7 +66,7 @@ pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] - lgt = rutf8.get_utf8_length_flag(output) + lgt = rutf8.get_utf8_length(output) return space.newutf8(output, lgt) diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -27,8 +27,8 @@ raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) - lgt, flag = rutf8.check_utf8(utf8_output, True) - return space.newtuple([space.newutf8(utf8_output, lgt, flag), + lgt = rutf8.get_utf8_length(utf8_output) + return space.newtuple([space.newutf8(utf8_output, lgt), space.newint(len(input))]) @unwrap_spec(errors="text_or_none") diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py --- a/pypy/module/_multibytecodec/test/test_translation.py +++ b/pypy/module/_multibytecodec/test/test_translation.py @@ -14,7 +14,7 @@ codecname, string = argv[1], argv[2] c = c_codecs.getcodec(codecname) u = c_codecs.decode(c, string) - lgt, _ = rutf8.get_utf8_length_flag(u) + lgt = rutf8.get_utf8_length(u) r = c_codecs.encode(c, u, lgt) print r return 0 diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py --- a/pypy/module/_pypyjson/interp_decoder.py +++ b/pypy/module/_pypyjson/interp_decoder.py @@ -3,6 +3,7 @@ from rpython.rlib.objectmodel import specialize, always_inline, r_dict from rpython.rlib import rfloat, runicode, rutf8 from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.rlib.rarithmetic import r_uint from pypy.interpreter.error import oefmt from pypy.interpreter import unicodehelper @@ -366,7 +367,7 @@ return # help the annotator to know that we'll never go beyond # this point # - utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True) + utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True) builder.append(utf8_ch) return i @@ -400,7 +401,7 @@ break elif ch == '\\' or ch < '\x20': self.pos = i-1 - return self.space.unicode_w(self.decode_string_escaped(start)) + return self.decode_string_escaped(start) strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i])) bits |= ord(ch) length = i - start - 1 diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -128,7 +128,7 @@ intval: lltype.Signed """ self.error(w_ffitype, w_obj) - + def handle_unichar(self, w_ffitype, w_obj, intval): """ intval: lltype.Signed @@ -174,7 +174,7 @@ def handle_struct_rawffi(self, w_ffitype, w_structinstance): """ This method should be killed as soon as we remove support for _rawffi structures - + w_structinstance: W_StructureInstance """ self.error(w_ffitype, w_structinstance) @@ -227,7 +227,7 @@ ucharval = self.get_char(w_ffitype) return space.newbytes(chr(ucharval)) elif w_ffitype.is_unichar(): - wcharval = self.get_unichar(w_ffitype) + wcharval = r_uint(self.get_unichar(w_ffitype)) return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1) elif w_ffitype.is_double(): return self._float(w_ffitype) @@ -349,7 +349,7 @@ def get_struct_rawffi(self, w_ffitype, w_structdescr): """ This should be killed as soon as we kill support for _rawffi structures - + Return type: lltype.Unsigned (the address of the structure) """ diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -448,7 +448,8 @@ elif c == 'c': return space.newbytes(func(add_arg, argdesc, ll_type)) elif c == 'u': - return space.newunicode(func(add_arg, argdesc, ll_type)) + return space.newutf8(rutf8.unichr_as_utf8( + ord(func(add_arg, argdesc, ll_type))), 1) elif c == 'f' or c == 'd' or c == 'g': return space.newfloat(float(func(add_arg, argdesc, ll_type))) else: @@ -596,10 +597,10 @@ return space.w_None wcharp_addr = rffi.cast(rffi.CWCHARP, address) if maxlength == -1: - s = rffi.wcharp2utf8(wcharp_addr) + s, lgt = rffi.wcharp2utf8(wcharp_addr) else: - s = rffi.wcharpsize2utf8(wcharp_addr, maxlength) - return space.newunicode(s) + s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength) + return space.newutf8(s, lgt) @unwrap_spec(address=r_uint, maxlength=int) def charp2rawstring(space, address, maxlength=-1): @@ -612,8 +613,8 @@ def wcharp2rawunicode(space, address, maxlength=-1): if maxlength == -1: return wcharp2unicode(space, address) - s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength) - return space.newunicode(s) + s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength) + return space.newutf8(s, maxlength) @unwrap_spec(address=r_uint, newcontent='bufferstr') def rawstring2charp(space, address, newcontent): diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py --- a/pypy/module/array/interp_array.py +++ b/pypy/module/array/interp_array.py @@ -1,7 +1,7 @@ from rpython.rlib import jit, rgc, rutf8 from rpython.rlib.buffer import RawBuffer from rpython.rlib.objectmodel import keepalive_until_here -from rpython.rlib.rarithmetic import ovfcheck, widen +from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint from rpython.rlib.unroll import unrolling_iterable from rpython.rtyper.annlowlevel import llstr from rpython.rtyper.lltypesystem import lltype, rffi @@ -380,6 +380,7 @@ if len(s) % self.itemsize != 0: raise oefmt(self.space.w_ValueError, "string length not a multiple of item size") + self.check_valid_unicode(space, s) # empty for non-u arrays oldlen = self.len new = len(s) / self.itemsize if not new: @@ -451,7 +452,7 @@ """ if self.typecode == 'u': buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned()) - return space.newutf8(rffi.wcharpsize2unicode(buf, self.len)) + return space.newutf8(rffi.wcharpsize2utf8(buf, self.len), self.len) else: raise oefmt(space.w_ValueError, "tounicode() may only be called on type 'u' arrays") @@ -710,6 +711,9 @@ s = "array('%s', %s)" % (self.typecode, space.text_w(r)) return space.newtext(s) + def check_valid_unicode(self, space, s): + pass # overwritten by u + W_ArrayBase.typedef = TypeDef( 'array.array', __new__ = interp2app(w_array), @@ -870,6 +874,18 @@ def get_buffer(self): return rffi.cast(mytype.arrayptrtype, self._buffer) + if mytype.unwrap == 'utf8_len_w': + def check_valid_unicode(self, space, s): + i = 0 + while i < len(s): + if s[i] != '\x00' or ord(s[i + 1]) > 0x10: + v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) + + (ord(s[i + 2]) << 8) + ord(s[i + 3])) + raise oefmt(space.w_ValueError, + "Character U+%s is not in range [U+0000, U+10ffff]", + hex(v)[2:]) + i += 4 + def item_w(self, w_item): space = self.space unwrap = getattr(space, mytype.unwrap) @@ -1013,7 +1029,7 @@ elif mytype.typecode == 'c': return space.newbytes(item) elif mytype.typecode == 'u': - code = ord(item) + code = r_uint(ord(item)) return space.newutf8(rutf8.unichr_as_utf8(code), 1) assert 0, "unreachable" diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py --- a/pypy/module/array/test/test_array.py +++ b/pypy/module/array/test/test_array.py @@ -844,13 +844,7 @@ import sys if sys.maxunicode == 0xffff: skip("test for 32-bit unicodes") - a = self.array('u', b'\xff\xff\xff\xff') - assert len(a) == 1 - assert repr(a[0]) == "u'\Uffffffff'" - if sys.maxint == 2147483647: - assert ord(a[0]) == -1 - else: - assert ord(a[0]) == 4294967295 + raises(ValueError, self.array, 'u', b'\xff\xff\xff\xff') def test_weakref(self): import weakref diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py --- a/pypy/module/cpyext/longobject.py +++ b/pypy/module/cpyext/longobject.py @@ -4,6 +4,7 @@ CONST_STRING, ADDR, CANNOT_FAIL) from pypy.objspace.std.longobject import W_LongObject from pypy.interpreter.error import OperationError +from pypy.interpreter.unicodehelper import wcharpsize2utf8 from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask from rpython.rlib.rbigint import rbigint @@ -191,7 +192,7 @@ string, length gives the number of characters, and base is the radix for the conversion. The radix must be in the range [2, 36]; if it is out of range, ValueError will be raised.""" - w_value = space.newunicode(rffi.wcharpsize2unicode(u, length)) + w_value = space.newutf8(wcharpsize2utf8(space, u, length), length) w_base = space.newint(rffi.cast(lltype.Signed, base)) return space.call_function(space.w_long, w_value, w_base) diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py --- a/pypy/module/cpyext/object.py +++ b/pypy/module/cpyext/object.py @@ -246,7 +246,7 @@ the Python expression unicode(o). Called by the unicode() built-in function.""" if w_obj is None: - return space.newunicode(u"<NULL>") + return space.newutf8("<NULL>", 6) return space.call_function(space.w_unicode, w_obj) @cpython_api([PyObject, PyObject], rffi.INT_real, error=-1) @@ -302,7 +302,7 @@ if opid == Py_EQ: return 1 if opid == Py_NE: - return 0 + return 0 w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int) return int(space.is_true(w_res)) diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py --- a/pypy/module/cpyext/unicodeobject.py +++ b/pypy/module/cpyext/unicodeobject.py @@ -1,5 +1,11 @@ +from rpython.rtyper.lltypesystem import rffi, lltype +from rpython.rlib import rstring, runicode +from rpython.tool.sourcetools import func_renamer + from pypy.interpreter.error import OperationError, oefmt -from rpython.rtyper.lltypesystem import rffi, lltype +from pypy.interpreter.unicodehelper import ( + wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper, + unicode_encode_decimal) from pypy.module.unicodedata import unicodedb from pypy.module.cpyext.api import ( CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api, @@ -13,8 +19,6 @@ from pypy.module.sys.interp_encoding import setdefaultencoding from pypy.module._codecs.interp_codecs import CodecState from pypy.objspace.std import unicodeobject -from rpython.rlib import rstring, runicode -from rpython.tool.sourcetools import func_renamer import sys ## See comment in bytesobject.py. @@ -61,10 +65,10 @@ def unicode_attach(space, py_obj, w_obj, w_userdata=None): "Fills a newly allocated PyUnicodeObject with a unicode string" py_unicode = rffi.cast(PyUnicodeObject, py_obj) - s = space.unicode_w(w_obj) - py_unicode.c_length = len(s) + s, length = space.utf8_len_w(w_obj) + py_unicode.c_length = length py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO) - py_unicode.c_hash = space.hash_w(space.newunicode(s)) + py_unicode.c_hash = space.hash_w(space.newutf8(s, length)) py_unicode.c_defenc = lltype.nullptr(PyObject.TO) def unicode_realize(space, py_obj): @@ -73,11 +77,12 @@ be modified after this call. """ py_uni = rffi.cast(PyUnicodeObject, py_obj) - s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length) + length = py_uni.c_length + s = wcharpsize2utf8(space, py_uni.c_str, length) w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type)) w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type) - w_obj.__init__(s) - py_uni.c_hash = space.hash_w(space.newunicode(s)) + w_obj.__init__(s, length) + py_uni.c_hash = space.hash_w(space.newutf8(s, length)) track_reference(space, py_obj, w_obj) return w_obj @@ -214,8 +219,8 @@ if not ref_unicode.c_str: # Copy unicode buffer w_unicode = from_ref(space, rffi.cast(PyObject, ref)) - u = space.unicode_w(w_unicode) - ref_unicode.c_str = rffi.unicode2wcharp(u) + u, length = space.utf8_len_w(w_unicode) + ref_unicode.c_str = rffi.utf82wcharp(u, length) return ref_unicode.c_str @cpython_api([PyObject], rffi.CWCHARP) @@ -335,8 +340,8 @@ Therefore, modification of the resulting Unicode object is only allowed when u is NULL.""" if wchar_p: - s = rffi.wcharpsize2unicode(wchar_p, length) - return make_ref(space, space.newunicode(s)) + s = wcharpsize2utf8(space, wchar_p, length) + return make_ref(space, space.newutf8(s, length)) else: return rffi.cast(PyObject, new_empty_unicode(space, length)) @@ -506,7 +511,8 @@ """Encode the Py_UNICODE buffer of the given size and return a Python string object. Return NULL if an exception was raised by the codec.""" - w_u = space.newunicode(rffi.wcharpsize2unicode(s, size)) + u = wcharpsize2utf8(space, s, size) + w_u = space.newutf8(u, size) if errors: w_errors = space.newtext(rffi.charp2str(errors)) else: @@ -564,15 +570,11 @@ else: errors = None - result, length, byteorder = runicode.str_decode_utf_16_helper( - string, size, errors, - True, # final ? false for multiple passes? - None, # errorhandler - byteorder) + result, _, length, byteorder = str_decode_utf_16_helper( + string, errors, final=True, errorhandler=None, byteorder=byteorder) if pbyteorder is not None: pbyteorder[0] = rffi.cast(rffi.INT, byteorder) - - return space.newunicode(result) + return space.newutf8(result, length) @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject) def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder): @@ -620,15 +622,11 @@ else: errors = None - result, length, byteorder = runicode.str_decode_utf_32_helper( - string, size, errors, - True, # final ? false for multiple passes? - None, # errorhandler - byteorder) + result, _, length, byteorder = str_decode_utf_32_helper( + string, errors, final=True, errorhandler=None, byteorder=byteorder) if pbyteorder is not None: pbyteorder[0] = rffi.cast(rffi.INT, byteorder) - - return space.newunicode(result) + return space.newutf8(result, length) @cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING], rffi.INT_real, error=-1) @@ -646,14 +644,13 @@ Returns 0 on success, -1 on failure. """ - u = rffi.wcharpsize2unicode(s, length) + u = rffi.wcharpsize2utf8(s, length) if llerrors: errors = rffi.charp2str(llerrors) else: errors = None state = space.fromcache(CodecState) - result = runicode.unicode_encode_decimal(u, length, errors, - state.encode_error_handler) + result = unicode_encode_decimal(u, errors, state.encode_error_handler) i = len(result) output[i] = '\0' i -= 1 @@ -706,12 +703,17 @@ """Return 1 if substr matches str[start:end] at the given tail end (direction == -1 means to do a prefix match, direction == 1 a suffix match), 0 otherwise. Return -1 if an error occurred.""" - str = space.unicode_w(w_str) - substr = space.unicode_w(w_substr) + space.utf8_w(w_str) # type check + space.utf8_w(w_substr) + w_start = space.newint(start) + w_end = space.newint(end) if rffi.cast(lltype.Signed, direction) <= 0: - return rstring.startswith(str, substr, start, end) + w_result = space.call_method( + w_str, "startswith", w_substr, w_start, w_end) else: - return rstring.endswith(str, substr, start, end) + w_result = space.call_method( + w_str, "endswith", w_substr, w_start, w_end) + return space.int_w(w_result) @cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, error=-1) def PyUnicode_Count(space, w_str, w_substr, start, end): diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py --- a/pypy/module/pyexpat/interp_pyexpat.py +++ b/pypy/module/pyexpat/interp_pyexpat.py @@ -483,7 +483,7 @@ except rutf8.CheckError: from pypy.interpreter import unicodehelper # get the correct error msg - unicodehelper.str_decode_utf8(s, len(s), 'string', True, + unicodehelper.str_decode_utf8(s, 'string', True, unicodehelper.decode_error_handler(space)) assert False, "always raises" else: @@ -587,21 +587,22 @@ def UnknownEncodingHandler(self, space, name, info): # Yes, supports only 8bit encodings - translationmap = space.unicode_w( + translationmap, lgt = space.utf8_len_w( space.call_method( space.newbytes(self.all_chars), "decode", space.newtext(name), space.newtext("replace"))) - if len(translationmap) != 256: + if lgt != 256: raise oefmt(space.w_ValueError, "multi-byte encodings are not supported") - for i in range(256): - c = translationmap[i] - if c == u'\ufffd': + i = 0 + for c in rutf8.Utf8StringIterator(translationmap): + if c == 0xfffd: info.c_map[i] = rffi.cast(rffi.INT, -1) else: info.c_map[i] = rffi.cast(rffi.INT, c) + i += 1 info.c_data = lltype.nullptr(rffi.VOIDP.TO) info.c_convert = lltype.nullptr(rffi.VOIDP.TO) info.c_release = lltype.nullptr(rffi.VOIDP.TO) diff --git a/pypy/module/struct/formatiterator.py b/pypy/module/struct/formatiterator.py --- a/pypy/module/struct/formatiterator.py +++ b/pypy/module/struct/formatiterator.py @@ -1,6 +1,6 @@ from rpython.rlib.rarithmetic import (r_uint, r_ulonglong, r_longlong, maxint, intmask) -from rpython.rlib import jit +from rpython.rlib import jit, rutf8 from rpython.rlib.objectmodel import specialize from rpython.rlib.rstruct.error import StructError from rpython.rlib.rstruct.formatiterator import FormatIterator @@ -107,7 +107,7 @@ def accept_unicode_arg(self): w_obj = self.accept_obj_arg() - return self.space.unicode_w(w_obj) + return self.space.utf8_len_w(w_obj) def accept_float_arg(self): w_obj = self.accept_obj_arg() @@ -191,6 +191,10 @@ assert 0, "unreachable" self.result_w.append(w_value) + def append_utf8(self, value): + w_ch = self.space.newutf8(rutf8.unichr_as_utf8(r_uint(value)), 1) + self.result_w.append(w_ch) + def get_pos(self): return self.pos diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py --- a/pypy/module/unicodedata/interp_ucd.py +++ b/pypy/module/unicodedata/interp_ucd.py @@ -7,11 +7,8 @@ from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.typedef import TypeDef, interp_attrproperty from rpython.rlib.rarithmetic import r_longlong -from rpython.rlib.objectmodel import we_are_translated -from rpython.rlib.runicode import MAXUNICODE from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0 -from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate -import sys +from rpython.rlib.rutf8 import Utf8StringBuilder, unichr_as_utf8 # Contants for Hangul characters @@ -30,49 +27,17 @@ # unicode code point. -if MAXUNICODE > 0xFFFF: - # Target is wide build - def unichr_to_code_w(space, w_unichr): - if not space.isinstance_w(w_unichr, space.w_unicode): - raise oefmt( - space.w_TypeError, 'argument 1 must be unicode, not %T', - w_unichr) +# Target is wide build +def unichr_to_code_w(space, w_unichr): + if not space.isinstance_w(w_unichr, space.w_unicode): + raise oefmt( + space.w_TypeError, 'argument 1 must be unicode, not %T', + w_unichr) - if not we_are_translated() and sys.maxunicode == 0xFFFF: - # Host CPython is narrow build, accept surrogates - try: - return ord_accepts_surrogate(space.unicode_w(w_unichr)) - except TypeError: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - else: - if not space.len_w(w_unichr) == 1: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - return space.int_w(space.ord(w_unichr)) - -else: - # Target is narrow build - def unichr_to_code_w(space, w_unichr): - if not space.isinstance_w(w_unichr, space.w_unicode): - raise oefmt( - space.w_TypeError, 'argument 1 must be unicode, not %T', - w_unichr) - - if not we_are_translated() and sys.maxunicode > 0xFFFF: - # Host CPython is wide build, forbid surrogates - if not space.len_w(w_unichr) == 1: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - return space.int_w(space.ord(w_unichr)) - - else: - # Accept surrogates - try: - return ord_accepts_surrogate(space.unicode_w(w_unichr)) - except TypeError: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") + if not space.len_w(w_unichr) == 1: + raise oefmt(space.w_TypeError, + "need a single Unicode character as parameter") + return space.int_w(space.ord(w_unichr)) class UCD(W_Root): @@ -110,7 +75,8 @@ except KeyError: msg = space.mod(space.newtext("undefined character name '%s'"), space.newtext(name)) raise OperationError(space.w_KeyError, msg) - return space.newunicode(code_to_unichr(code)) + assert code >= 0 + return space.newutf8(unichr_as_utf8(code), 1) def name(self, space, w_unichr, w_default=None): code = unichr_to_code_w(space, w_unichr) @@ -259,10 +225,10 @@ result[0] = ch if not composed: # If decomposed normalization we are done - return space.newunicode(u''.join([unichr(i) for i in result[:j]])) + return self.build(space, result, stop=j) if j <= 1: - return space.newunicode(u''.join([unichr(i) for i in result[:j]])) + return self.build(space, result, stop=j) current = result[0] starter_pos = 0 @@ -310,7 +276,13 @@ result[starter_pos] = current - return space.newunicode(u''.join([unichr(i) for i in result[:next_insert]])) + return self.build(space, result, stop=next_insert) + + def build(self, space, r, stop): + builder = Utf8StringBuilder(stop * 3) + for i in range(stop): + builder.append_code(r[i]) + return space.newutf8(builder.build(), stop) methods = {} diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py --- a/pypy/module/unicodedata/test/test_hyp.py +++ b/pypy/module/unicodedata/test/test_hyp.py @@ -1,3 +1,4 @@ + import pytest try: from hypothesis import given, strategies as st, example, settings @@ -5,12 +6,14 @@ pytest.skip("hypothesis required") from pypy.module.unicodedata.interp_ucd import ucd +from rpython.rlib.rutf8 import get_utf8_length def make_normalization(space, NF_code): def normalize(s): - w_s = space.newunicode(s) + u = s.encode('utf8') + w_s = space.newutf8(u, get_utf8_length(u)) w_res = ucd.normalize(space, NF_code, w_s) - return space.unicode_w(w_res) + return space.utf8_w(w_res).decode('utf8') return normalize all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD'] diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py --- a/pypy/objspace/fake/objspace.py +++ b/pypy/objspace/fake/objspace.py @@ -212,9 +212,6 @@ def newutf8(self, x, l): return w_some_obj() - def new_from_utf8(self, a): - return w_some_obj() - def newunicode(self, a): return w_some_obj() diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py --- a/pypy/objspace/std/formatting.py +++ b/pypy/objspace/std/formatting.py @@ -3,7 +3,7 @@ from rpython.rlib import jit, rutf8 from rpython.rlib.objectmodel import specialize -from rpython.rlib.rarithmetic import INT_MAX +from rpython.rlib.rarithmetic import INT_MAX, r_uint from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf from rpython.rlib.rstring import StringBuilder from rpython.rlib.unroll import unrolling_iterable @@ -330,7 +330,7 @@ space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) - w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1) + w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1) else: cp = ord(self.fmt[self.fmtpos - 1]) w_s = space.newbytes(chr(cp)) @@ -466,7 +466,7 @@ n = space.int_w(w_value) if do_unicode: try: - c = rutf8.unichr_as_utf8(n) + c = rutf8.unichr_as_utf8(r_uint(n)) except ValueError: raise oefmt(space.w_OverflowError, "unicode character code out of range") diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -741,6 +741,8 @@ assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac' assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82' assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96' + assert u'\ud800\udc02'.encode('uTf-8') == '\xf0\x90\x80\x82' + assert u'\ud84d\udc56'.encode('Utf8') == '\xf0\xa3\x91\x96' assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80' assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80' assert (u'\ud800\udc02'*1000).encode('utf-8') == '\xf0\x90\x80\x82'*1000 diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py --- a/rpython/annotator/unaryop.py +++ b/rpython/annotator/unaryop.py @@ -792,7 +792,7 @@ def ord(self): # warning, on 32-bit with 32-bit unichars, this might return # negative numbers - return SomeInteger() + return SomeInteger(nonneg=True) class __extend__(SomeIterator): diff --git a/rpython/rlib/rstruct/nativefmttable.py b/rpython/rlib/rstruct/nativefmttable.py --- a/rpython/rlib/rstruct/nativefmttable.py +++ b/rpython/rlib/rstruct/nativefmttable.py @@ -4,7 +4,7 @@ """ import struct -from rpython.rlib import jit, longlong2float +from rpython.rlib import rutf8, longlong2float from rpython.rlib.objectmodel import specialize from rpython.rlib.rarithmetic import r_singlefloat, widen, intmask from rpython.rlib.rstruct import standardfmttable as std @@ -139,17 +139,17 @@ from rpython.rlib.rstruct import unichar def pack_unichar(fmtiter): - unistr = fmtiter.accept_unicode_arg() - if len(unistr) != 1: + utf8, lgt = fmtiter.accept_unicode_arg() + if lgt != 1: raise StructError("expected a unicode string of length 1") - c = unistr[0] # string->char conversion for the annotator - unichar.pack_unichar(c, fmtiter.wbuf, fmtiter.pos) + uchr = rutf8.codepoint_at_pos(utf8, 0) + unichar.pack_codepoint(uchr, fmtiter.wbuf, fmtiter.pos) fmtiter.advance(unichar.UNICODE_SIZE) @specialize.argtype(0) def unpack_unichar(fmtiter): data = fmtiter.read(unichar.UNICODE_SIZE) - fmtiter.appendobj(unichar.unpack_unichar(data)) + fmtiter.append_utf8(unichar.unpack_codepoint(data)) native_fmttable['u'] = {'size': unichar.UNICODE_SIZE, 'alignment': unichar.UNICODE_SIZE, diff --git a/rpython/rlib/rstruct/unichar.py b/rpython/rlib/rstruct/unichar.py --- a/rpython/rlib/rstruct/unichar.py +++ b/rpython/rlib/rstruct/unichar.py @@ -3,12 +3,8 @@ """ import sys -from rpython.rlib.runicode import MAXUNICODE -if MAXUNICODE <= 65535: - UNICODE_SIZE = 2 -else: - UNICODE_SIZE = 4 +UNICODE_SIZE = 4 BIGENDIAN = sys.byteorder == "big" def pack_unichar(unich, buf, pos): @@ -34,7 +30,7 @@ buf.setitem(pos+2, chr((unich >> 16) & 0xFF)) buf.setitem(pos+3, chr(unich >> 24)) -def unpack_unichar(rawstring): +def unpack_codepoint(rawstring): assert len(rawstring) == UNICODE_SIZE if UNICODE_SIZE == 2: if BIGENDIAN: @@ -54,4 +50,7 @@ ord(rawstring[1]) << 8 | ord(rawstring[2]) << 16 | ord(rawstring[3]) << 24) - return unichr(n) + return n + +def unpack_unichar(rawstring): + return unichr(unpack_codepoint(rawstring)) diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -19,7 +19,7 @@ from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline from rpython.rlib.rstring import StringBuilder -from rpython.rlib import jit +from rpython.rlib import jit, types from rpython.rlib.signature import signature from rpython.rlib.types import char, none from rpython.rlib.rarithmetic import r_uint @@ -27,6 +27,8 @@ from rpython.rtyper.lltypesystem import lltype, rffi +# we need a way to accept both r_uint and int(nonneg=True) +#@signature(types.int_nonneg(), types.bool(), returns=types.str()) def unichr_as_utf8(code, allow_surrogates=False): """Encode code (numeric value) as utf8 encoded string """ @@ -437,7 +439,7 @@ low = codepoint_at_pos(utf8, i) if 0xDC00 <= low <= 0xDFFF: uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) - i = next_codepoint_pos(utf8, i) + i = next_codepoint_pos(utf8, i) # else not really a surrogate pair, just append high else: i = next_codepoint_pos(utf8, i) @@ -535,6 +537,13 @@ else: return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos)) +def _pos_at_index(utf8, index): + # Slow! + pos = 0 + for _ in range(index): + pos = next_codepoint_pos(utf8, pos) + return pos + @jit.dont_look_inside def codepoint_at_index(utf8, storage, index): """ Return codepoint of a character inside utf8 encoded string, given diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py --- a/rpython/rlib/types.py +++ b/rpython/rlib/types.py @@ -26,6 +26,8 @@ def int(): return model.SomeInteger() +def int_nonneg(): + return model.SomeInteger(nonneg=True) def bool(): return model.SomeBool() diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py --- a/rpython/rtyper/lltypesystem/rffi.py +++ b/rpython/rtyper/lltypesystem/rffi.py @@ -1019,7 +1019,27 @@ s = StringBuilder(size) for i in range(size): rutf8.unichr_as_utf8_append(s, ord(w[i])) - return s.build() + return s.build() + +def wcharp2utf8(w): + from rpython.rlib import rutf8 + + s = rutf8.Utf8StringBuilder() + i = 0 + while ord(w[i]): + s.append_code(ord(w[i])) + i += 1 + return s.build(), i + +def wcharp2utf8n(w, maxlen): + from rpython.rlib import rutf8 + + s = rutf8.Utf8StringBuilder(maxlen) + i = 0 + while i < maxlen and w[i]: + s.append_code(ord(w[i])) + i += 1 + return s.build(), i def utf82wcharp(utf8, utf8len): from rpython.rlib import rutf8 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit