Author: fijal Branch: unicode-utf8 Changeset: r93101:f074b4987d57 Date: 2017-11-20 16:54 +0100 http://bitbucket.org/pypy/pypy/changeset/f074b4987d57/
Log: fixes until we get to formatting problems diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -1759,20 +1759,6 @@ def utf8_w(self, w_obj): return w_obj.utf8_w(self) - - @specialize.argtype(1) - def unicode_w(self, w_obj): - return self.utf8_w(w_obj).decode('utf8') - - def realunicode_w(self, w_obj): - return self.realutf8_w(w_obj).decode('utf8') - - def newunicode(self, u): - from pypy.interpreter import unicodehelper - assert isinstance(u, unicode) - # XXX let's disallow that - return self.newutf8(u.encode("utf8"), len(u), unicodehelper._get_flag(u)) - def convert_to_w_unicode(self, w_obj): return w_obj.convert_to_w_unicode(self) diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -61,10 +61,10 @@ @given(strategies.text()) def test_unicode_raw_escape(u): - r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict') + r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None) assert r == u.encode("raw-unicode-escape") @given(strategies.text()) def test_unicode_escape(u): - r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict") + r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None) assert r == u.encode("unicode-escape") diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -60,14 +60,12 @@ return True return False -def _get_flag(u): - flag = rutf8.FLAG_ASCII - for c in u: - if 0xD800 <= ord(c) <= 0xDFFF: - return rutf8.FLAG_HAS_SURROGATES - if ord(c) >= 0x80: - flag = rutf8.FLAG_REGULAR - return flag +def get_flag_from_code(oc): + if oc <= 0x7F: + return rutf8.FLAG_ASCII + if 0xD800 <= oc <= 0xDFFF: + return rutf8.FLAG_HAS_SURROGATES + return rutf8.FLAG_REGULAR # These functions take and return unwrapped rpython strings def decode_unicode_escape(space, string): @@ -134,7 +132,11 @@ return ress, len(s), lgt, flag def str_decode_latin_1(s, errors, final, errorhandler): - xxx + try: + rutf8.check_ascii(s) + return s, len(s), len(s), rutf8.FLAG_ASCII + except rutf8.CheckError: + return _str_decode_latin_1_slowpath(s, errors, final, errorhandler) def utf8_encode_latin_1(s, errors, errorhandler): try: @@ -208,7 +210,6 @@ slen = len(s) res = StringBuilder(slen) pos = 0 - continuation_bytes = 0 end = len(s) while pos < end: ordch1 = ord(s[pos]) @@ -229,6 +230,7 @@ if ordch1 <= 0xDF: if pos >= end: if not final: + pos -= 1 break r, pos = errorhandler(errors, "utf8", "unexpected end of data", s, pos - 1, pos) @@ -243,7 +245,6 @@ continue # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz pos += 1 - continuation_bytes += 1 res.append(chr(ordch1)) res.append(chr(ordch2)) continue @@ -251,6 +252,7 @@ if ordch1 <= 0xEF: if (pos + 2) > end: if not final: + pos -= 1 break r, pos = errorhandler(errors, "utf8", "unexpected end of data", s, pos - 1, pos + 1) @@ -272,7 +274,6 @@ pos += 2 # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz - continuation_bytes += 2 res.append(chr(ordch1)) res.append(chr(ordch2)) res.append(chr(ordch3)) @@ -281,6 +282,7 @@ if ordch1 <= 0xF4: if (pos + 3) > end: if not final: + pos -= 1 break r, pos = errorhandler(errors, "utf8", "unexpected end of data", s, pos - 1, pos) @@ -312,15 +314,12 @@ res.append(chr(ordch2)) res.append(chr(ordch3)) res.append(chr(ordch4)) - continuation_bytes += 3 continue r, pos = errorhandler(errors, "utf8", "invalid start byte", s, pos - 1, pos) res.append(r) - assert pos == end - assert pos - continuation_bytes >= 0 r = res.build() lgt, flag = rutf8.check_utf8(r, True) return r, pos, lgt, flag @@ -352,19 +351,14 @@ else: # when we get here, chr is a 32-bit unicode character if chr > 0x10ffff: - UUU message = "illegal Unicode character" res, pos = errorhandler(errors, encoding, message, s, pos-2, pos+digits) + size, flag = rutf8.check_utf8(res) builder.append(res) else: rutf8.unichr_as_utf8_append(builder, chr, True) - if chr <= 0x7f: - flag = rutf8.FLAG_ASCII - elif 0xd800 <= chr <= 0xdfff: - flag = rutf8.FLAG_HAS_SURROGATES - else: - flag = rutf8.FLAG_REGULAR + flag = get_flag_from_code(chr) pos += digits size = 1 @@ -508,22 +502,22 @@ builder.append(res) continue pos = look + 1 - XXX - if code <= MAXUNICODE: - builder.append(UNICHR(code)) - else: - code -= 0x10000L - builder.append(unichr(0xD800 + (code >> 10))) - builder.append(unichr(0xDC00 + (code & 0x03FF))) + outsize += 1 + flag = combine_flags(flag, get_flag_from_code(code)) + rutf8.unichr_as_utf8_append(builder, code) else: - YYY res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) + newsize, newflag = rutf8.check_utf8(res, True) + flag = combine_flags(flag, newflag) + outsize += newsize builder.append(res) else: - AAA res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) + newsize, newflag = rutf8.check_utf8(res, True) + flag = combine_flags(flag, newflag) + outsize += newsize builder.append(res) else: builder.append('\\') @@ -602,7 +596,7 @@ for i in range(zeros-1, -1, -1): result.append(TABLE[(char >> (4 * i)) & 0x0f]) -def utf8_encode_raw_unicode_escape(s, errors, errorhandler=None): +def utf8_encode_raw_unicode_escape(s, errors, errorhandler): # errorhandler is not used: this function cannot cause Unicode errors size = len(s) if size == 0: @@ -621,7 +615,7 @@ return result.build() -def utf8_encode_unicode_escape(s, errors): +def utf8_encode_unicode_escape(s, errors, errorhandler): return _utf8_encode_unicode_escape(s) # ____________________________________________________________ @@ -851,7 +845,7 @@ assert final_length >= 0 return result.build()[:final_length], pos, outsize, flag -def utf8_encode_utf_7(s, errors, errorhandler=None): +def utf8_encode_utf_7(s, errors, errorhandler): size = len(s) if size == 0: return '' @@ -1294,3 +1288,153 @@ errorhandler=None, allow_surrogates=True): return unicode_encode_utf_32_helper(s, errors, errorhandler, allow_surrogates, "little") + +# ____________________________________________________________ +# unicode-internal + +def str_decode_unicode_internal(s, errors, final=False, + errorhandler=None): + size = len(s) + if size == 0: + return '', 0, 0, rutf8.FLAG_ASCII + + unicode_bytes = 4 + if BYTEORDER == "little": + start = 0 + stop = unicode_bytes + step = 1 + else: + start = unicode_bytes - 1 + stop = -1 + step = -1 + + result = StringBuilder(size) + pos = 0 + while pos < size: + if pos > size - unicode_bytes: + res, pos = errorhandler(errors, "unicode_internal", + "truncated input", + s, pos, size) + result.append(res) + if pos > size - unicode_bytes: + break + continue + t = r_uint(0) + h = 0 + for j in range(start, stop, step): + t += r_uint(ord(s[pos + j])) << (h*8) + h += 1 + if t > 0x10ffff: + res, pos = errorhandler(errors, "unicode_internal", + "unichr(%d) not in range" % (t,), + s, pos, pos + unicode_bytes) + result.append(res) + continue + rutf8.unichr_as_utf8_append(result, intmask(t)) + pos += unicode_bytes + r = result.build() + lgt, flag = rutf8.check_utf8(r, True) + return r, pos, lgt, flag + +def utf8_encode_unicode_internal(s, errors, errorhandler): + size = len(s) + if size == 0: + return '' + + result = StringBuilder(size * 4) + pos = 0 + while pos < size: + oc = rutf8.codepoint_at_pos(s, pos) + if BYTEORDER == "little": + result.append(chr(oc & 0xFF)) + result.append(chr(oc >> 8 & 0xFF)) + result.append(chr(oc >> 16 & 0xFF)) + result.append(chr(oc >> 24 & 0xFF)) + else: + result.append(chr(oc >> 24 & 0xFF)) + result.append(chr(oc >> 16 & 0xFF)) + result.append(chr(oc >> 8 & 0xFF)) + result.append(chr(oc & 0xFF)) + pos = rutf8.next_codepoint_pos(s, pos) + + return result.build() + +# ____________________________________________________________ +# Charmap + +ERROR_CHAR = u'\ufffe'.encode('utf8') + +@specialize.argtype(4) +def str_decode_charmap(s, errors, final=False, + errorhandler=None, mapping=None): + "mapping can be a rpython dictionary, or a dict-like object." + + # Default to Latin-1 + if mapping is None: + return str_decode_latin_1(s, errors, final=final, + errorhandler=errorhandler) + size = len(s) + if size == 0: + return '', 0, 0, rutf8.FLAG_ASCII + + pos = 0 + result = StringBuilder(size) + while pos < size: + ch = s[pos] + + c = mapping.get(ch, ERROR_CHAR) + if c == ERROR_CHAR: + r, pos = errorhandler(errors, "charmap", + "character maps to <undefined>", + s, pos, pos + 1) + result.append(r) + continue + result.append(c) + pos += 1 + r = result.build() + lgt, flag = rutf8.check_utf8(r, True) + return r, pos, lgt, flag + +def utf8_encode_charmap(s, errors, errorhandler=None, + mapping=None): + YYY + if mapping is None: + return unicode_encode_latin_1(s, size, errors, + errorhandler=errorhandler) + + if errorhandler is None: + errorhandler = default_unicode_error_encode + + if size == 0: + return '' + result = StringBuilder(size) + pos = 0 + while pos < size: + ch = s[pos] + + c = mapping.get(ch, '') + if len(c) == 0: + # collect all unencodable chars. Important for narrow builds. + collend = pos + 1 + while collend < size and mapping.get(s[collend], '') == '': + collend += 1 + ru, rs, pos = errorhandler(errors, "charmap", + "character maps to <undefined>", + s, pos, collend) + if rs is not None: + # py3k only + result.append(rs) + continue + for ch2 in ru: + c2 = mapping.get(ch2, '') + if len(c2) == 0: + errorhandler( + "strict", "charmap", + "character maps to <undefined>", + s, pos, pos + 1) + result.append(c2) + continue + result.append(c) + pos += 1 + return result.build() + diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -1,7 +1,6 @@ from rpython.rlib import jit, rutf8 from rpython.rlib.objectmodel import we_are_translated, not_rpython from rpython.rlib.rstring import UnicodeBuilder -from rpython.rlib.runicode import code_to_unichr, MAXUNICODE from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault @@ -563,14 +562,14 @@ if space.isinstance_w(w_ch, space.w_unicode): # Charmap may return a unicode string - return space.unicode_w(w_ch) + return space.utf8_w(w_ch) elif space.isinstance_w(w_ch, space.w_int): # Charmap may return a number x = space.int_w(w_ch) if not 0 <= x <= 0x10FFFF: raise oefmt(space.w_TypeError, "character mapping must be in range(0x110000)") - return code_to_unichr(x) + return rutf8.unichr_as_utf8(x) elif space.is_w(w_ch, space.w_None): # Charmap may return None return errorchar @@ -614,12 +613,13 @@ @unwrap_spec(string='bufferstr', errors='text_or_none') def charmap_decode(space, string, errors="strict", w_mapping=None): - from pypy.interpreter.unicodehelper import DecodeWrapper + from pypy.interpreter import unicodehelper if errors is None: errors = 'strict' if len(string) == 0: - return space.newtuple([space.newunicode(u''), space.newint(0)]) + return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), + space.newint(0)]) if space.is_none(w_mapping): mapping = None @@ -628,14 +628,14 @@ final = True state = space.fromcache(CodecState) - result, consumed = runicode.str_decode_charmap( - string, len(string), errors, - final, DecodeWrapper(state.decode_error_handler).handle, mapping) - return space.newtuple([space.newunicode(result), space.newint(consumed)]) + result, consumed, lgt, flag = unicodehelper.str_decode_charmap( + string, errors, final, state.decode_error_handler, mapping) + return space.newtuple([space.newutf8(result, lgt, flag), + space.newint(consumed)]) @unwrap_spec(utf8='utf8', errors='text_or_none') def charmap_encode(space, utf8, errors="strict", w_mapping=None): - from pypy.interpreter.unicodehelper import EncodeWrapper + from pypy.interpreter import unicodehelper if errors is None: errors = 'strict' @@ -645,10 +645,8 @@ mapping = Charmap_Encode(space, w_mapping) state = space.fromcache(CodecState) - uni = utf8.decode('utf8') - result = runicode.unicode_encode_charmap( - uni, len(uni), errors, - EncodeWrapper(state.encode_error_handler).handle, mapping) + result = unicodehelper.unicode_encode_charmap( + utf8, errors, state.encode_error_handler, mapping) return space.newtuple([space.newbytes(result), space.newint(len(uni))]) @@ -707,7 +705,7 @@ @unwrap_spec(errors='text_or_none') def unicode_internal_decode(space, w_string, errors="strict"): - from pypy.interpreter.unicodehelper import DecodeWrapper + from pypy.interpreter import unicodehelper if errors is None: errors = 'strict' @@ -718,14 +716,16 @@ string = space.readbuf_w(w_string).as_str() if len(string) == 0: - return space.newtuple([space.newunicode(u''), space.newint(0)]) + return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), + space.newint(0)]) final = True state = space.fromcache(CodecState) - result, consumed = runicode.str_decode_unicode_internal( - string, len(string), errors, - final, DecodeWrapper(state.decode_error_handler).handle) - return space.newtuple([space.newunicode(result), space.newint(consumed)]) + result, consumed, lgt, flag = unicodehelper.str_decode_unicode_internal( + string, errors, + final, state.decode_error_handler) + return space.newtuple([space.newutf8(result, lgt, flag), + space.newint(consumed)]) # ____________________________________________________________ # support for the "string escape" codec diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -15,7 +15,6 @@ 'utf-32', 'utf-32-le', 'utf-32-be', 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): - print encoding assert unicode(u.encode(encoding),encoding) == u def test_ucs4(self): diff --git a/pypy/module/exceptions/interp_exceptions.py b/pypy/module/exceptions/interp_exceptions.py --- a/pypy/module/exceptions/interp_exceptions.py +++ b/pypy/module/exceptions/interp_exceptions.py @@ -285,7 +285,7 @@ def descr_init(self, space, w_object, w_start, w_end, w_reason): # typechecking - space.realunicode_w(w_object) + space.utf8_w(w_object) space.int_w(w_start) space.int_w(w_end) space.realtext_w(w_reason) @@ -719,7 +719,7 @@ def descr_init(self, space, w_encoding, w_object, w_start, w_end, w_reason): # typechecking space.realtext_w(w_encoding) - space.realunicode_w(w_object) # XXX realutf8()? + space.utf8_w(w_object) space.int_w(w_start) space.int_w(w_end) space.realtext_w(w_reason) diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py --- a/pypy/objspace/std/formatting.py +++ b/pypy/objspace/std/formatting.py @@ -432,8 +432,7 @@ def fmt_s(self, w_value): space = self.space - got_unicode = space.isinstance_w(w_value, - space.w_unicode) + got_unicode = space.isinstance_w(w_value, space.w_unicode) if not do_unicode: if got_unicode: raise NeedUnicodeFormattingError diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -164,9 +164,9 @@ if isinstance(x, str): return self.newtext(x) if isinstance(x, unicode): - from pypy.interpreter import unicodehelper - return self.newutf8(x.encode('utf8'), len(x), - unicodehelper._get_flag(x)) + x = x.encode('utf8') + lgt, flag = rutf8.check_utf8(x, True) + return self.newutf8(x, lgt, flag) if isinstance(x, float): return W_FloatObject(x) if isinstance(x, W_Root): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit