Author: fijal Branch: unicode-utf8 Changeset: r93110:c7109cb7f6be Date: 2017-11-21 14:03 +0100 http://bitbucket.org/pypy/pypy/changeset/c7109cb7f6be/
Log: fix all the tests in codecs until test_ztranslation diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -173,8 +173,13 @@ r, pos = errorhandler(errors, 'latin1', 'ordinal not in range(256)', s, cur, cur + 1) - res.append(r) for j in range(pos - cur): + c = rutf8.codepoint_at_pos(r, j) + if c > 0xFF: + errorhandler("strict", 'latin1', + 'ordinal not in range(256)', s, + cur, cur + 1) + res.append(chr(c)) i = rutf8.next_codepoint_pos(s, i) cur = pos cur += 1 @@ -200,7 +205,12 @@ msg = "ordinal not in range(128)" r, newpos = errorhandler(errors, 'ascii', msg, utf8, pos, endpos) - for _ in range(newpos - pos): + for j in range(newpos - pos): + c = rutf8.codepoint_at_pos(r, j) + if c > 0x7F: + errorhandler("strict", 'ascii', + 'ordinal not in range(128)', utf8, + pos, pos + 1) i = rutf8.next_codepoint_pos(utf8, i) pos = newpos res.append(r) @@ -364,7 +374,7 @@ message = "illegal Unicode character" res, pos = errorhandler(errors, encoding, message, s, pos-2, pos+digits) - size, flag = rutf8.check_utf8(res) + size, flag = rutf8.check_utf8(res, True) builder.append(res) else: rutf8.unichr_as_utf8_append(builder, chr, True) @@ -778,21 +788,25 @@ if base64bits > 0: # left-over bits if base64bits >= 6: # We've seen at least one base-64 character - aaa pos += 1 msg = "partial character in shift sequence" res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos) + reslen, resflags = rutf8.check_utf8(res, True) + outsize += reslen + flag = combine_flags(flag, resflags) result.append(res) continue else: # Some bits remain; they should be zero if base64buffer != 0: - bbb pos += 1 msg = "non-zero padding bits in shift sequence" res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos) + reslen, resflags = rutf8.check_utf8(res, True) + outsize += reslen + flag = combine_flags(flag, resflags) result.append(res) continue @@ -826,11 +840,13 @@ outsize += 1 pos += 1 else: - yyy startinpos = pos pos += 1 msg = "unexpected special character" res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos) + reslen, resflags = rutf8.check_utf8(res, True) + outsize += reslen + flag = combine_flags(flag, resflags) result.append(res) # end of string @@ -973,7 +989,7 @@ else: bo = 1 if size == 0: - return u'', 0, bo + return '', 0, 0, rutf8.FLAG_ASCII, bo if bo == -1: # force little endian ihi = 1 @@ -1182,7 +1198,7 @@ else: bo = 1 if size == 0: - return u'', 0, bo + return '', 0, 0, rutf8.FLAG_ASCII, bo if bo == -1: # force little endian iorder = [0, 1, 2, 3] @@ -1409,40 +1425,43 @@ mapping=None): size = len(s) if mapping is None: - return utf8_encode_latin_1(s, size, errors, - errorhandler=errorhandler) + return utf8_encode_latin_1(s, errors, errorhandler=errorhandler) if size == 0: return '' result = StringBuilder(size) pos = 0 + index = 0 while pos < size: ch = rutf8.codepoint_at_pos(s, pos) c = mapping.get(ch, '') if len(c) == 0: # collect all unencodable chars. Important for narrow builds. - collend = pos + 1 - while collend < size and mapping.get(s[collend], '') == '': - collend += 1 - rs, pos = errorhandler(errors, "charmap", + collend = rutf8.next_codepoint_pos(s, pos) + endindex = index + 1 + while collend < size and mapping.get(rutf8.codepoint_at_pos(s, collend), '') == '': + collend = rutf8.next_codepoint_pos(s, collend) + endindex += 1 + rs, endindex = errorhandler(errors, "charmap", "character maps to <undefined>", - s, pos, collend) - XXXX - if rs is not None: - # py3k only - result.append(rs) - continue - for ch2 in ru: - c2 = mapping.get(ch2, '') - if len(c2) == 0: + s, index, endindex) + j = 0 + for _ in range(endindex - index): + ch2 = rutf8.codepoint_at_pos(rs, j) + ch2 = mapping.get(ch2, '') + if not ch2: errorhandler( "strict", "charmap", "character maps to <undefined>", - s, pos, pos + 1) - result.append(c2) + s, index, index + 1) + result.append(ch2) + index += 1 + j = rutf8.next_codepoint_pos(rs, j) + pos = rutf8.next_codepoint_pos(s, pos) continue result.append(c) + index += 1 pos = rutf8.next_codepoint_pos(s, pos) return result.build() diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -1,6 +1,6 @@ from rpython.rlib import jit, rutf8 from rpython.rlib.objectmodel import we_are_translated, not_rpython -from rpython.rlib.rstring import UnicodeBuilder +from rpython.rlib.rstring import StringBuilder from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault @@ -241,33 +241,42 @@ "don't know how to handle %T in error callback", w_exc) def backslashreplace_errors(space, w_exc): + from pypy.interpreter import unicodehelper + check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): - obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object'))) + w_obj = space.getattr(w_exc, space.newtext('object')) + space.realutf8_w(w_obj) # for errors + w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) - builder = UnicodeBuilder() + start = w_obj._index_to_byte(start) + end = w_obj._index_to_byte(end) + builder = StringBuilder() + obj = w_obj._utf8 pos = start while pos < end: - oc = ord(obj[pos]) + oc = rutf8.codepoint_at_pos(obj, pos) num = hex(oc) if (oc >= 0x10000): - builder.append(u"\\U") + builder.append("\\U") zeros = 8 elif (oc >= 0x100): - builder.append(u"\\u") + builder.append("\\u") zeros = 4 else: - builder.append(u"\\x") + builder.append("\\x") zeros = 2 lnum = len(num) nb = zeros + 2 - lnum # num starts with '0x' if nb > 0: - builder.append_multiple_char(u'0', nb) - builder.append_slice(unicode(num), 2, lnum) - pos += 1 - return space.newtuple([space.newunicode(builder.build()), w_end]) + builder.append_multiple_char('0', nb) + builder.append_slice(num, 2, lnum) + pos = rutf8.next_codepoint_pos(obj, pos) + r = builder.build() + lgt, flag = rutf8.check_utf8(r, True) + return space.newtuple([space.newutf8(r, lgt, flag), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc) @@ -489,7 +498,7 @@ @unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int, w_final=WrappedDefault(False)) def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=None): - from pypy.interpreter.unicodehelper import DecodeWrapper + from pypy.interpreter.unicodehelper import str_decode_utf_16_helper if errors is None: errors = 'strict' @@ -504,16 +513,17 @@ consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = runicode.str_decode_utf_16_helper( - data, len(data), errors, final, - DecodeWrapper(state.decode_error_handler).handle, byteorder) - return space.newtuple([space.newunicode(res), space.newint(consumed), + res, consumed, lgt, flag, byteorder = str_decode_utf_16_helper( + data, errors, final, + state.decode_error_handler, byteorder) + return space.newtuple([space.newutf8(res, lgt, flag), + space.newint(consumed), space.newint(byteorder)]) @unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int, w_final=WrappedDefault(False)) def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=None): - from pypy.interpreter.unicodehelper import DecodeWrapper + from pypy.interpreter.unicodehelper import str_decode_utf_32_helper final = space.is_true(w_final) state = space.fromcache(CodecState) @@ -526,10 +536,11 @@ consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = runicode.str_decode_utf_32_helper( - data, len(data), errors, final, - DecodeWrapper(state.decode_error_handler).handle, byteorder) - return space.newtuple([space.newunicode(res), space.newint(consumed), + res, consumed, lgt, flag, byteorder = str_decode_utf_32_helper( + data, errors, final, + state.decode_error_handler, byteorder) + return space.newtuple([space.newutf8(res, lgt, flag), + space.newint(consumed), space.newint(byteorder)]) # ____________________________________________________________ diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -592,11 +592,11 @@ def handler_unicodeinternal(exc): if not isinstance(exc, UnicodeDecodeError): raise TypeError("don't know how to handle %r" % exc) - return (u"\x01", 1) + return (u"\x01", 4) codecs.register_error("test.hui", handler_unicodeinternal) res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui") if sys.maxunicode > 65535: - assert res == u"\u0000\u0001\u0000" # UCS4 build + assert res == u"\u0000\u0001" # UCS4 build else: assert res == u"\x00\x00\x01\x00\x00" # UCS2 build _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit