Author: fijal Branch: unicode-utf8 Changeset: r93087:3e5acb0a1e81 Date: 2017-11-20 11:13 +0100 http://bitbucket.org/pypy/pypy/changeset/3e5acb0a1e81/
Log: start working on more obscure codecs and completely remove hacks that go via UCS4 from unicodehelper. Now unicodehelper no longer uses runicode diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1,7 +1,9 @@ +import sys + from pypy.interpreter.error import OperationError from rpython.rlib.objectmodel import specialize -from rpython.rlib import runicode, rutf8 -from rpython.rlib.rarithmetic import r_uint +from rpython.rlib import rutf8 +from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rstring import StringBuilder from pypy.module._codecs import interp_codecs @@ -168,47 +170,6 @@ r = res.build() return r -class DecodeWrapper(object): - def __init__(self, handler): - self.orig = handler - - def handle(self, errors, encoding, msg, s, pos, endpos): - return self.orig(errors, encoding, msg, s, pos, endpos) - -class EncodeWrapper(object): - def __init__(self, handler): - self.orig = handler - - def handle(self, errors, encoding, msg, s, pos, endpos): - return self.orig(errors, encoding, msg, s.encode("utf8"), pos, endpos) - -def setup_new_encoders_legacy(encoding): - encoder_name = 'utf8_encode_' + encoding - encoder_call_name = 'unicode_encode_' + encoding - decoder_name = 'str_decode_' + encoding - def encoder(utf8, errors, errorhandler): - u = utf8.decode("utf8") - w = EncodeWrapper(errorhandler) - return getattr(runicode, encoder_call_name)(u, len(u), errors, - w.handle) - def decoder(s, slen, errors, final, errorhandler): - w = DecodeWrapper((errorhandler)) - u, pos = getattr(runicode, decoder_name)(s, slen, errors, final, w.handle) - return u.encode('utf8'), pos, len(u), _get_flag(u) - encoder.__name__ = encoder_name - decoder.__name__ = decoder_name - if encoder_name not in globals(): - globals()[encoder_name] = encoder - if decoder_name not in globals(): - globals()[decoder_name] = decoder - -def setup(): - for encoding in ['utf_16', 'utf_16_le', 'utf_16_be', 'utf_32_le', 'utf_32', - 'utf_32_be', 'unicode_internal']: - setup_new_encoders_legacy(encoding) - -setup() - def utf8_encode_ascii(utf8, errors, errorhandler): """ Don't be confused - this is a slowpath for errors e.g. "ignore" or an obscure errorhandler @@ -618,6 +579,41 @@ lgt, flag = rutf8.check_utf8(r, True) return r, pos, lgt, flag + +TABLE = '0123456789abcdef' + +def raw_unicode_escape_helper(result, char): + if char >= 0x10000 or char < 0: + result.append("\\U") + zeros = 8 + elif char >= 0x100: + result.append("\\u") + zeros = 4 + else: + result.append("\\x") + zeros = 2 + for i in range(zeros-1, -1, -1): + result.append(TABLE[(char >> (4 * i)) & 0x0f]) + +def utf8_encode_raw_unicode_escape(s, errors, errorhandler=None): + # errorhandler is not used: this function cannot cause Unicode errors + size = len(s) + if size == 0: + return '' + result = StringBuilder(size) + pos = 0 + while pos < size: + oc = ord(s[pos]) + + if oc < 0x100: + result.append(chr(oc)) + else: + raw_unicode_escape_helper(result, oc) + pos += 1 + + return result.build() + + # ____________________________________________________________ # utf-7 @@ -896,3 +892,395 @@ result.append('-') return result.build() + +# ____________________________________________________________ +# utf-16 + +BYTEORDER = sys.byteorder +BYTEORDER2 = BYTEORDER[0] + 'e' # either "le" or "be" +assert BYTEORDER2 in ('le', 'be') + +def str_decode_utf_16(s, errors, final=True, + errorhandler=None): + result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final, + errorhandler, "native") + return result, c, lgt, flag + +def str_decode_utf_16_be(s, errors, final=True, + errorhandler=None): + result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final, + errorhandler, "big") + return result, c, lgt, flag + +def str_decode_utf_16_le(s, errors, final=True, + errorhandler=None): + result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final, + errorhandler, "little") + return result, c, lgt, flag + +def str_decode_utf_16_helper(s, errors, final=True, + errorhandler=None, + byteorder="native", + public_encoding_name='utf16'): + size = len(s) + bo = 0 + + if BYTEORDER == 'little': + ihi = 1 + ilo = 0 + else: + ihi = 0 + ilo = 1 + + # Check for BOM marks (U+FEFF) in the input and adjust current + # byte order setting accordingly. In native mode, the leading BOM + # mark is skipped, in all other modes, it is copied to the output + # stream as-is (giving a ZWNBSP character). + pos = 0 + if byteorder == 'native': + if size >= 2: + bom = (ord(s[ihi]) << 8) | ord(s[ilo]) + if BYTEORDER == 'little': + if bom == 0xFEFF: + pos += 2 + bo = -1 + elif bom == 0xFFFE: + pos += 2 + bo = 1 + else: + if bom == 0xFEFF: + pos += 2 + bo = 1 + elif bom == 0xFFFE: + pos += 2 + bo = -1 + elif byteorder == 'little': + bo = -1 + else: + bo = 1 + if size == 0: + return u'', 0, bo + if bo == -1: + # force little endian + ihi = 1 + ilo = 0 + + elif bo == 1: + # force big endian + ihi = 0 + ilo = 1 + + result = StringBuilder(size // 2) + + #XXX I think the errors are not correctly handled here + while pos < size: + # remaining bytes at the end? (size should be even) + if len(s) - pos < 2: + if not final: + break + r, pos = errorhandler(errors, public_encoding_name, + "truncated data", + s, pos, len(s)) + result.append(r) + if len(s) - pos < 2: + break + ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo]) + pos += 2 + if ch < 0xD800 or ch > 0xDFFF: + rutf8.unichr_as_utf8_append(result, ch) + continue + # UTF-16 code pair: + if len(s) - pos < 2: + pos -= 2 + if not final: + break + errmsg = "unexpected end of data" + r, pos = errorhandler(errors, public_encoding_name, + errmsg, s, pos, len(s)) + result.append(r) + if len(s) - pos < 2: + break + elif 0xD800 <= ch <= 0xDBFF: + ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo]) + pos += 2 + if 0xDC00 <= ch2 <= 0xDFFF: + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000 + rutf8.unichr_as_utf8_append(result, ch) + continue + else: + r, pos = errorhandler(errors, public_encoding_name, + "illegal UTF-16 surrogate", + s, pos - 4, pos - 2) + result.append(r) + else: + r, pos = errorhandler(errors, public_encoding_name, + "illegal encoding", + s, pos - 2, pos) + result.append(r) + r = result.build() + lgt, flag = rutf8.check_utf8(r, True) + return result.build(), pos, lgt, flag, bo + +def _STORECHAR(result, CH, byteorder): + hi = chr(((CH) >> 8) & 0xff) + lo = chr((CH) & 0xff) + if byteorder == 'little': + result.append(lo) + result.append(hi) + else: + result.append(hi) + result.append(lo) + +def unicode_encode_utf_16_helper(s, errors, + errorhandler=None, + allow_surrogates=True, + byteorder='little', + public_encoding_name='utf16'): + size = len(s) + if size == 0: + if byteorder == 'native': + result = StringBuilder(2) + _STORECHAR(result, 0xFEFF, BYTEORDER) + return result.build() + return "" + + result = StringBuilder(size * 2 + 2) + if byteorder == 'native': + _STORECHAR(result, 0xFEFF, BYTEORDER) + byteorder = BYTEORDER + + pos = 0 + while pos < size: + ch = rutf8.codepoint_at_pos(s, pos) + pos = rutf8.next_codepoint_pos(s, pos) + + if ch < 0xD800: + _STORECHAR(result, ch, byteorder) + elif ch >= 0x10000: + _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder) + _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder) + elif ch >= 0xE000 or allow_surrogates: + _STORECHAR(result, ch, byteorder) + else: + ru, pos = errorhandler(errors, public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + xxx + #if rs is not None: + # # py3k only + # if len(rs) % 2 != 0: + # errorhandler('strict', public_encoding_name, + # 'surrogates not allowed', + # s, pos-1, pos) + # result.append(rs) + # continue + for ch in ru: + if ord(ch) < 0xD800: + _STORECHAR(result, ord(ch), byteorder) + else: + errorhandler('strict', public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + continue + + return result.build() + +def utf8_encode_utf_16(s, errors, + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, errors, errorhandler, + allow_surrogates, "native") + +def utf8_encode_utf_16_be(s, errors, + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, errors, errorhandler, + allow_surrogates, "big") + +def utf8_encode_utf_16_le(s, errors, + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, errors, errorhandler, + allow_surrogates, "little") + +# ____________________________________________________________ +# utf-32 + +def str_decode_utf_32(s, errors, final=True, + errorhandler=None): + result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final, + errorhandler, "native") + return result, c, lgt, flag + +def str_decode_utf_32_be(s, errors, final=True, + errorhandler=None): + result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final, + errorhandler, "big") + return result, c, lgt, flag + +def str_decode_utf_32_le(s, errors, final=True, + errorhandler=None): + result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final, + errorhandler, "little") + return result, c, lgt, flag + +BOM32_DIRECT = intmask(0x0000FEFF) +BOM32_REVERSE = intmask(0xFFFE0000) + +def str_decode_utf_32_helper(s, errors, final=True, + errorhandler=None, + byteorder="native", + public_encoding_name='utf32'): + bo = 0 + size = len(s) + + if BYTEORDER == 'little': + iorder = [0, 1, 2, 3] + else: + iorder = [3, 2, 1, 0] + + # Check for BOM marks (U+FEFF) in the input and adjust current + # byte order setting accordingly. In native mode, the leading BOM + # mark is skipped, in all other modes, it is copied to the output + # stream as-is (giving a ZWNBSP character). + pos = 0 + if byteorder == 'native': + if size >= 4: + bom = intmask( + (ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) | + (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]])) + if BYTEORDER == 'little': + if bom == BOM32_DIRECT: + pos += 4 + bo = -1 + elif bom == BOM32_REVERSE: + pos += 4 + bo = 1 + else: + if bom == BOM32_DIRECT: + pos += 4 + bo = 1 + elif bom == BOM32_REVERSE: + pos += 4 + bo = -1 + elif byteorder == 'little': + bo = -1 + else: + bo = 1 + if size == 0: + return u'', 0, bo + if bo == -1: + # force little endian + iorder = [0, 1, 2, 3] + + elif bo == 1: + # force big endian + iorder = [3, 2, 1, 0] + + result = StringBuilder(size // 4) + + while pos < size: + # remaining bytes at the end? (size should be divisible by 4) + if len(s) - pos < 4: + if not final: + break + r, pos = errorhandler(errors, public_encoding_name, + "truncated data", + s, pos, len(s)) + result.append(r) + if len(s) - pos < 4: + break + continue + ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) | + (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]])) + if ch >= 0x110000: + r, pos = errorhandler(errors, public_encoding_name, + "codepoint not in range(0x110000)", + s, pos, len(s)) + result.append(r) + continue + + rutf8.unichr_as_utf8_append(result, ch) + pos += 4 + r = result.build() + lgt, flag = rutf8.check_utf8(r, True) + return r, pos, lgt, flag, bo + +def _STORECHAR32(result, CH, byteorder): + c0 = chr(((CH) >> 24) & 0xff) + c1 = chr(((CH) >> 16) & 0xff) + c2 = chr(((CH) >> 8) & 0xff) + c3 = chr((CH) & 0xff) + if byteorder == 'little': + result.append(c3) + result.append(c2) + result.append(c1) + result.append(c0) + else: + result.append(c0) + result.append(c1) + result.append(c2) + result.append(c3) + +def unicode_encode_utf_32_helper(s, errors, + errorhandler=None, + allow_surrogates=True, + byteorder='little', + public_encoding_name='utf32'): + size = len(s) + if size == 0: + if byteorder == 'native': + result = StringBuilder(4) + _STORECHAR32(result, 0xFEFF, BYTEORDER) + return result.build() + return "" + + result = StringBuilder(size * 4 + 4) + if byteorder == 'native': + _STORECHAR32(result, 0xFEFF, BYTEORDER) + byteorder = BYTEORDER + + pos = 0 + while pos < size: + ch = rutf8.codepoint_at_pos(s, pos) + pos = rutf8.next_codepoint_pos(s, pos) + ch2 = 0 + if not allow_surrogates and 0xD800 <= ch < 0xE000: + ru, pos = errorhandler(errors, public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + XXX + if rs is not None: + # py3k only + if len(rs) % 4 != 0: + errorhandler('strict', public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + result.append(rs) + continue + for ch in ru: + if ord(ch) < 0xD800: + _STORECHAR32(result, ord(ch), byteorder) + else: + errorhandler('strict', public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + continue + _STORECHAR32(result, ch, byteorder) + + return result.build() + +def utf8_encode_utf_32(s, errors, + errorhandler=None, allow_surrogates=True): + return unicode_encode_utf_32_helper(s, errors, errorhandler, + allow_surrogates, "native") + +def utf8_encode_utf_32_be(s, errors, + errorhandler=None, allow_surrogates=True): + return unicode_encode_utf_32_helper(s, errors, errorhandler, + allow_surrogates, "big") + +def utf8_encode_utf_32_le(s, errors, + errorhandler=None, allow_surrogates=True): + return unicode_encode_utf_32_helper(s, errors, errorhandler, + allow_surrogates, "little") diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -30,6 +30,10 @@ endpos): """Generic wrapper for calling into error handlers. + Note that error handler receives and returns position into + the unicode characters, not into the position of utf8 bytes, + so it needs to be converted by the codec + Returns (unicode_or_none, str_or_none, newpos) as error handlers may return unicode or on Python 3, bytes. """ diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -15,6 +15,7 @@ 'utf-32', 'utf-32-le', 'utf-32-be', 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): + print encoding assert unicode(u.encode(encoding),encoding) == u def test_ucs4(self): @@ -115,10 +116,10 @@ raises(TypeError, charmap_decode, '\xff', "strict", {0xff: 0x110000}) assert (charmap_decode("\x00\x01\x02", "strict", {0: 0x10FFFF, 1: ord('b'), 2: ord('c')}) == - u"\U0010FFFFbc", 3) + (u"\U0010FFFFbc", 3)) assert (charmap_decode("\x00\x01\x02", "strict", {0: u'\U0010FFFF', 1: u'b', 2: u'c'}) == - u"\U0010FFFFbc", 3) + (u"\U0010FFFFbc", 3)) def test_escape_decode_errors(self): from _codecs import escape_decode as decode diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1194,7 +1194,7 @@ assert False, "always raises" return space.newbytes(s) if ((encoding is None and space.sys.defaultencoding == 'utf8') or - encoding == 'utf-8' or encoding == 'utf8'): + encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'): return space.newbytes(space.utf8_w(w_object)) if w_encoder is None: from pypy.module._codecs.interp_codecs import lookup_codec _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit