Author: Ronan Lamy <ronan.l...@gmail.com> Branch: py3.5 Changeset: r94591:2396fb397495 Date: 2018-05-14 22:34 +0100 http://bitbucket.org/pypy/pypy/changeset/2396fb397495/
Log: hg merge default diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -168,6 +168,222 @@ return decode_utf8(space, string, allow_surrogates=True) # ____________________________________________________________ +# utf-16 + +def str_decode_utf_16(s, size, errors, final=True, + errorhandler=None): + result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, + errorhandler, "native", + 'utf-16-' + BYTEORDER2) + return result, length + +def str_decode_utf_16_be(s, size, errors, final=True, + errorhandler=None): + result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, + errorhandler, "big", + 'utf-16-be') + return result, length + +def str_decode_utf_16_le(s, size, errors, final=True, + errorhandler=None): + result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, + errorhandler, "little", + 'utf-16-le') + return result, length + +def str_decode_utf_16_helper(s, size, errors, final=True, + errorhandler=None, + byteorder="native", + public_encoding_name='utf16'): + if errorhandler is None: + errorhandler = default_unicode_error_decode + bo = 0 + + if BYTEORDER == 'little': + ihi = 1 + ilo = 0 + else: + ihi = 0 + ilo = 1 + + # Check for BOM marks (U+FEFF) in the input and adjust current + # byte order setting accordingly. In native mode, the leading BOM + # mark is skipped, in all other modes, it is copied to the output + # stream as-is (giving a ZWNBSP character). + pos = 0 + if byteorder == 'native': + if size >= 2: + bom = (ord(s[ihi]) << 8) | ord(s[ilo]) + if BYTEORDER == 'little': + if bom == 0xFEFF: + pos += 2 + bo = -1 + elif bom == 0xFFFE: + pos += 2 + bo = 1 + else: + if bom == 0xFEFF: + pos += 2 + bo = 1 + elif bom == 0xFFFE: + pos += 2 + bo = -1 + elif byteorder == 'little': + bo = -1 + else: + bo = 1 + if size == 0: + return u'', 0, bo + if bo == -1: + # force little endian + ihi = 1 + ilo = 0 + + elif bo == 1: + # force big endian + ihi = 0 + ilo = 1 + + result = UnicodeBuilder(size // 2) + + #XXX I think the errors are not correctly handled here + while pos < size: + # remaining bytes at the end? (size should be even) + if len(s) - pos < 2: + if not final: + break + r, pos = errorhandler(errors, public_encoding_name, + "truncated data", + s, pos, len(s)) + result.append(r) + if len(s) - pos < 2: + break + ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo]) + pos += 2 + if ch < 0xD800 or ch > 0xDFFF: + result.append(unichr(ch)) + continue + # UTF-16 code pair: + if len(s) - pos < 2: + pos -= 2 + if not final: + break + errmsg = "unexpected end of data" + r, pos = errorhandler(errors, public_encoding_name, + errmsg, s, pos, len(s)) + result.append(r) + if len(s) - pos < 2: + break + elif 0xD800 <= ch <= 0xDBFF: + ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo]) + pos += 2 + if 0xDC00 <= ch2 <= 0xDFFF: + if MAXUNICODE < 65536: + result.append(unichr(ch)) + result.append(unichr(ch2)) + else: + result.append(UNICHR((((ch & 0x3FF)<<10) | + (ch2 & 0x3FF)) + 0x10000)) + continue + else: + r, pos = errorhandler(errors, public_encoding_name, + "illegal UTF-16 surrogate", + s, pos - 4, pos - 2) + result.append(r) + else: + r, pos = errorhandler(errors, public_encoding_name, + "illegal encoding", + s, pos - 2, pos) + result.append(r) + return result.build(), pos, bo + +def _STORECHAR(result, CH, byteorder): + hi = chr(((CH) >> 8) & 0xff) + lo = chr((CH) & 0xff) + if byteorder == 'little': + result.append(lo) + result.append(hi) + else: + result.append(hi) + result.append(lo) + +def unicode_encode_utf_16_helper(s, size, errors, + errorhandler=None, + allow_surrogates=True, + byteorder='little', + public_encoding_name='utf16'): + if errorhandler is None: + errorhandler = default_unicode_error_encode + if size == 0: + if byteorder == 'native': + result = StringBuilder(2) + _STORECHAR(result, 0xFEFF, BYTEORDER) + return result.build() + return "" + + result = StringBuilder(size * 2 + 2) + if byteorder == 'native': + _STORECHAR(result, 0xFEFF, BYTEORDER) + byteorder = BYTEORDER + + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 + + if ch < 0xD800: + _STORECHAR(result, ch, byteorder) + elif ch >= 0x10000: + _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder) + _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder) + elif ch >= 0xE000 or allow_surrogates: + _STORECHAR(result, ch, byteorder) + else: + ru, rs, pos = errorhandler(errors, public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + if rs is not None: + # py3k only + if len(rs) % 2 != 0: + errorhandler('strict', public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + result.append(rs) + continue + for ch in ru: + if ord(ch) < 0xD800: + _STORECHAR(result, ord(ch), byteorder) + else: + errorhandler('strict', public_encoding_name, + 'surrogates not allowed', + s, pos-1, pos) + continue + + return result.build() + +def unicode_encode_utf_16(s, size, errors, + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + allow_surrogates, "native", + 'utf-16-' + BYTEORDER2) + +def unicode_encode_utf_16_be(s, size, errors, + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + allow_surrogates, "big", + 'utf-16-be') + +def unicode_encode_utf_16_le(s, size, errors, + errorhandler=None, + allow_surrogates=True): + return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + allow_surrogates, "little", + 'utf-16-le') + + +# ____________________________________________________________ # utf-32 def str_decode_utf_32(s, size, errors, final=True, _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit