Author: Yusuke Tsutsumi <yus...@tsutsumi.io> Branch: fix_test_codecs Changeset: r94700:8f5146e6c44f Date: 2018-05-24 20:01 -0700 http://bitbucket.org/pypy/pypy/changeset/8f5146e6c44f/
Log: Copying code out of runicode into unicodehelper, further isolating the pypy code from rpython diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1,12 +1,13 @@ import sys from pypy.interpreter.error import OperationError, oefmt -from rpython.rlib.objectmodel import specialize +from rpython.rlib.objectmodel import specialize, we_are_translated from rpython.rlib.rarithmetic import intmask from rpython.rlib.rstring import StringBuilder, UnicodeBuilder -from rpython.rlib import runicode +from rpython.rlib import runicode, jit, nonconst from rpython.rlib.runicode import ( default_unicode_error_encode, default_unicode_error_decode, MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR) +from rpython.tool.sourcetools import func_with_new_name _WIN32 = sys.platform == 'win32' _MACOSX = sys.platform == 'darwin' @@ -85,7 +86,7 @@ force_replace=False) elif _MACOSX: uni = space.unicode_w(w_uni) - bytes = runicode.unicode_encode_utf_8_impl( + bytes = unicode_encode_utf_8_impl( uni, len(uni), 'surrogateescape', errorhandler=state.encode_error_handler, allow_surrogates=False) @@ -149,11 +150,102 @@ # allowed, either paired or lone. A paired surrogate is considered # like the non-BMP character it stands for. See also *_utf8sp(). assert isinstance(uni, unicode) - return runicode.unicode_encode_utf_8( + return unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=encode_error_handler(space), allow_surrogates=allow_surrogates) +@jit.elidable +def unicode_encode_utf_8(s, size, errors, errorhandler=None, + allow_surrogates=False): + # In this function, allow_surrogates can be: + # + # * True: surrogates are always allowed. A valid surrogate pair + # is replaced with the non-BMP unicode char it stands for, + # which is then encoded as 4 bytes. + # + # * False: surrogates are always forbidden. + # + # See also unicode_encode_utf8sp(). + # + if errorhandler is None: + errorhandler = default_unicode_error_encode + return unicode_encode_utf_8_elidable(s, size, errors, errorhandler, + allow_surrogates=allow_surrogates) + +def unicode_encode_utf_8_impl(s, size, errors, errorhandler, + allow_surrogates=False): + assert(size >= 0) + result = StringBuilder(size) + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 + if ch < 0x80: + # Encode ASCII + result.append(chr(ch)) + elif ch < 0x0800: + # Encode Latin-1 + result.append(chr((0xc0 | (ch >> 6)))) + result.append(chr((0x80 | (ch & 0x3f)))) + else: + # Encode UCS2 Unicode ordinals + if ch < 0x10000: + # Special case: check for high surrogate + if 0xD800 <= ch <= 0xDFFF: + if pos != size: + ch2 = ord(s[pos]) + # Check for low surrogate and combine the two to + # form a UCS4 value + if ((allow_surrogates or MAXUNICODE < 65536 + or is_narrow_host()) and + ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF): + ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 + assert ch3 >= 0 + pos += 1 + _encodeUCS4(result, ch3) + continue + # note: if the program only ever calls this with + # allow_surrogates=True, then we'll never annotate + # the following block of code, and errorhandler() + # will never be called. This causes RPython + # problems. Avoid it with the nonconst hack. + if not allow_surrogates or nonconst.NonConstant(False): + ru, rs, pos = errorhandler(errors, 'utf8', + 'surrogates not allowed', + s, pos-1, pos) + if rs is not None: + # py3k only + result.append(rs) + continue + for ch in ru: + if ord(ch) < 0x80: + result.append(chr(ord(ch))) + else: + errorhandler('strict', 'utf8', + 'surrogates not allowed', + s, pos-1, pos) + continue + # else: Fall through and handles isolated high surrogates + result.append((chr((0xe0 | (ch >> 12))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) + else: + _encodeUCS4(result, ch) + return result.build() +unicode_encode_utf_8_elidable = jit.elidable( + func_with_new_name(unicode_encode_utf_8_impl, + "unicode_encode_utf_8_elidable")) + + +def _encodeUCS4(result, ch): + # Encode UCS4 Unicode ordinals + result.append((chr((0xf0 | (ch >> 18))))) + result.append((chr((0x80 | ((ch >> 12) & 0x3f))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) + + def encode_utf8sp(space, uni): # Surrogate-preserving utf-8 encoding. Any surrogate character # turns into its 3-bytes encoding, whether it is paired or not. @@ -370,3 +462,7 @@ return unicode_encode_utf_32_helper(s, size, errors, errorhandler, allow_surrogates, "little", 'utf-32-le') + + +def is_narrow_host(): + return not we_are_translated() and sys.maxunicode == 0xFFFF diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -10,6 +10,7 @@ from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault from pypy.interpreter import unicodehelper +from pypy.interpreter.unicodehelper import unicode_encode_utf_8_impl from pypy.module.unicodedata import unicodedb @@ -737,7 +738,7 @@ # NB. can't call unicode_encode_utf_8() directly because that's # an @elidable function nowadays. Instead, we need the _impl(). # (The problem is the errorhandler, which calls arbitrary Python.) - result = runicode.unicode_encode_utf_8_impl( + result = unicode_encode_utf_8_impl( uni, len(uni), errors, state.encode_error_handler, allow_surrogates=False) return space.newtuple([space.newbytes(result), space.newint(len(uni))]) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -6,7 +6,7 @@ from rpython.rlib.rstring import StringBuilder, UnicodeBuilder from rpython.rlib.runicode import ( make_unicode_escape_function, str_decode_ascii, str_decode_utf_8, - unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii, + unicode_encode_ascii, fast_str_decode_ascii, unicode_encode_utf8_forbid_surrogates, SurrogateError) from rpython.rlib import jit @@ -564,7 +564,7 @@ if encoding is None or encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) - return space.newbytes(unicode_encode_utf_8( + return space.newbytes(unicodehelper.unicode_encode_utf_8( u, len(u), errors, errorhandler=eh)) elif encoding == 'ascii': u = space.unicode_w(w_object) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit