Author: Yusuke Tsutsumi <[email protected]>
Branch: fix_test_codecs
Changeset: r94700:8f5146e6c44f
Date: 2018-05-24 20:01 -0700
http://bitbucket.org/pypy/pypy/changeset/8f5146e6c44f/
Log: Copying code out of runicode into unicodehelper, further isolating
the pypy code from rpython
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,12 +1,13 @@
import sys
from pypy.interpreter.error import OperationError, oefmt
-from rpython.rlib.objectmodel import specialize
+from rpython.rlib.objectmodel import specialize, we_are_translated
from rpython.rlib.rarithmetic import intmask
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib import runicode
+from rpython.rlib import runicode, jit, nonconst
from rpython.rlib.runicode import (
default_unicode_error_encode, default_unicode_error_decode,
MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
+from rpython.tool.sourcetools import func_with_new_name
_WIN32 = sys.platform == 'win32'
_MACOSX = sys.platform == 'darwin'
@@ -85,7 +86,7 @@
force_replace=False)
elif _MACOSX:
uni = space.unicode_w(w_uni)
- bytes = runicode.unicode_encode_utf_8_impl(
+ bytes = unicode_encode_utf_8_impl(
uni, len(uni), 'surrogateescape',
errorhandler=state.encode_error_handler,
allow_surrogates=False)
@@ -149,11 +150,102 @@
# allowed, either paired or lone. A paired surrogate is considered
# like the non-BMP character it stands for. See also *_utf8sp().
assert isinstance(uni, unicode)
- return runicode.unicode_encode_utf_8(
+ return unicode_encode_utf_8(
uni, len(uni), "strict",
errorhandler=encode_error_handler(space),
allow_surrogates=allow_surrogates)
[email protected]
+def unicode_encode_utf_8(s, size, errors, errorhandler=None,
+ allow_surrogates=False):
+ # In this function, allow_surrogates can be:
+ #
+ # * True: surrogates are always allowed. A valid surrogate pair
+ # is replaced with the non-BMP unicode char it stands for,
+ # which is then encoded as 4 bytes.
+ #
+ # * False: surrogates are always forbidden.
+ #
+ # See also unicode_encode_utf8sp().
+ #
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
+ return unicode_encode_utf_8_elidable(s, size, errors, errorhandler,
+ allow_surrogates=allow_surrogates)
+
+def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
+ allow_surrogates=False):
+ assert(size >= 0)
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
+ if ch < 0x80:
+ # Encode ASCII
+ result.append(chr(ch))
+ elif ch < 0x0800:
+ # Encode Latin-1
+ result.append(chr((0xc0 | (ch >> 6))))
+ result.append(chr((0x80 | (ch & 0x3f))))
+ else:
+ # Encode UCS2 Unicode ordinals
+ if ch < 0x10000:
+ # Special case: check for high surrogate
+ if 0xD800 <= ch <= 0xDFFF:
+ if pos != size:
+ ch2 = ord(s[pos])
+ # Check for low surrogate and combine the two to
+ # form a UCS4 value
+ if ((allow_surrogates or MAXUNICODE < 65536
+ or is_narrow_host()) and
+ ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
+ ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) +
0x10000
+ assert ch3 >= 0
+ pos += 1
+ _encodeUCS4(result, ch3)
+ continue
+ # note: if the program only ever calls this with
+ # allow_surrogates=True, then we'll never annotate
+ # the following block of code, and errorhandler()
+ # will never be called. This causes RPython
+ # problems. Avoid it with the nonconst hack.
+ if not allow_surrogates or nonconst.NonConstant(False):
+ ru, rs, pos = errorhandler(errors, 'utf8',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ if rs is not None:
+ # py3k only
+ result.append(rs)
+ continue
+ for ch in ru:
+ if ord(ch) < 0x80:
+ result.append(chr(ord(ch)))
+ else:
+ errorhandler('strict', 'utf8',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ continue
+ # else: Fall through and handles isolated high surrogates
+ result.append((chr((0xe0 | (ch >> 12)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+ else:
+ _encodeUCS4(result, ch)
+ return result.build()
+unicode_encode_utf_8_elidable = jit.elidable(
+ func_with_new_name(unicode_encode_utf_8_impl,
+ "unicode_encode_utf_8_elidable"))
+
+
+def _encodeUCS4(result, ch):
+ # Encode UCS4 Unicode ordinals
+ result.append((chr((0xf0 | (ch >> 18)))))
+ result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+
+
def encode_utf8sp(space, uni):
# Surrogate-preserving utf-8 encoding. Any surrogate character
# turns into its 3-bytes encoding, whether it is paired or not.
@@ -370,3 +462,7 @@
return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
allow_surrogates, "little",
'utf-32-le')
+
+
+def is_narrow_host():
+ return not we_are_translated() and sys.maxunicode == 0xFFFF
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -10,6 +10,7 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter import unicodehelper
+from pypy.interpreter.unicodehelper import unicode_encode_utf_8_impl
from pypy.module.unicodedata import unicodedb
@@ -737,7 +738,7 @@
# NB. can't call unicode_encode_utf_8() directly because that's
# an @elidable function nowadays. Instead, we need the _impl().
# (The problem is the errorhandler, which calls arbitrary Python.)
- result = runicode.unicode_encode_utf_8_impl(
+ result = unicode_encode_utf_8_impl(
uni, len(uni), errors, state.encode_error_handler,
allow_surrogates=False)
return space.newtuple([space.newbytes(result), space.newint(len(uni))])
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,7 +6,7 @@
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.runicode import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
- unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
+ unicode_encode_ascii, fast_str_decode_ascii,
unicode_encode_utf8_forbid_surrogates, SurrogateError)
from rpython.rlib import jit
@@ -564,7 +564,7 @@
if encoding is None or encoding == 'utf-8':
u = space.unicode_w(w_object)
eh = unicodehelper.encode_error_handler(space)
- return space.newbytes(unicode_encode_utf_8(
+ return space.newbytes(unicodehelper.unicode_encode_utf_8(
u, len(u), errors, errorhandler=eh))
elif encoding == 'ascii':
u = space.unicode_w(w_object)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit