Author: Yusuke Tsutsumi <yus...@tsutsumi.io>
Branch: fix_test_codecs
Changeset: r94700:8f5146e6c44f
Date: 2018-05-24 20:01 -0700
http://bitbucket.org/pypy/pypy/changeset/8f5146e6c44f/

Log:    Copying code out of runicode into unicodehelper, further isolating
        the pypy code from rpython

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,12 +1,13 @@
 import sys
 from pypy.interpreter.error import OperationError, oefmt
-from rpython.rlib.objectmodel import specialize
+from rpython.rlib.objectmodel import specialize, we_are_translated
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib import runicode
+from rpython.rlib import runicode, jit, nonconst
 from rpython.rlib.runicode import (
     default_unicode_error_encode, default_unicode_error_decode,
     MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
+from rpython.tool.sourcetools import func_with_new_name
 
 _WIN32 = sys.platform == 'win32'
 _MACOSX = sys.platform == 'darwin'
@@ -85,7 +86,7 @@
                                     force_replace=False)
     elif _MACOSX:
         uni = space.unicode_w(w_uni)
-        bytes = runicode.unicode_encode_utf_8_impl(
+        bytes = unicode_encode_utf_8_impl(
             uni, len(uni), 'surrogateescape',
             errorhandler=state.encode_error_handler,
             allow_surrogates=False)
@@ -149,11 +150,102 @@
     # allowed, either paired or lone.  A paired surrogate is considered
     # like the non-BMP character it stands for.  See also *_utf8sp().
     assert isinstance(uni, unicode)
-    return runicode.unicode_encode_utf_8(
+    return unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=encode_error_handler(space),
         allow_surrogates=allow_surrogates)
 
+@jit.elidable
+def unicode_encode_utf_8(s, size, errors, errorhandler=None,
+                         allow_surrogates=False):
+    # In this function, allow_surrogates can be:
+    #
+    #  * True:  surrogates are always allowed.  A valid surrogate pair
+    #           is replaced with the non-BMP unicode char it stands for,
+    #           which is then encoded as 4 bytes.
+    #
+    #  * False: surrogates are always forbidden.
+    #
+    # See also unicode_encode_utf8sp().
+    #
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+    return unicode_encode_utf_8_elidable(s, size, errors, errorhandler,
+                                         allow_surrogates=allow_surrogates)
+
+def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
+                              allow_surrogates=False):
+    assert(size >= 0)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        else:
+            # Encode UCS2 Unicode ordinals
+            if ch < 0x10000:
+                # Special case: check for high surrogate
+                if 0xD800 <= ch <= 0xDFFF:
+                    if pos != size:
+                        ch2 = ord(s[pos])
+                        # Check for low surrogate and combine the two to
+                        # form a UCS4 value
+                        if ((allow_surrogates or MAXUNICODE < 65536
+                             or is_narrow_host()) and
+                            ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
+                            ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 
0x10000
+                            assert ch3 >= 0
+                            pos += 1
+                            _encodeUCS4(result, ch3)
+                            continue
+                    # note: if the program only ever calls this with
+                    # allow_surrogates=True, then we'll never annotate
+                    # the following block of code, and errorhandler()
+                    # will never be called.  This causes RPython
+                    # problems.  Avoid it with the nonconst hack.
+                    if not allow_surrogates or nonconst.NonConstant(False):
+                        ru, rs, pos = errorhandler(errors, 'utf8',
+                                                   'surrogates not allowed',
+                                                   s, pos-1, pos)
+                        if rs is not None:
+                            # py3k only
+                            result.append(rs)
+                            continue
+                        for ch in ru:
+                            if ord(ch) < 0x80:
+                                result.append(chr(ord(ch)))
+                            else:
+                                errorhandler('strict', 'utf8',
+                                             'surrogates not allowed',
+                                             s, pos-1, pos)
+                        continue
+                    # else: Fall through and handles isolated high surrogates
+                result.append((chr((0xe0 | (ch >> 12)))))
+                result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+                result.append((chr((0x80 | (ch & 0x3f)))))
+            else:
+                _encodeUCS4(result, ch)
+    return result.build()
+unicode_encode_utf_8_elidable = jit.elidable(
+    func_with_new_name(unicode_encode_utf_8_impl,
+                       "unicode_encode_utf_8_elidable"))
+
+
+def _encodeUCS4(result, ch):
+    # Encode UCS4 Unicode ordinals
+    result.append((chr((0xf0 | (ch >> 18)))))
+    result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
+    result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+    result.append((chr((0x80 | (ch & 0x3f)))))
+
+
 def encode_utf8sp(space, uni):
     # Surrogate-preserving utf-8 encoding.  Any surrogate character
     # turns into its 3-bytes encoding, whether it is paired or not.
@@ -370,3 +462,7 @@
     return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
                                         allow_surrogates, "little",
                                         'utf-32-le')
+
+
+def is_narrow_host():
+    return not we_are_translated() and sys.maxunicode == 0xFFFF
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -10,6 +10,7 @@
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter import unicodehelper
+from pypy.interpreter.unicodehelper import unicode_encode_utf_8_impl
 from pypy.module.unicodedata import unicodedb
 
 
@@ -737,7 +738,7 @@
     # NB. can't call unicode_encode_utf_8() directly because that's
     # an @elidable function nowadays.  Instead, we need the _impl().
     # (The problem is the errorhandler, which calls arbitrary Python.)
-    result = runicode.unicode_encode_utf_8_impl(
+    result = unicode_encode_utf_8_impl(
         uni, len(uni), errors, state.encode_error_handler,
         allow_surrogates=False)
     return space.newtuple([space.newbytes(result), space.newint(len(uni))])
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,7 +6,7 @@
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib.runicode import (
     make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
-    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
+    unicode_encode_ascii, fast_str_decode_ascii,
     unicode_encode_utf8_forbid_surrogates, SurrogateError)
 from rpython.rlib import jit
 
@@ -564,7 +564,7 @@
         if encoding is None or encoding == 'utf-8':
             u = space.unicode_w(w_object)
             eh = unicodehelper.encode_error_handler(space)
-            return space.newbytes(unicode_encode_utf_8(
+            return space.newbytes(unicodehelper.unicode_encode_utf_8(
                     u, len(u), errors, errorhandler=eh))
         elif encoding == 'ascii':
             u = space.unicode_w(w_object)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to