[pypy-commit] pypy unicode-utf8-py3: fix imports. Tests start to run. str_decode_utf8 replaces decode_utf8 but args have changed

Matti Picus Sun, 10 Jun 2018 22:30:02 -0700

Author: Matti Picus <mtti.pi...@gmail.com>
Branch: unicode-utf8-py3
Changeset: r94753:40650baa7fd6
Date: 2018-06-10 22:20 -0700
http://bitbucket.org/pypy/pypy/changeset/40650baa7fd6/


Log:    fix imports. Tests start to run. str_decode_utf8 replaces
        decode_utf8 but args have changed

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -247,9 +247,7 @@
 
     def unicode_w(self, space):
         self._typed_unwrap_error(space, "string")
-
-    def text_w(self, space):
-        self._typed_unwrap_error(space, "string")
+    realunicode_w = unicode_w
 
     def utf8_w(self, space):
         self._typed_unwrap_error(space, "unicode")
@@ -1732,7 +1730,6 @@
         return rstring.assert_str0(result)
 
     realtext_w = text_w         # Python 2 compatibility
-    realunicode_w = unicode_w
 
     def fsencode(space, w_obj):
         from pypy.interpreter.unicodehelper import fsencode
diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -4,7 +4,7 @@
 import struct
 import sys
 from pypy.interpreter.unicodehelper import (
-    encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be)
+    encode_utf8, str_decode_utf8, utf8_encode_utf_32_be, str_decode_utf_32_be)
 from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
 
 
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -304,11 +304,12 @@
                                            errorhandler=errorhandler)
         return res.encode('utf8'), size, len(res)
 
-def str_decode_utf8(s, errors, final, errorhandler):
+def str_decode_utf8(s, errors, final, errorhandler, allow_surrogates=False):
     """ Same as checking for the valid utf8, but we know the utf8 is not
     valid so we're trying to either raise or pack stuff with error handler.
     The key difference is that this is call_may_force
     """
+    # XXX need to handle allow_surrogates
     slen = len(s)
     res = StringBuilder(slen)
     pos = 0
@@ -967,6 +968,32 @@
 
     return result.build()
 
+def encode_utf8(space, uni, allow_surrogates=False):
+    # Note that Python3 tends to forbid *all* surrogates in utf-8.
+    # If allow_surrogates=True, then revert to the Python 2 behavior
+    # which never raises UnicodeEncodeError.  Surrogate pairs are then
+    # allowed, either paired or lone.  A paired surrogate is considered
+    # like the non-BMP character it stands for.  See also *_utf8sp().
+    assert isinstance(uni, unicode)
+    return runicode.unicode_encode_utf_8(
+        uni, len(uni), "strict",
+        errorhandler=encode_error_handler(space),
+        allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni):
+    # Surrogate-preserving utf-8 encoding.  Any surrogate character
+    # turns into its 3-bytes encoding, whether it is paired or not.
+    # This should always be reversible, and the reverse is
+    # decode_utf8sp().
+    return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+    # Surrogate-preserving utf-8 decoding.  Assuming there is no
+    # encoding error, it should always be reversible, and the reverse is
+    # encode_utf8sp().
+    return decode_utf8(space, string, allow_surrogates=True)
+
+
 # ____________________________________________________________
 # utf-16
 
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -86,7 +86,7 @@
                 newpos = -1
             else:
                 if newpos < 0:
-                newpos = length + newpos
+                    newpos = length + newpos
             if newpos < 0 or newpos > length:
                 raise oefmt(space.w_IndexError,
                             "position %d from error handler out of bounds",
diff --git a/pypy/objspace/std/dictmultiobject.py 
b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -12,7 +12,7 @@
 from pypy.interpreter.mixedmodule import MixedModule
 from pypy.interpreter.signature import Signature
 from pypy.interpreter.typedef import TypeDef
-from pypy.interpreter.unicodehelper import decode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.objspace.std.util import negate
 
 
@@ -1184,7 +1184,7 @@
     # we should implement the same shortcuts as we do for BytesDictStrategy
 
     def decodekey_str(self, key):
-        return decode_utf8(self.space, key, allow_surrogates=True)
+        return str_decode_utf8(self.space, key, allow_surrogates=True)
 
     def setitem_str(self, w_dict, key, w_value):
         assert key is not None
diff --git a/pypy/objspace/std/mapdict.py b/pypy/objspace/std/mapdict.py
--- a/pypy/objspace/std/mapdict.py
+++ b/pypy/objspace/std/mapdict.py
@@ -4,7 +4,7 @@
 from rpython.rlib.rarithmetic import intmask, r_uint
 
 from pypy.interpreter.baseobjspace import W_Root
-from pypy.interpreter.unicodehelper import decode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.objspace.std.dictmultiobject import (
     W_DictMultiObject, DictStrategy, ObjectDictStrategy, BaseKeyIterator,
     BaseValueIterator, BaseItemIterator, _never_equal_to_string,
@@ -433,7 +433,7 @@
     def materialize_str_dict(self, space, obj, str_dict):
         new_obj = self.back.materialize_str_dict(space, obj, str_dict)
         if self.index == DICT:
-            uni_name = decode_utf8(space, self.name)
+            uni_name = str_decode_utf8(space, self.name)
             str_dict[uni_name] = obj._mapdict_read_storage(self.storageindex)
         else:
             self._copy_attr(obj, new_obj)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -4,7 +4,7 @@
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.function import Function, Method, FunctionWithFixedCode
 from pypy.interpreter.typedef import get_unique_interplevel_subclass
-from pypy.interpreter.unicodehelper import decode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.objspace.std import frame, transparent, callmethod
 from pypy.objspace.descroperation import (
     DescrOperation, get_attribute_name, raiseattrerror)
@@ -165,7 +165,7 @@
                 unicode_x = x.decode('ascii')
             except UnicodeDecodeError:
                 return self._wrap_string_old(x)
-            return self.newunicode(unicode_x)
+            return self.newtext(unicode_x)
         if isinstance(x, unicode):
             x = x.encode('utf8')
             lgt = rutf8.check_utf8(x, True)
@@ -192,7 +192,7 @@
             else:
                 lst.append(unichr(ch))
         unicode_x = u''.join(lst)
-        return self.newunicode(unicode_x)
+        return self.newtext(unicode_x)
 
     @not_rpython # only for tests
     def _wrap_not_rpython(self, x):
@@ -334,7 +334,7 @@
 
     def newlist_text(self, list_t):
         return self.newlist_unicode([
-            decode_utf8(self, s, allow_surrogates=True) for s in list_t])
+            str_decode_utf8(self, s, allow_surrogates=True) for s in list_t])
 
     def newlist_utf8(self, list_u, is_ascii):
         if is_ascii:
@@ -388,7 +388,7 @@
         return W_BytearrayObject(l)
 
     def newtext(self, s):
-        return self.newunicode(decode_utf8(self, s, allow_surrogates=True))
+        return self.newtext(str_decode_utf8(self, s, allow_surrogates=True))
 
     def newtext_or_none(self, s):
         if s is None:
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,8 +6,9 @@
 from rpython.rlib.rarithmetic import ovfcheck
 from rpython.rlib.rstring import (
     StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith,
-    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
-    unicode_encode_utf8_forbid_surrogates, SurrogateError, endswith)
+    endswith)
+from rpython.rlib.runicode import (
+    unicode_encode_utf8_forbid_surrogates, SurrogateError)
 from rpython.rlib import rutf8, jit
 
 from pypy.interpreter import unicodehelper
@@ -1851,4 +1852,4 @@
     return unicode_encode_utf8_forbid_surrogates(value, len(value))
 
 _repr_function = rutf8.make_utf8_escape_function(
-    pass_printable=True, unicode_output=True, quotes=True, prefix='')
+    pass_printable=True, quotes=True, prefix='')
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: fix imports. Tests start to run. str_decode_utf8 replaces decode_utf8 but args have changed

Reply via email to