Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r94938:d7b217949b58
Date: 2018-08-03 20:18 -0700
http://bitbucket.org/pypy/pypy/changeset/d7b217949b58/

Log:    fixes for translation, surrogates

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1721,9 +1721,9 @@
         return w_obj.convert_to_w_unicode(self)
 
     def realunicode_w(self, w_obj):
-        from rpython.rlib.runicode import str_decode_utf_8
+        from pypy.interpreter.unicodehelper import decode_utf8sp
         utf8 = self.utf8_w(w_obj)
-        return str_decode_utf_8(utf8, len(utf8), 'strict', True)[0]
+        return decode_utf8sp(self, utf8)[0]
 
     def utf8_0_w(self, w_obj):
         "Like utf8_w, but rejects strings with NUL bytes."
@@ -1763,7 +1763,8 @@
     def realutf8_w(self, w_obj):
         # Like utf8_w(), but only works if w_obj is really of type
         # 'unicode'.  On Python 3 this is the same as utf8_w().
-        if not self.isinstance_w(w_obj, self.w_unicode):
+        from pypy.objspace.std.unicodeobject import W_UnicodeObject
+        if not isinstance(w_obj, W_UnicodeObject):
             raise oefmt(self.w_TypeError, "argument must be a unicode")
         return self.utf8_w(w_obj)
 
@@ -1784,7 +1785,7 @@
     def fsdecode_w(self, w_obj):
         if self.isinstance_w(w_obj, self.w_bytes):
             w_obj = self.fsdecode(w_obj)
-        return self.unicode0_w(w_obj)
+        return self.utf8_w(w_obj)
 
     def bool_w(self, w_obj):
         # Unwraps a bool, also accepting an int for compatibility.
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -10,7 +10,6 @@
 from rpython.rlib.objectmodel import dont_inline, not_rpython
 from rpython.rlib import rstack, rstackovf
 from rpython.rlib import rwin32
-from rpython.rlib import runicode
 
 from pypy.interpreter import debug
 
@@ -518,7 +517,10 @@
                     elif fmt == 'N':
                         result = value.getname(space)
                     elif fmt == '8':
-                        result = _decode_utf8(value)
+                        try:
+                            result = value.decode('utf8')
+                        except UnicodeDecodeError:
+                            result = value.decode('unicode-escape')
                     else:
                         if isinstance(value, unicode):
                             result = value
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -45,7 +45,8 @@
                  closure=None, w_ann=None, forcename=None, qualname=None):
         self.space = space
         self.name = forcename or code.co_name
-        self.qualname = qualname or self.name.decode('utf-8')
+        self.qualname = qualname or self.name
+        assert isinstance(self.qualname, str)
         self.w_doc = None   # lazily read from code.getdocstring()
         self.code = code       # Code instance
         self.w_func_globals = w_globals  # the globals dictionary
@@ -434,7 +435,7 @@
 
     def fset_func_qualname(self, space, w_name):
         try:
-            self.qualname = space.utf8_w(w_name)
+            self.qualname = space.realutf8_w(w_name)
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 raise oefmt(space.w_TypeError,
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -30,6 +30,7 @@
     return raise_unicode_exception_decode
 
 def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+    assert startingpos >= 0
     ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
     return ''.join(ux), endingpos, endingpos
 
@@ -116,7 +117,7 @@
         # instead
         from pypy.module._codecs.locale import (
             unicode_encode_locale_surrogateescape)
-        uni = space.realunicode_w(w_uni)
+        uni = space.realunicode_w(w_uni).decode('utf8')
         if u'\x00' in uni:
             raise oefmt(space.w_ValueError, "embedded null character")
         bytes = unicode_encode_locale_surrogateescape(
diff --git a/pypy/objspace/std/dictmultiobject.py 
b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1164,7 +1164,8 @@
     unerase = staticmethod(unerase)
 
     def wrap(self, unwrapped):
-        return self.space.newutf8(unwrapped, len(unwrapped))
+        return self.space.newutf8(unwrapped,
+                                  rutf8.codepoints_in_utf8(unwrapped))
 
     def unwrap(self, wrapped):
         return self.space.utf8_w(wrapped)
@@ -1209,7 +1210,7 @@
     ##     return self.space.newlist_bytes(self.listview_bytes(w_dict))
 
     def wrapkey(space, key):
-        return space.newutf8(key, len(key))
+        return space.newutf8(key, rutf8.codepoints_in_utf8(key))
 
     @jit.look_inside_iff(lambda self, w_dict:
                          w_dict_unrolling_heuristic(w_dict))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to