Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r94938:d7b217949b58
Date: 2018-08-03 20:18 -0700
http://bitbucket.org/pypy/pypy/changeset/d7b217949b58/
Log: fixes for translation, surrogates
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1721,9 +1721,9 @@
return w_obj.convert_to_w_unicode(self)
def realunicode_w(self, w_obj):
- from rpython.rlib.runicode import str_decode_utf_8
+ from pypy.interpreter.unicodehelper import decode_utf8sp
utf8 = self.utf8_w(w_obj)
- return str_decode_utf_8(utf8, len(utf8), 'strict', True)[0]
+ return decode_utf8sp(self, utf8)[0]
def utf8_0_w(self, w_obj):
"Like utf8_w, but rejects strings with NUL bytes."
@@ -1763,7 +1763,8 @@
def realutf8_w(self, w_obj):
# Like utf8_w(), but only works if w_obj is really of type
# 'unicode'. On Python 3 this is the same as utf8_w().
- if not self.isinstance_w(w_obj, self.w_unicode):
+ from pypy.objspace.std.unicodeobject import W_UnicodeObject
+ if not isinstance(w_obj, W_UnicodeObject):
raise oefmt(self.w_TypeError, "argument must be a unicode")
return self.utf8_w(w_obj)
@@ -1784,7 +1785,7 @@
def fsdecode_w(self, w_obj):
if self.isinstance_w(w_obj, self.w_bytes):
w_obj = self.fsdecode(w_obj)
- return self.unicode0_w(w_obj)
+ return self.utf8_w(w_obj)
def bool_w(self, w_obj):
# Unwraps a bool, also accepting an int for compatibility.
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -10,7 +10,6 @@
from rpython.rlib.objectmodel import dont_inline, not_rpython
from rpython.rlib import rstack, rstackovf
from rpython.rlib import rwin32
-from rpython.rlib import runicode
from pypy.interpreter import debug
@@ -518,7 +517,10 @@
elif fmt == 'N':
result = value.getname(space)
elif fmt == '8':
- result = _decode_utf8(value)
+ try:
+ result = value.decode('utf8')
+ except UnicodeDecodeError:
+ result = value.decode('unicode-escape')
else:
if isinstance(value, unicode):
result = value
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -45,7 +45,8 @@
closure=None, w_ann=None, forcename=None, qualname=None):
self.space = space
self.name = forcename or code.co_name
- self.qualname = qualname or self.name.decode('utf-8')
+ self.qualname = qualname or self.name
+ assert isinstance(self.qualname, str)
self.w_doc = None # lazily read from code.getdocstring()
self.code = code # Code instance
self.w_func_globals = w_globals # the globals dictionary
@@ -434,7 +435,7 @@
def fset_func_qualname(self, space, w_name):
try:
- self.qualname = space.utf8_w(w_name)
+ self.qualname = space.realutf8_w(w_name)
except OperationError as e:
if e.match(space, space.w_TypeError):
raise oefmt(space.w_TypeError,
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -30,6 +30,7 @@
return raise_unicode_exception_decode
def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+ assert startingpos >= 0
ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
return ''.join(ux), endingpos, endingpos
@@ -116,7 +117,7 @@
# instead
from pypy.module._codecs.locale import (
unicode_encode_locale_surrogateescape)
- uni = space.realunicode_w(w_uni)
+ uni = space.realunicode_w(w_uni).decode('utf8')
if u'\x00' in uni:
raise oefmt(space.w_ValueError, "embedded null character")
bytes = unicode_encode_locale_surrogateescape(
diff --git a/pypy/objspace/std/dictmultiobject.py
b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1164,7 +1164,8 @@
unerase = staticmethod(unerase)
def wrap(self, unwrapped):
- return self.space.newutf8(unwrapped, len(unwrapped))
+ return self.space.newutf8(unwrapped,
+ rutf8.codepoints_in_utf8(unwrapped))
def unwrap(self, wrapped):
return self.space.utf8_w(wrapped)
@@ -1209,7 +1210,7 @@
## return self.space.newlist_bytes(self.listview_bytes(w_dict))
def wrapkey(space, key):
- return space.newutf8(key, len(key))
+ return space.newutf8(key, rutf8.codepoints_in_utf8(key))
@jit.look_inside_iff(lambda self, w_dict:
w_dict_unrolling_heuristic(w_dict))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit