Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r94753:40650baa7fd6
Date: 2018-06-10 22:20 -0700
http://bitbucket.org/pypy/pypy/changeset/40650baa7fd6/
Log: fix imports. Tests start to run. str_decode_utf8 replaces
decode_utf8 but args have changed
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -247,9 +247,7 @@
def unicode_w(self, space):
self._typed_unwrap_error(space, "string")
-
- def text_w(self, space):
- self._typed_unwrap_error(space, "string")
+ realunicode_w = unicode_w
def utf8_w(self, space):
self._typed_unwrap_error(space, "unicode")
@@ -1732,7 +1730,6 @@
return rstring.assert_str0(result)
realtext_w = text_w # Python 2 compatibility
- realunicode_w = unicode_w
def fsencode(space, w_obj):
from pypy.interpreter.unicodehelper import fsencode
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -4,7 +4,7 @@
import struct
import sys
from pypy.interpreter.unicodehelper import (
- encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be)
+ encode_utf8, str_decode_utf8, utf8_encode_utf_32_be, str_decode_utf_32_be)
from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -304,11 +304,12 @@
errorhandler=errorhandler)
return res.encode('utf8'), size, len(res)
-def str_decode_utf8(s, errors, final, errorhandler):
+def str_decode_utf8(s, errors, final, errorhandler, allow_surrogates=False):
""" Same as checking for the valid utf8, but we know the utf8 is not
valid so we're trying to either raise or pack stuff with error handler.
The key difference is that this is call_may_force
"""
+ # XXX need to handle allow_surrogates
slen = len(s)
res = StringBuilder(slen)
pos = 0
@@ -967,6 +968,32 @@
return result.build()
+def encode_utf8(space, uni, allow_surrogates=False):
+ # Note that Python3 tends to forbid *all* surrogates in utf-8.
+ # If allow_surrogates=True, then revert to the Python 2 behavior
+ # which never raises UnicodeEncodeError. Surrogate pairs are then
+ # allowed, either paired or lone. A paired surrogate is considered
+ # like the non-BMP character it stands for. See also *_utf8sp().
+ assert isinstance(uni, unicode)
+ return runicode.unicode_encode_utf_8(
+ uni, len(uni), "strict",
+ errorhandler=encode_error_handler(space),
+ allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni):
+ # Surrogate-preserving utf-8 encoding. Any surrogate character
+ # turns into its 3-bytes encoding, whether it is paired or not.
+ # This should always be reversible, and the reverse is
+ # decode_utf8sp().
+ return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+ # Surrogate-preserving utf-8 decoding. Assuming there is no
+ # encoding error, it should always be reversible, and the reverse is
+ # encode_utf8sp().
+ return decode_utf8(space, string, allow_surrogates=True)
+
+
# ____________________________________________________________
# utf-16
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -86,7 +86,7 @@
newpos = -1
else:
if newpos < 0:
- newpos = length + newpos
+ newpos = length + newpos
if newpos < 0 or newpos > length:
raise oefmt(space.w_IndexError,
"position %d from error handler out of bounds",
diff --git a/pypy/objspace/std/dictmultiobject.py
b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -12,7 +12,7 @@
from pypy.interpreter.mixedmodule import MixedModule
from pypy.interpreter.signature import Signature
from pypy.interpreter.typedef import TypeDef
-from pypy.interpreter.unicodehelper import decode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.objspace.std.util import negate
@@ -1184,7 +1184,7 @@
# we should implement the same shortcuts as we do for BytesDictStrategy
def decodekey_str(self, key):
- return decode_utf8(self.space, key, allow_surrogates=True)
+ return str_decode_utf8(self.space, key, allow_surrogates=True)
def setitem_str(self, w_dict, key, w_value):
assert key is not None
diff --git a/pypy/objspace/std/mapdict.py b/pypy/objspace/std/mapdict.py
--- a/pypy/objspace/std/mapdict.py
+++ b/pypy/objspace/std/mapdict.py
@@ -4,7 +4,7 @@
from rpython.rlib.rarithmetic import intmask, r_uint
from pypy.interpreter.baseobjspace import W_Root
-from pypy.interpreter.unicodehelper import decode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.objspace.std.dictmultiobject import (
W_DictMultiObject, DictStrategy, ObjectDictStrategy, BaseKeyIterator,
BaseValueIterator, BaseItemIterator, _never_equal_to_string,
@@ -433,7 +433,7 @@
def materialize_str_dict(self, space, obj, str_dict):
new_obj = self.back.materialize_str_dict(space, obj, str_dict)
if self.index == DICT:
- uni_name = decode_utf8(space, self.name)
+ uni_name = str_decode_utf8(space, self.name)
str_dict[uni_name] = obj._mapdict_read_storage(self.storageindex)
else:
self._copy_attr(obj, new_obj)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -4,7 +4,7 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.function import Function, Method, FunctionWithFixedCode
from pypy.interpreter.typedef import get_unique_interplevel_subclass
-from pypy.interpreter.unicodehelper import decode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.objspace.std import frame, transparent, callmethod
from pypy.objspace.descroperation import (
DescrOperation, get_attribute_name, raiseattrerror)
@@ -165,7 +165,7 @@
unicode_x = x.decode('ascii')
except UnicodeDecodeError:
return self._wrap_string_old(x)
- return self.newunicode(unicode_x)
+ return self.newtext(unicode_x)
if isinstance(x, unicode):
x = x.encode('utf8')
lgt = rutf8.check_utf8(x, True)
@@ -192,7 +192,7 @@
else:
lst.append(unichr(ch))
unicode_x = u''.join(lst)
- return self.newunicode(unicode_x)
+ return self.newtext(unicode_x)
@not_rpython # only for tests
def _wrap_not_rpython(self, x):
@@ -334,7 +334,7 @@
def newlist_text(self, list_t):
return self.newlist_unicode([
- decode_utf8(self, s, allow_surrogates=True) for s in list_t])
+ str_decode_utf8(self, s, allow_surrogates=True) for s in list_t])
def newlist_utf8(self, list_u, is_ascii):
if is_ascii:
@@ -388,7 +388,7 @@
return W_BytearrayObject(l)
def newtext(self, s):
- return self.newunicode(decode_utf8(self, s, allow_surrogates=True))
+ return self.newtext(str_decode_utf8(self, s, allow_surrogates=True))
def newtext_or_none(self, s):
if s is None:
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,8 +6,9 @@
from rpython.rlib.rarithmetic import ovfcheck
from rpython.rlib.rstring import (
StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith,
- unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
- unicode_encode_utf8_forbid_surrogates, SurrogateError, endswith)
+ endswith)
+from rpython.rlib.runicode import (
+ unicode_encode_utf8_forbid_surrogates, SurrogateError)
from rpython.rlib import rutf8, jit
from pypy.interpreter import unicodehelper
@@ -1851,4 +1852,4 @@
return unicode_encode_utf8_forbid_surrogates(value, len(value))
_repr_function = rutf8.make_utf8_escape_function(
- pass_printable=True, unicode_output=True, quotes=True, prefix='')
+ pass_printable=True, quotes=True, prefix='')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit