Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r94846:9cf4fc74394c
Date: 2018-07-11 06:49 -0700
http://bitbucket.org/pypy/pypy/changeset/9cf4fc74394c/
Log: surrogate and illegal unicode handling
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -29,11 +29,17 @@
space.newtext(msg)]))
return raise_unicode_exception_decode
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+ ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
+ return ''.join(ux), endingpos
+
@specialize.memo()
def encode_error_handler(space):
# Fast version of the "strict" errors handler.
def raise_unicode_exception_encode(errors, encoding, msg, utf8,
startingpos, endingpos):
+ if isinstance(utf8, unicode):
+ utf8 = utf8.encode('utf8')
u_len = rutf8.get_utf8_length(utf8)
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([space.newtext(encoding),
@@ -993,7 +999,7 @@
# Surrogate-preserving utf-8 decoding. Assuming there is no
# encoding error, it should always be reversible, and the reverse is
# encode_utf8sp().
- return str_decode_utf8(string, "string", True, decode_error_handler(space),
+ return str_decode_utf8(string, "string", True, decode_never_raise,
allow_surrogates=True)
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1187,22 +1187,26 @@
def encode_object(space, w_object, encoding, errors):
+ utf8 = space.utf8_w(w_object)
+ idx = rutf8.surrogate_in_utf8(utf8)
+ if idx >= 0:
+ eh = unicodehelper.encode_error_handler(space)
+ eh(None, "utf8", "surrogates not allowed", utf8,
+ idx, idx + 1)
if errors is None or errors == 'strict':
if encoding is None or encoding == 'utf-8':
- utf8 = space.utf8_w(w_object)
- if rutf8.has_surrogates(utf8):
- utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+ #if rutf8.has_surrogates(utf8):
+ # utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
return space.newbytes(utf8)
elif encoding == 'ascii':
- s = space.utf8_w(w_object)
try:
- rutf8.check_ascii(s)
+ rutf8.check_ascii(utf8)
except rutf8.CheckError as a:
eh = unicodehelper.encode_error_handler(space)
- eh(None, "ascii", "ordinal not in range(128)", s,
+ eh(None, "ascii", "ordinal not in range(128)", utf8,
a.pos, a.pos + 1)
assert False, "always raises"
- return space.newbytes(s)
+ return space.newbytes(utf8)
from pypy.module._codecs.interp_codecs import encode_text
if encoding is None:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit