Author: Amaury Forgeot d'Arc <[email protected]>
Branch: py3.5
Changeset: r88151:0e35b7ce07d1
Date: 2016-11-06 16:38 +0100
http://bitbucket.org/pypy/pypy/changeset/0e35b7ce07d1/
Log: Adapt 'surrogatepass' to utf16 and utf32 encodings. Other encodings
fail.
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,6 +1,7 @@
+import sys
from rpython.rlib import jit
from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import UnicodeBuilder, StringBuilder
from rpython.rlib.runicode import (
code_to_unichr, MAXUNICODE,
raw_unicode_escape_helper_unicode)
@@ -322,14 +323,55 @@
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
+
+(ENC_UNKNOWN, ENC_UTF8,
+ ENC_UTF16BE, ENC_UTF16LE,
+ ENC_UTF32BE, ENC_UTF32LE) = range(-1, 5)
+BIG_ENDIAN = sys.byteorder == 'big'
+
+STANDARD_ENCODINGS = {
+ 'utf8': (3, ENC_UTF8),
+ 'utf_8': (3, ENC_UTF8),
+ 'cp_utf8': (3, ENC_UTF8),
+ 'utf16': (2, ENC_UTF16BE) if BIG_ENDIAN else (2, ENC_UTF16LE),
+ 'utf_16': (2, ENC_UTF16BE) if BIG_ENDIAN else (2, ENC_UTF16LE),
+ 'utf16be': (2, ENC_UTF16BE),
+ 'utf_16be': (2, ENC_UTF16BE),
+ 'utf16_be': (2, ENC_UTF16BE),
+ 'utf_16_be': (2, ENC_UTF16BE),
+ 'utf16le': (2, ENC_UTF16LE),
+ 'utf_16le': (2, ENC_UTF16LE),
+ 'utf16_le': (2, ENC_UTF16LE),
+ 'utf_16_le': (2, ENC_UTF16LE),
+ 'utf32': (4, ENC_UTF32BE) if BIG_ENDIAN else (4, ENC_UTF32LE),
+ 'utf_32': (4, ENC_UTF32BE) if BIG_ENDIAN else (4, ENC_UTF32LE),
+ 'utf32be': (4, ENC_UTF32BE),
+ 'utf_32be': (4, ENC_UTF32BE),
+ 'utf32_be': (4, ENC_UTF32BE),
+ 'utf_32_be': (4, ENC_UTF32BE),
+ 'utf32le': (4, ENC_UTF32LE),
+ 'utf_32le': (4, ENC_UTF32LE),
+ 'utf32_le': (4, ENC_UTF32LE),
+ 'utf_32_le': (4, ENC_UTF32LE),
+}
+
+def get_standard_encoding(encoding):
+ encoding = encoding.lower().replace('-', '_')
+ return STANDARD_ENCODINGS.get(encoding, (0, ENC_UNKNOWN))
+
def surrogatepass_errors(space, w_exc):
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
start = space.int_w(space.getattr(w_exc, space.wrap('start')))
w_end = space.getattr(w_exc, space.wrap('end'))
+ encoding = space.str_w(space.getattr(w_exc, space.wrap('encoding')))
+ bytelength, code = get_standard_encoding(encoding)
+ if code == ENC_UNKNOWN:
+ # Not supported, fail with original exception
+ raise OperationError(space.type(w_exc), w_exc)
end = space.int_w(w_end)
- res = ''
+ builder = StringBuilder()
pos = start
while pos < end:
ch = ord(obj[pos])
@@ -337,31 +379,61 @@
if ch < 0xd800 or ch > 0xdfff:
# Not a surrogate, fail with original exception
raise OperationError(space.type(w_exc), w_exc)
- res += chr(0xe0 | (ch >> 12))
- res += chr(0x80 | ((ch >> 6) & 0x3f))
- res += chr(0x80 | (ch & 0x3f))
- return space.newtuple([space.newbytes(res), w_end])
+ if code == ENC_UTF8:
+ builder.append(chr(0xe0 | (ch >> 12)))
+ builder.append(chr(0x80 | ((ch >> 6) & 0x3f)))
+ builder.append(chr(0x80 | (ch & 0x3f)))
+ elif code == ENC_UTF16LE:
+ builder.append(chr(ch & 0xff))
+ builder.append(chr(ch >> 8))
+ elif code == ENC_UTF16BE:
+ builder.append(chr(ch >> 8))
+ builder.append(chr(ch & 0xff))
+ elif code == ENC_UTF32LE:
+ builder.append(chr(ch & 0xff))
+ builder.append(chr(ch >> 8))
+ builder.append(chr(0))
+ builder.append(chr(0))
+ elif code == ENC_UTF32BE:
+ builder.append(chr(0))
+ builder.append(chr(0))
+ builder.append(chr(ch >> 8))
+ builder.append(chr(ch & 0xff))
+ return space.newtuple([space.newbytes(builder.build()), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
start = space.int_w(space.getattr(w_exc, space.wrap('start')))
obj = space.bytes_w(space.getattr(w_exc, space.wrap('object')))
+ encoding = space.str_w(space.getattr(w_exc, space.wrap('encoding')))
+ bytelength, code = get_standard_encoding(encoding)
ch = 0
# Try decoding a single surrogate character. If there are more,
# let the codec call us again
- ch0 = ord(obj[start + 0])
- ch1 = ord(obj[start + 1])
+ ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1
+ ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1
ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1
- if (ch2 != -1 and
- ch0 & 0xf0 == 0xe0 and
- ch1 & 0xc0 == 0x80 and
- ch2 & 0xc0 == 0x80):
- # it's a three-byte code
- ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
- if ch < 0xd800 or ch > 0xdfff:
- # it's not a surrogate - fail
- ch = 0
+ ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1
+ if code == ENC_UTF8:
+ if (ch1 != -1 and ch2 != -1 and
+ ch0 & 0xf0 == 0xe0 and
+ ch1 & 0xc0 == 0x80 and
+ ch2 & 0xc0 == 0x80):
+ # it's a three-byte code
+ ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
+ elif code == ENC_UTF16LE:
+ ch = (ch1 << 8) | ch0
+ elif code == ENC_UTF16BE:
+ ch = (ch0 << 8) | ch1
+ elif code == ENC_UTF32LE:
+ ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0
+ elif code == ENC_UTF32BE:
+ ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3
+ if ch < 0xd800 or ch > 0xdfff:
+ # it's not a surrogate - fail
+ ch = 0
if ch == 0:
raise OperationError(space.type(w_exc), w_exc)
- return space.newtuple([space.wrap(unichr(ch)), space.wrap(start + 3)])
+ return space.newtuple([space.wrap(unichr(ch)),
+ space.wrap(start + bytelength)])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -624,6 +624,51 @@
raises(UnicodeDecodeError, b"abc\xed\xa0z".decode, "utf-8",
"surrogatepass")
+ def test_badandgoodsurrogatepassexceptions(self):
+ import codecs
+ surrogatepass_errors = codecs.lookup_error('surrogatepass')
+ # "surrogatepass" complains about a non-exception passed in
+ raises(TypeError, surrogatepass_errors, 42)
+ # "surrogatepass" complains about the wrong exception types
+ raises(TypeError, surrogatepass_errors, UnicodeError("ouch"))
+ # "surrogatepass" can not be used for translating
+ raises(TypeError, surrogatepass_errors,
+ UnicodeTranslateError("\ud800", 0, 1, "ouch"))
+ # Use the correct exception
+ for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
+ raises(UnicodeEncodeError, surrogatepass_errors,
+ UnicodeEncodeError(enc, "a", 0, 1, "ouch"))
+ raises(UnicodeDecodeError, surrogatepass_errors,
+ UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch"))
+ for s in ("\ud800", "\udfff", "\ud800\udfff"):
+ raises(UnicodeEncodeError, surrogatepass_errors,
+ UnicodeEncodeError("ascii", s, 0, len(s), "ouch"))
+ tests = [
+ ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
+ ("utf-16le", "\ud800", b'\x00\xd8', 2),
+ ("utf-16be", "\ud800", b'\xd8\x00', 2),
+ ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
+ ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
+ ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
+ ("utf-16le", "\udfff", b'\xff\xdf', 2),
+ ("utf-16be", "\udfff", b'\xdf\xff', 2),
+ ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
+ ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
+ ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
+ ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
+ ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
+ ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00',
4),
+ ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff',
4),
+ ]
+ for enc, s, b, n in tests:
+ assert surrogatepass_errors(
+ UnicodeEncodeError(enc, "a" + s + "b", 1, 1 + len(s), "ouch")
+ ) == (b, 1 + len(s))
+ assert surrogatepass_errors(
+ UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
+ 1, 1 + n, "ouch")
+ ) == (s[:1], 1 + n)
+
def test_badhandler(self):
import codecs
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None),
("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit