Author: Amaury Forgeot d'Arc <[email protected]>
Branch: py3.5
Changeset: r88057:08fdd579df29
Date: 2016-11-02 09:31 +0100
http://bitbucket.org/pypy/pypy/changeset/08fdd579df29/
Log: Fix "namereplace" handler for unknown characters. Also convert other
handlers to return a bytes string, python3 allows this now (no need
to encode the replacement string again)
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,7 +1,8 @@
from rpython.rlib import jit
from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.rstring import UnicodeBuilder, StringBuilder
-from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.runicode import (
+ code_to_unichr, MAXUNICODE, raw_unicode_escape_helper)
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -245,7 +246,7 @@
start = space.int_w(space.getattr(w_exc, space.wrap('start')))
w_end = space.getattr(w_exc, space.wrap('end'))
end = space.int_w(w_end)
- builder = UnicodeBuilder()
+ builder = StringBuilder()
pos = start
while pos < end:
code = ord(obj[pos])
@@ -255,9 +256,9 @@
code |= ord(obj[pos+1]) & 0x03FF
code += 0x10000
pos += 1
- builder.append(u"&#")
- builder.append(unicode(str(code)))
- builder.append(u";")
+ builder.append("&#")
+ builder.append(str(code))
+ builder.append(";")
pos += 1
return space.newtuple([space.wrap(builder.build()), w_end])
else:
@@ -271,25 +272,11 @@
start = space.int_w(space.getattr(w_exc, space.wrap('start')))
w_end = space.getattr(w_exc, space.wrap('end'))
end = space.int_w(w_end)
- builder = UnicodeBuilder()
+ builder = StringBuilder()
pos = start
while pos < end:
oc = ord(obj[pos])
- num = hex(oc)
- if (oc >= 0x10000):
- builder.append(u"\\U")
- zeros = 8
- elif (oc >= 0x100):
- builder.append(u"\\u")
- zeros = 4
- else:
- builder.append(u"\\x")
- zeros = 2
- lnum = len(num)
- nb = zeros + 2 - lnum # num starts with '0x'
- if nb > 0:
- builder.append_multiple_char(u'0', nb)
- builder.append_slice(unicode(num), 2, lnum)
+ raw_unicode_escape_helper(builder, oc)
pos += 1
return space.newtuple([space.wrap(builder.build()), w_end])
else:
@@ -307,9 +294,14 @@
pos = start
while pos < end:
oc = ord(obj[pos])
- builder.append('\\N{')
- builder.append(unicodedb.name(oc))
- builder.append('}')
+ try:
+ name = unicodedb.name(oc)
+ except KeyError:
+ raw_unicode_escape_helper(builder, oc)
+ else:
+ builder.append('\\N{')
+ builder.append(name)
+ builder.append('}')
pos += 1
return space.newtuple([space.newbytes(builder.build()), w_end])
else:
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -568,6 +568,7 @@
assert 'a\xac\u1234\u20ac\u8000'.encode('ascii', 'namereplace') == (
b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
b'\\N{CJK UNIFIED IDEOGRAPH-8000}')
+ assert '[\uDC80]'.encode('utf-8', 'namereplace') == b'[\\udc80]'
def test_surrogateescape(self):
assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit