Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: py3.5 Changeset: r88057:08fdd579df29 Date: 2016-11-02 09:31 +0100 http://bitbucket.org/pypy/pypy/changeset/08fdd579df29/
Log: Fix "namereplace" handler for unknown characters. Also convert other handlers to return a bytes string, python3 allows this now (no need to encode the replacement string again) diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -1,7 +1,8 @@ from rpython.rlib import jit from rpython.rlib.objectmodel import we_are_translated -from rpython.rlib.rstring import UnicodeBuilder, StringBuilder -from rpython.rlib.runicode import code_to_unichr, MAXUNICODE +from rpython.rlib.rstring import StringBuilder +from rpython.rlib.runicode import ( + code_to_unichr, MAXUNICODE, raw_unicode_escape_helper) from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault @@ -245,7 +246,7 @@ start = space.int_w(space.getattr(w_exc, space.wrap('start'))) w_end = space.getattr(w_exc, space.wrap('end')) end = space.int_w(w_end) - builder = UnicodeBuilder() + builder = StringBuilder() pos = start while pos < end: code = ord(obj[pos]) @@ -255,9 +256,9 @@ code |= ord(obj[pos+1]) & 0x03FF code += 0x10000 pos += 1 - builder.append(u"&#") - builder.append(unicode(str(code))) - builder.append(u";") + builder.append("&#") + builder.append(str(code)) + builder.append(";") pos += 1 return space.newtuple([space.wrap(builder.build()), w_end]) else: @@ -271,25 +272,11 @@ start = space.int_w(space.getattr(w_exc, space.wrap('start'))) w_end = space.getattr(w_exc, space.wrap('end')) end = space.int_w(w_end) - builder = UnicodeBuilder() + builder = StringBuilder() pos = start while pos < end: oc = ord(obj[pos]) - num = hex(oc) - if (oc >= 0x10000): - builder.append(u"\\U") - zeros = 8 - elif (oc >= 0x100): - builder.append(u"\\u") - zeros = 4 - else: - builder.append(u"\\x") - zeros = 2 - lnum = len(num) - nb = zeros + 2 - lnum # num starts with '0x' - if nb > 0: - builder.append_multiple_char(u'0', nb) - builder.append_slice(unicode(num), 2, lnum) + raw_unicode_escape_helper(builder, oc) pos += 1 return space.newtuple([space.wrap(builder.build()), w_end]) else: @@ -307,9 +294,14 @@ pos = start while pos < end: oc = ord(obj[pos]) - builder.append('\\N{') - builder.append(unicodedb.name(oc)) - builder.append('}') + try: + name = unicodedb.name(oc) + except KeyError: + raw_unicode_escape_helper(builder, oc) + else: + builder.append('\\N{') + builder.append(name) + builder.append('}') pos += 1 return space.newtuple([space.newbytes(builder.build()), w_end]) else: diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -568,6 +568,7 @@ assert 'a\xac\u1234\u20ac\u8000'.encode('ascii', 'namereplace') == ( b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' b'\\N{CJK UNIFIED IDEOGRAPH-8000}') + assert '[\uDC80]'.encode('utf-8', 'namereplace') == b'[\\udc80]' def test_surrogateescape(self): assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b' _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit