[pypy-commit] pypy py3.5: Fix "namereplace" handler for unknown characters.

amauryfa Wed, 02 Nov 2016 01:33:19 -0700

Author: Amaury Forgeot d'Arc <amaur...@gmail.com>
Branch: py3.5
Changeset: r88057:08fdd579df29
Date: 2016-11-02 09:31 +0100
http://bitbucket.org/pypy/pypy/changeset/08fdd579df29/


Log:    Fix "namereplace" handler for unknown characters. Also convert other
        handlers to return a bytes string, python3 allows this now (no need
        to encode the replacement string again)

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,7 +1,8 @@
 from rpython.rlib import jit
 from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.rstring import UnicodeBuilder, StringBuilder
-from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.runicode import (
+    code_to_unichr, MAXUNICODE, raw_unicode_escape_helper)
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -245,7 +246,7 @@
         start = space.int_w(space.getattr(w_exc, space.wrap('start')))
         w_end = space.getattr(w_exc, space.wrap('end'))
         end = space.int_w(w_end)
-        builder = UnicodeBuilder()
+        builder = StringBuilder()
         pos = start
         while pos < end:
             code = ord(obj[pos])
@@ -255,9 +256,9 @@
                 code |= ord(obj[pos+1]) & 0x03FF
                 code += 0x10000
                 pos += 1
-            builder.append(u"&#")
-            builder.append(unicode(str(code)))
-            builder.append(u";")
+            builder.append("&#")
+            builder.append(str(code))
+            builder.append(";")
             pos += 1
         return space.newtuple([space.wrap(builder.build()), w_end])
     else:
@@ -271,25 +272,11 @@
         start = space.int_w(space.getattr(w_exc, space.wrap('start')))
         w_end = space.getattr(w_exc, space.wrap('end'))
         end = space.int_w(w_end)
-        builder = UnicodeBuilder()
+        builder = StringBuilder()
         pos = start
         while pos < end:
             oc = ord(obj[pos])
-            num = hex(oc)
-            if (oc >= 0x10000):
-                builder.append(u"\\U")
-                zeros = 8
-            elif (oc >= 0x100):
-                builder.append(u"\\u")
-                zeros = 4
-            else:
-                builder.append(u"\\x")
-                zeros = 2
-            lnum = len(num)
-            nb = zeros + 2 - lnum # num starts with '0x'
-            if nb > 0:
-                builder.append_multiple_char(u'0', nb)
-            builder.append_slice(unicode(num), 2, lnum)
+            raw_unicode_escape_helper(builder, oc)
             pos += 1
         return space.newtuple([space.wrap(builder.build()), w_end])
     else:
@@ -307,9 +294,14 @@
         pos = start
         while pos < end:
             oc = ord(obj[pos])
-            builder.append('\\N{')
-            builder.append(unicodedb.name(oc))
-            builder.append('}')
+            try:
+                name = unicodedb.name(oc)
+            except KeyError:
+                raw_unicode_escape_helper(builder, oc)
+            else:
+                builder.append('\\N{')
+                builder.append(name)
+                builder.append('}')
             pos += 1
         return space.newtuple([space.newbytes(builder.build()), w_end])
     else:
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -568,6 +568,7 @@
         assert 'a\xac\u1234\u20ac\u8000'.encode('ascii', 'namereplace') == (
             b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
             b'\\N{CJK UNIFIED IDEOGRAPH-8000}')
+        assert '[\uDC80]'.encode('utf-8', 'namereplace') == b'[\\udc80]'
 
     def test_surrogateescape(self):
         assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3.5: Fix "namereplace" handler for unknown characters.

Reply via email to