Author: Ronan Lamy <ronan.l...@gmail.com>
Branch: unicode-utf8
Changeset: r93341:3e5aa507f585
Date: 2017-12-10 04:47 +0000
http://bitbucket.org/pypy/pypy/changeset/3e5aa507f585/

Log:    Fix errorhandler use in utf8_encode_charmap()

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1425,8 +1425,7 @@
     lgt = rutf8.check_utf8(r, True)
     return r, pos, lgt
 
-def utf8_encode_charmap(s, errors, errorhandler=None,
-                           mapping=None):
+def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
     size = len(s)
     if mapping is None:
         return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
@@ -1438,31 +1437,31 @@
     index = 0
     while pos < size:
         ch = rutf8.codepoint_at_pos(s, pos)
-
         c = mapping.get(ch, '')
         if len(c) == 0:
-            # collect all unencodable chars. Important for narrow builds.
-            collend = rutf8.next_codepoint_pos(s, pos)
-            endindex = index + 1
-            while collend < size and mapping.get(rutf8.codepoint_at_pos(s, 
collend), '') == '':
-                collend = rutf8.next_codepoint_pos(s, collend)
-                endindex += 1
-            rs, endindex = errorhandler(errors, "charmap",
+            # collect all unencodable chars.
+            startindex = index
+            pos = rutf8.next_codepoint_pos(s, pos)
+            index += 1
+            while (pos < size and
+                   mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
+                pos = rutf8.next_codepoint_pos(s, pos)
+                index += 1
+            res_8, newindex = errorhandler(errors, "charmap",
                                    "character maps to <undefined>",
-                                   s, index, endindex)
-            j = 0
-            for _ in range(endindex - index):
-                ch2 = rutf8.codepoint_at_pos(rs, j)
-                ch2 = mapping.get(ch2, '')
+                                   s, startindex, index)
+            for cp2 in rutf8.Utf8StringIterator(res_8):
+                ch2 = mapping.get(cp2, '')
                 if not ch2:
                     errorhandler(
-                        "strict", "charmap",
-                        "character maps to <undefined>",
-                        s,  index, index + 1)
+                        "strict", "charmap", "character maps to <undefined>",
+                        s,  startindex, index)
                 result.append(ch2)
-                index += 1
-                j = rutf8.next_codepoint_pos(rs, j)
-                pos = rutf8.next_codepoint_pos(s, pos)
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = 0
+                for _ in range(newindex):
+                    pos = rutf8.next_codepoint_pos(s, pos)
             continue
         result.append(c)
         index += 1
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -537,8 +537,12 @@
         assert '\xff'.decode('utf-7', 'ignore') == ''
         assert '\x00'.decode('unicode-internal', 'ignore') == ''
 
-    def test_backslahreplace(self):
-        assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') 
== 'a\\xac\u1234\u20ac\u8000'
+    def test_backslashreplace(self):
+        sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+        expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+        assert sin.encode('ascii', 'backslashreplace') == expected
+        expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+        assert sin.encode("iso-8859-15", "backslashreplace") == expected
 
     def test_badhandler(self):
         import codecs
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to