Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8
Changeset: r93341:3e5aa507f585
Date: 2017-12-10 04:47 +0000
http://bitbucket.org/pypy/pypy/changeset/3e5aa507f585/
Log: Fix errorhandler use in utf8_encode_charmap()
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1425,8 +1425,7 @@
lgt = rutf8.check_utf8(r, True)
return r, pos, lgt
-def utf8_encode_charmap(s, errors, errorhandler=None,
- mapping=None):
+def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
size = len(s)
if mapping is None:
return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
@@ -1438,31 +1437,31 @@
index = 0
while pos < size:
ch = rutf8.codepoint_at_pos(s, pos)
-
c = mapping.get(ch, '')
if len(c) == 0:
- # collect all unencodable chars. Important for narrow builds.
- collend = rutf8.next_codepoint_pos(s, pos)
- endindex = index + 1
- while collend < size and mapping.get(rutf8.codepoint_at_pos(s,
collend), '') == '':
- collend = rutf8.next_codepoint_pos(s, collend)
- endindex += 1
- rs, endindex = errorhandler(errors, "charmap",
+ # collect all unencodable chars.
+ startindex = index
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ while (pos < size and
+ mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ res_8, newindex = errorhandler(errors, "charmap",
"character maps to <undefined>",
- s, index, endindex)
- j = 0
- for _ in range(endindex - index):
- ch2 = rutf8.codepoint_at_pos(rs, j)
- ch2 = mapping.get(ch2, '')
+ s, startindex, index)
+ for cp2 in rutf8.Utf8StringIterator(res_8):
+ ch2 = mapping.get(cp2, '')
if not ch2:
errorhandler(
- "strict", "charmap",
- "character maps to <undefined>",
- s, index, index + 1)
+ "strict", "charmap", "character maps to <undefined>",
+ s, startindex, index)
result.append(ch2)
- index += 1
- j = rutf8.next_codepoint_pos(rs, j)
- pos = rutf8.next_codepoint_pos(s, pos)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = 0
+ for _ in range(newindex):
+ pos = rutf8.next_codepoint_pos(s, pos)
continue
result.append(c)
index += 1
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -537,8 +537,12 @@
assert '\xff'.decode('utf-7', 'ignore') == ''
assert '\x00'.decode('unicode-internal', 'ignore') == ''
- def test_backslahreplace(self):
- assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace')
== 'a\\xac\u1234\u20ac\u8000'
+ def test_backslashreplace(self):
+ sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+ expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+ assert sin.encode('ascii', 'backslashreplace') == expected
+ expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+ assert sin.encode("iso-8859-15", "backslashreplace") == expected
def test_badhandler(self):
import codecs
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit