Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8 Changeset: r93341:3e5aa507f585 Date: 2017-12-10 04:47 +0000 http://bitbucket.org/pypy/pypy/changeset/3e5aa507f585/
Log: Fix errorhandler use in utf8_encode_charmap() diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1425,8 +1425,7 @@ lgt = rutf8.check_utf8(r, True) return r, pos, lgt -def utf8_encode_charmap(s, errors, errorhandler=None, - mapping=None): +def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None): size = len(s) if mapping is None: return utf8_encode_latin_1(s, errors, errorhandler=errorhandler) @@ -1438,31 +1437,31 @@ index = 0 while pos < size: ch = rutf8.codepoint_at_pos(s, pos) - c = mapping.get(ch, '') if len(c) == 0: - # collect all unencodable chars. Important for narrow builds. - collend = rutf8.next_codepoint_pos(s, pos) - endindex = index + 1 - while collend < size and mapping.get(rutf8.codepoint_at_pos(s, collend), '') == '': - collend = rutf8.next_codepoint_pos(s, collend) - endindex += 1 - rs, endindex = errorhandler(errors, "charmap", + # collect all unencodable chars. + startindex = index + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + while (pos < size and + mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''): + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + res_8, newindex = errorhandler(errors, "charmap", "character maps to <undefined>", - s, index, endindex) - j = 0 - for _ in range(endindex - index): - ch2 = rutf8.codepoint_at_pos(rs, j) - ch2 = mapping.get(ch2, '') + s, startindex, index) + for cp2 in rutf8.Utf8StringIterator(res_8): + ch2 = mapping.get(cp2, '') if not ch2: errorhandler( - "strict", "charmap", - "character maps to <undefined>", - s, index, index + 1) + "strict", "charmap", "character maps to <undefined>", + s, startindex, index) result.append(ch2) - index += 1 - j = rutf8.next_codepoint_pos(rs, j) - pos = rutf8.next_codepoint_pos(s, pos) + if index != newindex: # Should be uncommon + index = newindex + pos = 0 + for _ in range(newindex): + pos = rutf8.next_codepoint_pos(s, pos) continue result.append(c) index += 1 diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -537,8 +537,12 @@ assert '\xff'.decode('utf-7', 'ignore') == '' assert '\x00'.decode('unicode-internal', 'ignore') == '' - def test_backslahreplace(self): - assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000' + def test_backslashreplace(self): + sin = u"a\xac\u1234\u20ac\u8000\U0010ffff" + expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" + assert sin.encode('ascii', 'backslashreplace') == expected + expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff" + assert sin.encode("iso-8859-15", "backslashreplace") == expected def test_badhandler(self): import codecs _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit