Author: fijal Branch: unicode-utf8 Changeset: r93091:4668380f4c79 Date: 2017-11-20 13:56 +0100 http://bitbucket.org/pypy/pypy/changeset/4668380f4c79/
Log: * Improve ascii/utf8 codecs and unicode escape * Raise instead of looping infinitely when errorhandler returns nonsense diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -33,25 +33,33 @@ assert lst == [("??", "ascii", input, 0, 2), ("??", "ascii", input, 5, 7)] +@given(strategies.text()) +def test_utf8_encode_ascii_2(u): + def eh(errors, encoding, reason, p, start, end): + return "?" * (end - start), end + + assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == u.encode("ascii", "replace") + def test_str_decode_ascii(): - assert str_decode_ascii("abc", 3, "??", True, "??") == ("abc", 3, 3) + assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3, rutf8.FLAG_ASCII) def eh(errors, encoding, reason, p, start, end): lst.append((errors, encoding, p, start, end)) - return u"\u1234\u5678", end + return u"\u1234\u5678".encode("utf8"), end lst = [] input = "\xe8" exp = u"\u1234\u5678".encode("utf8") - assert str_decode_ascii(input, 1, "??", True, eh) == (exp, 1, 2) + assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2, rutf8.FLAG_REGULAR) assert lst == [("??", "ascii", input, 0, 1)] lst = [] input = "\xe8\xe9abc\xea\xeb" - assert str_decode_ascii(input, 7, "??", True, eh) == ( - exp + exp + "abc" + exp + exp, 7, 11) + assert str_decode_ascii(input, "??", True, eh) == ( + exp + exp + "abc" + exp + exp, 7, 11, rutf8.FLAG_REGULAR) assert lst == [("??", "ascii", input, 0, 1), ("??", "ascii", input, 1, 2), ("??", "ascii", input, 5, 6), ("??", "ascii", input, 6, 7)] -@given(strategies.binary()) -def test_unicode_raw_escape(s): - uh.utf8_encode_raw_unicode_escape(s, 'strict') +@given(strategies.text()) +def test_unicode_raw_escape(u): + r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict') + assert r == u.encode("raw-unicode-escape") diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -158,6 +158,7 @@ res.append(chr(oc)) i += 1 else: + XXX r, pos = errorhandler(errors, 'latin1', 'ordinal not in range(256)', s, cur, cur + 1) @@ -179,10 +180,15 @@ pos = 0 while i < len(utf8): ch = rutf8.codepoint_at_pos(utf8, i) - if ch >= 0x7F: + if ch > 0x7F: + endpos = pos + 1 + end_i = rutf8.next_codepoint_pos(utf8, i) + while end_i < len(utf8) and rutf8.codepoint_at_pos(utf8, end_i) > 0x7F: + endpos += 1 + end_i = rutf8.next_codepoint_pos(utf8, end_i) msg = "ordinal not in range(128)" r, newpos = errorhandler(errors, 'ascii', msg, utf8, - pos, pos + 1) + pos, endpos) for _ in range(newpos - pos): i = rutf8.next_codepoint_pos(utf8, i) pos = newpos @@ -603,13 +609,13 @@ result = StringBuilder(size) pos = 0 while pos < size: - oc = ord(s[pos]) + oc = rutf8.codepoint_at_pos(s, pos) if oc < 0x100: result.append(chr(oc)) else: raw_unicode_escape_helper(result, oc) - pos += 1 + pos = rutf8.next_codepoint_pos(s, pos) return result.build() diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -71,6 +71,9 @@ raise oefmt(space.w_IndexError, "position %d from error handler out of bounds", newpos) + if newpos < startpos: + raise oefmt(space.w_IndexError, + "position %d from error handler did not progress", newpos) w_replace = space.convert_to_w_unicode(w_replace) return w_replace._utf8, newpos return call_errorhandler _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit