Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8 Changeset: r93342:a4d68881a89d Date: 2017-12-10 05:16 +0000 http://bitbucket.org/pypy/pypy/changeset/a4d68881a89d/
Log: Handle errorhandlers that go backward diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1101,19 +1101,16 @@ ru, newindex = errorhandler(errors, public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - for j in range(newindex - index): - pos = rutf8.next_codepoint_pos(s, pos) - j = 0 - while j < len(ru): - ch = rutf8.codepoint_at_pos(ru, j) - if ord(ch) < 0xD800: - _STORECHAR(result, ord(ch), byteorder) + for cp in rutf8.Utf8StringIterator(res_8): + if cp < 0xD800: + _STORECHAR(result, cp, byteorder) else: errorhandler('strict', public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - j = rutf8.next_codepoint_pos(ru, j) - index = newindex + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue pos = rutf8.next_codepoint_pos(s, pos) @@ -1282,22 +1279,19 @@ ch = rutf8.codepoint_at_pos(s, pos) pos = rutf8.next_codepoint_pos(s, pos) if not allow_surrogates and 0xD800 <= ch < 0xE000: - ru, newindex = errorhandler(errors, public_encoding_name, + res_8, newindex = errorhandler(errors, public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - for j in range(newindex - index): - pos = rutf8.next_codepoint_pos(s, pos) - j = 0 - while j < len(ru): - ch = rutf8.codepoint_at_pos(ru, j) - if ord(ch) < 0xD800: - _STORECHAR32(result, ord(ch), byteorder) + for ch in rutf8.Utf8StringIterator(res_8): + if ch < 0xD800: + _STORECHAR32(result, ch, byteorder) else: errorhandler('strict', public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - j = rutf8.next_codepoint_pos(ru, j) - index = newindex + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue _STORECHAR32(result, ch, byteorder) index += 1 @@ -1459,9 +1453,7 @@ result.append(ch2) if index != newindex: # Should be uncommon index = newindex - pos = 0 - for _ in range(newindex): - pos = rutf8.next_codepoint_pos(s, pos) + pos = rutf8._pos_at_index(s, newindex) continue result.append(c) index += 1 diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -439,7 +439,7 @@ low = codepoint_at_pos(utf8, i) if 0xDC00 <= low <= 0xDFFF: uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) - i = next_codepoint_pos(utf8, i) + i = next_codepoint_pos(utf8, i) # else not really a surrogate pair, just append high else: i = next_codepoint_pos(utf8, i) @@ -537,6 +537,13 @@ else: return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos)) +def _pos_at_index(utf8, index): + # Slow! + pos = 0 + for _ in range(index): + pos = next_codepoint_pos(utf8, pos) + return pos + @jit.dont_look_inside def codepoint_at_index(utf8, storage, index): """ Return codepoint of a character inside utf8 encoded string, given _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit