Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r95335:de06359bbf5c Date: 2018-11-16 12:28 -0800 http://bitbucket.org/pypy/pypy/changeset/de06359bbf5c/
Log: refactor builting erro handlers to use utf8 indices, add failing test diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -379,25 +379,23 @@ def surrogatepass_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): - utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object'))) + w_obj = space.getattr(w_exc, space.newtext('object')) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) - msg = space.text_w(space.getattr(w_exc, space.newtext('reason'))) bytelength, code = get_standard_encoding(encoding) if code == ENC_UNKNOWN: # Not supported, fail with original exception raise OperationError(space.type(w_exc), w_exc) end = space.int_w(w_end) builder = StringBuilder() + start = w_obj._index_to_byte(start) + end = w_obj._index_to_byte(end) + obj = w_obj._utf8 pos = start - # start, end are in codepoint indices - itr = rutf8.Utf8StringIterator(utf8) - for i in range(pos): - itr.next() while pos < end: - ch = itr.next() - pos += 1 + ch = rutf8.codepoint_at_pos(obj, pos) + pos = rutf8.next_codepoint_pos(obj, pos) if ch < 0xd800 or ch > 0xdfff: # Not a surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) @@ -465,22 +463,22 @@ def surrogateescape_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): - utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object'))) + w_obj = space.getattr(w_exc, space.newtext('object')) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) res = '' + start = w_obj._index_to_byte(start) + end = w_obj._index_to_byte(end) + obj = w_obj._utf8 pos = start - itr = rutf8.Utf8StringIterator(utf8) - for i in range(pos): - itr.next() while pos < end: - ch = itr.next() - pos += 1 - if ch < 0xdc80 or ch > 0xdcff: + code = rutf8.codepoint_at_pos(obj, pos) + if code < 0xdc80 or code > 0xdcff: # Not a UTF-8b surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) - res += chr(ch - 0xdc00) + res += chr(code - 0xdc00) + pos = rutf8.next_codepoint_pos(obj, pos) return space.newtuple([space.newbytes(res), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): consumed = 0 diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -625,6 +625,8 @@ assert '[\uDC80]'.encode('utf-8', 'namereplace') == b'[\\udc80]' def test_surrogateescape(self): + assert "\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", + "surrogateescape") == b"\xe4\xeb\xef\xf6\xfc" assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b' assert 'a\udc80b'.encode('utf-8', 'surrogateescape') == b'a\x80b' for enc in ('utf-8', 'ascii', 'latin-1', 'charmap'): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit