Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r95318:83e8a364e912 Date: 2018-11-15 01:11 -0800 http://bitbucket.org/pypy/pypy/changeset/83e8a364e912/
Log: pos in encoding error handler is in unicode not utf8. refactor, use only utf8 diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -241,8 +241,9 @@ pos = e.pos assert pos >= 0 start = s[:pos] + upos = rutf8.codepoints_in_utf8(s, end=pos) ru, lgt = errorhandler(errors, 'utf8', - 'surrogates not allowed', s, pos, pos + 1) + 'surrogates not allowed', s, upos, upos + 1) end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler, allow_surrogates=allow_surrogates) s = start + ru + end diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -379,10 +379,11 @@ def surrogatepass_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): - obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object'))) + utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object'))) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) + msg = space.text_w(space.getattr(w_exc, space.newtext('reason'))) bytelength, code = get_standard_encoding(encoding) if code == ENC_UNKNOWN: # Not supported, fail with original exception @@ -390,8 +391,12 @@ end = space.int_w(w_end) builder = StringBuilder() pos = start + # start, end are in codepoint indices + itr = rutf8.Utf8StringIterator(utf8) + for i in range(pos): + itr.next() while pos < end: - ch = ord(obj[pos]) + ch = itr.next() pos += 1 if ch < 0xd800 or ch > 0xdfff: # Not a surrogate, fail with original exception @@ -466,8 +471,11 @@ end = space.int_w(w_end) res = '' pos = start + itr = rutf8.Utf8StringIterator(utf8) + for i in range(pos): + itr.next() while pos < end: - ch = rutf8.codepoint_at_pos(utf8, pos) + ch = itr.next() pos += 1 if ch < 0xdc80 or ch > 0xdcff: # Not a UTF-8b surrogate, fail with original exception diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -648,6 +648,8 @@ assert u'\ud8ae'.encode('utf_16_be', 'surrogatepass') == b'\xd8\xae' assert (u'\U0000d8ae'.encode('utf-32-be', 'surrogatepass') == b'\x00\x00\xd8\xae') + assert (u'\x80\ud800'.encode('utf8', 'surrogatepass') == + b'\xc2\x80\xed\xa0\x80') def test_badandgoodsurrogatepassexceptions(self): import codecs diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1209,28 +1209,26 @@ if errors is None: errors = 'strict' pos = rutf8.surrogate_in_utf8(utf8) - if pos >= 0: - handled_error = True - else: - handled_error = False state = space.fromcache(CodecState) eh = state.encode_error_handler - while pos >= 0: - start = utf8[:pos] - ru, _pos = eh(errors, "utf8", "surrogates not allowed", utf8, - pos, pos + 1) - upos = rutf8.next_codepoint_pos(utf8, _pos) - end = utf8[upos:] - utf8 = start + ru + end - _pos = rutf8.surrogate_in_utf8(utf8) - if _pos <= pos: - # surrogatepass? - break - pos = _pos - if errors == 'surrogateescape' and handled_error: - #escape - return space.newbytes(utf8) - w_object = space.newtext(utf8) + if pos >= 0: + while pos >= 0: + start = utf8[:pos] + upos = rutf8.codepoints_in_utf8(utf8, end=pos) + ru, _pos = eh(errors, "utf8", "surrogates not allowed", utf8, + upos, upos + 1) + upos = rutf8.next_codepoint_pos(utf8, _pos) + end = utf8[upos:] + utf8 = start + ru + end + _pos = rutf8.surrogate_in_utf8(utf8) + if _pos <= pos: + # surrogatepass? + break + pos = _pos + if errors == 'surrogateescape': + #escape + return space.newbytes(utf8) + w_object = space.newtext(utf8) if errors is None or errors == 'strict': if encoding is None or encoding == 'utf-8': #if rutf8.has_surrogates(utf8): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit