Author: Matti Picus <matti.pi...@gmail.com> Branch: py3.6 Changeset: r96061:74fc16b2e4b5 Date: 2019-02-17 20:08 +0200 http://bitbucket.org/pypy/pypy/changeset/74fc16b2e4b5/
Log: make utf8_encode_utf_8 non-recursive, and pass surrogate pairs to error handler diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -21,7 +21,7 @@ space.newtext(msg)])) return raise_unicode_exception_decode -def _decode_never_raise(errors, encoding, msg, s, startingpos, endingpos): +def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos): assert startingpos >= 0 ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]] return ''.join(ux), endingpos, 'b' @@ -218,20 +218,38 @@ return res.build(), len(s), len(s) def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False): - try: - lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates) - except rutf8.CheckError as e: - # XXX change this to non-recursive - pos = e.pos - assert pos >= 0 - start = s[:pos] - upos = rutf8.codepoints_in_utf8(s, end=pos) - ru, lgt, rettype = errorhandler(errors, 'utf8', - 'surrogates not allowed', s, upos, upos + 1) - end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler, - allow_surrogates=allow_surrogates) - s = start + ru + end - return s + size = len(s) + if size == 0: + return '' + pos = 0 + upos = 0 + result = StringBuilder(size) + while pos < size: + try: + lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates, start=pos) + if pos == 0: + # fast path + return s + for ch in s[pos:]: + result.append(ch) + break + except rutf8.CheckError as e: + for ch in s[pos:e.pos]: + result.append(ch) + upos += rutf8.codepoints_in_utf8(s, start=pos, end=e.pos) + pos = e.pos + assert pos >= 0 + res, newindex, rettype = errorhandler(errors, 'utf8', + 'surrogates not allowed', s, upos, upos + 1) + if rettype == 'u': + for cp in rutf8.Utf8StringIterator(res): + result.append(chr(cp)) + else: + for ch in res: + result.append(ch) + upos = newindex + pos = rutf8._pos_at_index(s, upos) + return result.build() def utf8_encode_latin_1(s, errors, errorhandler, allow_surrogates=False): try: @@ -1017,7 +1035,7 @@ # Surrogate-preserving utf-8 decoding. Assuming there is no # encoding error, it should always be reversible, and the reverse is # unused encode_utf8sp(). - return str_decode_utf8(string, "string", True, _decode_never_raise, + return str_decode_utf8(string, "string", True, decode_never_raise, allow_surrogates=True) # ____________________________________________________________ diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -1149,7 +1149,6 @@ backslashreplace = ''.join('\\x%02x' % b for b in ill_surrogate) assert test_sequence.decode(encoding, "backslashreplace") == (before + backslashreplace + after) - def test_lone_surrogates_utf_8(self): """ @@ -1158,6 +1157,8 @@ """ e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8", "surrogateescape").value + assert e.start == 1 + assert e.end == 3 assert e.object[e.start:e.end] == u'\ud800\udfff' def test_charmap_encode(self): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit