Author: Matti Picus <[email protected]>
Branch: py3.6
Changeset: r96061:74fc16b2e4b5
Date: 2019-02-17 20:08 +0200
http://bitbucket.org/pypy/pypy/changeset/74fc16b2e4b5/
Log: make utf8_encode_utf_8 non-recursive, and pass surrogate pairs to
error handler
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -21,7 +21,7 @@
space.newtext(msg)]))
return raise_unicode_exception_decode
-def _decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
assert startingpos >= 0
ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
return ''.join(ux), endingpos, 'b'
@@ -218,20 +218,38 @@
return res.build(), len(s), len(s)
def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False):
- try:
- lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates)
- except rutf8.CheckError as e:
- # XXX change this to non-recursive
- pos = e.pos
- assert pos >= 0
- start = s[:pos]
- upos = rutf8.codepoints_in_utf8(s, end=pos)
- ru, lgt, rettype = errorhandler(errors, 'utf8',
- 'surrogates not allowed', s, upos, upos + 1)
- end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler,
- allow_surrogates=allow_surrogates)
- s = start + ru + end
- return s
+ size = len(s)
+ if size == 0:
+ return ''
+ pos = 0
+ upos = 0
+ result = StringBuilder(size)
+ while pos < size:
+ try:
+ lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates,
start=pos)
+ if pos == 0:
+ # fast path
+ return s
+ for ch in s[pos:]:
+ result.append(ch)
+ break
+ except rutf8.CheckError as e:
+ for ch in s[pos:e.pos]:
+ result.append(ch)
+ upos += rutf8.codepoints_in_utf8(s, start=pos, end=e.pos)
+ pos = e.pos
+ assert pos >= 0
+ res, newindex, rettype = errorhandler(errors, 'utf8',
+ 'surrogates not allowed', s, upos, upos + 1)
+ if rettype == 'u':
+ for cp in rutf8.Utf8StringIterator(res):
+ result.append(chr(cp))
+ else:
+ for ch in res:
+ result.append(ch)
+ upos = newindex
+ pos = rutf8._pos_at_index(s, upos)
+ return result.build()
def utf8_encode_latin_1(s, errors, errorhandler, allow_surrogates=False):
try:
@@ -1017,7 +1035,7 @@
# Surrogate-preserving utf-8 decoding. Assuming there is no
# encoding error, it should always be reversible, and the reverse is
# unused encode_utf8sp().
- return str_decode_utf8(string, "string", True, _decode_never_raise,
+ return str_decode_utf8(string, "string", True, decode_never_raise,
allow_surrogates=True)
# ____________________________________________________________
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -1149,7 +1149,6 @@
backslashreplace = ''.join('\\x%02x' % b for b in
ill_surrogate)
assert test_sequence.decode(encoding, "backslashreplace") ==
(before +
backslashreplace
+ after)
-
def test_lone_surrogates_utf_8(self):
"""
@@ -1158,6 +1157,8 @@
"""
e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8",
"surrogateescape").value
+ assert e.start == 1
+ assert e.end == 3
assert e.object[e.start:e.end] == u'\ud800\udfff'
def test_charmap_encode(self):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit