[pypy-commit] pypy py3.6: make utf8_encode_utf_8 non-recursive, and pass surrogate pairs to error handler

mattip Mon, 18 Feb 2019 07:07:53 -0800

Author: Matti Picus <matti.pi...@gmail.com>
Branch: py3.6
Changeset: r96061:74fc16b2e4b5
Date: 2019-02-17 20:08 +0200
http://bitbucket.org/pypy/pypy/changeset/74fc16b2e4b5/


Log:    make utf8_encode_utf_8 non-recursive, and pass surrogate pairs to
        error handler

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -21,7 +21,7 @@
                                              space.newtext(msg)]))
     return raise_unicode_exception_decode
 
-def _decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
     assert startingpos >= 0
     ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
     return ''.join(ux), endingpos, 'b'
@@ -218,20 +218,38 @@
     return res.build(), len(s), len(s)
 
 def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False):
-    try:
-        lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates)
-    except rutf8.CheckError as e:
-        # XXX change this to non-recursive
-        pos = e.pos
-        assert pos >= 0
-        start = s[:pos]
-        upos = rutf8.codepoints_in_utf8(s, end=pos)
-        ru, lgt, rettype = errorhandler(errors, 'utf8',
-                    'surrogates not allowed', s, upos, upos + 1)
-        end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler,
-                                allow_surrogates=allow_surrogates)
-        s = start + ru + end
-    return s
+    size = len(s)
+    if size == 0:
+        return ''
+    pos = 0
+    upos = 0
+    result = StringBuilder(size)
+    while pos < size:
+        try:
+            lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates, 
start=pos)
+            if pos == 0:
+                # fast path
+                return s
+            for ch in s[pos:]:
+                result.append(ch)
+            break
+        except rutf8.CheckError as e:
+            for ch in s[pos:e.pos]:
+                result.append(ch)
+            upos += rutf8.codepoints_in_utf8(s, start=pos, end=e.pos)
+            pos = e.pos
+            assert pos >= 0
+            res, newindex, rettype = errorhandler(errors, 'utf8',
+                        'surrogates not allowed', s, upos, upos + 1)
+            if rettype == 'u':
+                for cp in rutf8.Utf8StringIterator(res):
+                    result.append(chr(cp))
+            else:
+                for ch in res:
+                    result.append(ch)
+            upos = newindex
+            pos = rutf8._pos_at_index(s, upos)
+    return result.build()
 
 def utf8_encode_latin_1(s, errors, errorhandler, allow_surrogates=False):
     try:
@@ -1017,7 +1035,7 @@
     # Surrogate-preserving utf-8 decoding.  Assuming there is no
     # encoding error, it should always be reversible, and the reverse is
     # unused encode_utf8sp().
-    return str_decode_utf8(string, "string", True, _decode_never_raise,
+    return str_decode_utf8(string, "string", True, decode_never_raise,
                            allow_surrogates=True)
 
 # ____________________________________________________________
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -1149,7 +1149,6 @@
                 backslashreplace = ''.join('\\x%02x' % b for b in 
ill_surrogate)
                 assert test_sequence.decode(encoding, "backslashreplace") == 
(before +
                                                              backslashreplace 
+ after)
-                
 
     def test_lone_surrogates_utf_8(self):
         """
@@ -1158,6 +1157,8 @@
         """
         e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8",
                    "surrogateescape").value
+        assert e.start == 1
+        assert e.end == 3
         assert e.object[e.start:e.end] == u'\ud800\udfff'
 
     def test_charmap_encode(self):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3.6: make utf8_encode_utf_8 non-recursive, and pass surrogate pairs to error handler

Reply via email to