[pypy-commit] pypy unicode-utf8: fix encoding to operate on utf-8 encoded strings

cfbolz Fri, 24 Nov 2017 06:16:34 -0800

Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: unicode-utf8
Changeset: r93163:5b81f483c459
Date: 2017-11-24 15:14 +0100
http://bitbucket.org/pypy/pypy/changeset/5b81f483c459/


Log:    fix encoding to operate on utf-8 encoded strings

diff --git a/pypy/module/_pypyjson/interp_encoder.py 
b/pypy/module/_pypyjson/interp_encoder.py
--- a/pypy/module/_pypyjson/interp_encoder.py
+++ b/pypy/module/_pypyjson/interp_encoder.py
@@ -1,5 +1,5 @@
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.runicode import str_decode_utf_8
+from rpython.rlib import rutf8
 from pypy.interpreter import unicodehelper
 
 
@@ -30,11 +30,8 @@
             # the input is a string with only non-special ascii chars
             return w_string
 
-        eh = unicodehelper.decode_error_handler(space)
-        u = str_decode_utf_8(
-                s, len(s), None, final=True, errorhandler=eh,
-                allow_surrogates=True)[0]
-        sb = StringBuilder(len(u))
+        unicodehelper.check_utf8_or_raise(space, s)
+        sb = StringBuilder(len(s))
         sb.append_slice(s, 0, first)
     else:
         # We used to check if 'u' contains only safe characters, and return
@@ -44,29 +41,31 @@
         # a string (with the ascii encoding).  This requires two passes
         # over the characters.  So we may as well directly turn it into a
         # string here --- only one pass.
-        u = space.unicode_w(w_string)
-        sb = StringBuilder(len(u))
+        s = space.utf8_w(w_string)
+        sb = StringBuilder(len(s))
         first = 0
 
-    for i in range(first, len(u)):
-        c = u[i]
-        if c <= u'~':
-            if c == u'"' or c == u'\\':
+    it = rutf8.Utf8StringIterator(s)
+    for i in range(first):
+        it.next()
+    for c in it:
+        if c <= ord('~'):
+            if c == ord('"') or c == ord('\\'):
                 sb.append('\\')
-            elif c < u' ':
-                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
+            elif c < ord(' '):
+                sb.append(ESCAPE_BEFORE_SPACE[c])
                 continue
-            sb.append(chr(ord(c)))
+            sb.append(chr(c))
         else:
-            if c <= u'\uffff':
+            if c <= ord(u'\uffff'):
                 sb.append('\\u')
-                sb.append(HEX[ord(c) >> 12])
-                sb.append(HEX[(ord(c) >> 8) & 0x0f])
-                sb.append(HEX[(ord(c) >> 4) & 0x0f])
-                sb.append(HEX[ord(c) & 0x0f])
+                sb.append(HEX[c >> 12])
+                sb.append(HEX[(c >> 8) & 0x0f])
+                sb.append(HEX[(c >> 4) & 0x0f])
+                sb.append(HEX[c & 0x0f])
             else:
                 # surrogate pair
-                n = ord(c) - 0x10000
+                n = c - 0x10000
                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
                 sb.append('\\ud')
                 sb.append(HEX[(s1 >> 8) & 0x0f])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: fix encoding to operate on utf-8 encoded strings

Reply via email to