Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: unicode-utf8
Changeset: r93163:5b81f483c459
Date: 2017-11-24 15:14 +0100
http://bitbucket.org/pypy/pypy/changeset/5b81f483c459/
Log: fix encoding to operate on utf-8 encoded strings
diff --git a/pypy/module/_pypyjson/interp_encoder.py
b/pypy/module/_pypyjson/interp_encoder.py
--- a/pypy/module/_pypyjson/interp_encoder.py
+++ b/pypy/module/_pypyjson/interp_encoder.py
@@ -1,5 +1,5 @@
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.runicode import str_decode_utf_8
+from rpython.rlib import rutf8
from pypy.interpreter import unicodehelper
@@ -30,11 +30,8 @@
# the input is a string with only non-special ascii chars
return w_string
- eh = unicodehelper.decode_error_handler(space)
- u = str_decode_utf_8(
- s, len(s), None, final=True, errorhandler=eh,
- allow_surrogates=True)[0]
- sb = StringBuilder(len(u))
+ unicodehelper.check_utf8_or_raise(space, s)
+ sb = StringBuilder(len(s))
sb.append_slice(s, 0, first)
else:
# We used to check if 'u' contains only safe characters, and return
@@ -44,29 +41,31 @@
# a string (with the ascii encoding). This requires two passes
# over the characters. So we may as well directly turn it into a
# string here --- only one pass.
- u = space.unicode_w(w_string)
- sb = StringBuilder(len(u))
+ s = space.utf8_w(w_string)
+ sb = StringBuilder(len(s))
first = 0
- for i in range(first, len(u)):
- c = u[i]
- if c <= u'~':
- if c == u'"' or c == u'\\':
+ it = rutf8.Utf8StringIterator(s)
+ for i in range(first):
+ it.next()
+ for c in it:
+ if c <= ord('~'):
+ if c == ord('"') or c == ord('\\'):
sb.append('\\')
- elif c < u' ':
- sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
+ elif c < ord(' '):
+ sb.append(ESCAPE_BEFORE_SPACE[c])
continue
- sb.append(chr(ord(c)))
+ sb.append(chr(c))
else:
- if c <= u'\uffff':
+ if c <= ord(u'\uffff'):
sb.append('\\u')
- sb.append(HEX[ord(c) >> 12])
- sb.append(HEX[(ord(c) >> 8) & 0x0f])
- sb.append(HEX[(ord(c) >> 4) & 0x0f])
- sb.append(HEX[ord(c) & 0x0f])
+ sb.append(HEX[c >> 12])
+ sb.append(HEX[(c >> 8) & 0x0f])
+ sb.append(HEX[(c >> 4) & 0x0f])
+ sb.append(HEX[c & 0x0f])
else:
# surrogate pair
- n = ord(c) - 0x10000
+ n = c - 0x10000
s1 = 0xd800 | ((n >> 10) & 0x3ff)
sb.append('\\ud')
sb.append(HEX[(s1 >> 8) & 0x0f])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit