Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r92734:13a4b012e64e
Date: 2017-10-12 14:26 +0200
http://bitbucket.org/pypy/pypy/changeset/13a4b012e64e/
Log: Random in-progress clean-up
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,4 +1,7 @@
+import py
from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+
class FakeSpace:
pass
@@ -16,11 +19,14 @@
def test_decode_utf8():
space = FakeSpace()
- assert decode_utf8(space, "abc") == u"abc"
- assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
- assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
- assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
- got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
- assert map(ord, got) == [0xd800, 0xdc00]
- got = decode_utf8(space, "\xf0\x90\x80\x80")
- assert map(ord, got) == [0x10000]
+ assert decode_utf8(space, "abc") == ("abc", 3)
+ assert decode_utf8(space, "\xe1\x88\xb4") == ("\xe1\x88\xb4", 1)
+ assert decode_utf8(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1)
+ assert decode_utf8(space, "\xed\xb0\x80") == ("\xed\xb0\x80", 1)
+ assert decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80") == (
+ "\xed\xa0\x80\xed\xb0\x80", 2)
+ assert decode_utf8(space, "\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 1)
+
+def test_utf8_encode_ascii():
+ assert utf8_encode_ascii("abc", 3, "??", "??") == "abc"
+ py.test.skip("test me more...")
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -62,7 +62,16 @@
# XXX argh. we want each surrogate to be encoded separately
return ''.join([u.encode('utf8') for u in result_u]), len(result_u)
-def check_utf8(space, string):
+def check_ascii_or_raise(space, string):
+ try:
+ rutf8.check_ascii(string)
+ except rutf8.CheckError as e:
+ decode_error_handler(space)('strict', 'ascii',
+ 'ordinal not in range(128)', string,
+ e.pos, e.pos + 1)
+ assert False, "unreachable"
+
+def check_utf8_or_raise(space, string):
# Surrogates are accepted and not treated specially at all.
# If there happen to be two 3-bytes encoding a pair of surrogates,
# you still get two surrogate unicode characters in the result.
@@ -70,13 +79,13 @@
try:
length = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError as e:
- raise Exception("foo")
- decode_error_handler(space)('strict', 'utf8', e.msg, string,
e.startpos,
- e.endpos)
- raise False, "unreachable"
+ decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
+ e.pos, e.pos + 1)
+ assert False, "unreachable"
return length
def encode_utf8(space, uni):
+ # DEPRECATED
# Note that this function never raises UnicodeEncodeError,
# since surrogates are allowed, either paired or lone.
# A paired surrogate is considered like the non-BMP character
@@ -87,11 +96,8 @@
allow_surrogates=True)
def decode_utf8(space, s):
- u, _ = runicode.str_decode_utf_8(s, len(s),
- "strict", final=True,
- errorhandler=decode_error_handler(space),
- allow_surrogates=True)
- return u.encode('utf8'), len(u)
+ # DEPRECATED
+ return (s, check_utf8_or_raise(space, s))
def utf8_encode_ascii(utf8, utf8len, errors, errorhandler):
if len(utf8) == utf8len:
@@ -109,7 +115,7 @@
else:
utf8_repl, newpos, length = errorhandler(errors, 'ascii',
'ordinal not in range (128)', utf8, lgt, lgt + 1)
- return b.build(), lgt
+ return b.build()
def str_decode_ascii(s, slen, errors, final, errorhandler):
try:
diff --git a/pypy/objspace/std/marshal_impl.py
b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -403,7 +403,7 @@
@unmarshaller(TYPE_UNICODE)
def unmarshal_unicode(space, u, tc):
arg = u.get_str()
- length = unicodehelper.check_utf8(space, arg)
+ length = unicodehelper.check_utf8_or_raise(space, arg)
return space.newutf8(arg, length)
@marshaller(W_SetObject)
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1106,22 +1106,11 @@
if errors is None or errors == 'strict':
if encoding == 'ascii':
s = space.charbuf_w(w_obj)
- try:
- rutf8.check_ascii(s)
- except rutf8.CheckError as e:
- unicodehelper.decode_error_handler(space)(None,
- 'ascii', "ordinal not in range(128)", s, e.pos, e.pos+1)
- assert False
+ unicodehelper.check_ascii_or_raise(space, s)
return space.newutf8(s, len(s))
if encoding == 'utf-8':
s = space.charbuf_w(w_obj)
- try:
- lgt = rutf8.check_utf8(s, allow_surrogates=True)
- except rutf8.CheckError:
- assert False, "fix in the future"
- eh = unicodehelper.decode_error_handler(space)
- eh(None, 'utf8', e.msg, s, e.startpos, e.endpos)
- assert False, "has to raise"
+ lgt = unicodehelper.check_utf8_or_raise(space, s)
return space.newutf8(s, lgt)
w_codecs = space.getbuiltinmodule("_codecs")
w_decode = space.getattr(w_codecs, space.newtext("decode"))
@@ -1176,11 +1165,7 @@
if encoding != 'ascii':
return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
s = space.bytes_w(w_bytes)
- try:
- rutf8.check_ascii(s)
- except rutf8.CheckError:
- # raising UnicodeDecodeError is messy, "please crash for me"
- return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
+ unicodehelper.check_ascii_or_raise(space, s)
return W_UnicodeObject(s, len(s))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit