[pypy-commit] pypy unicode-utf8: Random in-progress clean-up

arigo Thu, 12 Oct 2017 05:27:37 -0700

Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r92734:13a4b012e64e
Date: 2017-10-12 14:26 +0200
http://bitbucket.org/pypy/pypy/changeset/13a4b012e64e/


Log:    Random in-progress clean-up

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,4 +1,7 @@
+import py
 from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+
 
 class FakeSpace:
     pass
@@ -16,11 +19,14 @@
 
 def test_decode_utf8():
     space = FakeSpace()
-    assert decode_utf8(space, "abc") == u"abc"
-    assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
-    assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
-    assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
-    got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
-    assert map(ord, got) == [0xd800, 0xdc00]
-    got = decode_utf8(space, "\xf0\x90\x80\x80")
-    assert map(ord, got) == [0x10000]
+    assert decode_utf8(space, "abc") == ("abc", 3)
+    assert decode_utf8(space, "\xe1\x88\xb4") == ("\xe1\x88\xb4", 1)
+    assert decode_utf8(space, "\xed\xa0\x80") == ("\xed\xa0\x80", 1)
+    assert decode_utf8(space, "\xed\xb0\x80") == ("\xed\xb0\x80", 1)
+    assert decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80") == (
+        "\xed\xa0\x80\xed\xb0\x80", 2)
+    assert decode_utf8(space, "\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 1)
+
+def test_utf8_encode_ascii():
+    assert utf8_encode_ascii("abc", 3, "??", "??") == "abc"
+    py.test.skip("test me more...")
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -62,7 +62,16 @@
     # XXX argh.  we want each surrogate to be encoded separately
     return ''.join([u.encode('utf8') for u in result_u]), len(result_u)
 
-def check_utf8(space, string):
+def check_ascii_or_raise(space, string):
+    try:
+        rutf8.check_ascii(string)
+    except rutf8.CheckError as e:
+        decode_error_handler(space)('strict', 'ascii',
+                                    'ordinal not in range(128)', string,
+                                    e.pos, e.pos + 1)
+        assert False, "unreachable"
+
+def check_utf8_or_raise(space, string):
     # Surrogates are accepted and not treated specially at all.
     # If there happen to be two 3-bytes encoding a pair of surrogates,
     # you still get two surrogate unicode characters in the result.
@@ -70,13 +79,13 @@
     try:
         length = rutf8.check_utf8(string, allow_surrogates=True)
     except rutf8.CheckError as e:
-        raise Exception("foo")
-        decode_error_handler(space)('strict', 'utf8', e.msg, string, 
e.startpos,
-                                    e.endpos)
-        raise False, "unreachable"
+        decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
+                                    e.pos, e.pos + 1)
+        assert False, "unreachable"
     return length
 
 def encode_utf8(space, uni):
+    # DEPRECATED
     # Note that this function never raises UnicodeEncodeError,
     # since surrogates are allowed, either paired or lone.
     # A paired surrogate is considered like the non-BMP character
@@ -87,11 +96,8 @@
         allow_surrogates=True)
 
 def decode_utf8(space, s):
-    u, _ = runicode.str_decode_utf_8(s, len(s),
-        "strict", final=True,
-        errorhandler=decode_error_handler(space),
-        allow_surrogates=True)
-    return u.encode('utf8'), len(u)
+    # DEPRECATED
+    return (s, check_utf8_or_raise(space, s))
 
 def utf8_encode_ascii(utf8, utf8len, errors, errorhandler):
     if len(utf8) == utf8len:
@@ -109,7 +115,7 @@
         else:
             utf8_repl, newpos, length = errorhandler(errors, 'ascii', 
                 'ordinal not in range (128)', utf8, lgt, lgt + 1)
-    return b.build(), lgt
+    return b.build()
 
 def str_decode_ascii(s, slen, errors, final, errorhandler):
     try:
diff --git a/pypy/objspace/std/marshal_impl.py 
b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -403,7 +403,7 @@
 @unmarshaller(TYPE_UNICODE)
 def unmarshal_unicode(space, u, tc):
     arg = u.get_str()
-    length = unicodehelper.check_utf8(space, arg)
+    length = unicodehelper.check_utf8_or_raise(space, arg)
     return space.newutf8(arg, length)
 
 @marshaller(W_SetObject)
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1106,22 +1106,11 @@
     if errors is None or errors == 'strict':
         if encoding == 'ascii':
             s = space.charbuf_w(w_obj)
-            try:
-                rutf8.check_ascii(s)
-            except rutf8.CheckError as e:
-                unicodehelper.decode_error_handler(space)(None,
-                    'ascii', "ordinal not in range(128)", s, e.pos, e.pos+1)
-                assert False
+            unicodehelper.check_ascii_or_raise(space, s)
             return space.newutf8(s, len(s))
         if encoding == 'utf-8':
             s = space.charbuf_w(w_obj)
-            try:
-                lgt = rutf8.check_utf8(s, allow_surrogates=True)
-            except rutf8.CheckError:
-                assert False, "fix in the future"
-                eh = unicodehelper.decode_error_handler(space)
-                eh(None, 'utf8', e.msg, s, e.startpos, e.endpos)
-                assert False, "has to raise"
+            lgt = unicodehelper.check_utf8_or_raise(space, s)
             return space.newutf8(s, lgt)
     w_codecs = space.getbuiltinmodule("_codecs")
     w_decode = space.getattr(w_codecs, space.newtext("decode"))
@@ -1176,11 +1165,7 @@
     if encoding != 'ascii':
         return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
     s = space.bytes_w(w_bytes)
-    try:
-        rutf8.check_ascii(s)
-    except rutf8.CheckError:
-        # raising UnicodeDecodeError is messy, "please crash for me"
-        return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
+    unicodehelper.check_ascii_or_raise(space, s)
     return W_UnicodeObject(s, len(s))
 
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: Random in-progress clean-up

Reply via email to