Author: Armin Rigo <[email protected]>
Branch: 
Changeset: r88992:81769ca3299e
Date: 2016-12-10 15:09 +0100
http://bitbucket.org/pypy/pypy/changeset/81769ca3299e/

Log:    Test for unicodehelper.{decode_utf8,encode_utf8}

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -0,0 +1,26 @@
+from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+
+class FakeSpace:
+    pass
+
+def test_encode_utf8():
+    space = FakeSpace()
+    assert encode_utf8(space, u"abc") == "abc"
+    assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
+    assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80"
+    assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80"
+    # for the following test, go to lengths to avoid CPython's optimizer
+    # and .pyc file storage, which collapse the two surrogates into one
+    c = u"\udc00"
+    assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80"
+
+def test_decode_utf8():
+    space = FakeSpace()
+    assert decode_utf8(space, "abc") == u"abc"
+    assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
+    assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
+    assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
+    got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
+    assert map(ord, got) == [0xd800, 0xdc00]
+    got = decode_utf8(space, "\xf0\x90\x80\x80")
+    assert map(ord, got) == [0x10000]
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -61,6 +61,8 @@
     # Note that this function never raises UnicodeEncodeError,
     # since surrogate pairs are allowed.
     # This is not the case with Python3.
+    # Also, note that the two characters \d800\dc00 are considered as
+    # a paired surrogate, and turn into a single 4-byte utf8 char.
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=raise_unicode_exception_encode,
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to