Author: Armin Rigo <ar...@tunes.org>
Branch: py3.5
Changeset: r88998:894e8d2f5df8
Date: 2016-12-10 15:58 +0100
http://bitbucket.org/pypy/pypy/changeset/894e8d2f5df8/

Log:    hg merge default

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -327,6 +327,16 @@
 
 def unicode_encode_utf_8(s, size, errors, errorhandler=None,
                          allow_surrogates=allow_surrogate_by_default):
+    # In this function, allow_surrogates can be:
+    #
+    #  * True:  surrogates are always allowed.  A valid surrogate pair
+    #           is replaced with the non-BMP unicode char it stands for,
+    #           which is then encoded as 4 bytes.
+    #
+    #  * False: surrogates are always forbidden.
+    #
+    # See also unicode_encode_utf8sp().
+    #
     if errorhandler is None:
         errorhandler = default_unicode_error_encode
     return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
@@ -391,6 +401,33 @@
                 _encodeUCS4(result, ch)
     return result.build()
 
+def unicode_encode_utf8sp(s, size):
+    # Surrogate-preserving utf-8 encoding.  Any surrogate character
+    # turns into its 3-bytes encoding, whether it is paired or not.
+    # This should always be reversible, and the reverse is the regular
+    # str_decode_utf_8() with allow_surrogates=True.
+    assert(size >= 0)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        elif ch < 0x10000:
+            # Encode UCS2 Unicode ordinals, and surrogates
+            result.append((chr((0xe0 | (ch >> 12)))))
+            result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+            result.append((chr((0x80 | (ch & 0x3f)))))
+        else:
+            _encodeUCS4(result, ch)
+    return result.build()
+
 # ____________________________________________________________
 # utf-16
 
diff --git a/rpython/rlib/test/test_runicode.py 
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -812,6 +812,21 @@
         py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
         assert encoder(u'u\u1234', 2, 'replace') == 'u?'
 
+    def test_encode_utf8sp(self):
+        # for the following test, go to lengths to avoid CPython's optimizer
+        # and .pyc file storage, which collapse the two surrogates into one
+        c = u"\udc00"
+        for input, expected in [
+                (u"", ""),
+                (u"abc", "abc"),
+                (u"\u1234", "\xe1\x88\xb4"),
+                (u"\ud800", "\xed\xa0\x80"),
+                (u"\udc00", "\xed\xb0\x80"),
+                (u"\ud800" + c, "\xed\xa0\x80\xed\xb0\x80"),
+            ]:
+            got = runicode.unicode_encode_utf8sp(input, len(input))
+            assert got == expected
+
 
 class TestTranslation(object):
     def setup_class(cls):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to