Author: Armin Rigo <ar...@tunes.org> Branch: py3.5 Changeset: r88998:894e8d2f5df8 Date: 2016-12-10 15:58 +0100 http://bitbucket.org/pypy/pypy/changeset/894e8d2f5df8/
Log: hg merge default diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -327,6 +327,16 @@ def unicode_encode_utf_8(s, size, errors, errorhandler=None, allow_surrogates=allow_surrogate_by_default): + # In this function, allow_surrogates can be: + # + # * True: surrogates are always allowed. A valid surrogate pair + # is replaced with the non-BMP unicode char it stands for, + # which is then encoded as 4 bytes. + # + # * False: surrogates are always forbidden. + # + # See also unicode_encode_utf8sp(). + # if errorhandler is None: errorhandler = default_unicode_error_encode return unicode_encode_utf_8_impl(s, size, errors, errorhandler, @@ -391,6 +401,33 @@ _encodeUCS4(result, ch) return result.build() +def unicode_encode_utf8sp(s, size): + # Surrogate-preserving utf-8 encoding. Any surrogate character + # turns into its 3-bytes encoding, whether it is paired or not. + # This should always be reversible, and the reverse is the regular + # str_decode_utf_8() with allow_surrogates=True. + assert(size >= 0) + result = StringBuilder(size) + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 + if ch < 0x80: + # Encode ASCII + result.append(chr(ch)) + elif ch < 0x0800: + # Encode Latin-1 + result.append(chr((0xc0 | (ch >> 6)))) + result.append(chr((0x80 | (ch & 0x3f)))) + elif ch < 0x10000: + # Encode UCS2 Unicode ordinals, and surrogates + result.append((chr((0xe0 | (ch >> 12))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) + else: + _encodeUCS4(result, ch) + return result.build() + # ____________________________________________________________ # utf-16 diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -812,6 +812,21 @@ py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None) assert encoder(u'u\u1234', 2, 'replace') == 'u?' + def test_encode_utf8sp(self): + # for the following test, go to lengths to avoid CPython's optimizer + # and .pyc file storage, which collapse the two surrogates into one + c = u"\udc00" + for input, expected in [ + (u"", ""), + (u"abc", "abc"), + (u"\u1234", "\xe1\x88\xb4"), + (u"\ud800", "\xed\xa0\x80"), + (u"\udc00", "\xed\xb0\x80"), + (u"\ud800" + c, "\xed\xa0\x80\xed\xb0\x80"), + ]: + got = runicode.unicode_encode_utf8sp(input, len(input)) + assert got == expected + class TestTranslation(object): def setup_class(cls): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit