Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: py3k Changeset: r48163:8942f2c46162 Date: 2011-10-17 20:18 +0200 http://bitbucket.org/pypy/pypy/changeset/8942f2c46162/
Log: Fix utf-8 encoding; all test_runicode passes. diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py --- a/pypy/rlib/runicode.py +++ b/pypy/rlib/runicode.py @@ -253,6 +253,8 @@ result.append((chr((0x80 | (ch & 0x3f))))) def unicode_encode_utf_8(s, size, errors, errorhandler=None): + if errorhandler is None: + errorhandler = raise_unicode_exception_encode assert(size >= 0) result = StringBuilder(size) pos = 0 @@ -279,11 +281,14 @@ pos += 1 _encodeUCS4(result, ch3) continue - r, pos = errorhandler(errors, 'utf-8', - 'surrogates not allowed', - s, pos-1, pos) - result.append(r) - continue + r, pos = errorhandler(errors, 'utf-8', + 'surrogates not allowed', + s, pos-1, pos) + result.append(r) + continue + result.append((chr((0xe0 | (ch >> 12))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) else: _encodeUCS4(result, ch) return result.build() diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py --- a/pypy/rlib/test/test_runicode.py +++ b/pypy/rlib/test/test_runicode.py @@ -118,6 +118,9 @@ for i in range(10000): for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le " "utf-32 utf-32-be utf-32-le").split(): + if encoding == 'utf-8' and 0xd800 <= i <= 0xdfff: + # Don't try to encode lone surrogates + continue self.checkdecode(unichr(i), encoding) def test_random(self): _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit