Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: py3k Changeset: r48241:e0cf3d8b87a2 Date: 2011-10-19 22:31 +0200 http://bitbucket.org/pypy/pypy/changeset/e0cf3d8b87a2/
Log: Lot of fixes in the _codecs module diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -49,7 +49,7 @@ "(unicode, int) tuple, not %s") raise operationerrfmt( space.w_TypeError, msg, - space.str_w(space.repr(w_res))) + space.unicode_w(space.repr(w_res))) w_replace, w_newpos = space.fixedview(w_res, 2) newpos = space.int_w(w_newpos) if newpos < 0: @@ -487,7 +487,7 @@ make_encoder_wrapper('mbcs_encode') make_decoder_wrapper('mbcs_decode') -@unwrap_spec(data=str, errors='str_or_None', byteorder=int) +@unwrap_spec(data="bufferstr", errors='str_or_None', byteorder=int) def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=False): if errors is None: errors = 'strict' @@ -507,7 +507,7 @@ return space.newtuple([space.wrap(res), space.wrap(consumed), space.wrap(byteorder)]) -@unwrap_spec(data=str, errors='str_or_None', byteorder=int) +@unwrap_spec(data="bufferstr", errors='str_or_None', byteorder=int) def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=False): final = space.is_true(w_final) state = space.fromcache(CodecState) @@ -599,7 +599,7 @@ # Charmap may return a string try: - x = space.realstr_w(w_ch) + x = space.bytes_w(w_ch) except OperationError, e: if not e.match(space, space.w_TypeError): raise @@ -626,7 +626,7 @@ raise OperationError(space.w_TypeError, space.wrap("invalid mapping")) -@unwrap_spec(string=str, errors='str_or_None') +@unwrap_spec(string="bufferstr", errors='str_or_None') def charmap_decode(space, string, errors="strict", w_mapping=None): if errors is None: errors = 'strict' @@ -658,7 +658,7 @@ result = runicode.unicode_encode_charmap( uni, len(uni), errors, state.encode_error_handler, mapping) - return space.newtuple([space.wrap(result), space.wrap(len(uni))]) + return space.newtuple([space.wrapbytes(result), space.wrap(len(uni))]) @unwrap_spec(chars=unicode) @@ -716,7 +716,7 @@ if space.isinstance_w(w_string, space.w_unicode): return space.newtuple([w_string, space.len(w_string)]) - string = space.str_w(w_string) + string = space.bytes_w(w_string) if len(string) == 0: return space.newtuple([space.wrap(u''), space.wrap(0)]) @@ -729,21 +729,21 @@ return space.newtuple([space.wrap(result), space.wrap(consumed)]) # ____________________________________________________________ -# support for the "string escape" codec +# support for the "string escape" translation # This is a bytes-to bytes transformation -@unwrap_spec(data=str, errors='str_or_None') +@unwrap_spec(data="bufferstr", errors='str_or_None') def escape_encode(space, data, errors='strict'): from pypy.objspace.std.stringobject import string_escape_encode result = string_escape_encode(data, quote="'") start = 1 end = len(result) - 1 assert end >= 0 - w_result = space.wrap(result[start:end]) + w_result = space.wrapbytes(result[start:end]) return space.newtuple([w_result, space.wrap(len(data))]) -@unwrap_spec(data=str, errors='str_or_None') +@unwrap_spec(data="bufferstr", errors='str_or_None') def escape_decode(space, data, errors='strict'): from pypy.interpreter.pyparser.parsestring import PyString_DecodeEscape result = PyString_DecodeEscape(space, data, None) - return space.newtuple([space.wrap(result), space.wrap(len(data))]) + return space.newtuple([space.wrapbytes(result), space.wrap(len(data))]) diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -17,7 +17,7 @@ 'utf-32', 'utf-32-le', 'utf-32-be', 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): - assert unicode(u.encode(encoding),encoding) == u + assert str(u.encode(encoding),encoding) == u def test_ucs4(self): x = u'\U00100000' @@ -25,14 +25,14 @@ assert x == y def test_named_unicode(self): - assert unicode('\\N{SPACE}','unicode-escape') == u" " - raises( UnicodeDecodeError, unicode,'\\N{SPACE','unicode-escape') - raises( UnicodeDecodeError, unicode,'\\NSPACE}','unicode-escape') - raises( UnicodeDecodeError, unicode,'\\NSPACE','unicode-escape') - raises( UnicodeDecodeError, unicode,'\\N','unicode-escape') - assert unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u" " - assert unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a " - assert "\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx" + assert str(b'\\N{SPACE}','unicode-escape') == u" " + raises( UnicodeDecodeError, str,b'\\N{SPACE','unicode-escape') + raises( UnicodeDecodeError, str,b'\\NSPACE}','unicode-escape') + raises( UnicodeDecodeError, str,b'\\NSPACE','unicode-escape') + raises( UnicodeDecodeError, str,b'\\N','unicode-escape') + assert str(b'\\N{SPACE}\\N{SPACE}','unicode-escape') == u" " + assert str(b'\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a " + assert b"\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx" assert 1 <= len(u"\N{CJK UNIFIED IDEOGRAPH-20000}") <= 2 def test_literals(self): @@ -40,26 +40,26 @@ def test_insecure_pickle(self): import pickle - insecure = ["abc", "2 + 2", # not quoted + insecure = [b"abc", b"2 + 2", # not quoted #"'abc' + 'def'", # not a single quoted string - "'abc", # quote is not closed - "'abc\"", # open quote and close quote don't match - "'abc' ?", # junk after close quote - "'\\'", # trailing backslash + b"'abc", # quote is not closed + b"'abc\"", # open quote and close quote don't match + b"'abc' ?", # junk after close quote + b"'\\'", # trailing backslash # some tests of the quoting rules #"'abc\"\''", #"'\\\\a\'\'\'\\\'\\\\\''", ] for s in insecure: - buf = "S" + s + "\012p0\012." + buf = b"S" + s + b"\012p0\012." raises (ValueError, pickle.loads, buf) def test_unicodedecodeerror(self): assert str(UnicodeDecodeError( - "ascii", "g\xfcrk", 1, 2, "ouch")) == "'ascii' codec can't decode byte 0xfc in position 1: ouch" + "ascii", b"g\xfcrk", 1, 2, "ouch")) == "'ascii' codec can't decode byte 0xfc in position 1: ouch" assert str(UnicodeDecodeError( - "ascii", "g\xfcrk", 1, 3, "ouch")) == "'ascii' codec can't decode bytes in position 1-2: ouch" + "ascii", b"g\xfcrk", 1, 3, "ouch")) == "'ascii' codec can't decode bytes in position 1-2: ouch" def test_unicodetranslateerror(self): @@ -73,7 +73,7 @@ assert str(UnicodeTranslateError( u"g\uffffrk", 1, 2, "ouch"))== "can't translate character u'\\uffff' in position 1: ouch" - if sys.maxunicode > 0xffff and len(unichr(0x10000)) == 1: + if sys.maxunicode > 0xffff and len(chr(0x10000)) == 1: assert str(UnicodeTranslateError( u"g\U00010000rk", 1, 2, "ouch"))== "can't translate character u'\\U00010000' in position 1: ouch" @@ -96,30 +96,31 @@ assert str(UnicodeEncodeError( "ascii", u"\uffffx", 0, 1, "ouch"))=="'ascii' codec can't encode character u'\\uffff' in position 0: ouch" - if sys.maxunicode > 0xffff and len(unichr(0x10000)) == 1: + if sys.maxunicode > 0xffff and len(chr(0x10000)) == 1: assert str(UnicodeEncodeError( "ascii", u"\U00010000x", 0, 1, "ouch")) =="'ascii' codec can't encode character u'\\U00010000' in position 0: ouch" def test_indexerror(self): - test = "\\" # trailing backslash - raises (ValueError, test.decode,'string-escape') + import _codecs + test = b"\\" # trailing backslash + raises (ValueError, _codecs.escape_decode, test) def test_charmap_decode(self): from _codecs import charmap_decode import sys - assert charmap_decode('', 'strict', 'blablabla') == ('', 0) - assert charmap_decode('xxx') == ('xxx', 3) - assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3) - map = tuple([unichr(i) for i in range(256)]) - assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4) + assert charmap_decode(b'', 'strict', 'blablabla') == ('', 0) + assert charmap_decode(b'xxx') == ('xxx', 3) + assert charmap_decode(b'xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3) + map = tuple([chr(i) for i in range(256)]) + assert charmap_decode(b'xxx\xff', 'strict', map) == (u'xxx\xff', 4) raises(TypeError, charmap_decode, '\xff', "replace", {0xff: 0x10001}) def test_unicode_escape(self): from _codecs import unicode_escape_encode, unicode_escape_decode assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3) - assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3) - assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12) + assert unicode_escape_decode(b'abc') == (b'abc'.decode('unicode_escape'), 3) + assert unicode_escape_decode(b'\\x61\\x62\\x63') == (u'abc', 12) class AppTestPartialEvaluation: @@ -144,13 +145,13 @@ u"\x00\xff\u07ff\u0800\uffff", ] - buffer = '' + buffer = b'' result = u"" for (c, partialresult) in zip(u"\x00\xff\u07ff\u0800\uffff".encode(encoding), check_partial): - buffer += c + buffer += bytes([c]) res = _codecs.utf_8_decode(buffer,'strict',False) if res[1] >0 : - buffer = '' + buffer = b'' result += res[0] assert result == partialresult @@ -169,26 +170,26 @@ u"\x00\xff\u0100", u"\x00\xff\u0100\uffff", ] - buffer = '' + buffer = b'' result = u"" for (c, partialresult) in zip(u"\x00\xff\u0100\uffff".encode(encoding), check_partial): - buffer += c + buffer += bytes([c]) res = _codecs.utf_16_decode(buffer,'strict',False) if res[1] >0 : - buffer = '' + buffer = b'' result += res[0] assert result == partialresult def test_bug1098990_a(self): - import codecs, StringIO + import codecs, io self.encoding = 'utf-8' s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" s3 = u"next line.\r\n" s = (s1+s2+s3).encode(self.encoding) - stream = StringIO.StringIO(s) + stream = io.BytesIO(s) reader = codecs.getreader(self.encoding)(stream) assert reader.readline() == s1 assert reader.readline() == s2 @@ -196,7 +197,7 @@ assert reader.readline() == u"" def test_bug1098990_b(self): - import codecs, StringIO + import codecs, io self.encoding = 'utf-8' s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n" s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n" @@ -205,7 +206,7 @@ s5 = u"againokay.\r\n" s = (s1+s2+s3+s4+s5).encode(self.encoding) - stream = StringIO.StringIO(s) + stream = io.BytesIO(s) reader = codecs.getreader(self.encoding)(stream) assert reader.readline() == s1 assert reader.readline() == s2 @@ -216,11 +217,11 @@ def test_seek_utf16le(self): # all codecs should be able to encode these - import codecs, StringIO + import codecs, io encoding = 'utf-16-le' s = u"%s\n%s\n" % (10*u"abc123", 10*u"def456") - reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding))) - for t in xrange(5): + reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding))) + for t in range(5): # Test that calling seek resets the internal codec state and buffers reader.seek(0, 0) line = reader.readline() @@ -229,71 +230,75 @@ def test_unicode_internal_encode(self): import sys - class U(unicode): + class U(str): pass enc = U(u"a").encode("unicode_internal") if sys.maxunicode == 65535: # UCS2 build if sys.byteorder == "big": - assert enc == "\x00a" + assert enc == b"\x00a" else: - assert enc == "a\x00" + assert enc == b"a\x00" elif len(u"\U00010098") == 1: # UCS4 build on a UCS4 CPython enc2 = u"\U00010098".encode("unicode_internal") if sys.byteorder == "big": - assert enc == "\x00\x00\x00a" - assert enc2 == "\x00\x01\x00\x98" + assert enc == b"\x00\x00\x00a" + assert enc2 == b"\x00\x01\x00\x98" else: - assert enc == "a\x00\x00\x00" - assert enc2 == "\x98\x00\x01\x00" + assert enc == b"a\x00\x00\x00" + assert enc2 == b"\x98\x00\x01\x00" else: # UCS4 build on a UCS2 CPython if sys.byteorder == "big": - assert enc == "\x00\x00\x00a" + assert enc == b"\x00\x00\x00a" else: - assert enc == "a\x00\x00\x00" + assert enc == b"a\x00\x00\x00" def test_unicode_internal_decode(self): import sys if sys.maxunicode == 65535: # UCS2 build if sys.byteorder == "big": - bytes = "\x00a" + bytes = b"\x00a" else: - bytes = "a\x00" + bytes = b"a\x00" else: # UCS4 build if sys.byteorder == "big": - bytes = "\x00\x00\x00a" - bytes2 = "\x00\x01\x00\x98" + bytes = b"\x00\x00\x00a" + bytes2 = b"\x00\x01\x00\x98" else: - bytes = "a\x00\x00\x00" - bytes2 = "\x98\x00\x01\x00" + bytes = b"a\x00\x00\x00" + bytes2 = b"\x98\x00\x01\x00" assert bytes2.decode("unicode_internal") == u"\U00010098" assert bytes.decode("unicode_internal") == u"a" def test_raw_unicode_escape(self): - assert unicode("\u0663", "raw-unicode-escape") == u"\u0663" - assert u"\u0663".encode("raw-unicode-escape") == "\u0663" + assert str(b"\u0663", "raw-unicode-escape") == u"\u0663" + assert u"\u0663".encode("raw-unicode-escape") == b"\u0663" def test_escape_decode(self): - test = 'a\n\\b\x00c\td\u2045'.encode('string_escape') - assert test.decode('string_escape') =='a\n\\b\x00c\td\u2045' - assert '\\077'.decode('string_escape') == '?' - assert '\\100'.decode('string_escape') == '@' - assert '\\253'.decode('string_escape') == chr(0253) - assert '\\312'.decode('string_escape') == chr(0312) + import _codecs + test = _codecs.escape_encode(b'a\n\\b\x00c\td\u2045')[0] + assert _codecs.escape_decode(test)[0] == b'a\n\\b\x00c\td\u2045' + assert _codecs.escape_decode(b'\\077')[0] == b'?' + assert _codecs.escape_decode(b'\\100')[0] == b'@' + assert _codecs.escape_decode(b'\\253')[0] == bytes([0253]) + assert _codecs.escape_decode(b'\\312')[0] == bytes([0312]) def test_escape_decode_wrap_around(self): - assert '\\400'.decode('string_escape') == chr(0) + import _codecs + assert _codecs.escape_decode(b'\\400')[0] == b'\0' def test_escape_decode_ignore_invalid(self): - assert '\\9'.decode('string_escape') == '\\9' - assert '\\01'.decode('string_escape') == chr(01) - assert '\\0f'.decode('string_escape') == chr(0) + 'f' - assert '\\08'.decode('string_escape') == chr(0) + '8' + import _codecs + assert _codecs.escape_decode(b'\\9')[0] == b'\\9' + assert _codecs.escape_decode(b'\\01')[0] == b'\x01' + assert _codecs.escape_decode(b'\\0f')[0] == b'\0' + b'f' + assert _codecs.escape_decode(b'\\08')[0] == b'\0' + b'8' def test_escape_encode(self): - assert '"'.encode('string_escape') == '"' - assert "'".encode('string_escape') == "\\'" + import _codecs + assert _codecs.escape_encode(b'"')[0] == b'"' + assert _codecs.escape_encode(b"'")[0] == b"\\'" def test_decode_utf8_different_case(self): constant = u"a" @@ -304,35 +309,35 @@ def search_function(encoding): def f(input, errors="strict"): return 42 - print encoding + print(encoding) if encoding == 'test.mytestenc': return (f, f, None, None) return None _codecs.register(search_function) - raises(TypeError, "hello".decode, "test.mytestenc") + raises(TypeError, b"hello".decode, "test.mytestenc") raises(TypeError, u"hello".encode, "test.mytestenc") def test_cpytest_decode(self): import codecs - assert codecs.decode('\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc' + assert codecs.decode(b'\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc' raises(TypeError, codecs.decode) - assert codecs.decode('abc') == u'abc' - raises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii') + assert codecs.decode(b'abc') == u'abc' + raises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii') def test_bad_errorhandler_return(self): import codecs def baddecodereturn1(exc): return 42 codecs.register_error("test.baddecodereturn1", baddecodereturn1) - raises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1") - raises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1") - raises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1") - raises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1") - raises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") - raises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") + raises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1") + raises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1") + raises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1") + raises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1") + raises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") + raises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") def test_cpy_bug1175396(self): - import codecs, StringIO + import codecs, io s = [ '<%!--===================================================\r\n', ' BLOG index page: show recent articles,\r\n', @@ -364,15 +369,15 @@ ' log.error("Error loading articles: "+str(x))\r\n', ' self.abort("cannot load articles")\r\n', ] - stream = StringIO.StringIO("".join(s).encode("utf7")) - assert "aborrt" not in stream.getvalue() + stream = io.BytesIO("".join(s).encode("utf7")) + assert b"aborrt" not in stream.getvalue() reader = codecs.getreader("utf7")(stream) for (i, line) in enumerate(reader): assert line == s[i] def test_array(self): import _codecs, array - _codecs.readbuffer_encode(array.array('c', 'spam')) == ('spam', 4) + _codecs.readbuffer_encode(array.array('b', b'spam')) == ('spam', 4) def test_utf8sig(self): import codecs @@ -382,28 +387,28 @@ def test_escape_decode_escaped_newline(self): import _codecs - s = '\\\n' + s = b'\\\n' decoded = _codecs.unicode_escape_decode(s)[0] assert decoded == '' def test_charmap_decode_1(self): import codecs - assert codecs.charmap_encode(u'xxx') == ('xxx', 3) - assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3) + assert codecs.charmap_encode(u'xxx') == (b'xxx', 3) + assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3) - res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab") + res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab") assert res == (u"ab\ufffd", 3) - res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe") + res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab\ufffe") assert res == (u'ab\ufffd', 3) def test_decode_errors(self): import sys if sys.maxunicode > 0xffff: try: - "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") + b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") except UnicodeDecodeError, ex: assert "unicode_internal" == ex.encoding - assert "\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object + assert b"\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object assert ex.start == 4 assert ex.end == 8 else: @@ -414,14 +419,14 @@ assert codecs.replace_errors(UnicodeEncodeError( "ascii", u"\u3042", 0, 1, "ouch")) == (u"?", 1) assert codecs.replace_errors(UnicodeDecodeError( - "ascii", "\xff", 0, 1, "ouch")) == (u"\ufffd", 1) + "ascii", b"\xff", 0, 1, "ouch")) == (u"\ufffd", 1) assert codecs.replace_errors(UnicodeTranslateError( u"\u3042", 0, 1, "ouch")) == (u"\ufffd", 1) assert codecs.replace_errors(UnicodeEncodeError( "ascii", u"\u3042\u3042", 0, 2, "ouch")) == (u"??", 2) assert codecs.replace_errors(UnicodeDecodeError( - "ascii", "\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2) + "ascii", b"\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2) assert codecs.replace_errors(UnicodeTranslateError( u"\u3042\u3042", 0, 2, "ouch")) == (u"\ufffd\ufffd", 2) @@ -439,13 +444,13 @@ # A UnicodeDecodeError object without an end attribute class NoEndUnicodeDecodeError(UnicodeDecodeError): def __init__(self): - UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") + UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad") del self.end # A UnicodeDecodeError object with a bad object attribute class BadObjectUnicodeDecodeError(UnicodeDecodeError): def __init__(self): - UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") + UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad") self.object = [] # A UnicodeTranslateError object without a start attribute @@ -477,11 +482,11 @@ # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement def test_decode_ignore(self): - assert '\xff'.decode('utf-7', 'ignore') == '' - assert '\x00'.decode('unicode-internal', 'ignore') == '' + assert b'\xff'.decode('utf-7', 'ignore') == '' + assert b'\x00'.decode('unicode-internal', 'ignore') == '' def test_backslahreplace(self): - assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000' + assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == b'a\\xac\u1234\u20ac\u8000' def test_surrogateescape(self): assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b' @@ -502,10 +507,10 @@ "test.badhandler" ) for (enc, bytes) in ( - ("utf-8", "\xff"), - ("ascii", "\xff"), - ("utf-7", "+x-"), - ("unicode-internal", "\x00"), + ("utf-8", b"\xff"), + ("ascii", b"\xff"), + ("utf-7", b"+x-"), + ("unicode-internal", b"\x00"), ): raises( TypeError, @@ -518,19 +523,19 @@ import codecs import sys try: - '\x00'.decode('unicode-internal') + b'\x00'.decode('unicode-internal') except UnicodeDecodeError: pass else: raise Exception("DID NOT RAISE") - res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace") + res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace") if sys.maxunicode > 65535: assert res == u"\u0000\ufffd" # UCS4 build else: assert res == u"\x00\x00\ufffd" # UCS2 build - res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore") + res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore") if sys.maxunicode > 65535: assert res == u"\u0000" # UCS4 build else: @@ -541,7 +546,7 @@ raise TypeError("don't know how to handle %r" % exc) return (u"\x01", 1) codecs.register_error("test.hui", handler_unicodeinternal) - res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui") + res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui") if sys.maxunicode > 65535: assert res == u"\u0000\u0001\u0000" # UCS4 build else: @@ -550,31 +555,31 @@ def test_encode_error_bad_handler(self): import codecs codecs.register_error("test.bad_handler", lambda e: (repl, 1)) - assert u"xyz".encode("latin-1", "test.bad_handler") == "xyz" + assert u"xyz".encode("latin-1", "test.bad_handler") == b"xyz" repl = u"\u1234" raises(UnicodeEncodeError, u"\u5678".encode, "latin-1", "test.bad_handler") repl = u"\u00E9" s = u"\u5678".encode("latin-1", "test.bad_handler") - assert s == '\xe9' + assert s == b'\xe9' def test_charmap_encode(self): - assert 'xxx'.encode('charmap') == 'xxx' + assert 'xxx'.encode('charmap') == b'xxx' import codecs raises(TypeError, codecs.charmap_encode, u'\xff', "replace", {0xff: 300}) raises(UnicodeError, codecs.charmap_encode, u"\xff", "replace", {0xff: None}) def test_charmap_encode_replace(self): - charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) - charmap[ord("?")] = "XYZ" + charmap = dict([(c, bytes([c, c]).upper()) for c in b"abcdefgh"]) + charmap[ord("?")] = b"XYZ" import codecs sin = u"abcDEF" sout = codecs.charmap_encode(sin, "replace", charmap)[0] - assert sout == "AABBCCXYZXYZXYZ" + assert sout == b"AABBCCXYZXYZXYZ" def test_charmap_decode_2(self): - assert 'foo'.decode('charmap') == 'foo' + assert b'foo'.decode('charmap') == 'foo' def test_charmap_build(self): import codecs @@ -583,25 +588,25 @@ def test_utf7_start_end_in_exception(self): try: - '+IC'.decode('utf-7') + b'+IC'.decode('utf-7') except UnicodeDecodeError, exc: assert exc.start == 0 assert exc.end == 3 def test_utf7_surrogate(self): - raises(UnicodeDecodeError, '+3ADYAA-'.decode, 'utf-7') + raises(UnicodeDecodeError, b'+3ADYAA-'.decode, 'utf-7') def test_utf_16_encode_decode(self): import codecs x = u'123abc' - assert codecs.getencoder('utf-16')(x) == ('\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6) - assert codecs.getdecoder('utf-16')('\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14) + assert codecs.getencoder('utf-16')(x) == (b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6) + assert codecs.getdecoder('utf-16')(b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14) def test_unicode_escape(self): - assert u'\\'.encode('unicode-escape') == '\\\\' - assert '\\\\'.decode('unicode-escape') == u'\\' - assert u'\ud801'.encode('unicode-escape') == '\\ud801' - assert u'\u0013'.encode('unicode-escape') == '\\x13' + assert u'\\'.encode('unicode-escape') == b'\\\\' + assert b'\\\\'.decode('unicode-escape') == u'\\' + assert u'\ud801'.encode('unicode-escape') == b'\\ud801' + assert u'\u0013'.encode('unicode-escape') == b'\\x13' def test_mbcs(self): import sys @@ -611,11 +616,3 @@ assert u'caf\xe9'.encode('mbcs') == 'caf\xe9' assert u'\u040a'.encode('mbcs') == '?' # some cyrillic letter assert 'cafx\e9'.decode('mbcs') == u'cafx\e9' - - def test_bad_handler_string_result(self): - import _codecs - def f(exc): - return ('foo', exc.end) - _codecs.register_error("test.test_codecs_not_a_string", f) - raises(TypeError, u'\u1234'.encode, 'ascii', - 'test.test_codecs_not_a_string') diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py --- a/pypy/rlib/runicode.py +++ b/pypy/rlib/runicode.py @@ -272,11 +272,11 @@ # Encode UCS2 Unicode ordinals if ch < 0x10000: # Special case: check for high surrogate - if 0xD800 <= ch <= 0xDBFF and pos != size: + if 0xD800 <= ch <= 0xDFFF and pos != size: ch2 = ord(s[pos]) # Check for low surrogate and combine the two to # form a UCS4 value - if 0xDC00 <= ch2 <= 0xDFFF: + if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 pos += 1 _encodeUCS4(result, ch3) _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit