Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: Changeset: r57996:ce4e0ff9862b Date: 2012-10-11 00:51 +0200 http://bitbucket.org/pypy/pypy/changeset/ce4e0ff9862b/
Log: Issue1285: Python2 allows lone surrogates, also in string literals which appear in marshalled code. Also use more direct code for functions that are often used. diff --git a/pypy/interpreter/generator.py b/pypy/interpreter/generator.py --- a/pypy/interpreter/generator.py +++ b/pypy/interpreter/generator.py @@ -3,7 +3,6 @@ from pypy.interpreter.gateway import NoneNotWrapped from pypy.interpreter.pyopcode import LoopBlock from pypy.rlib import jit -from pypy.rlib.objectmodel import specialize class GeneratorIterator(Wrappable): diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1,10 +1,62 @@ +from pypy.interpreter.error import OperationError +from pypy.rlib.objectmodel import specialize +from pypy.rlib import runicode from pypy.module._codecs import interp_codecs +@specialize.memo() +def decode_error_handler(space): + def raise_unicode_exception_decode(errors, encoding, msg, s, + startingpos, endingpos): + raise OperationError(space.w_UnicodeDecodeError, + space.newtuple([space.wrap(encoding), + space.wrap(s), + space.wrap(startingpos), + space.wrap(endingpos), + space.wrap(msg)])) + return raise_unicode_exception_decode + +@specialize.memo() +def encode_error_handler(space): + def raise_unicode_exception_encode(errors, encoding, msg, u, + startingpos, endingpos): + raise OperationError(space.w_UnicodeEncodeError, + space.newtuple([space.wrap(encoding), + space.wrap(u), + space.wrap(startingpos), + space.wrap(endingpos), + space.wrap(msg)])) + return raise_unicode_exception_encode + +# ____________________________________________________________ + def PyUnicode_AsEncodedString(space, w_data, w_encoding): return interp_codecs.encode(space, w_data, w_encoding) # These functions take and return unwrapped rpython strings and unicodes -PyUnicode_DecodeUnicodeEscape = interp_codecs.make_raw_decoder('unicode_escape') -PyUnicode_DecodeRawUnicodeEscape = interp_codecs.make_raw_decoder('raw_unicode_escape') -PyUnicode_DecodeUTF8 = interp_codecs.make_raw_decoder('utf_8') -PyUnicode_EncodeUTF8 = interp_codecs.make_raw_encoder('utf_8') +def PyUnicode_DecodeUnicodeEscape(space, string): + state = space.fromcache(interp_codecs.CodecState) + unicodedata_handler = state.get_unicodedata_handler(space) + result, consumed = runicode.str_decode_unicode_escape( + string, len(string), "strict", + final=True, errorhandler=decode_error_handler(space), + unicodedata_handler=unicodedata_handler) + return result + +def PyUnicode_DecodeRawUnicodeEscape(space, string): + result, consumed = runicode.str_decode_raw_unicode_escape( + string, len(string), "strict", + final=True, errorhandler=decode_error_handler(space)) + return result + +def PyUnicode_DecodeUTF8(space, string): + result, consumed = runicode.str_decode_utf_8( + string, len(string), "strict", + final=True, errorhandler=decode_error_handler(space), + allow_surrogates=True) + return result + +def PyUnicode_EncodeUTF8(space, uni): + return runicode.unicode_encode_utf_8( + uni, len(uni), "strict", + errorhandler=encode_error_handler(space), + allow_surrogates=True) diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -339,38 +339,6 @@ from pypy.rlib import runicode -def make_raw_encoder(name): - rname = "unicode_encode_%s" % (name.replace("_encode", ""), ) - assert hasattr(runicode, rname) - def raw_encoder(space, uni): - state = space.fromcache(CodecState) - func = getattr(runicode, rname) - errors = "strict" - return func(uni, len(uni), errors, state.encode_error_handler) - raw_encoder.func_name = rname - return raw_encoder - -def make_raw_decoder(name): - rname = "str_decode_%s" % (name.replace("_decode", ""), ) - assert hasattr(runicode, rname) - def raw_decoder(space, string): - final = True - errors = "strict" - state = space.fromcache(CodecState) - func = getattr(runicode, rname) - kwargs = {} - if name == 'unicode_escape': - unicodedata_handler = state.get_unicodedata_handler(space) - result, consumed = func(string, len(string), errors, - final, state.decode_error_handler, - unicodedata_handler=unicodedata_handler) - else: - result, consumed = func(string, len(string), errors, - final, state.decode_error_handler) - return result - raw_decoder.func_name = rname - return raw_decoder - def make_encoder_wrapper(name): rname = "unicode_encode_%s" % (name.replace("_encode", ""), ) assert hasattr(runicode, rname) diff --git a/pypy/module/marshal/test/test_marshal.py b/pypy/module/marshal/test/test_marshal.py --- a/pypy/module/marshal/test/test_marshal.py +++ b/pypy/module/marshal/test/test_marshal.py @@ -163,6 +163,7 @@ def test_unicode(self): import marshal, sys self.marshal_check(u'\uFFFF') + self.marshal_check(u'\ud800') self.marshal_check(unichr(sys.maxunicode)) diff --git a/pypy/objspace/std/unicodetype.py b/pypy/objspace/std/unicodetype.py --- a/pypy/objspace/std/unicodetype.py +++ b/pypy/objspace/std/unicodetype.py @@ -1,5 +1,5 @@ from pypy.interpreter.error import OperationError, operationerrfmt -from pypy.interpreter import gateway +from pypy.interpreter import gateway, unicodehelper from pypy.objspace.std.stdtypedef import StdTypeDef, SMM from pypy.objspace.std.register_all import register_all from pypy.objspace.std.basestringtype import basestring_typedef @@ -186,32 +186,6 @@ # ____________________________________________________________ -def decode_error_handler(space): - def raise_unicode_exception_decode(errors, encoding, msg, s, - startingpos, endingpos): - raise OperationError(space.w_UnicodeDecodeError, - space.newtuple([space.wrap(encoding), - space.wrap(s), - space.wrap(startingpos), - space.wrap(endingpos), - space.wrap(msg)])) - return raise_unicode_exception_decode -decode_error_handler._annspecialcase_ = 'specialize:memo' - -def encode_error_handler(space): - def raise_unicode_exception_encode(errors, encoding, msg, u, - startingpos, endingpos): - raise OperationError(space.w_UnicodeEncodeError, - space.newtuple([space.wrap(encoding), - space.wrap(u), - space.wrap(startingpos), - space.wrap(endingpos), - space.wrap(msg)])) - return raise_unicode_exception_encode -encode_error_handler._annspecialcase_ = 'specialize:memo' - -# ____________________________________________________________ - def getdefaultencoding(space): return space.sys.defaultencoding @@ -235,12 +209,12 @@ if errors is None or errors == 'strict': if encoding == 'ascii': u = space.unicode_w(w_object) - eh = encode_error_handler(space) + eh = unicodehelper.encode_error_handler(space) return space.wrap(unicode_encode_ascii( u, len(u), None, errorhandler=eh)) if encoding == 'utf-8': u = space.unicode_w(w_object) - eh = encode_error_handler(space) + eh = unicodehelper.encode_error_handler(space) return space.wrap(unicode_encode_utf_8( u, len(u), None, errorhandler=eh, allow_surrogates=True)) @@ -265,12 +239,12 @@ if encoding == 'ascii': # XXX error handling s = space.bufferstr_w(w_obj) - eh = decode_error_handler(space) + eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_ascii( s, len(s), None, final=True, errorhandler=eh)[0]) if encoding == 'utf-8': s = space.bufferstr_w(w_obj) - eh = decode_error_handler(space) + eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0]) _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit