Author: Philip Jenvey <pjen...@underboss.org> Branch: py3k Changeset: r59270:b85a52c00034 Date: 2012-12-02 14:45 -0800 http://bitbucket.org/pypy/pypy/changeset/b85a52c00034/
Log: merge default diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -9,15 +9,19 @@ self.codec_search_cache = {} self.codec_error_registry = {} self.codec_need_encodings = True - self.decode_error_handler = self.make_errorhandler(space, True) - self.encode_error_handler = self.make_errorhandler(space, False) + self.decode_error_handler = self.make_decode_errorhandler(space) + self.encode_error_handler = self.make_encode_errorhandler(space) self.unicodedata_handler = None - def make_errorhandler(self, space, decode): - def unicode_call_errorhandler(errors, encoding, reason, input, - startpos, endpos): + def _make_errorhandler(self, space, decode): + def call_errorhandler(errors, encoding, reason, input, startpos, + endpos): + """Generic wrapper for calling into error handlers. + Returns (unicode_or_none, str_or_none, newpos) as error + handlers may return unicode or on Python 3, bytes. + """ w_errorhandler = lookup_error(space, errors) if decode: w_cls = space.w_UnicodeDecodeError @@ -56,16 +60,21 @@ raise operationerrfmt( space.w_IndexError, "position %d from error handler out of bounds", newpos) - if decode: - replace = space.unicode_w(w_replace) - return replace, newpos - else: - from pypy.objspace.std.unicodetype import encode_object - if space.isinstance_w(w_replace, space.w_unicode): - w_replace = encode_object(space, w_replace, encoding, None) - replace = space.bytes_w(w_replace) - return replace, newpos - return unicode_call_errorhandler + replace = space.unicode_w(w_replace) + return replace, newpos + return call_errorhandler + + def make_decode_errorhandler(self, space): + return self._make_errorhandler(space, True) + + def make_encode_errorhandler(self, space): + errorhandler = self._make_errorhandler(space, False) + def encode_call_errorhandler(errors, encoding, reason, input, startpos, + endpos): + replace, newpos = errorhandler(errors, encoding, reason, input, + startpos, endpos) + return replace, None, newpos + return encode_call_errorhandler def get_unicodedata_handler(self, space): if self.unicodedata_handler: diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -2,7 +2,7 @@ class AppTestCodecs: - spaceconfig = dict(usemodules=('unicodedata', 'struct')) + spaceconfig = dict(usemodules=('binascii', 'struct', 'unicodedata')) def test_register_noncallable(self): import _codecs diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -279,8 +279,14 @@ replace = "?" else: assert errorcb - replace, end = errorcb(errors, namecb, reason, - unicodedata, start, end) + retu, rets, end = errorcb(errors, namecb, reason, + unicodedata, start, end) + if rets is not None: + # py3k only + replace = rets + else: + codec = pypy_cjk_enc_getcodec(encodebuf) + replace = encode(codec, retu, "strict", errorcb, namecb) inbuf = rffi.get_nonmovingbuffer(replace) try: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py --- a/pypy/module/_multibytecodec/test/test_app_codecs.py +++ b/pypy/module/_multibytecodec/test/test_app_codecs.py @@ -102,3 +102,11 @@ repl = "\u2014" s = "\uDDA1".encode("gbk", "test.multi_bad_handler") assert s == b'\xA1\xAA' + + def test_encode_custom_error_handler_type(self): + import codecs + import sys + codecs.register_error("test.test_encode_custom_error_handler_type", + lambda e: ('\xc3', e.end)) + raises(TypeError, u"\uDDA1".encode, "gbk", + "test.test_encode_custom_error_handler_type") diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py --- a/pypy/module/_multibytecodec/test/test_c_codecs.py +++ b/pypy/module/_multibytecodec/test/test_c_codecs.py @@ -122,3 +122,10 @@ c = getcodec('iso2022_jp') s = encode(c, u'\u83ca\u5730\u6642\u592b') assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str + +def test_encode_custom_error_handler_bytes(): + c = getcodec("hz") + def errorhandler(errors, enc, msg, t, startingpos, endingpos): + return None, '\xc3', endingpos + s = encode(c, u'abc\u1234def', 'foo', errorhandler) + assert '\xc3' in s diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py --- a/pypy/rlib/runicode.py +++ b/pypy/rlib/runicode.py @@ -57,9 +57,9 @@ def default_unicode_error_encode(errors, encoding, msg, u, startingpos, endingpos): if errors == 'replace': - return u'?', endingpos + return u'?', None, endingpos if errors == 'ignore': - return u'', endingpos + return u'', None, endingpos raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg) # ____________________________________________________________ @@ -300,10 +300,20 @@ _encodeUCS4(result, ch3) continue if not allow_surrogates: - r, pos = errorhandler(errors, 'utf-8', - 'surrogates not allowed', - s, pos-1, pos) - result.append(r) + ru, rs, pos = errorhandler(errors, 'utf-8', + 'surrogates not allowed', + s, pos-1, pos) + if rs is not None: + # py3k only + result.append(rs) + continue + for ch in ru: + if ord(ch) < 0x80: + result.append(chr(ord(ch))) + else: + errorhandler('strict', 'utf-8', + 'surrogates not allowed', + s, pos-1, pos) continue # else: Fall through and handles isolated high surrogates result.append((chr((0xe0 | (ch >> 12))))) @@ -970,9 +980,13 @@ collend = pos+1 while collend < len(p) and ord(p[collend]) >= limit: collend += 1 - r, pos = errorhandler(errors, encoding, reason, p, - collstart, collend) - for ch in r: + ru, rs, pos = errorhandler(errors, encoding, reason, p, + collstart, collend) + if rs is not None: + # py3k only + result.append(rs) + continue + for ch in ru: if ord(ch) < limit: result.append(chr(ord(ch))) else: @@ -1042,10 +1056,14 @@ c = mapping.get(ch, '') if len(c) == 0: - res, pos = errorhandler(errors, "charmap", - "character maps to <undefined>", - s, pos, pos + 1) - for ch2 in res: + ru, rs, pos = errorhandler(errors, "charmap", + "character maps to <undefined>", + s, pos, pos + 1) + if rs is not None: + # py3k only + result.append(rs) + continue + for ch2 in ru: c2 = mapping.get(ch2, '') if len(c2) == 0: errorhandler( @@ -1644,9 +1662,12 @@ pass collend += 1 msg = "invalid decimal Unicode string" - r, pos = errorhandler(errors, 'decimal', - msg, s, collstart, collend) - for char in r: + ru, rs, pos = errorhandler(errors, 'decimal', + msg, s, collstart, collend) + if rs is not None: + # py3k only + errorhandler('strict', 'decimal', msg, s, collstart, collend) + for char in ru: ch = ord(char) if unicodedb.isspace(ch): result.append(' ') diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py --- a/pypy/rlib/test/test_runicode.py +++ b/pypy/rlib/test/test_runicode.py @@ -66,12 +66,19 @@ assert t is s assert start == startingpos assert stop == endingpos - return "42424242", stop + return u"42424242", None, stop encoder = self.getencoder(encoding) result = encoder(s, len(s), "foo!", errorhandler) assert called[0] assert "42424242" in result + # ensure bytes results passthru + def errorhandler_bytes(errors, enc, msg, t, startingpos, + endingpos): + return None, '\xc3', endingpos + result = encoder(s, len(s), "foo!", errorhandler_bytes) + assert '\xc3' in result + def checkdecodeerror(self, s, encoding, start, stop, addstuff=True, msg=None): called = [0] _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit