Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r44722:32f1f17883f4 Date: 2011-06-05 17:22 +0200 http://bitbucket.org/pypy/pypy/changeset/32f1f17883f4/
Log: Custom decode error handlers. diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -3,6 +3,8 @@ from pypy.translator.tool.cbuild import ExternalCompilationInfo from pypy.tool.autopath import pypydir +UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD' + class EncodeDecodeError(Exception): def __init__(self, start, end, reason): @@ -103,11 +105,12 @@ [DECODEBUF_P], rffi.SSIZE_T) pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed', [DECODEBUF_P], rffi.SSIZE_T) -pypy_cjk_dec_inbuf_add = llexternal('pypy_cjk_dec_inbuf_add', - [DECODEBUF_P, rffi.SSIZE_T, rffi.INT], - rffi.INT) +pypy_cjk_dec_replace_on_error = llexternal('pypy_cjk_dec_replace_on_error', + [DECODEBUF_P, rffi.CWCHARP, + rffi.SSIZE_T, rffi.SSIZE_T], + rffi.SSIZE_T) -def decode(codec, stringdata, errors="strict"): +def decode(codec, stringdata, errors="strict", errorcb=None, namecb=None): inleft = len(stringdata) inbuf = rffi.get_nonmovingbuffer(stringdata) try: @@ -119,7 +122,8 @@ r = pypy_cjk_dec_chunk(decodebuf) if r == 0: break - multibytecodec_decerror(decodebuf, r, errors) + multibytecodec_decerror(decodebuf, r, errors, + errorcb, namecb, stringdata) src = pypy_cjk_dec_outbuf(decodebuf) length = pypy_cjk_dec_outlen(decodebuf) return rffi.wcharpsize2unicode(src, length) @@ -130,7 +134,8 @@ finally: rffi.free_nonmovingbuffer(stringdata, inbuf) -def multibytecodec_decerror(decodebuf, e, errors): +def multibytecodec_decerror(decodebuf, e, errors, + errorcb, namecb, stringdata): if e > 0: reason = "illegal multibyte sequence" esize = e @@ -142,19 +147,27 @@ else: raise RuntimeError # - if errors == "ignore": - pypy_cjk_dec_inbuf_add(decodebuf, esize, rffi.cast(rffi.INT, 0)) - return # continue decoding - if errors == "replace": - e = pypy_cjk_dec_inbuf_add(decodebuf, esize, rffi.cast(rffi.INT, 1)) - if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY: - raise MemoryError - return # continue decoding + # compute the unicode to use as a replacement -> 'replace', and + # the current position in the input 'unicodedata' -> 'end' start = pypy_cjk_dec_inbuf_consumed(decodebuf) end = start + esize - if errors != "strict": - reason = "not implemented: custom error handlers" # XXX implement me - raise EncodeDecodeError(start, end, reason) + if errors == "strict": + raise EncodeDecodeError(start, end, reason) + elif errors == "ignore": + replace = u"" + elif errors == "replace": + replace = UNICODE_REPLACEMENT_CHARACTER + else: + assert errorcb != None + replace, end = errorcb(errors, namecb, reason, + stringdata, start, end) + inbuf = rffi.get_nonmoving_unicodebuffer(replace) + try: + r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end) + finally: + rffi.free_nonmoving_unicodebuffer(replace, inbuf) + if r == MBERR_NOMEMORY: + raise MemoryError # ____________________________________________________________ # Encoding diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -16,9 +16,11 @@ def decode(self, space, input, errors=None): if errors is None: errors = 'strict' + state = space.fromcache(CodecState) # try: - output = c_codecs.decode(self.codec, input, errors) + output = c_codecs.decode(self.codec, input, errors, + state.decode_error_handler, self.name) except c_codecs.EncodeDecodeError, e: raise OperationError( space.w_UnicodeDecodeError, diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py --- a/pypy/module/_multibytecodec/test/test_app_codecs.py +++ b/pypy/module/_multibytecodec/test/test_app_codecs.py @@ -52,6 +52,13 @@ r = codec.decode("def~{}abc", 'replace') assert r == (u'def\ufffd\u5fcf', 9) + def test_decode_custom_error_handler(self): + import codecs + codecs.register_error("test.decode_custom_error_handler", + lambda e: (u'\u1234\u5678', e.end)) + u = "abc\xDD".decode("hz", "test.decode_custom_error_handler") + assert u == u'abc\u1234\u5678' + def test_encode_hz(self): import _codecs_cn codec = _codecs_cn.getcodec("hz") diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c @@ -1,8 +1,7 @@ #include <stdlib.h> +#include <string.h> #include "src/cjkcodecs/multibytecodec.h" -#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) - struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec, char *inbuf, Py_ssize_t inlen) @@ -95,17 +94,19 @@ return d->inbuf - d->inbuf_start; } -int pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s* d, Py_ssize_t skip, - int add_replacement_character) +Py_ssize_t pypy_cjk_dec_replace_on_error(struct pypy_cjk_dec_s* d, + Py_UNICODE *newbuf, Py_ssize_t newlen, + Py_ssize_t in_offset) { - if (add_replacement_character) + if (newlen > 0) { - if (d->outbuf >= d->outbuf_end) - if (expand_decodebuffer(d, 1) == -1) + if (d->outbuf + newlen > d->outbuf_end) + if (expand_decodebuffer(d, newlen) == -1) return MBERR_NOMEMORY; - *d->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER; + memcpy(d->outbuf, newbuf, newlen * sizeof(Py_UNICODE)); + d->outbuf += newlen; } - d->inbuf += skip; + d->inbuf = d->inbuf_start + in_offset; return 0; } diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h @@ -102,7 +102,8 @@ Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *); Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d); Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d); -int pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s*, Py_ssize_t, int); +Py_ssize_t pypy_cjk_dec_replace_on_error(struct pypy_cjk_dec_s* d, + Py_UNICODE *, Py_ssize_t, Py_ssize_t); struct pypy_cjk_enc_s { const MultibyteCodec *codec; _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit