Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r44721:be2600cf63a3 Date: 2011-06-05 17:10 +0200 http://bitbucket.org/pypy/pypy/changeset/be2600cf63a3/
Log: Custom encode error handlers. diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -176,11 +176,12 @@ [ENCODEBUF_P], rffi.SSIZE_T) pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed', [ENCODEBUF_P], rffi.SSIZE_T) -pypy_cjk_enc_inbuf_add = llexternal('pypy_cjk_enc_inbuf_add', - [ENCODEBUF_P, rffi.SSIZE_T, rffi.INT], - rffi.INT) +pypy_cjk_enc_replace_on_error = llexternal('pypy_cjk_enc_replace_on_error', + [ENCODEBUF_P, rffi.CCHARP, + rffi.SSIZE_T, rffi.SSIZE_T], + rffi.SSIZE_T) -def encode(codec, unicodedata, errors="strict"): +def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None): inleft = len(unicodedata) inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata) try: @@ -192,12 +193,14 @@ r = pypy_cjk_enc_chunk(encodebuf) if r == 0: break - multibytecodec_encerror(encodebuf, r, errors) + multibytecodec_encerror(encodebuf, r, errors, + codec, errorcb, namecb, unicodedata) while True: r = pypy_cjk_enc_reset(encodebuf) if r == 0: break - multibytecodec_encerror(encodebuf, r, errors) + multibytecodec_encerror(encodebuf, r, errors, + codec, errorcb, namecb, unicodedata) src = pypy_cjk_enc_outbuf(encodebuf) length = pypy_cjk_enc_outlen(encodebuf) return rffi.charpsize2str(src, length) @@ -208,7 +211,8 @@ finally: rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf) -def multibytecodec_encerror(encodebuf, e, errors): +def multibytecodec_encerror(encodebuf, e, errors, + codec, errorcb, namecb, unicodedata): if e > 0: reason = "illegal multibyte sequence" esize = e @@ -220,16 +224,27 @@ else: raise RuntimeError # - if errors == 'ignore': - pypy_cjk_enc_inbuf_add(encodebuf, esize, rffi.cast(rffi.INT, 0)) - return # continue encoding - if errors == "replace": - e = pypy_cjk_enc_inbuf_add(encodebuf, esize, rffi.cast(rffi.INT, 1)) - if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY: - raise MemoryError - return # continue decoding + # compute the string to use as a replacement -> 'replace', and + # the current position in the input 'unicodedata' -> 'end' start = pypy_cjk_enc_inbuf_consumed(encodebuf) end = start + esize - if errors != "strict": - reason = "not implemented: custom error handlers" # XXX implement me - raise EncodeDecodeError(start, end, reason) + if errors == "strict": + raise EncodeDecodeError(start, end, reason) + elif errors == "ignore": + replace = "" + elif errors == "replace": + try: + replace = encode(codec, u"?") + except EncodeDecodeError: + replace = "?" + else: + assert errorcb != None + replace, end = errorcb(errors, namecb, reason, + unicodedata, start, end) + inbuf = rffi.get_nonmovingbuffer(replace) + try: + r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) + finally: + rffi.free_nonmovingbuffer(replace, inbuf) + if r == MBERR_NOMEMORY: + raise MemoryError diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -3,6 +3,7 @@ from pypy.interpreter.typedef import TypeDef from pypy.interpreter.error import OperationError from pypy.module._multibytecodec import c_codecs +from pypy.module._codecs.interp_codecs import CodecState class MultibyteCodec(Wrappable): @@ -37,9 +38,11 @@ def encode(self, space, input, errors=None): if errors is None: errors = 'strict' + state = space.fromcache(CodecState) # try: - output = c_codecs.encode(self.codec, input, errors) + output = c_codecs.encode(self.codec, input, errors, + state.encode_error_handler, self.name) except c_codecs.EncodeDecodeError, e: raise OperationError( space.w_UnicodeEncodeError, diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py --- a/pypy/module/_multibytecodec/test/test_app_codecs.py +++ b/pypy/module/_multibytecodec/test/test_app_codecs.py @@ -84,3 +84,10 @@ r = codec.encode(u'abc\u1234def', 'replace') assert r == ('abc?def', 7) assert type(r[0]) is str + + def test_encode_custom_error_handler(self): + import codecs + codecs.register_error("test.multi_bad_handler", lambda e: (repl, 1)) + repl = u"\u2014" + s = u"\uDDA1".encode("gbk", "test.multi_bad_handler") + assert s == '\xA1\xAA' diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py --- a/pypy/module/_multibytecodec/test/test_c_codecs.py +++ b/pypy/module/_multibytecodec/test/test_c_codecs.py @@ -46,14 +46,6 @@ u = decode(c, 'def~{}abc', 'replace') assert u == u'def\ufffd\u5fcf' -def test_decode_hz_foobar(): - # not implemented yet: custom error handlers - c = getcodec("hz") - e = py.test.raises(EncodeDecodeError, decode, c, "~{xyz}", "foobar").value - assert e.start == 2 - assert e.end == 4 - assert e.reason == "not implemented: custom error handlers" - def test_encode_hz(): c = getcodec("hz") s = encode(c, u'foobar') @@ -79,15 +71,6 @@ s = encode(c, u'abc\u1234def', 'replace') assert s == 'abc?def' -def test_encode_hz_foobar(): - # not implemented yet: custom error handlers - c = getcodec("hz") - e = py.test.raises(EncodeDecodeError, encode, - c, u'abc\u1234def', 'foobar').value - assert e.start == 3 - assert e.end == 4 - assert e.reason == "not implemented: custom error handlers" - def test_encode_jisx0208(): c = getcodec('iso2022_jp') s = encode(c, u'\u83ca\u5730\u6642\u592b') diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c @@ -226,33 +226,18 @@ return d->inbuf - d->inbuf_start; } -int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s* d, Py_ssize_t skip, - int add_replacement_character) +Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d, + char *newbuf, Py_ssize_t newlen, + Py_ssize_t in_offset) { - if (add_replacement_character) + if (newlen > 0) { - const Py_UNICODE replchar = '?', *inbuf = &replchar; - Py_ssize_t r; - - while (1) - { - Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf); - r = d->codec->encode(&d->state, d->codec->config, - &inbuf, 1, &d->outbuf, outleft, 0); - if (r != MBERR_TOOSMALL) - break; - /* output buffer too small; grow it and continue. */ - if (expand_encodebuffer(d, -1) == -1) - return MBERR_NOMEMORY; - } - if (r != 0) - { - if (d->outbuf >= d->outbuf_end) - if (expand_encodebuffer(d, 1) == -1) - return MBERR_NOMEMORY; - *d->outbuf++ = '?'; - } + if (d->outbuf + newlen > d->outbuf_end) + if (expand_encodebuffer(d, newlen) == -1) + return MBERR_NOMEMORY; + memcpy(d->outbuf, newbuf, newlen); + d->outbuf += newlen; } - d->inbuf += skip; + d->inbuf = d->inbuf_start + in_offset; return 0; } diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h @@ -120,7 +120,8 @@ Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *); Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d); Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d); -int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s*, Py_ssize_t, int); +Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d, + char *, Py_ssize_t, Py_ssize_t); /* list of codecs defined in the .c files */ _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit