Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r44707:4ad72b733e1f Date: 2011-06-05 10:52 +0200 http://bitbucket.org/pypy/pypy/changeset/4ad72b733e1f/
Log: decode(errors="ignore") at the C level diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -103,8 +103,10 @@ [DECODEBUF_P], rffi.SSIZE_T) pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed', [DECODEBUF_P], rffi.SSIZE_T) +pypy_cjk_dec_inbuf_add = llexternal('pypy_cjk_dec_inbuf_add', + [DECODEBUF_P, rffi.SSIZE_T], lltype.Void) -def decode(codec, stringdata): +def decode(codec, stringdata, errors="strict"): inleft = len(stringdata) inbuf = rffi.get_nonmovingbuffer(stringdata) try: @@ -112,10 +114,11 @@ if not decodebuf: raise MemoryError try: - r = pypy_cjk_dec_chunk(decodebuf) - if r != 0: - multibytecodec_decerror(decodebuf, r) - assert False + while True: + r = pypy_cjk_dec_chunk(decodebuf) + if r == 0: + break + multibytecodec_decerror(decodebuf, r, errors) src = pypy_cjk_dec_outbuf(decodebuf) length = pypy_cjk_dec_outlen(decodebuf) return rffi.wcharpsize2unicode(src, length) @@ -126,7 +129,7 @@ finally: rffi.free_nonmovingbuffer(stringdata, inbuf) -def multibytecodec_decerror(decodebuf, e): +def multibytecodec_decerror(decodebuf, e, errors): if e > 0: reason = "illegal multibyte sequence" esize = e @@ -139,7 +142,9 @@ raise RuntimeError # # if errors == ERROR_REPLACE:... - # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:... + if errors == "ignore": # or errors == ERROR_REPLACE + pypy_cjk_dec_inbuf_add(decodebuf, esize) + return # continue decoding start = pypy_cjk_dec_inbuf_consumed(decodebuf) end = start + esize if 1: # errors == ERROR_STRICT: diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py --- a/pypy/module/_multibytecodec/test/test_c_codecs.py +++ b/pypy/module/_multibytecodec/test/test_c_codecs.py @@ -36,6 +36,11 @@ assert e.end == 4 assert e.reason == "illegal multibyte sequence" +def test_decode_hz_ignore(): + c = getcodec("hz") + u = decode(c, 'def~{}abc', 'ignore') + assert u == u'def\u5fcf' + def test_encode_hz(): c = getcodec("hz") s = encode(c, u'foobar') diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c @@ -93,6 +93,11 @@ return d->inbuf - d->inbuf_start; } +void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s* d, Py_ssize_t skip) +{ + d->inbuf += skip; +} + /************************************************************/ struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec, diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h @@ -102,6 +102,7 @@ Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *); Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d); Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d); +void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s*, Py_ssize_t); struct pypy_cjk_enc_s { const MultibyteCodec *codec; _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit