Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r46148:67f047ef8c7a Date: 2011-08-01 13:16 +0200 http://bitbucket.org/pypy/pypy/changeset/67f047ef8c7a/
Log: Incremental encoder, first try. diff --git a/pypy/module/_multibytecodec/__init__.py b/pypy/module/_multibytecodec/__init__.py --- a/pypy/module/_multibytecodec/__init__.py +++ b/pypy/module/_multibytecodec/__init__.py @@ -10,11 +10,11 @@ 'MultibyteIncrementalDecoder': 'interp_incremental.MultibyteIncrementalDecoder', + 'MultibyteIncrementalEncoder': + 'interp_incremental.MultibyteIncrementalEncoder', } appleveldefs = { - 'MultibyteIncrementalEncoder': - 'app_multibytecodec.MultibyteIncrementalEncoder', 'MultibyteStreamReader': 'app_multibytecodec.MultibyteStreamReader', 'MultibyteStreamWriter': diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -183,9 +183,11 @@ # ____________________________________________________________ # Encoding ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci) +pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new', + [MULTIBYTECODEC_P], ENCODEBUF_P) pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init', - [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T], - ENCODEBUF_P) + [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T], + rffi.SSIZE_T) pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P], lltype.Void) pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P], @@ -204,39 +206,46 @@ [ENCODEBUF_P, rffi.CCHARP, rffi.SSIZE_T, rffi.SSIZE_T], rffi.SSIZE_T) +pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec', + [ENCODEBUF_P], MULTIBYTECODEC_P) def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None): + encodebuf = pypy_cjk_enc_new(codec) + if not encodebuf: + raise MemoryError + try: + return encodeex(encodebuf, unicodedata, errors, errorcb, namecb) + finally: + pypy_cjk_enc_free(encodebuf) + +def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None, + namecb=None, ignore_error=0): inleft = len(unicodedata) inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata) try: - encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft) - if not encodebuf: + if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0: raise MemoryError - try: - while True: - r = pypy_cjk_enc_chunk(encodebuf) - if r == 0: - break - multibytecodec_encerror(encodebuf, r, errors, - codec, errorcb, namecb, unicodedata) - while True: - r = pypy_cjk_enc_reset(encodebuf) - if r == 0: - break - multibytecodec_encerror(encodebuf, r, errors, - codec, errorcb, namecb, unicodedata) - src = pypy_cjk_enc_outbuf(encodebuf) - length = pypy_cjk_enc_outlen(encodebuf) - return rffi.charpsize2str(src, length) - # - finally: - pypy_cjk_enc_free(encodebuf) + while True: + r = pypy_cjk_enc_chunk(encodebuf) + if r == 0 or r == ignore_error: + break + multibytecodec_encerror(encodebuf, r, errors, + errorcb, namecb, unicodedata) + while True: + r = pypy_cjk_enc_reset(encodebuf) + if r == 0: + break + multibytecodec_encerror(encodebuf, r, errors, + errorcb, namecb, unicodedata) + src = pypy_cjk_enc_outbuf(encodebuf) + length = pypy_cjk_enc_outlen(encodebuf) + return rffi.charpsize2str(src, length) # finally: rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf) def multibytecodec_encerror(encodebuf, e, errors, - codec, errorcb, namecb, unicodedata): + errorcb, namecb, unicodedata): if e > 0: reason = "illegal multibyte sequence" esize = e @@ -257,6 +266,7 @@ elif errors == "ignore": replace = "" elif errors == "replace": + codec = pypy_cjk_enc_getcodec(encodebuf) try: replace = encode(codec, u"?") except EncodeDecodeError: diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -1,14 +1,15 @@ from pypy.rpython.lltypesystem import lltype from pypy.module._multibytecodec import c_codecs from pypy.module._multibytecodec.interp_multibytecodec import ( - MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror) + MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror, + wrap_unicodeencodeerror) from pypy.interpreter.baseobjspace import Wrappable from pypy.interpreter.gateway import interp2app, unwrap_spec from pypy.interpreter.typedef import TypeDef, GetSetProperty from pypy.module._codecs.interp_codecs import CodecState -class MultibyteIncrementalDecoder(Wrappable): +class MultibyteIncrementalBase(Wrappable): def __init__(self, space, errors): if errors is None: @@ -21,6 +22,22 @@ self.name = codec.name self._initialize() + def __del__(self): + self._free() + + def reset_w(self): + self._free() + self._initialize() + + def fget_errors(self, space): + return space.wrap(self.errors) + + def fset_errors(self, space, w_errors): + self.errors = space.str_w(w_errors) + + +class MultibyteIncrementalDecoder(MultibyteIncrementalBase): + def _initialize(self): self.decodebuf = c_codecs.pypy_cjk_dec_new(self.codec) self.pending = "" @@ -31,13 +48,6 @@ c_codecs.pypy_cjk_dec_free(self.decodebuf) self.decodebuf = lltype.nullptr(c_codecs.DECODEBUF_P.TO) - def __del__(self): - self._free() - - def reset_w(self): - self._free() - self._initialize() - @unwrap_spec(object=str, final=bool) def decode_w(self, object, final=False): space = self.space @@ -57,12 +67,6 @@ self.pending = object[pos:] return space.wrap(output) - def fget_errors(self, space): - return space.wrap(self.errors) - - def fset_errors(self, space, w_errors): - self.errors = space.str_w(w_errors) - @unwrap_spec(errors="str_or_None") def mbidecoder_new(space, w_subtype, errors=None): @@ -81,6 +85,55 @@ ) +class MultibyteIncrementalEncoder(MultibyteIncrementalBase): + + def _initialize(self): + self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec) + self.pending = u"" + + def _free(self): + self.pending = None + if self.encodebuf: + c_codecs.pypy_cjk_enc_free(self.encodebuf) + self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO) + + @unwrap_spec(object=unicode, final=bool) + def encode_w(self, object, final=False): + space = self.space + state = space.fromcache(CodecState) + if len(self.pending) > 0: + object = self.pending + object + try: + output = c_codecs.encodeex(self.encodebuf, object, self.errors, + state.encode_error_handler, self.name, + get_ignore_error(final)) + except c_codecs.EncodeDecodeError, e: + raise wrap_unicodeencodeerror(space, e, object, self.name) + except RuntimeError: + raise wrap_runtimeerror(space) + pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf) + assert 0 <= pos <= len(object) + self.pending = object[pos:] + return space.wrap(output) + + +@unwrap_spec(errors="str_or_None") +def mbiencoder_new(space, w_subtype, errors=None): + r = space.allocate_instance(MultibyteIncrementalEncoder, w_subtype) + r.__init__(space, errors) + return space.wrap(r) + +MultibyteIncrementalEncoder.typedef = TypeDef( + 'MultibyteIncrementalEncoder', + __module__ = '_multibytecodec', + __new__ = interp2app(mbiencoder_new), + encode = interp2app(MultibyteIncrementalEncoder.encode_w), + reset = interp2app(MultibyteIncrementalEncoder.reset_w), + errors = GetSetProperty(MultibyteIncrementalEncoder.fget_errors, + MultibyteIncrementalEncoder.fset_errors), + ) + + def get_ignore_error(final): if final: return 0 # don't ignore any error diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py b/pypy/module/_multibytecodec/test/test_app_incremental.py --- a/pypy/module/_multibytecodec/test/test_app_incremental.py +++ b/pypy/module/_multibytecodec/test/test_app_incremental.py @@ -13,6 +13,15 @@ return IncrementalHzDecoder """) + cls.w_IncrementalHzEncoder = cls.space.appexec([], """(): + import _codecs_cn + from _multibytecodec import MultibyteIncrementalEncoder + + class IncrementalHzEncoder(MultibyteIncrementalEncoder): + codec = _codecs_cn.getcodec('hz') + + return IncrementalHzEncoder + """) def test_decode_hz(self): d = self.IncrementalHzDecoder() @@ -74,3 +83,14 @@ d.errors = "replace" r = d.decode("~{abc", True) assert r == u'\u5f95\ufffd' + + def test_decode_hz_buffer_grow(self): + d = self.IncrementalHzDecoder() + for i in range(13): + r = d.decode("a" * (2**i)) + assert r == unicode("a" * (2**i)) + + def test_encode_hz(self): + e = self.IncrementalHzEncoder() + r = e.encode("abcd") + assert r == u'abcd' diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c @@ -119,34 +119,40 @@ /************************************************************/ -struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec, - Py_UNICODE *inbuf, Py_ssize_t inlen) +struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec) { - Py_ssize_t outlen; struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s)); if (!d) return NULL; if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0) - goto errorexit; + { + free(d); + return NULL; + } + d->codec = codec; + d->outbuf_start = NULL; + return d; +} - d->codec = codec; +Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d, + Py_UNICODE *inbuf, Py_ssize_t inlen) +{ + Py_ssize_t outlen; d->inbuf_start = inbuf; d->inbuf = inbuf; d->inbuf_end = inbuf + inlen; - - if (inlen > (PY_SSIZE_T_MAX - 16) / 2) - goto errorexit; - outlen = inlen * 2 + 16; - d->outbuf_start = malloc(outlen); - if (!d->outbuf_start) - goto errorexit; + if (d->outbuf_start == NULL) + { + if (inlen > (PY_SSIZE_T_MAX - 16) / 2) + return -1; + outlen = inlen * 2 + 16; + d->outbuf_start = malloc(outlen); + if (d->outbuf_start == NULL) + return -1; + d->outbuf_end = d->outbuf_start + outlen; + } d->outbuf = d->outbuf_start; - d->outbuf_end = d->outbuf_start + outlen; - return d; - - errorexit: - free(d); - return NULL; + return 0; } void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d) @@ -249,3 +255,8 @@ d->inbuf = d->inbuf_start + in_offset; return 0; } + +const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *d) +{ + return d->codec; +} diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h --- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h +++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h @@ -113,8 +113,9 @@ unsigned char *outbuf_start, *outbuf, *outbuf_end; }; -struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec, - Py_UNICODE *inbuf, Py_ssize_t inlen); +struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec); +Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d, + Py_UNICODE *inbuf, Py_ssize_t inlen); void pypy_cjk_enc_free(struct pypy_cjk_enc_s *); Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *); Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *); @@ -124,6 +125,7 @@ Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d); Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d, char *, Py_ssize_t, Py_ssize_t); +const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *); /* list of codecs defined in the .c files */ _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit