Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r98571:2ed84f7866b6 Date: 2020-01-23 11:37 +0100 http://bitbucket.org/pypy/pypy/changeset/2ed84f7866b6/
Log: Fix a corner case in multibytecodec: for stateful codecs, when encoding fails and we use replacement, the replacement string must be written in the output preserving the state. diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -194,17 +194,23 @@ rffi.SSIZE_T) pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec', [ENCODEBUF_P], MULTIBYTECODEC_P) +pypy_cjk_enc_copystate = llexternal('pypy_cjk_enc_copystate', + [ENCODEBUF_P, ENCODEBUF_P], lltype.Void) MBENC_FLUSH = 1 MBENC_RESET = 2 def encode(codec, unicodedata, length, errors="strict", errorcb=None, - namecb=None): + namecb=None, copystate=lltype.nullptr(ENCODEBUF_P.TO)): encodebuf = pypy_cjk_enc_new(codec) if not encodebuf: raise MemoryError + if copystate: + pypy_cjk_enc_copystate(encodebuf, copystate) try: return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb) finally: + if copystate: + pypy_cjk_enc_copystate(copystate, encodebuf) pypy_cjk_enc_free(encodebuf) def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None, @@ -258,18 +264,18 @@ elif errors == "ignore": replace = "" elif errors == "replace": - codec = pypy_cjk_enc_getcodec(encodebuf) - try: - replace = encode(codec, "?", 1) - except EncodeDecodeError: - replace = "?" + replace = "?" # utf-8 unicode else: assert errorcb - rets, end = errorcb(errors, namecb, reason, + replace, end = errorcb(errors, namecb, reason, unicodedata, start, end) + if len(replace) > 0: codec = pypy_cjk_enc_getcodec(encodebuf) - lgt = rutf8.codepoints_in_utf8(rets) - replace = encode(codec, rets, lgt, "strict", errorcb, namecb) + lgt = rutf8.codepoints_in_utf8(replace) + replace = encode(codec, replace, lgt, copystate=encodebuf) + #else: + # replace is an empty utf-8 unicode, which we directly consider to + # encode as an empty byte string. with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) if r == MBERR_NOMEMORY: diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c --- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c +++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c @@ -135,6 +135,11 @@ return d; } +void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src) +{ + dst->state = src->state; +} + Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d, Py_UNICODE *inbuf, Py_ssize_t inlen) { diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h --- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h +++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h @@ -146,6 +146,8 @@ char *, pypymbc_ssize_t, pypymbc_ssize_t); RPY_EXTERN const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *); +RPY_EXTERN +void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src); /* list of codecs defined in the .c files */ diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py --- a/pypy/module/_multibytecodec/test/test_app_codecs.py +++ b/pypy/module/_multibytecodec/test/test_app_codecs.py @@ -110,3 +110,33 @@ lambda e: ('\xc3', e.end)) raises(TypeError, u"\uDDA1".encode, "gbk", "test.test_encode_custom_error_handler_type") + + def test_encode_replacement_with_state(self): + import codecs + s = u'\u4ee4\u477c\u4ee4'.encode("iso-2022-jp", errors="replace") + assert s == '\x1b$BNa\x1b(B?\x1b$BNa\x1b(B' + + def test_streaming_codec(self): + test_0 = u'\uc5fc\u76d0\u5869\u9e7d\u477c\u4e3d/\u3012' + test_1 = u'\u4ee4\u477c\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b' + test_2 = u' foo = "Quoted string ****\u4ee4\u477c" ' + + ereplace = {'errors': 'replace'} + exml = {'errors': 'xmlcharrefreplace'} + for codec in ("iso-2022-jp", "iso-2022-jp-ext", "iso-2022-jp-1", + "iso-2022-jp-2", "iso-2022-jp-3", "iso-2022-jp-2004", + "iso-2022-kr", + ): + + out_1 = test_1.encode(codec, **ereplace).decode(codec, **ereplace) + assert out_1.endswith(u'\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b') + + out_0a = test_0.encode(codec, **ereplace).decode(codec, **ereplace) + for n, char in enumerate(out_0a): + assert char in (test_0[n], "?") + + out_0b = test_0.encode(codec, **exml).decode(codec, **ereplace) + assert "䝼" in out_0b + + out_2 = test_2.encode(codec, **ereplace).decode(codec, **ereplace) + assert out_2.count('"') == 2 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit