Author: Armin Rigo <[email protected]>
Branch:
Changeset: r44058:6cb94685f116
Date: 2011-05-10 22:40 +0200
http://bitbucket.org/pypy/pypy/changeset/6cb94685f116/
Log: Encoding.
diff --git a/pypy/module/_multibytecodec/c_codecs.py
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -1,7 +1,7 @@
import py, sys
from pypy.rpython.lltypesystem import lltype, llmemory, rffi
-from pypy.rpython.lltypesystem.rstr import UNICODE
-from pypy.rpython.annlowlevel import hlunicode
+from pypy.rpython.lltypesystem.rstr import STR, UNICODE
+from pypy.rpython.annlowlevel import hlstr, hlunicode
from pypy.rlib.objectmodel import keepalive_until_here
from pypy.translator.tool.cbuild import ExternalCompilationInfo
from pypy.tool.autopath import pypydir
@@ -78,6 +78,7 @@
return getter()
# ____________________________________________________________
+# Decoding
DECODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_dec_s', compilation_info=eci)
pypy_cjk_dec_init = llexternal('pypy_cjk_dec_init',
@@ -98,19 +99,16 @@
def decode(codec, stringdata):
inleft = len(stringdata)
- if inleft > sys.maxint // 4:
- raise MemoryError
inbuf = rffi.get_nonmovingbuffer(stringdata)
try:
decodebuf = pypy_cjk_dec_init(codec, inbuf, inleft)
if not decodebuf:
raise MemoryError
try:
- while True:
- r = pypy_cjk_dec_chunk(decodebuf)
- if r == 0:
- break
+ r = pypy_cjk_dec_chunk(decodebuf)
+ if r != 0:
multibytecodec_decerror(decodebuf, r)
+ assert False
src = pypy_cjk_dec_outbuf(decodebuf)
length = pypy_cjk_dec_outlen(decodebuf)
return unicode_from_raw(src, length)
@@ -140,8 +138,6 @@
if 1: # errors == ERROR_STRICT:
raise EncodeDecodeError(start, end, reason)
-# ____________________________________________________________
-
def unicode_from_raw(src, length):
result = lltype.malloc(UNICODE, length)
try:
@@ -154,3 +150,83 @@
return hlunicode(result)
finally:
keepalive_until_here(result)
+
+# ____________________________________________________________
+# Encoding
+
+ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
+pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
+ [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T],
+ ENCODEBUF_P)
+pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
+ lltype.Void)
+pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P],
+ rffi.SSIZE_T)
+pypy_cjk_enc_reset = llexternal('pypy_cjk_enc_reset', [ENCODEBUF_P],
+ rffi.SSIZE_T)
+pypy_cjk_enc_outbuf = llexternal('pypy_cjk_enc_outbuf', [ENCODEBUF_P],
+ rffi.CCHARP)
+pypy_cjk_enc_outlen = llexternal('pypy_cjk_enc_outlen', [ENCODEBUF_P],
+ rffi.SSIZE_T)
+pypy_cjk_enc_inbuf_remaining = llexternal('pypy_cjk_enc_inbuf_remaining',
+ [ENCODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed',
+ [ENCODEBUF_P], rffi.SSIZE_T)
+
+def encode(codec, unicodedata):
+ inleft = len(unicodedata)
+ inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
+ try:
+ encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft)
+ if not encodebuf:
+ raise MemoryError
+ try:
+ r = pypy_cjk_enc_chunk(encodebuf)
+ if r != 0:
+ multibytecodec_encerror(encodebuf, r)
+ assert False
+ r = pypy_cjk_enc_reset(encodebuf)
+ if r != 0:
+ multibytecodec_encerror(encodebuf, r)
+ assert False
+ src = pypy_cjk_enc_outbuf(encodebuf)
+ length = pypy_cjk_enc_outlen(encodebuf)
+ return string_from_raw(src, length)
+ #
+ finally:
+ pypy_cjk_enc_free(encodebuf)
+ #
+ finally:
+ rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
+
+def multibytecodec_encerror(encodebuf, e):
+ if e > 0:
+ reason = "illegal multibyte sequence"
+ esize = e
+ elif e == MBERR_TOOFEW:
+ reason = "incomplete multibyte sequence"
+ esize = pypy_cjk_enc_inbuf_remaining(encodebuf)
+ elif e == MBERR_NOMEMORY:
+ raise MemoryError
+ else:
+ raise RuntimeError
+ #
+ # if errors == ERROR_REPLACE:...
+ # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+ start = pypy_cjk_enc_inbuf_consumed(encodebuf)
+ end = start + esize
+ if 1: # errors == ERROR_STRICT:
+ raise EncodeDecodeError(start, end, reason)
+
+def string_from_raw(src, length):
+ result = lltype.malloc(STR, length)
+ try:
+ str_chars_offset = (rffi.offsetof(STR, 'chars') + \
+ rffi.itemoffsetof(STR.chars, 0))
+ dest = rffi.cast_ptr_to_adr(result) + str_chars_offset
+ src = rffi.cast_ptr_to_adr(src) + rffi.itemoffsetof(rffi.CCHARP.TO)
+ rffi.raw_memcopy(src, dest,
+ llmemory.sizeof(lltype.Char) * length)
+ return hlstr(result)
+ finally:
+ keepalive_until_here(result)
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
@@ -15,7 +15,9 @@
d->inbuf_start = inbuf;
d->inbuf = inbuf;
d->inbuf_end = inbuf + inlen;
- d->outbuf_start = malloc(inlen * sizeof(Py_UNICODE));
+ d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ?
+ malloc(inlen * sizeof(Py_UNICODE)) :
+ NULL);
if (!d->outbuf_start)
goto errorexit;
d->outbuf = d->outbuf_start;
@@ -40,13 +42,15 @@
orgpos = d->outbuf - d->outbuf_start;
orgsize = d->outbuf_end - d->outbuf_start;
- esize = orgsize + (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
- newbuf = realloc(d->outbuf_start, esize * sizeof(Py_UNICODE));
+ esize = (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
+ newbuf = (esize <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE) - orgsize) ?
+ realloc(d->outbuf_start, (orgsize + esize) * sizeof(Py_UNICODE)) :
+ NULL);
if (!newbuf)
return -1;
d->outbuf_start = newbuf;
d->outbuf = newbuf + orgpos;
- d->outbuf_end = newbuf + esize;
+ d->outbuf_end = newbuf + orgsize + esize;
return 0;
}
@@ -88,3 +92,120 @@
{
return d->inbuf - d->inbuf_start;
}
+
+/************************************************************/
+
+struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
+ Py_UNICODE *inbuf, Py_ssize_t inlen)
+{
+ Py_ssize_t outlen;
+ struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s));
+ if (!d)
+ return NULL;
+ if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0)
+ goto errorexit;
+
+ d->codec = codec;
+ d->inbuf_start = inbuf;
+ d->inbuf = inbuf;
+ d->inbuf_end = inbuf + inlen;
+
+ if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
+ goto errorexit;
+ outlen = inlen * 2 + 16;
+ d->outbuf_start = malloc(outlen);
+ if (!d->outbuf_start)
+ goto errorexit;
+ d->outbuf = d->outbuf_start;
+ d->outbuf_end = d->outbuf_start + outlen;
+ return d;
+
+ errorexit:
+ free(d);
+ return NULL;
+}
+
+void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d)
+{
+ free(d->outbuf_start);
+ free(d);
+}
+
+static int expand_encodebuffer(struct pypy_cjk_enc_s *d, Py_ssize_t esize)
+{
+ Py_ssize_t orgpos, orgsize;
+ unsigned char *newbuf;
+
+ orgpos = d->outbuf - d->outbuf_start;
+ orgsize = d->outbuf_end - d->outbuf_start;
+ esize = (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
+ newbuf = (esize <= PY_SSIZE_T_MAX - orgsize ?
+ realloc(d->outbuf_start, orgsize + esize) :
+ NULL);
+ if (!newbuf)
+ return -1;
+ d->outbuf_start = newbuf;
+ d->outbuf = newbuf + orgpos;
+ d->outbuf_end = newbuf + orgsize + esize;
+ return 0;
+}
+
+#define MBENC_RESET MBENC_MAX<<1
+
+Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *d)
+{
+ int flags = MBENC_FLUSH | MBENC_RESET; /* XXX always, for now */
+ while (1)
+ {
+ Py_ssize_t r;
+ Py_ssize_t inleft = (Py_ssize_t)(d->inbuf_end - d->inbuf);
+ Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+ if (inleft == 0)
+ return 0;
+ r = d->codec->encode(&d->state, d->codec->config,
+ &d->inbuf, inleft, &d->outbuf, outleft, flags);
+ if (r != MBERR_TOOSMALL)
+ return r;
+ /* output buffer too small; grow it and continue. */
+ if (expand_encodebuffer(d, -1) == -1)
+ return MBERR_NOMEMORY;
+ }
+}
+
+Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *d)
+{
+ if (d->codec->encreset == NULL)
+ return 0;
+
+ while (1)
+ {
+ Py_ssize_t r;
+ Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+ r = d->codec->encreset(&d->state, d->codec->config, &d->outbuf, outleft);
+ if (r != MBERR_TOOSMALL)
+ return r;
+ /* output buffer too small; grow it and continue. */
+ if (expand_encodebuffer(d, -1) == -1)
+ return MBERR_NOMEMORY;
+ }
+}
+
+char *pypy_cjk_enc_outbuf(struct pypy_cjk_enc_s *d)
+{
+ return d->outbuf_start;
+}
+
+Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *d)
+{
+ return d->outbuf - d->outbuf_start;
+}
+
+Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d)
+{
+ return d->inbuf_end - d->inbuf;
+}
+
+Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d)
+{
+ return d->inbuf - d->inbuf_start;
+}
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
@@ -12,6 +12,7 @@
typedef uint32_t ucs4_t, Py_UNICODE;
typedef uint16_t ucs2_t, DBCHAR;
typedef ssize_t Py_ssize_t;
+#define PY_SSIZE_T_MAX ((Py_ssize_t)(((size_t) -1) >> 1))
typedef union {
@@ -81,4 +82,21 @@
Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
+struct pypy_cjk_enc_s {
+ const MultibyteCodec *codec;
+ MultibyteCodec_State state;
+ const Py_UNICODE *inbuf_start, *inbuf, *inbuf_end;
+ unsigned char *outbuf_start, *outbuf, *outbuf_end;
+};
+
+struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
+ Py_UNICODE *inbuf, Py_ssize_t inlen);
+void pypy_cjk_enc_free(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *);
+char *pypy_cjk_enc_outbuf(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d);
+Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
+
#endif
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py
b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -1,6 +1,7 @@
import py
from pypy.module._multibytecodec.c_codecs import getcodec, codecs
-from pypy.module._multibytecodec.c_codecs import decode, EncodeDecodeError
+from pypy.module._multibytecodec.c_codecs import decode, encode
+from pypy.module._multibytecodec.c_codecs import EncodeDecodeError
def test_codecs_existence():
@@ -34,3 +35,18 @@
assert e.start == 2
assert e.end == 4
assert e.reason == "illegal multibyte sequence"
+
+def test_encode_hz():
+ c = getcodec("hz")
+ s = encode(c, u'foobar')
+ assert s == 'foobar' and type(s) is str
+ s = encode(c, u'\u5f95\u6cef')
+ assert s == '~{abc}~}'
+
+def test_encode_hz_error():
+ # error
+ c = getcodec("hz")
+ e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+ assert e.start == 3
+ assert e.end == 4
+ assert e.reason == "illegal multibyte sequence"
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit