Author: Armin Rigo <[email protected]>
Branch: 
Changeset: r44058:6cb94685f116
Date: 2011-05-10 22:40 +0200
http://bitbucket.org/pypy/pypy/changeset/6cb94685f116/

Log:    Encoding.

diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -1,7 +1,7 @@
 import py, sys
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
-from pypy.rpython.lltypesystem.rstr import UNICODE
-from pypy.rpython.annlowlevel import hlunicode
+from pypy.rpython.lltypesystem.rstr import STR, UNICODE
+from pypy.rpython.annlowlevel import hlstr, hlunicode
 from pypy.rlib.objectmodel import keepalive_until_here
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
 from pypy.tool.autopath import pypydir
@@ -78,6 +78,7 @@
     return getter()
 
 # ____________________________________________________________
+# Decoding
 
 DECODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_dec_s', compilation_info=eci)
 pypy_cjk_dec_init = llexternal('pypy_cjk_dec_init',
@@ -98,19 +99,16 @@
 
 def decode(codec, stringdata):
     inleft = len(stringdata)
-    if inleft > sys.maxint // 4:
-        raise MemoryError
     inbuf = rffi.get_nonmovingbuffer(stringdata)
     try:
         decodebuf = pypy_cjk_dec_init(codec, inbuf, inleft)
         if not decodebuf:
             raise MemoryError
         try:
-            while True:
-                r = pypy_cjk_dec_chunk(decodebuf)
-                if r == 0:
-                    break
+            r = pypy_cjk_dec_chunk(decodebuf)
+            if r != 0:
                 multibytecodec_decerror(decodebuf, r)
+                assert False
             src = pypy_cjk_dec_outbuf(decodebuf)
             length = pypy_cjk_dec_outlen(decodebuf)
             return unicode_from_raw(src, length)
@@ -140,8 +138,6 @@
     if 1:  # errors == ERROR_STRICT:
         raise EncodeDecodeError(start, end, reason)
 
-# ____________________________________________________________
-
 def unicode_from_raw(src, length):
     result = lltype.malloc(UNICODE, length)
     try:
@@ -154,3 +150,83 @@
         return hlunicode(result)
     finally:
         keepalive_until_here(result)
+
+# ____________________________________________________________
+# Encoding
+
+ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
+pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
+                               [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T],
+                               ENCODEBUF_P)
+pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
+                               lltype.Void)
+pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P],
+                                rffi.SSIZE_T)
+pypy_cjk_enc_reset = llexternal('pypy_cjk_enc_reset', [ENCODEBUF_P],
+                                rffi.SSIZE_T)
+pypy_cjk_enc_outbuf = llexternal('pypy_cjk_enc_outbuf', [ENCODEBUF_P],
+                                 rffi.CCHARP)
+pypy_cjk_enc_outlen = llexternal('pypy_cjk_enc_outlen', [ENCODEBUF_P],
+                                 rffi.SSIZE_T)
+pypy_cjk_enc_inbuf_remaining = llexternal('pypy_cjk_enc_inbuf_remaining',
+                                          [ENCODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed',
+                                         [ENCODEBUF_P], rffi.SSIZE_T)
+
+def encode(codec, unicodedata):
+    inleft = len(unicodedata)
+    inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
+    try:
+        encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft)
+        if not encodebuf:
+            raise MemoryError
+        try:
+            r = pypy_cjk_enc_chunk(encodebuf)
+            if r != 0:
+                multibytecodec_encerror(encodebuf, r)
+                assert False
+            r = pypy_cjk_enc_reset(encodebuf)
+            if r != 0:
+                multibytecodec_encerror(encodebuf, r)
+                assert False
+            src = pypy_cjk_enc_outbuf(encodebuf)
+            length = pypy_cjk_enc_outlen(encodebuf)
+            return string_from_raw(src, length)
+        #
+        finally:
+            pypy_cjk_enc_free(encodebuf)
+    #
+    finally:
+        rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
+
+def multibytecodec_encerror(encodebuf, e):
+    if e > 0:
+        reason = "illegal multibyte sequence"
+        esize = e
+    elif e == MBERR_TOOFEW:
+        reason = "incomplete multibyte sequence"
+        esize = pypy_cjk_enc_inbuf_remaining(encodebuf)
+    elif e == MBERR_NOMEMORY:
+        raise MemoryError
+    else:
+        raise RuntimeError
+    #
+    # if errors == ERROR_REPLACE:...
+    # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+    start = pypy_cjk_enc_inbuf_consumed(encodebuf)
+    end = start + esize
+    if 1:  # errors == ERROR_STRICT:
+        raise EncodeDecodeError(start, end, reason)
+
+def string_from_raw(src, length):
+    result = lltype.malloc(STR, length)
+    try:
+        str_chars_offset = (rffi.offsetof(STR, 'chars') + \
+                            rffi.itemoffsetof(STR.chars, 0))
+        dest = rffi.cast_ptr_to_adr(result) + str_chars_offset
+        src = rffi.cast_ptr_to_adr(src) + rffi.itemoffsetof(rffi.CCHARP.TO)
+        rffi.raw_memcopy(src, dest,
+                         llmemory.sizeof(lltype.Char) * length)
+        return hlstr(result)
+    finally:
+        keepalive_until_here(result)
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c 
b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
@@ -15,7 +15,9 @@
   d->inbuf_start = inbuf;
   d->inbuf = inbuf;
   d->inbuf_end = inbuf + inlen;
-  d->outbuf_start = malloc(inlen * sizeof(Py_UNICODE));
+  d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ?
+                     malloc(inlen * sizeof(Py_UNICODE)) :
+                     NULL);
   if (!d->outbuf_start)
     goto errorexit;
   d->outbuf = d->outbuf_start;
@@ -40,13 +42,15 @@
 
   orgpos = d->outbuf - d->outbuf_start;
   orgsize = d->outbuf_end - d->outbuf_start;
-  esize = orgsize + (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
-  newbuf = realloc(d->outbuf_start, esize * sizeof(Py_UNICODE));
+  esize = (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
+  newbuf = (esize <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE) - orgsize) ?
+            realloc(d->outbuf_start, (orgsize + esize) * sizeof(Py_UNICODE)) :
+            NULL);
   if (!newbuf)
     return -1;
   d->outbuf_start = newbuf;
   d->outbuf = newbuf + orgpos;
-  d->outbuf_end = newbuf + esize;
+  d->outbuf_end = newbuf + orgsize + esize;
   return 0;
 }
 
@@ -88,3 +92,120 @@
 {
   return d->inbuf - d->inbuf_start;
 }
+
+/************************************************************/
+
+struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
+                                         Py_UNICODE *inbuf, Py_ssize_t inlen)
+{
+  Py_ssize_t outlen;
+  struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s));
+  if (!d)
+    return NULL;
+  if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0)
+    goto errorexit;
+
+  d->codec = codec;
+  d->inbuf_start = inbuf;
+  d->inbuf = inbuf;
+  d->inbuf_end = inbuf + inlen;
+
+  if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
+    goto errorexit;
+  outlen = inlen * 2 + 16;
+  d->outbuf_start = malloc(outlen);
+  if (!d->outbuf_start)
+    goto errorexit;
+  d->outbuf = d->outbuf_start;
+  d->outbuf_end = d->outbuf_start + outlen;
+  return d;
+
+ errorexit:
+  free(d);
+  return NULL;
+}
+
+void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d)
+{
+  free(d->outbuf_start);
+  free(d);
+}
+
+static int expand_encodebuffer(struct pypy_cjk_enc_s *d, Py_ssize_t esize)
+{
+  Py_ssize_t orgpos, orgsize;
+  unsigned char *newbuf;
+
+  orgpos = d->outbuf - d->outbuf_start;
+  orgsize = d->outbuf_end - d->outbuf_start;
+  esize = (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
+  newbuf = (esize <= PY_SSIZE_T_MAX - orgsize ?
+            realloc(d->outbuf_start, orgsize + esize) :
+            NULL);
+  if (!newbuf)
+    return -1;
+  d->outbuf_start = newbuf;
+  d->outbuf = newbuf + orgpos;
+  d->outbuf_end = newbuf + orgsize + esize;
+  return 0;
+}
+
+#define MBENC_RESET     MBENC_MAX<<1
+
+Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *d)
+{
+  int flags = MBENC_FLUSH | MBENC_RESET;   /* XXX always, for now */
+  while (1)
+    {
+      Py_ssize_t r;
+      Py_ssize_t inleft = (Py_ssize_t)(d->inbuf_end - d->inbuf);
+      Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+      if (inleft == 0)
+        return 0;
+      r = d->codec->encode(&d->state, d->codec->config,
+                           &d->inbuf, inleft, &d->outbuf, outleft, flags);
+      if (r != MBERR_TOOSMALL)
+        return r;
+      /* output buffer too small; grow it and continue. */
+      if (expand_encodebuffer(d, -1) == -1)
+        return MBERR_NOMEMORY;
+    }
+}
+
+Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *d)
+{
+  if (d->codec->encreset == NULL)
+    return 0;
+
+  while (1)
+    {
+      Py_ssize_t r;
+      Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+      r = d->codec->encreset(&d->state, d->codec->config, &d->outbuf, outleft);
+      if (r != MBERR_TOOSMALL)
+        return r;
+      /* output buffer too small; grow it and continue. */
+      if (expand_encodebuffer(d, -1) == -1)
+        return MBERR_NOMEMORY;
+    }
+}
+
+char *pypy_cjk_enc_outbuf(struct pypy_cjk_enc_s *d)
+{
+  return d->outbuf_start;
+}
+
+Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *d)
+{
+  return d->outbuf - d->outbuf_start;
+}
+
+Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d)
+{
+  return d->inbuf_end - d->inbuf;
+}
+
+Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d)
+{
+  return d->inbuf - d->inbuf_start;
+}
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h 
b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
@@ -12,6 +12,7 @@
 typedef uint32_t ucs4_t, Py_UNICODE;
 typedef uint16_t ucs2_t, DBCHAR;
 typedef ssize_t Py_ssize_t;
+#define PY_SSIZE_T_MAX   ((Py_ssize_t)(((size_t) -1) >> 1))
 
 
 typedef union {
@@ -81,4 +82,21 @@
 Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
 Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
 
+struct pypy_cjk_enc_s {
+  const MultibyteCodec *codec;
+  MultibyteCodec_State state;
+  const Py_UNICODE *inbuf_start, *inbuf, *inbuf_end;
+  unsigned char *outbuf_start, *outbuf, *outbuf_end;
+};
+
+struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
+                                         Py_UNICODE *inbuf, Py_ssize_t inlen);
+void pypy_cjk_enc_free(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *);
+char *pypy_cjk_enc_outbuf(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d);
+Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
+
 #endif
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py 
b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -1,6 +1,7 @@
 import py
 from pypy.module._multibytecodec.c_codecs import getcodec, codecs
-from pypy.module._multibytecodec.c_codecs import decode, EncodeDecodeError
+from pypy.module._multibytecodec.c_codecs import decode, encode
+from pypy.module._multibytecodec.c_codecs import EncodeDecodeError
 
 
 def test_codecs_existence():
@@ -34,3 +35,18 @@
     assert e.start == 2
     assert e.end == 4
     assert e.reason == "illegal multibyte sequence"
+
+def test_encode_hz():
+    c = getcodec("hz")
+    s = encode(c, u'foobar')
+    assert s == 'foobar' and type(s) is str
+    s = encode(c, u'\u5f95\u6cef')
+    assert s == '~{abc}~}'
+
+def test_encode_hz_error():
+    # error
+    c = getcodec("hz")
+    e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+    assert e.start == 3
+    assert e.end == 4
+    assert e.reason == "illegal multibyte sequence"
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to