[pypy-commit] pypy unicode-from-unicode-in-c: Move some pyunicode functions to pure C

mattip Mon, 29 Oct 2018 14:50:48 -0700

Author: Matti Picus <matti.pi...@gmail.com>
Branch: unicode-from-unicode-in-c
Changeset: r95260:d61f78777f2d
Date: 2018-10-28 19:35 +0200
http://bitbucket.org/pypy/pypy/changeset/d61f78777f2d/


Log:    Move some pyunicode functions to pure C

diff --git a/pypy/module/cpyext/api.py b/pypy/module/cpyext/api.py
--- a/pypy/module/cpyext/api.py
+++ b/pypy/module/cpyext/api.py
@@ -590,8 +590,10 @@
     'Py_FatalError', 'PyOS_snprintf', 'PyOS_vsnprintf', 'PyArg_Parse',
     'PyArg_ParseTuple', 'PyArg_UnpackTuple', 'PyArg_ParseTupleAndKeywords',
     'PyArg_VaParse', 'PyArg_VaParseTupleAndKeywords', '_PyArg_NoKeywords',
-    'PyString_FromFormat', 'PyString_FromFormatV',
-    'PyUnicode_FromFormat', 'PyUnicode_FromFormatV',
+    'PyString_FromFormat', 'PyString_FromFormatV', 
+    'PyUnicode_FromFormat', 'PyUnicode_FromFormatV', 'PyUnicode_FromUnicode',
+    'PyUnicode_FromWideChar', 'PyUnicode_AsUnicode', 'PyUnicode_GetSize',
+    'PyUnicode_AsWideChar',
     'PyModule_AddObject', 'PyModule_AddIntConstant', 
'PyModule_AddStringConstant',
     'Py_BuildValue', 'Py_VaBuildValue', 'PyTuple_Pack',
     '_PyArg_Parse_SizeT', '_PyArg_ParseTuple_SizeT',
@@ -1185,7 +1187,7 @@
     state.C.get_pyos_inputhook = rffi.llexternal(
         '_PyPy_get_PyOS_InputHook', [], FUNCPTR,
         compilation_info=eci, _nowrapper=True)
-
+        
 
 def init_function(func):
     INIT_FUNCTIONS.append(func)
diff --git a/pypy/module/cpyext/include/unicodeobject.h 
b/pypy/module/cpyext/include/unicodeobject.h
--- a/pypy/module/cpyext/include/unicodeobject.h
+++ b/pypy/module/cpyext/include/unicodeobject.h
@@ -9,11 +9,30 @@
 
 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char *format, va_list 
vargs);
 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char *format, ...);
+PyAPI_FUNC(PyObject *) PyUnicode_FromUnicode(const wchar_t *u, Py_ssize_t 
size);
+PyAPI_FUNC(PyObject *) PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t 
size);
+PyAPI_FUNC(wchar_t*)   PyUnicode_AsUnicode(PyObject *unicode);
+PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(PyObject *unicode);
+PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(PyUnicodeObject *unicode,
+                                             wchar_t *w, Py_ssize_t size);
 
+#define Py_UNICODE_COPY(target, source, length)                         \
+    Py_MEMCPY((target), (source), (length)*sizeof(wchar_t))
 #define PyUnicode_Check(op) \
                 PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS)
 #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
 
+/* Fast access macros */
+#define PyUnicode_GET_SIZE(op) \
+    (((PyUnicodeObject *)(op))->length)
+#define PyUnicode_GET_DATA_SIZE(op) \
+    (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
+#define PyUnicode_AS_UNICODE(op) \
+    (((PyUnicodeObject *)(op))->str)
+#define PyUnicode_AS_DATA(op) \
+    ((const char *)((PyUnicodeObject *)(op))->str)
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/pypy/module/cpyext/src/unicodeobject.c 
b/pypy/module/cpyext/src/unicodeobject.c
--- a/pypy/module/cpyext/src/unicodeobject.c
+++ b/pypy/module/cpyext/src/unicodeobject.c
@@ -420,4 +420,154 @@
     return ret;
 }
 
+/* The empty Unicode object is shared to improve performance. */
+static PyUnicodeObject *unicode_empty = NULL;
 
+#define _Py_RETURN_UNICODE_EMPTY()                      \
+    do {                                                \
+        if (unicode_empty != NULL)                      \
+            Py_INCREF(unicode_empty);                   \
+        else {                                          \
+            unicode_empty = _PyUnicode_New(0);          \
+            if (unicode_empty != NULL)                  \
+                Py_INCREF(unicode_empty);               \
+        }                                               \
+        return (PyObject *)unicode_empty;               \
+    } while (0)
+
+/* We allocate one more byte to make sure the string is
+   Ux0000 terminated; some code relies on that.
+*/
+
+static
+PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
+{
+    register PyUnicodeObject *unicode;
+    size_t new_size;
+
+    /* Optimization for empty strings */
+    if (length == 0 && unicode_empty != NULL) {
+        Py_INCREF(unicode_empty);
+        return unicode_empty;
+    }
+
+    /* Ensure we won't overflow the size. */
+    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
+        return (PyUnicodeObject *)PyErr_NoMemory();
+    }
+
+    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
+    if (unicode == NULL)
+        return NULL;
+    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
+    unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
+
+    if (!unicode->str) {
+        PyErr_NoMemory();
+        goto onError;
+    }
+    /* Initialize the first element to guard against cases where
+     * the caller fails before initializing str -- unicode_resize()
+     * reads str[0], and the Keep-Alive optimization can keep memory
+     * allocated for str alive across a call to unicode_dealloc(unicode).
+     * We don't want unicode_resize to read uninitialized memory in
+     * that case.
+     */
+    unicode->str[0] = 0;
+    unicode->str[length] = 0;
+    unicode->length = length;
+    unicode->hash = -1;
+    unicode->defenc = NULL;
+    return unicode;
+
+  onError:
+    /* XXX UNREF/NEWREF interface should be more symmetrical */
+    PyObject_Del(unicode);
+    return NULL;
+}
+
+
+PyObject*
+PyUnicode_FromUnicode(const wchar_t *u, Py_ssize_t size)
+{
+    /* Create a Unicode Object from the Py_UNICODE buffer u of the given size. 
u
+     * may be NULL which causes the contents to be undefined. It is the user's
+     * responsibility to fill in the needed data.  The buffer is copied into 
the new
+     * object. If the buffer is not NULL, the return value might be a shared 
object.
+     * Therefore, modification of the resulting Unicode object is only allowed 
when u
+     * is NULL. 
+     */
+    PyUnicodeObject *unicode;
+
+    /* Optimization for empty strings */
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
+
+
+    unicode = _PyUnicode_New(size);
+    if (!unicode)
+        return NULL;
+
+    /* Copy the Unicode data into the new object */
+    if (u != NULL)
+        Py_UNICODE_COPY(unicode->str, (wchar_t*)u, size);
+
+    return (PyObject *)unicode;
+}
+
+PyObject*
+PyUnicode_FromWideChar(const wchar_t *char_p, Py_ssize_t length)
+{
+    /* 
+     * Create a Unicode object from the wchar_t buffer w of the given size.
+     * Return NULL on failure.
+     * PyPy supposes Py_UNICODE == wchar_t
+     */
+    return PyUnicode_FromUnicode(char_p, length);
+}
+
+wchar_t *PyUnicode_AsUnicode(PyObject *unicode)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        goto onError;
+    }
+    return PyUnicode_AS_UNICODE(unicode);
+
+  onError:
+    return NULL;
+}
+
+Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        goto onError;
+    }
+    return PyUnicode_GET_SIZE(unicode);
+
+  onError:
+    return -1;
+}
+
+Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
+                                wchar_t *w,
+                                Py_ssize_t size)
+{
+    if (unicode == NULL) {
+        PyErr_BadInternalCall();
+        return -1;
+    }
+
+    /* If possible, try to copy the 0-termination as well */
+    if (size > PyUnicode_GET_SIZE(unicode))
+        size = PyUnicode_GET_SIZE(unicode) + 1;
+
+    memcpy(w, unicode->str, size * sizeof(wchar_t));
+    if (size > PyUnicode_GET_SIZE(unicode))
+        return PyUnicode_GET_SIZE(unicode);
+    else
+        return size;
+}
+
+
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py 
b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -9,6 +9,7 @@
 from rpython.rtyper.lltypesystem import rffi, lltype
 import sys, py
 from pypy.module.cpyext.unicodeobject import *
+from pypy.module.cpyext.state import State
 
 class AppTestUnicodeObject(AppTestCpythonExtensionBase):
     def test_unicodeobject(self):
@@ -116,16 +117,23 @@
              """
                 PyObject* o = PyUnicode_FromString("");
                 PyUnicodeObject* u = (PyUnicodeObject*)o;
+                Py_ssize_t n;
+                wchar_t * c;
 
-                PyUnicode_GET_SIZE(u);
-                PyUnicode_GET_SIZE(o);
+                n = PyUnicode_GET_SIZE(u);
+                n += PyUnicode_GET_SIZE(o);
 
-                PyUnicode_GET_DATA_SIZE(u);
-                PyUnicode_GET_DATA_SIZE(o);
+                n += PyUnicode_GET_DATA_SIZE(u);
+                n += PyUnicode_GET_DATA_SIZE(o);
 
-                PyUnicode_AS_UNICODE(o);
-                PyUnicode_AS_UNICODE(u);
-                return o;
+                c = PyUnicode_AS_UNICODE(o);
+                c = PyUnicode_AS_UNICODE(u);
+                if (c == NULL) {
+                    return o;
+                }
+                else {
+                    return o;
+                }
              """)])
         assert module.test_macro_invocations() == u''
 
@@ -159,11 +167,6 @@
 
 class TestUnicode(BaseApiTest):
     def test_unicodeobject(self, space):
-        assert PyUnicode_GET_SIZE(space, space.wrap(u'sp&#65533;m')) == 4
-        assert PyUnicode_GetSize(space, space.wrap(u'sp&#65533;m')) == 4
-        unichar = rffi.sizeof(Py_UNICODE)
-        assert PyUnicode_GET_DATA_SIZE(space, space.wrap(u'sp&#65533;m')) == 4 
* unichar
-
         encoding = rffi.charp2str(PyUnicode_GetDefaultEncoding(space, ))
         w_default_encoding = space.call_function(
             space.sys.get('getdefaultencoding')
@@ -184,46 +187,6 @@
         rffi.free_charp(utf_8)
         rffi.free_charp(prev_encoding)
 
-    def test_AS(self, space):
-        word = space.wrap(u'spam')
-        array = rffi.cast(rffi.CWCHARP, PyUnicode_AS_DATA(space, word))
-        array2 = PyUnicode_AS_UNICODE(space, word)
-        array3 = PyUnicode_AsUnicode(space, word)
-        for (i, char) in enumerate(space.unicode_w(word)):
-            assert array[i] == char
-            assert array2[i] == char
-            assert array3[i] == char
-        with raises_w(space, TypeError):
-            PyUnicode_AsUnicode(space, space.wrap('spam'))
-
-        utf_8 = rffi.str2charp('utf-8')
-        encoded = PyUnicode_AsEncodedString(space, space.wrap(u'sp&#65533;m'),
-                                                utf_8, None)
-        assert space.unwrap(encoded) == 'sp\xef\xbf\xbdm'
-        encoded_obj = PyUnicode_AsEncodedObject(space, 
space.wrap(u'sp&#65533;m'),
-                                                utf_8, None)
-        assert space.eq_w(encoded, encoded_obj)
-        with raises_w(space, TypeError):
-            PyUnicode_AsEncodedString(
-                space, space.newtuple([1, 2, 3]), None, None)
-        with raises_w(space, TypeError):
-            PyUnicode_AsEncodedString(space, space.wrap(''), None, None)
-        ascii = rffi.str2charp('ascii')
-        replace = rffi.str2charp('replace')
-        encoded = PyUnicode_AsEncodedString(space, space.wrap(u'sp&#65533;m'),
-                                                ascii, replace)
-        assert space.unwrap(encoded) == 'sp?m'
-        rffi.free_charp(utf_8)
-        rffi.free_charp(replace)
-        rffi.free_charp(ascii)
-
-        buf = rffi.unicode2wcharp(u"12345")
-        PyUnicode_AsWideChar(space, space.wrap(u'longword'), buf, 5)
-        assert rffi.wcharp2unicode(buf) == 'longw'
-        PyUnicode_AsWideChar(space, space.wrap(u'a'), buf, 5)
-        assert rffi.wcharp2unicode(buf) == 'a'
-        rffi.free_wcharp(buf)
-
     def test_fromstring(self, space):
         s = rffi.str2charp(u'sp\x09m'.encode("utf-8"))
         w_res = PyUnicode_FromString(space, s)
@@ -536,25 +499,6 @@
         w_res = PyUnicode_Concat(space, space.wrap(u'a'), space.wrap(u'b'))
         assert space.unicode_w(w_res) == u'ab'
 
-    def test_copy(self, space):
-        w_x = space.wrap(u"abcd\u0660")
-        count1 = space.int_w(space.len(w_x))
-        target_chunk = lltype.malloc(rffi.CWCHARP.TO, count1, flavor='raw')
-
-        x_chunk = PyUnicode_AS_UNICODE(space, w_x)
-        Py_UNICODE_COPY(space, target_chunk, x_chunk, 4)
-        w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, 4))
-
-        assert space.eq_w(w_y, space.wrap(u"abcd"))
-
-        size = PyUnicode_GET_SIZE(space, w_x)
-        Py_UNICODE_COPY(space, target_chunk, x_chunk, size)
-        w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, size))
-
-        assert space.eq_w(w_y, w_x)
-
-        lltype.free(target_chunk, flavor='raw')
-
     def test_ascii_codec(self, space):
         s = 'abcdefg'
         data = rffi.str2charp(s)
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -64,7 +64,7 @@
     py_unicode = rffi.cast(PyUnicodeObject, py_obj)
     s = space.unicode_w(w_obj)
     py_unicode.c_length = len(s)
-    py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
+    py_unicode.c_str = rffi.unicode2wcharp(s)
     py_unicode.c_hash = space.hash_w(space.newunicode(s))
     py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
 
@@ -87,7 +87,7 @@
     py_unicode = rffi.cast(PyUnicodeObject, py_obj)
     decref(space, py_unicode.c_defenc)
     if py_unicode.c_str:
-        lltype.free(py_unicode.c_str, flavor="raw")
+        rffi.free_wcharp(py_unicode.c_str)
 
     from pypy.module.cpyext.object import _dealloc
     _dealloc(space, py_obj)
@@ -189,82 +189,6 @@
     """Get the maximum ordinal for a Unicode character."""
     return runicode.UNICHR(runicode.MAXUNICODE)
 
-@cpython_api([rffi.VOIDP], rffi.CCHARP, error=CANNOT_FAIL)
-def PyUnicode_AS_DATA(space, ref):
-    """Return a pointer to the internal buffer of the object. o has to be a
-    PyUnicodeObject (not checked)."""
-    return rffi.cast(rffi.CCHARP, PyUnicode_AS_UNICODE(space, ref))
-
-@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
-def PyUnicode_GET_DATA_SIZE(space, w_obj):
-    """Return the size of the object's internal buffer in bytes.  o has to be a
-    PyUnicodeObject (not checked)."""
-    return rffi.sizeof(Py_UNICODE) * PyUnicode_GET_SIZE(space, w_obj)
-
-@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
-def PyUnicode_GET_SIZE(space, w_obj):
-    """Return the size of the object.  obj is a PyUnicodeObject (not
-    checked)."""
-    return space.len_w(w_obj)
-
-@cpython_api([rffi.VOIDP], rffi.CWCHARP, error=CANNOT_FAIL)
-def PyUnicode_AS_UNICODE(space, ref):
-    """Return a pointer to the internal Py_UNICODE buffer of the object.  ref
-    has to be a PyUnicodeObject (not checked)."""
-    ref_unicode = rffi.cast(PyUnicodeObject, ref)
-    if not ref_unicode.c_str:
-        # Copy unicode buffer
-        w_unicode = from_ref(space, rffi.cast(PyObject, ref))
-        u = space.unicode_w(w_unicode)
-        ref_unicode.c_str = rffi.unicode2wcharp(u)
-    return ref_unicode.c_str
-
-@cpython_api([PyObject], rffi.CWCHARP)
-def PyUnicode_AsUnicode(space, ref):
-    """Return a read-only pointer to the Unicode object's internal Py_UNICODE
-    buffer, NULL if unicode is not a Unicode object."""
-    # Don't use PyUnicode_Check, it will realize the object :-(
-    w_type = from_ref(space, rffi.cast(PyObject, ref.c_ob_type))
-    if not space.issubtype_w(w_type, space.w_unicode):
-        raise oefmt(space.w_TypeError, "expected unicode object")
-    return PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref))
-
-@cpython_api([PyObject], Py_ssize_t, error=-1)
-def PyUnicode_GetSize(space, ref):
-    if from_ref(space, rffi.cast(PyObject, ref.c_ob_type)) is space.w_unicode:
-        ref = rffi.cast(PyUnicodeObject, ref)
-        return ref.c_length
-    else:
-        w_obj = from_ref(space, ref)
-        return space.len_w(w_obj)
-
-@cpython_api([PyUnicodeObject, rffi.CWCHARP, Py_ssize_t], Py_ssize_t, error=-1)
-def PyUnicode_AsWideChar(space, ref, buf, size):
-    """Copy the Unicode object contents into the wchar_t buffer w.  At most
-    size wchar_t characters are copied (excluding a possibly trailing
-    0-termination character).  Return the number of wchar_t characters
-    copied or -1 in case of an error.  Note that the resulting wchar_t
-    string may or may not be 0-terminated.  It is the responsibility of the 
caller
-    to make sure that the wchar_t string is 0-terminated in case this is
-    required by the application."""
-    c_str = PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref))
-    c_length = ref.c_length
-
-    # If possible, try to copy the 0-termination as well
-    if size > c_length:
-        size = c_length + 1
-
-
-    i = 0
-    while i < size:
-        buf[i] = c_str[i]
-        i += 1
-
-    if size > c_length:
-        return c_length
-    else:
-        return size
-
 @cpython_api([], rffi.CCHARP, error=CANNOT_FAIL)
 def PyUnicode_GetDefaultEncoding(space):
     """Returns the currently active default encoding."""
@@ -327,27 +251,6 @@
 
     return unicodeobject.encode_object(space, w_unicode, 'unicode-escape', 
'strict')
 
-@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True)
-def PyUnicode_FromUnicode(space, wchar_p, length):
-    """Create a Unicode Object from the Py_UNICODE buffer u of the given size. 
u
-    may be NULL which causes the contents to be undefined. It is the user's
-    responsibility to fill in the needed data.  The buffer is copied into the 
new
-    object. If the buffer is not NULL, the return value might be a shared 
object.
-    Therefore, modification of the resulting Unicode object is only allowed 
when u
-    is NULL."""
-    if wchar_p:
-        s = rffi.wcharpsize2unicode(wchar_p, length)
-        return make_ref(space, space.newunicode(s))
-    else:
-        return rffi.cast(PyObject, new_empty_unicode(space, length))
-
-@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True)
-def PyUnicode_FromWideChar(space, wchar_p, length):
-    """Create a Unicode object from the wchar_t buffer w of the given size.
-    Return NULL on failure."""
-    # PyPy supposes Py_UNICODE == wchar_t
-    return PyUnicode_FromUnicode(space, wchar_p, length)
-
 @cpython_api([PyObject, CONST_STRING], PyObject, result_is_ll=True)
 def _PyUnicode_AsDefaultEncodedString(space, ref, errors):
     # Returns a borrowed reference.
@@ -679,13 +582,6 @@
     """Concat two strings giving a new Unicode string."""
     return space.add(w_left, w_right)
 
-@cpython_api([rffi.CWCHARP, rffi.CWCHARP, Py_ssize_t], lltype.Void)
-def Py_UNICODE_COPY(space, target, source, length):
-    """Roughly equivalent to memcpy() only the base size is Py_UNICODE
-    copies sizeof(Py_UNICODE) * length bytes from source to target"""
-    for i in range(0, length):
-        target[i] = source[i]
-
 @cpython_api([PyObject, PyObject], PyObject)
 def PyUnicode_Format(space, w_format, w_args):
     """Return a new string object from format and args; this is analogous to
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-from-unicode-in-c: Move some pyunicode functions to pure C

Reply via email to