Author: Matti Picus <[email protected]>
Branch: unicode-from-unicode-in-c
Changeset: r95260:d61f78777f2d
Date: 2018-10-28 19:35 +0200
http://bitbucket.org/pypy/pypy/changeset/d61f78777f2d/
Log: Move some pyunicode functions to pure C
diff --git a/pypy/module/cpyext/api.py b/pypy/module/cpyext/api.py
--- a/pypy/module/cpyext/api.py
+++ b/pypy/module/cpyext/api.py
@@ -590,8 +590,10 @@
'Py_FatalError', 'PyOS_snprintf', 'PyOS_vsnprintf', 'PyArg_Parse',
'PyArg_ParseTuple', 'PyArg_UnpackTuple', 'PyArg_ParseTupleAndKeywords',
'PyArg_VaParse', 'PyArg_VaParseTupleAndKeywords', '_PyArg_NoKeywords',
- 'PyString_FromFormat', 'PyString_FromFormatV',
- 'PyUnicode_FromFormat', 'PyUnicode_FromFormatV',
+ 'PyString_FromFormat', 'PyString_FromFormatV',
+ 'PyUnicode_FromFormat', 'PyUnicode_FromFormatV', 'PyUnicode_FromUnicode',
+ 'PyUnicode_FromWideChar', 'PyUnicode_AsUnicode', 'PyUnicode_GetSize',
+ 'PyUnicode_AsWideChar',
'PyModule_AddObject', 'PyModule_AddIntConstant',
'PyModule_AddStringConstant',
'Py_BuildValue', 'Py_VaBuildValue', 'PyTuple_Pack',
'_PyArg_Parse_SizeT', '_PyArg_ParseTuple_SizeT',
@@ -1185,7 +1187,7 @@
state.C.get_pyos_inputhook = rffi.llexternal(
'_PyPy_get_PyOS_InputHook', [], FUNCPTR,
compilation_info=eci, _nowrapper=True)
-
+
def init_function(func):
INIT_FUNCTIONS.append(func)
diff --git a/pypy/module/cpyext/include/unicodeobject.h
b/pypy/module/cpyext/include/unicodeobject.h
--- a/pypy/module/cpyext/include/unicodeobject.h
+++ b/pypy/module/cpyext/include/unicodeobject.h
@@ -9,11 +9,30 @@
PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char *format, va_list
vargs);
PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char *format, ...);
+PyAPI_FUNC(PyObject *) PyUnicode_FromUnicode(const wchar_t *u, Py_ssize_t
size);
+PyAPI_FUNC(PyObject *) PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t
size);
+PyAPI_FUNC(wchar_t*) PyUnicode_AsUnicode(PyObject *unicode);
+PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(PyObject *unicode);
+PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(PyUnicodeObject *unicode,
+ wchar_t *w, Py_ssize_t size);
+#define Py_UNICODE_COPY(target, source, length) \
+ Py_MEMCPY((target), (source), (length)*sizeof(wchar_t))
#define PyUnicode_Check(op) \
PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS)
#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
+/* Fast access macros */
+#define PyUnicode_GET_SIZE(op) \
+ (((PyUnicodeObject *)(op))->length)
+#define PyUnicode_GET_DATA_SIZE(op) \
+ (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
+#define PyUnicode_AS_UNICODE(op) \
+ (((PyUnicodeObject *)(op))->str)
+#define PyUnicode_AS_DATA(op) \
+ ((const char *)((PyUnicodeObject *)(op))->str)
+
+
#ifdef __cplusplus
}
#endif
diff --git a/pypy/module/cpyext/src/unicodeobject.c
b/pypy/module/cpyext/src/unicodeobject.c
--- a/pypy/module/cpyext/src/unicodeobject.c
+++ b/pypy/module/cpyext/src/unicodeobject.c
@@ -420,4 +420,154 @@
return ret;
}
+/* The empty Unicode object is shared to improve performance. */
+static PyUnicodeObject *unicode_empty = NULL;
+#define _Py_RETURN_UNICODE_EMPTY() \
+ do { \
+ if (unicode_empty != NULL) \
+ Py_INCREF(unicode_empty); \
+ else { \
+ unicode_empty = _PyUnicode_New(0); \
+ if (unicode_empty != NULL) \
+ Py_INCREF(unicode_empty); \
+ } \
+ return (PyObject *)unicode_empty; \
+ } while (0)
+
+/* We allocate one more byte to make sure the string is
+ Ux0000 terminated; some code relies on that.
+*/
+
+static
+PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
+{
+ register PyUnicodeObject *unicode;
+ size_t new_size;
+
+ /* Optimization for empty strings */
+ if (length == 0 && unicode_empty != NULL) {
+ Py_INCREF(unicode_empty);
+ return unicode_empty;
+ }
+
+ /* Ensure we won't overflow the size. */
+ if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
+ return (PyUnicodeObject *)PyErr_NoMemory();
+ }
+
+ unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
+ if (unicode == NULL)
+ return NULL;
+ new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
+ unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
+
+ if (!unicode->str) {
+ PyErr_NoMemory();
+ goto onError;
+ }
+ /* Initialize the first element to guard against cases where
+ * the caller fails before initializing str -- unicode_resize()
+ * reads str[0], and the Keep-Alive optimization can keep memory
+ * allocated for str alive across a call to unicode_dealloc(unicode).
+ * We don't want unicode_resize to read uninitialized memory in
+ * that case.
+ */
+ unicode->str[0] = 0;
+ unicode->str[length] = 0;
+ unicode->length = length;
+ unicode->hash = -1;
+ unicode->defenc = NULL;
+ return unicode;
+
+ onError:
+ /* XXX UNREF/NEWREF interface should be more symmetrical */
+ PyObject_Del(unicode);
+ return NULL;
+}
+
+
+PyObject*
+PyUnicode_FromUnicode(const wchar_t *u, Py_ssize_t size)
+{
+ /* Create a Unicode Object from the Py_UNICODE buffer u of the given size.
u
+ * may be NULL which causes the contents to be undefined. It is the user's
+ * responsibility to fill in the needed data. The buffer is copied into
the new
+ * object. If the buffer is not NULL, the return value might be a shared
object.
+ * Therefore, modification of the resulting Unicode object is only allowed
when u
+ * is NULL.
+ */
+ PyUnicodeObject *unicode;
+
+ /* Optimization for empty strings */
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
+
+
+ unicode = _PyUnicode_New(size);
+ if (!unicode)
+ return NULL;
+
+ /* Copy the Unicode data into the new object */
+ if (u != NULL)
+ Py_UNICODE_COPY(unicode->str, (wchar_t*)u, size);
+
+ return (PyObject *)unicode;
+}
+
+PyObject*
+PyUnicode_FromWideChar(const wchar_t *char_p, Py_ssize_t length)
+{
+ /*
+ * Create a Unicode object from the wchar_t buffer w of the given size.
+ * Return NULL on failure.
+ * PyPy supposes Py_UNICODE == wchar_t
+ */
+ return PyUnicode_FromUnicode(char_p, length);
+}
+
+wchar_t *PyUnicode_AsUnicode(PyObject *unicode)
+{
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
+ goto onError;
+ }
+ return PyUnicode_AS_UNICODE(unicode);
+
+ onError:
+ return NULL;
+}
+
+Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
+{
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
+ goto onError;
+ }
+ return PyUnicode_GET_SIZE(unicode);
+
+ onError:
+ return -1;
+}
+
+Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
+ wchar_t *w,
+ Py_ssize_t size)
+{
+ if (unicode == NULL) {
+ PyErr_BadInternalCall();
+ return -1;
+ }
+
+ /* If possible, try to copy the 0-termination as well */
+ if (size > PyUnicode_GET_SIZE(unicode))
+ size = PyUnicode_GET_SIZE(unicode) + 1;
+
+ memcpy(w, unicode->str, size * sizeof(wchar_t));
+ if (size > PyUnicode_GET_SIZE(unicode))
+ return PyUnicode_GET_SIZE(unicode);
+ else
+ return size;
+}
+
+
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py
b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -9,6 +9,7 @@
from rpython.rtyper.lltypesystem import rffi, lltype
import sys, py
from pypy.module.cpyext.unicodeobject import *
+from pypy.module.cpyext.state import State
class AppTestUnicodeObject(AppTestCpythonExtensionBase):
def test_unicodeobject(self):
@@ -116,16 +117,23 @@
"""
PyObject* o = PyUnicode_FromString("");
PyUnicodeObject* u = (PyUnicodeObject*)o;
+ Py_ssize_t n;
+ wchar_t * c;
- PyUnicode_GET_SIZE(u);
- PyUnicode_GET_SIZE(o);
+ n = PyUnicode_GET_SIZE(u);
+ n += PyUnicode_GET_SIZE(o);
- PyUnicode_GET_DATA_SIZE(u);
- PyUnicode_GET_DATA_SIZE(o);
+ n += PyUnicode_GET_DATA_SIZE(u);
+ n += PyUnicode_GET_DATA_SIZE(o);
- PyUnicode_AS_UNICODE(o);
- PyUnicode_AS_UNICODE(u);
- return o;
+ c = PyUnicode_AS_UNICODE(o);
+ c = PyUnicode_AS_UNICODE(u);
+ if (c == NULL) {
+ return o;
+ }
+ else {
+ return o;
+ }
""")])
assert module.test_macro_invocations() == u''
@@ -159,11 +167,6 @@
class TestUnicode(BaseApiTest):
def test_unicodeobject(self, space):
- assert PyUnicode_GET_SIZE(space, space.wrap(u'sp�m')) == 4
- assert PyUnicode_GetSize(space, space.wrap(u'sp�m')) == 4
- unichar = rffi.sizeof(Py_UNICODE)
- assert PyUnicode_GET_DATA_SIZE(space, space.wrap(u'sp�m')) == 4
* unichar
-
encoding = rffi.charp2str(PyUnicode_GetDefaultEncoding(space, ))
w_default_encoding = space.call_function(
space.sys.get('getdefaultencoding')
@@ -184,46 +187,6 @@
rffi.free_charp(utf_8)
rffi.free_charp(prev_encoding)
- def test_AS(self, space):
- word = space.wrap(u'spam')
- array = rffi.cast(rffi.CWCHARP, PyUnicode_AS_DATA(space, word))
- array2 = PyUnicode_AS_UNICODE(space, word)
- array3 = PyUnicode_AsUnicode(space, word)
- for (i, char) in enumerate(space.unicode_w(word)):
- assert array[i] == char
- assert array2[i] == char
- assert array3[i] == char
- with raises_w(space, TypeError):
- PyUnicode_AsUnicode(space, space.wrap('spam'))
-
- utf_8 = rffi.str2charp('utf-8')
- encoded = PyUnicode_AsEncodedString(space, space.wrap(u'sp�m'),
- utf_8, None)
- assert space.unwrap(encoded) == 'sp\xef\xbf\xbdm'
- encoded_obj = PyUnicode_AsEncodedObject(space,
space.wrap(u'sp�m'),
- utf_8, None)
- assert space.eq_w(encoded, encoded_obj)
- with raises_w(space, TypeError):
- PyUnicode_AsEncodedString(
- space, space.newtuple([1, 2, 3]), None, None)
- with raises_w(space, TypeError):
- PyUnicode_AsEncodedString(space, space.wrap(''), None, None)
- ascii = rffi.str2charp('ascii')
- replace = rffi.str2charp('replace')
- encoded = PyUnicode_AsEncodedString(space, space.wrap(u'sp�m'),
- ascii, replace)
- assert space.unwrap(encoded) == 'sp?m'
- rffi.free_charp(utf_8)
- rffi.free_charp(replace)
- rffi.free_charp(ascii)
-
- buf = rffi.unicode2wcharp(u"12345")
- PyUnicode_AsWideChar(space, space.wrap(u'longword'), buf, 5)
- assert rffi.wcharp2unicode(buf) == 'longw'
- PyUnicode_AsWideChar(space, space.wrap(u'a'), buf, 5)
- assert rffi.wcharp2unicode(buf) == 'a'
- rffi.free_wcharp(buf)
-
def test_fromstring(self, space):
s = rffi.str2charp(u'sp\x09m'.encode("utf-8"))
w_res = PyUnicode_FromString(space, s)
@@ -536,25 +499,6 @@
w_res = PyUnicode_Concat(space, space.wrap(u'a'), space.wrap(u'b'))
assert space.unicode_w(w_res) == u'ab'
- def test_copy(self, space):
- w_x = space.wrap(u"abcd\u0660")
- count1 = space.int_w(space.len(w_x))
- target_chunk = lltype.malloc(rffi.CWCHARP.TO, count1, flavor='raw')
-
- x_chunk = PyUnicode_AS_UNICODE(space, w_x)
- Py_UNICODE_COPY(space, target_chunk, x_chunk, 4)
- w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, 4))
-
- assert space.eq_w(w_y, space.wrap(u"abcd"))
-
- size = PyUnicode_GET_SIZE(space, w_x)
- Py_UNICODE_COPY(space, target_chunk, x_chunk, size)
- w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, size))
-
- assert space.eq_w(w_y, w_x)
-
- lltype.free(target_chunk, flavor='raw')
-
def test_ascii_codec(self, space):
s = 'abcdefg'
data = rffi.str2charp(s)
diff --git a/pypy/module/cpyext/unicodeobject.py
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -64,7 +64,7 @@
py_unicode = rffi.cast(PyUnicodeObject, py_obj)
s = space.unicode_w(w_obj)
py_unicode.c_length = len(s)
- py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
+ py_unicode.c_str = rffi.unicode2wcharp(s)
py_unicode.c_hash = space.hash_w(space.newunicode(s))
py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
@@ -87,7 +87,7 @@
py_unicode = rffi.cast(PyUnicodeObject, py_obj)
decref(space, py_unicode.c_defenc)
if py_unicode.c_str:
- lltype.free(py_unicode.c_str, flavor="raw")
+ rffi.free_wcharp(py_unicode.c_str)
from pypy.module.cpyext.object import _dealloc
_dealloc(space, py_obj)
@@ -189,82 +189,6 @@
"""Get the maximum ordinal for a Unicode character."""
return runicode.UNICHR(runicode.MAXUNICODE)
-@cpython_api([rffi.VOIDP], rffi.CCHARP, error=CANNOT_FAIL)
-def PyUnicode_AS_DATA(space, ref):
- """Return a pointer to the internal buffer of the object. o has to be a
- PyUnicodeObject (not checked)."""
- return rffi.cast(rffi.CCHARP, PyUnicode_AS_UNICODE(space, ref))
-
-@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
-def PyUnicode_GET_DATA_SIZE(space, w_obj):
- """Return the size of the object's internal buffer in bytes. o has to be a
- PyUnicodeObject (not checked)."""
- return rffi.sizeof(Py_UNICODE) * PyUnicode_GET_SIZE(space, w_obj)
-
-@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
-def PyUnicode_GET_SIZE(space, w_obj):
- """Return the size of the object. obj is a PyUnicodeObject (not
- checked)."""
- return space.len_w(w_obj)
-
-@cpython_api([rffi.VOIDP], rffi.CWCHARP, error=CANNOT_FAIL)
-def PyUnicode_AS_UNICODE(space, ref):
- """Return a pointer to the internal Py_UNICODE buffer of the object. ref
- has to be a PyUnicodeObject (not checked)."""
- ref_unicode = rffi.cast(PyUnicodeObject, ref)
- if not ref_unicode.c_str:
- # Copy unicode buffer
- w_unicode = from_ref(space, rffi.cast(PyObject, ref))
- u = space.unicode_w(w_unicode)
- ref_unicode.c_str = rffi.unicode2wcharp(u)
- return ref_unicode.c_str
-
-@cpython_api([PyObject], rffi.CWCHARP)
-def PyUnicode_AsUnicode(space, ref):
- """Return a read-only pointer to the Unicode object's internal Py_UNICODE
- buffer, NULL if unicode is not a Unicode object."""
- # Don't use PyUnicode_Check, it will realize the object :-(
- w_type = from_ref(space, rffi.cast(PyObject, ref.c_ob_type))
- if not space.issubtype_w(w_type, space.w_unicode):
- raise oefmt(space.w_TypeError, "expected unicode object")
- return PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref))
-
-@cpython_api([PyObject], Py_ssize_t, error=-1)
-def PyUnicode_GetSize(space, ref):
- if from_ref(space, rffi.cast(PyObject, ref.c_ob_type)) is space.w_unicode:
- ref = rffi.cast(PyUnicodeObject, ref)
- return ref.c_length
- else:
- w_obj = from_ref(space, ref)
- return space.len_w(w_obj)
-
-@cpython_api([PyUnicodeObject, rffi.CWCHARP, Py_ssize_t], Py_ssize_t, error=-1)
-def PyUnicode_AsWideChar(space, ref, buf, size):
- """Copy the Unicode object contents into the wchar_t buffer w. At most
- size wchar_t characters are copied (excluding a possibly trailing
- 0-termination character). Return the number of wchar_t characters
- copied or -1 in case of an error. Note that the resulting wchar_t
- string may or may not be 0-terminated. It is the responsibility of the
caller
- to make sure that the wchar_t string is 0-terminated in case this is
- required by the application."""
- c_str = PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref))
- c_length = ref.c_length
-
- # If possible, try to copy the 0-termination as well
- if size > c_length:
- size = c_length + 1
-
-
- i = 0
- while i < size:
- buf[i] = c_str[i]
- i += 1
-
- if size > c_length:
- return c_length
- else:
- return size
-
@cpython_api([], rffi.CCHARP, error=CANNOT_FAIL)
def PyUnicode_GetDefaultEncoding(space):
"""Returns the currently active default encoding."""
@@ -327,27 +251,6 @@
return unicodeobject.encode_object(space, w_unicode, 'unicode-escape',
'strict')
-@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True)
-def PyUnicode_FromUnicode(space, wchar_p, length):
- """Create a Unicode Object from the Py_UNICODE buffer u of the given size.
u
- may be NULL which causes the contents to be undefined. It is the user's
- responsibility to fill in the needed data. The buffer is copied into the
new
- object. If the buffer is not NULL, the return value might be a shared
object.
- Therefore, modification of the resulting Unicode object is only allowed
when u
- is NULL."""
- if wchar_p:
- s = rffi.wcharpsize2unicode(wchar_p, length)
- return make_ref(space, space.newunicode(s))
- else:
- return rffi.cast(PyObject, new_empty_unicode(space, length))
-
-@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True)
-def PyUnicode_FromWideChar(space, wchar_p, length):
- """Create a Unicode object from the wchar_t buffer w of the given size.
- Return NULL on failure."""
- # PyPy supposes Py_UNICODE == wchar_t
- return PyUnicode_FromUnicode(space, wchar_p, length)
-
@cpython_api([PyObject, CONST_STRING], PyObject, result_is_ll=True)
def _PyUnicode_AsDefaultEncodedString(space, ref, errors):
# Returns a borrowed reference.
@@ -679,13 +582,6 @@
"""Concat two strings giving a new Unicode string."""
return space.add(w_left, w_right)
-@cpython_api([rffi.CWCHARP, rffi.CWCHARP, Py_ssize_t], lltype.Void)
-def Py_UNICODE_COPY(space, target, source, length):
- """Roughly equivalent to memcpy() only the base size is Py_UNICODE
- copies sizeof(Py_UNICODE) * length bytes from source to target"""
- for i in range(0, length):
- target[i] = source[i]
-
@cpython_api([PyObject, PyObject], PyObject)
def PyUnicode_Format(space, w_format, w_args):
"""Return a new string object from format and args; this is analogous to
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit