Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-from-unicode-in-c Changeset: r95260:d61f78777f2d Date: 2018-10-28 19:35 +0200 http://bitbucket.org/pypy/pypy/changeset/d61f78777f2d/
Log: Move some pyunicode functions to pure C diff --git a/pypy/module/cpyext/api.py b/pypy/module/cpyext/api.py --- a/pypy/module/cpyext/api.py +++ b/pypy/module/cpyext/api.py @@ -590,8 +590,10 @@ 'Py_FatalError', 'PyOS_snprintf', 'PyOS_vsnprintf', 'PyArg_Parse', 'PyArg_ParseTuple', 'PyArg_UnpackTuple', 'PyArg_ParseTupleAndKeywords', 'PyArg_VaParse', 'PyArg_VaParseTupleAndKeywords', '_PyArg_NoKeywords', - 'PyString_FromFormat', 'PyString_FromFormatV', - 'PyUnicode_FromFormat', 'PyUnicode_FromFormatV', + 'PyString_FromFormat', 'PyString_FromFormatV', + 'PyUnicode_FromFormat', 'PyUnicode_FromFormatV', 'PyUnicode_FromUnicode', + 'PyUnicode_FromWideChar', 'PyUnicode_AsUnicode', 'PyUnicode_GetSize', + 'PyUnicode_AsWideChar', 'PyModule_AddObject', 'PyModule_AddIntConstant', 'PyModule_AddStringConstant', 'Py_BuildValue', 'Py_VaBuildValue', 'PyTuple_Pack', '_PyArg_Parse_SizeT', '_PyArg_ParseTuple_SizeT', @@ -1185,7 +1187,7 @@ state.C.get_pyos_inputhook = rffi.llexternal( '_PyPy_get_PyOS_InputHook', [], FUNCPTR, compilation_info=eci, _nowrapper=True) - + def init_function(func): INIT_FUNCTIONS.append(func) diff --git a/pypy/module/cpyext/include/unicodeobject.h b/pypy/module/cpyext/include/unicodeobject.h --- a/pypy/module/cpyext/include/unicodeobject.h +++ b/pypy/module/cpyext/include/unicodeobject.h @@ -9,11 +9,30 @@ PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char *format, va_list vargs); PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char *format, ...); +PyAPI_FUNC(PyObject *) PyUnicode_FromUnicode(const wchar_t *u, Py_ssize_t size); +PyAPI_FUNC(PyObject *) PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size); +PyAPI_FUNC(wchar_t*) PyUnicode_AsUnicode(PyObject *unicode); +PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(PyObject *unicode); +PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(PyUnicodeObject *unicode, + wchar_t *w, Py_ssize_t size); +#define Py_UNICODE_COPY(target, source, length) \ + Py_MEMCPY((target), (source), (length)*sizeof(wchar_t)) #define PyUnicode_Check(op) \ PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS) #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type) +/* Fast access macros */ +#define PyUnicode_GET_SIZE(op) \ + (((PyUnicodeObject *)(op))->length) +#define PyUnicode_GET_DATA_SIZE(op) \ + (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) +#define PyUnicode_AS_UNICODE(op) \ + (((PyUnicodeObject *)(op))->str) +#define PyUnicode_AS_DATA(op) \ + ((const char *)((PyUnicodeObject *)(op))->str) + + #ifdef __cplusplus } #endif diff --git a/pypy/module/cpyext/src/unicodeobject.c b/pypy/module/cpyext/src/unicodeobject.c --- a/pypy/module/cpyext/src/unicodeobject.c +++ b/pypy/module/cpyext/src/unicodeobject.c @@ -420,4 +420,154 @@ return ret; } +/* The empty Unicode object is shared to improve performance. */ +static PyUnicodeObject *unicode_empty = NULL; +#define _Py_RETURN_UNICODE_EMPTY() \ + do { \ + if (unicode_empty != NULL) \ + Py_INCREF(unicode_empty); \ + else { \ + unicode_empty = _PyUnicode_New(0); \ + if (unicode_empty != NULL) \ + Py_INCREF(unicode_empty); \ + } \ + return (PyObject *)unicode_empty; \ + } while (0) + +/* We allocate one more byte to make sure the string is + Ux0000 terminated; some code relies on that. +*/ + +static +PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) +{ + register PyUnicodeObject *unicode; + size_t new_size; + + /* Optimization for empty strings */ + if (length == 0 && unicode_empty != NULL) { + Py_INCREF(unicode_empty); + return unicode_empty; + } + + /* Ensure we won't overflow the size. */ + if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { + return (PyUnicodeObject *)PyErr_NoMemory(); + } + + unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); + if (unicode == NULL) + return NULL; + new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); + unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); + + if (!unicode->str) { + PyErr_NoMemory(); + goto onError; + } + /* Initialize the first element to guard against cases where + * the caller fails before initializing str -- unicode_resize() + * reads str[0], and the Keep-Alive optimization can keep memory + * allocated for str alive across a call to unicode_dealloc(unicode). + * We don't want unicode_resize to read uninitialized memory in + * that case. + */ + unicode->str[0] = 0; + unicode->str[length] = 0; + unicode->length = length; + unicode->hash = -1; + unicode->defenc = NULL; + return unicode; + + onError: + /* XXX UNREF/NEWREF interface should be more symmetrical */ + PyObject_Del(unicode); + return NULL; +} + + +PyObject* +PyUnicode_FromUnicode(const wchar_t *u, Py_ssize_t size) +{ + /* Create a Unicode Object from the Py_UNICODE buffer u of the given size. u + * may be NULL which causes the contents to be undefined. It is the user's + * responsibility to fill in the needed data. The buffer is copied into the new + * object. If the buffer is not NULL, the return value might be a shared object. + * Therefore, modification of the resulting Unicode object is only allowed when u + * is NULL. + */ + PyUnicodeObject *unicode; + + /* Optimization for empty strings */ + if (size == 0) + _Py_RETURN_UNICODE_EMPTY(); + + + unicode = _PyUnicode_New(size); + if (!unicode) + return NULL; + + /* Copy the Unicode data into the new object */ + if (u != NULL) + Py_UNICODE_COPY(unicode->str, (wchar_t*)u, size); + + return (PyObject *)unicode; +} + +PyObject* +PyUnicode_FromWideChar(const wchar_t *char_p, Py_ssize_t length) +{ + /* + * Create a Unicode object from the wchar_t buffer w of the given size. + * Return NULL on failure. + * PyPy supposes Py_UNICODE == wchar_t + */ + return PyUnicode_FromUnicode(char_p, length); +} + +wchar_t *PyUnicode_AsUnicode(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + goto onError; + } + return PyUnicode_AS_UNICODE(unicode); + + onError: + return NULL; +} + +Py_ssize_t PyUnicode_GetSize(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + goto onError; + } + return PyUnicode_GET_SIZE(unicode); + + onError: + return -1; +} + +Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, + wchar_t *w, + Py_ssize_t size) +{ + if (unicode == NULL) { + PyErr_BadInternalCall(); + return -1; + } + + /* If possible, try to copy the 0-termination as well */ + if (size > PyUnicode_GET_SIZE(unicode)) + size = PyUnicode_GET_SIZE(unicode) + 1; + + memcpy(w, unicode->str, size * sizeof(wchar_t)); + if (size > PyUnicode_GET_SIZE(unicode)) + return PyUnicode_GET_SIZE(unicode); + else + return size; +} + + diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py --- a/pypy/module/cpyext/test/test_unicodeobject.py +++ b/pypy/module/cpyext/test/test_unicodeobject.py @@ -9,6 +9,7 @@ from rpython.rtyper.lltypesystem import rffi, lltype import sys, py from pypy.module.cpyext.unicodeobject import * +from pypy.module.cpyext.state import State class AppTestUnicodeObject(AppTestCpythonExtensionBase): def test_unicodeobject(self): @@ -116,16 +117,23 @@ """ PyObject* o = PyUnicode_FromString(""); PyUnicodeObject* u = (PyUnicodeObject*)o; + Py_ssize_t n; + wchar_t * c; - PyUnicode_GET_SIZE(u); - PyUnicode_GET_SIZE(o); + n = PyUnicode_GET_SIZE(u); + n += PyUnicode_GET_SIZE(o); - PyUnicode_GET_DATA_SIZE(u); - PyUnicode_GET_DATA_SIZE(o); + n += PyUnicode_GET_DATA_SIZE(u); + n += PyUnicode_GET_DATA_SIZE(o); - PyUnicode_AS_UNICODE(o); - PyUnicode_AS_UNICODE(u); - return o; + c = PyUnicode_AS_UNICODE(o); + c = PyUnicode_AS_UNICODE(u); + if (c == NULL) { + return o; + } + else { + return o; + } """)]) assert module.test_macro_invocations() == u'' @@ -159,11 +167,6 @@ class TestUnicode(BaseApiTest): def test_unicodeobject(self, space): - assert PyUnicode_GET_SIZE(space, space.wrap(u'sp�m')) == 4 - assert PyUnicode_GetSize(space, space.wrap(u'sp�m')) == 4 - unichar = rffi.sizeof(Py_UNICODE) - assert PyUnicode_GET_DATA_SIZE(space, space.wrap(u'sp�m')) == 4 * unichar - encoding = rffi.charp2str(PyUnicode_GetDefaultEncoding(space, )) w_default_encoding = space.call_function( space.sys.get('getdefaultencoding') @@ -184,46 +187,6 @@ rffi.free_charp(utf_8) rffi.free_charp(prev_encoding) - def test_AS(self, space): - word = space.wrap(u'spam') - array = rffi.cast(rffi.CWCHARP, PyUnicode_AS_DATA(space, word)) - array2 = PyUnicode_AS_UNICODE(space, word) - array3 = PyUnicode_AsUnicode(space, word) - for (i, char) in enumerate(space.unicode_w(word)): - assert array[i] == char - assert array2[i] == char - assert array3[i] == char - with raises_w(space, TypeError): - PyUnicode_AsUnicode(space, space.wrap('spam')) - - utf_8 = rffi.str2charp('utf-8') - encoded = PyUnicode_AsEncodedString(space, space.wrap(u'sp�m'), - utf_8, None) - assert space.unwrap(encoded) == 'sp\xef\xbf\xbdm' - encoded_obj = PyUnicode_AsEncodedObject(space, space.wrap(u'sp�m'), - utf_8, None) - assert space.eq_w(encoded, encoded_obj) - with raises_w(space, TypeError): - PyUnicode_AsEncodedString( - space, space.newtuple([1, 2, 3]), None, None) - with raises_w(space, TypeError): - PyUnicode_AsEncodedString(space, space.wrap(''), None, None) - ascii = rffi.str2charp('ascii') - replace = rffi.str2charp('replace') - encoded = PyUnicode_AsEncodedString(space, space.wrap(u'sp�m'), - ascii, replace) - assert space.unwrap(encoded) == 'sp?m' - rffi.free_charp(utf_8) - rffi.free_charp(replace) - rffi.free_charp(ascii) - - buf = rffi.unicode2wcharp(u"12345") - PyUnicode_AsWideChar(space, space.wrap(u'longword'), buf, 5) - assert rffi.wcharp2unicode(buf) == 'longw' - PyUnicode_AsWideChar(space, space.wrap(u'a'), buf, 5) - assert rffi.wcharp2unicode(buf) == 'a' - rffi.free_wcharp(buf) - def test_fromstring(self, space): s = rffi.str2charp(u'sp\x09m'.encode("utf-8")) w_res = PyUnicode_FromString(space, s) @@ -536,25 +499,6 @@ w_res = PyUnicode_Concat(space, space.wrap(u'a'), space.wrap(u'b')) assert space.unicode_w(w_res) == u'ab' - def test_copy(self, space): - w_x = space.wrap(u"abcd\u0660") - count1 = space.int_w(space.len(w_x)) - target_chunk = lltype.malloc(rffi.CWCHARP.TO, count1, flavor='raw') - - x_chunk = PyUnicode_AS_UNICODE(space, w_x) - Py_UNICODE_COPY(space, target_chunk, x_chunk, 4) - w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, 4)) - - assert space.eq_w(w_y, space.wrap(u"abcd")) - - size = PyUnicode_GET_SIZE(space, w_x) - Py_UNICODE_COPY(space, target_chunk, x_chunk, size) - w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, size)) - - assert space.eq_w(w_y, w_x) - - lltype.free(target_chunk, flavor='raw') - def test_ascii_codec(self, space): s = 'abcdefg' data = rffi.str2charp(s) diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py --- a/pypy/module/cpyext/unicodeobject.py +++ b/pypy/module/cpyext/unicodeobject.py @@ -64,7 +64,7 @@ py_unicode = rffi.cast(PyUnicodeObject, py_obj) s = space.unicode_w(w_obj) py_unicode.c_length = len(s) - py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO) + py_unicode.c_str = rffi.unicode2wcharp(s) py_unicode.c_hash = space.hash_w(space.newunicode(s)) py_unicode.c_defenc = lltype.nullptr(PyObject.TO) @@ -87,7 +87,7 @@ py_unicode = rffi.cast(PyUnicodeObject, py_obj) decref(space, py_unicode.c_defenc) if py_unicode.c_str: - lltype.free(py_unicode.c_str, flavor="raw") + rffi.free_wcharp(py_unicode.c_str) from pypy.module.cpyext.object import _dealloc _dealloc(space, py_obj) @@ -189,82 +189,6 @@ """Get the maximum ordinal for a Unicode character.""" return runicode.UNICHR(runicode.MAXUNICODE) -@cpython_api([rffi.VOIDP], rffi.CCHARP, error=CANNOT_FAIL) -def PyUnicode_AS_DATA(space, ref): - """Return a pointer to the internal buffer of the object. o has to be a - PyUnicodeObject (not checked).""" - return rffi.cast(rffi.CCHARP, PyUnicode_AS_UNICODE(space, ref)) - -@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL) -def PyUnicode_GET_DATA_SIZE(space, w_obj): - """Return the size of the object's internal buffer in bytes. o has to be a - PyUnicodeObject (not checked).""" - return rffi.sizeof(Py_UNICODE) * PyUnicode_GET_SIZE(space, w_obj) - -@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL) -def PyUnicode_GET_SIZE(space, w_obj): - """Return the size of the object. obj is a PyUnicodeObject (not - checked).""" - return space.len_w(w_obj) - -@cpython_api([rffi.VOIDP], rffi.CWCHARP, error=CANNOT_FAIL) -def PyUnicode_AS_UNICODE(space, ref): - """Return a pointer to the internal Py_UNICODE buffer of the object. ref - has to be a PyUnicodeObject (not checked).""" - ref_unicode = rffi.cast(PyUnicodeObject, ref) - if not ref_unicode.c_str: - # Copy unicode buffer - w_unicode = from_ref(space, rffi.cast(PyObject, ref)) - u = space.unicode_w(w_unicode) - ref_unicode.c_str = rffi.unicode2wcharp(u) - return ref_unicode.c_str - -@cpython_api([PyObject], rffi.CWCHARP) -def PyUnicode_AsUnicode(space, ref): - """Return a read-only pointer to the Unicode object's internal Py_UNICODE - buffer, NULL if unicode is not a Unicode object.""" - # Don't use PyUnicode_Check, it will realize the object :-( - w_type = from_ref(space, rffi.cast(PyObject, ref.c_ob_type)) - if not space.issubtype_w(w_type, space.w_unicode): - raise oefmt(space.w_TypeError, "expected unicode object") - return PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref)) - -@cpython_api([PyObject], Py_ssize_t, error=-1) -def PyUnicode_GetSize(space, ref): - if from_ref(space, rffi.cast(PyObject, ref.c_ob_type)) is space.w_unicode: - ref = rffi.cast(PyUnicodeObject, ref) - return ref.c_length - else: - w_obj = from_ref(space, ref) - return space.len_w(w_obj) - -@cpython_api([PyUnicodeObject, rffi.CWCHARP, Py_ssize_t], Py_ssize_t, error=-1) -def PyUnicode_AsWideChar(space, ref, buf, size): - """Copy the Unicode object contents into the wchar_t buffer w. At most - size wchar_t characters are copied (excluding a possibly trailing - 0-termination character). Return the number of wchar_t characters - copied or -1 in case of an error. Note that the resulting wchar_t - string may or may not be 0-terminated. It is the responsibility of the caller - to make sure that the wchar_t string is 0-terminated in case this is - required by the application.""" - c_str = PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref)) - c_length = ref.c_length - - # If possible, try to copy the 0-termination as well - if size > c_length: - size = c_length + 1 - - - i = 0 - while i < size: - buf[i] = c_str[i] - i += 1 - - if size > c_length: - return c_length - else: - return size - @cpython_api([], rffi.CCHARP, error=CANNOT_FAIL) def PyUnicode_GetDefaultEncoding(space): """Returns the currently active default encoding.""" @@ -327,27 +251,6 @@ return unicodeobject.encode_object(space, w_unicode, 'unicode-escape', 'strict') -@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True) -def PyUnicode_FromUnicode(space, wchar_p, length): - """Create a Unicode Object from the Py_UNICODE buffer u of the given size. u - may be NULL which causes the contents to be undefined. It is the user's - responsibility to fill in the needed data. The buffer is copied into the new - object. If the buffer is not NULL, the return value might be a shared object. - Therefore, modification of the resulting Unicode object is only allowed when u - is NULL.""" - if wchar_p: - s = rffi.wcharpsize2unicode(wchar_p, length) - return make_ref(space, space.newunicode(s)) - else: - return rffi.cast(PyObject, new_empty_unicode(space, length)) - -@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True) -def PyUnicode_FromWideChar(space, wchar_p, length): - """Create a Unicode object from the wchar_t buffer w of the given size. - Return NULL on failure.""" - # PyPy supposes Py_UNICODE == wchar_t - return PyUnicode_FromUnicode(space, wchar_p, length) - @cpython_api([PyObject, CONST_STRING], PyObject, result_is_ll=True) def _PyUnicode_AsDefaultEncodedString(space, ref, errors): # Returns a borrowed reference. @@ -679,13 +582,6 @@ """Concat two strings giving a new Unicode string.""" return space.add(w_left, w_right) -@cpython_api([rffi.CWCHARP, rffi.CWCHARP, Py_ssize_t], lltype.Void) -def Py_UNICODE_COPY(space, target, source, length): - """Roughly equivalent to memcpy() only the base size is Py_UNICODE - copies sizeof(Py_UNICODE) * length bytes from source to target""" - for i in range(0, length): - target[i] = source[i] - @cpython_api([PyObject, PyObject], PyObject) def PyUnicode_Format(space, w_format, w_args): """Return a new string object from format and args; this is analogous to _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit