Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r2985:5e554e5c0dfc Date: 2017-06-19 12:06 +0200 http://bitbucket.org/cffi/cffi/changeset/5e554e5c0dfc/
Log: Write a copy of wchar_helper.h that uses CPython 3.3's new unicode string API. It makes sense anyway for speed reasons, but it also avoids even more special cases for platforms with 16-bit wchar_t on CPython >= 3.3. diff --git a/c/_cffi_backend.c b/c/_cffi_backend.c --- a/c/_cffi_backend.c +++ b/c/_cffi_backend.c @@ -286,7 +286,11 @@ # include "file_emulator.h" #endif -#include "wchar_helper.h" +#ifdef PyUnicode_KIND /* Python >= 3.3 */ +# include "wchar_helper_3.h" +#else +# include "wchar_helper.h" +#endif #include "../cffi/_cffi_errors.h" diff --git a/c/wchar_helper_3.h b/c/wchar_helper_3.h new file mode 100644 --- /dev/null +++ b/c/wchar_helper_3.h @@ -0,0 +1,155 @@ +/* + * wchar_t helpers, version CPython >= 3.3. + * + * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all + * platforms, even ones with wchar_t limited to 2 bytes. As such, + * this code here works from the outside like wchar_helper.h in the + * case Py_UNICODE_SIZE == 4, but the implementation is very different. + */ + +typedef uint16_t cffi_char16_t; +typedef uint32_t cffi_char32_t; + + +static PyObject * +_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size) +{ + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size); +} + +static PyObject * +_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size) +{ + /* are there any surrogate pairs, and if so, how many? */ + Py_ssize_t i, count_surrogates = 0; + for (i = 0; i < size - 1; i++) { + if (0xD800 <= w[i] && w[i] <= 0xDBFF && + 0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF) + count_surrogates++; + } + if (count_surrogates == 0) { + /* no, fast path */ + return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size); + } + else + { + PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF); + Py_UCS4 *data; + assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND); + data = PyUnicode_4BYTE_DATA(result); + + for (i = 0; i < size; i++) + { + cffi_char32_t ch = w[i]; + if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) { + cffi_char32_t ch2 = w[i + 1]; + if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + i++; + } + } + *data++ = ch; + } + return result; + } +} + +static int +_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result, + char *err_got) +{ + cffi_char32_t ch; + if (PyUnicode_GET_LENGTH(unicode) != 1) { + sprintf(err_got, "unicode string of length %zd", + PyUnicode_GET_LENGTH(unicode)); + return -1; + } + ch = PyUnicode_READ_CHAR(unicode, 0); + + if (ch > 0xFFFF) + { + sprintf(err_got, "larger-than-0xFFFF character"); + return -1; + } + *result = (cffi_char16_t)ch; + return 0; +} + +static int +_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result, + char *err_got) +{ + if (PyUnicode_GET_LENGTH(unicode) != 1) { + sprintf(err_got, "unicode string of length %zd", + PyUnicode_GET_LENGTH(unicode)); + return -1; + } + *result = PyUnicode_READ_CHAR(unicode, 0); + return 0; +} + +static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode) +{ + Py_ssize_t length = PyUnicode_GET_LENGTH(unicode); + Py_ssize_t result = length; + unsigned int kind = PyUnicode_KIND(unicode); + + if (kind == PyUnicode_4BYTE_KIND) + { + Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode); + Py_ssize_t i; + for (i = 0; i < length; i++) { + if (data[i] > 0xFFFF) + result++; + } + } + return result; +} + +static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode) +{ + return PyUnicode_GET_LENGTH(unicode); +} + +static int _my_PyUnicode_AsChar16(PyObject *unicode, + cffi_char16_t *result, + Py_ssize_t resultlen) +{ + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + unsigned int kind = PyUnicode_KIND(unicode); + void *data = PyUnicode_DATA(unicode); + Py_ssize_t i; + + for (i = 0; i < len; i++) { + cffi_char32_t ordinal = PyUnicode_READ(kind, data, i); + if (ordinal > 0xFFFF) { + if (ordinal > 0x10FFFF) { + PyErr_Format(PyExc_ValueError, + "unicode character out of range for " + "conversion to char16_t: 0x%x", (int)ordinal); + return -1; + } + ordinal -= 0x10000; + *result++ = 0xD800 | (ordinal >> 10); + *result++ = 0xDC00 | (ordinal & 0x3FF); + } + else + *result++ = ordinal; + } + return 0; +} + +static int _my_PyUnicode_AsChar32(PyObject *unicode, + cffi_char32_t *result, + Py_ssize_t resultlen) +{ + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + unsigned int kind = PyUnicode_KIND(unicode); + void *data = PyUnicode_DATA(unicode); + Py_ssize_t i; + + for (i = 0; i < len; i++) + result[i] = PyUnicode_READ(kind, data, i); + + return 0; +} _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit