https://github.com/python/cpython/commit/c497694f772763f9e8642603d9f6627675ba64c4
commit: c497694f772763f9e8642603d9f6627675ba64c4
branch: main
author: Victor Stinner <[email protected]>
committer: vstinner <[email protected]>
date: 2025-09-22T23:36:05+02:00
summary:
gh-139156: Use PyBytesWriter in UTF-16 encoder (#139233)
Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the
PyBytesWriter API.
files:
M Objects/unicodeobject.c
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 06caf1dc054019..f348c2f18f8fd3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6407,32 +6407,15 @@ _PyUnicode_EncodeUTF16(PyObject *str,
const char *errors,
int byteorder)
{
- int kind;
- const void *data;
- Py_ssize_t len;
- PyObject *v;
- unsigned short *out;
- Py_ssize_t pairs;
-#if PY_BIG_ENDIAN
- int native_ordering = byteorder >= 0;
-#else
- int native_ordering = byteorder <= 0;
-#endif
- const char *encoding;
- Py_ssize_t nsize, pos;
- PyObject *errorHandler = NULL;
- PyObject *exc = NULL;
- PyObject *rep = NULL;
-
if (!PyUnicode_Check(str)) {
PyErr_BadArgument();
return NULL;
}
- kind = PyUnicode_KIND(str);
- data = PyUnicode_DATA(str);
- len = PyUnicode_GET_LENGTH(str);
+ int kind = PyUnicode_KIND(str);
+ const void *data = PyUnicode_DATA(str);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(str);
- pairs = 0;
+ Py_ssize_t pairs = 0;
if (kind == PyUnicode_4BYTE_KIND) {
const Py_UCS4 *in = (const Py_UCS4 *)data;
const Py_UCS4 *end = in + len;
@@ -6445,27 +6428,48 @@ _PyUnicode_EncodeUTF16(PyObject *str,
if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
return PyErr_NoMemory();
}
- nsize = len + pairs + (byteorder == 0);
- v = PyBytes_FromStringAndSize(NULL, nsize * 2);
- if (v == NULL) {
+ Py_ssize_t nsize = len + pairs + (byteorder == 0);
+
+#if PY_BIG_ENDIAN
+ int native_ordering = byteorder >= 0;
+#else
+ int native_ordering = byteorder <= 0;
+#endif
+
+ if (kind == PyUnicode_1BYTE_KIND) {
+ PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
+ if (v == NULL) {
+ return NULL;
+ }
+
+ /* output buffer is 2-bytes aligned */
+ assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
+ unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
+ if (byteorder == 0) {
+ *out++ = 0xFEFF;
+ }
+ if (len > 0) {
+ ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out,
native_ordering);
+ }
+ return v;
+ }
+
+ PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
+ if (writer == NULL) {
return NULL;
}
/* output buffer is 2-bytes aligned */
- assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
- out = (unsigned short *)PyBytes_AS_STRING(v);
+ assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
+ unsigned short *out = PyBytesWriter_GetData(writer);
if (byteorder == 0) {
*out++ = 0xFEFF;
}
if (len == 0) {
- goto done;
- }
-
- if (kind == PyUnicode_1BYTE_KIND) {
- ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out,
native_ordering);
- goto done;
+ return PyBytesWriter_Finish(writer);
}
+ const char *encoding;
if (byteorder < 0) {
encoding = "utf-16-le";
}
@@ -6476,10 +6480,11 @@ _PyUnicode_EncodeUTF16(PyObject *str,
encoding = "utf-16";
}
- pos = 0;
- while (pos < len) {
- Py_ssize_t newpos, repsize, moreunits;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *rep = NULL;
+ for (Py_ssize_t pos = 0; pos < len; ) {
if (kind == PyUnicode_2BYTE_KIND) {
pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
&out, native_ordering);
@@ -6492,6 +6497,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
if (pos == len)
break;
+ Py_ssize_t newpos;
rep = unicode_encode_call_errorhandler(
errors, &errorHandler,
encoding, "surrogates not allowed",
@@ -6499,6 +6505,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
if (!rep)
goto error;
+ Py_ssize_t repsize, moreunits;
if (PyBytes_Check(rep)) {
repsize = PyBytes_GET_SIZE(rep);
if (repsize & 1) {
@@ -6524,21 +6531,17 @@ _PyUnicode_EncodeUTF16(PyObject *str,
/* two bytes are reserved for each surrogate */
if (moreunits > 0) {
- Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
- if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
- /* integer overflow */
- PyErr_NoMemory();
+ out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits,
out);
+ if (out == NULL) {
goto error;
}
- if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
- goto error;
- out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
}
if (PyBytes_Check(rep)) {
memcpy(out, PyBytes_AS_STRING(rep), repsize);
out += repsize / 2;
- } else /* rep is unicode */ {
+ } else {
+ /* rep is unicode */
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
&out, native_ordering);
@@ -6547,23 +6550,20 @@ _PyUnicode_EncodeUTF16(PyObject *str,
Py_CLEAR(rep);
}
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+
/* Cut back to size actually needed. This is necessary for, for example,
encoding of a string containing isolated surrogates and the 'ignore'
handler
is used. */
- nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
- if (nsize != PyBytes_GET_SIZE(v))
- _PyBytes_Resize(&v, nsize);
- Py_XDECREF(errorHandler);
- Py_XDECREF(exc);
- done:
- return v;
+ return PyBytesWriter_FinishWithPointer(writer, out);
+
error:
Py_XDECREF(rep);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
- Py_XDECREF(v);
+ PyBytesWriter_Discard(writer);
return NULL;
-#undef STORECHAR
}
PyObject *
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]