https://github.com/python/cpython/commit/f693f8422700f889f90ab548ceeb24704a2ae952 commit: f693f8422700f889f90ab548ceeb24704a2ae952 branch: main author: Bénédikt Tran <10796600+picn...@users.noreply.github.com> committer: picnixz <10796600+picn...@users.noreply.github.com> date: 2025-03-03T11:43:22Z summary:
gh-129173: simplify `PyCodec_XMLCharRefReplaceErrors` logic (#129894) Writing the decimal representation of a Unicode codepoint only requires to know the number of digits. --------- Co-authored-by: Petr Viktorin <encu...@gmail.com> files: M Python/codecs.c diff --git a/Python/codecs.c b/Python/codecs.c index b876b816f688a0..d5d9a4a8bcabb7 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -730,6 +730,25 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) } +/* + * Determine the number of digits for a decimal representation of Unicode + * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits). + */ +static inline int +n_decimal_digits_for_codepoint(Py_UCS4 ch) +{ + if (ch < 10) return 1; + if (ch < 100) return 2; + if (ch < 1000) return 3; + if (ch < 10000) return 4; + if (ch < 100000) return 5; + if (ch < 1000000) return 6; + if (ch < 10000000) return 7; + // Unicode codepoints are limited to 1114111 (7 decimal digits) + Py_UNREACHABLE(); +} + + /* * Create a Unicode string containing 'count' copies of the official * Unicode REPLACEMENT CHARACTER (0xFFFD). @@ -867,9 +886,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } } + +// --- handler: 'xmlcharrefreplace' ------------------------------------------- + PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) { - if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { + if (!_PyIsUnicodeEncodeError(exc)) { wrong_exception_type(exc); return NULL; } @@ -896,30 +918,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) Py_ssize_t ressize = 0; for (Py_ssize_t i = start; i < end; ++i) { - /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - if (ch < 10) { - ressize += 2 + 1 + 1; - } - else if (ch < 100) { - ressize += 2 + 2 + 1; - } - else if (ch < 1000) { - ressize += 2 + 3 + 1; - } - else if (ch < 10000) { - ressize += 2 + 4 + 1; - } - else if (ch < 100000) { - ressize += 2 + 5 + 1; - } - else if (ch < 1000000) { - ressize += 2 + 6 + 1; - } - else { - assert(ch < 10000000); - ressize += 2 + 7 + 1; - } + int k = n_decimal_digits_for_codepoint(ch); + assert(k != 0); + assert(k <= 7); + ressize += 2 + k + 1; } /* allocate replacement */ @@ -931,45 +934,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); /* generate replacement */ for (Py_ssize_t i = start; i < end; ++i) { - int digits, base; Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - if (ch < 10) { - digits = 1; - base = 1; - } - else if (ch < 100) { - digits = 2; - base = 10; - } - else if (ch < 1000) { - digits = 3; - base = 100; - } - else if (ch < 10000) { - digits = 4; - base = 1000; - } - else if (ch < 100000) { - digits = 5; - base = 10000; - } - else if (ch < 1000000) { - digits = 6; - base = 100000; - } - else { - assert(ch < 10000000); - digits = 7; - base = 1000000; - } + /* + * Write the decimal representation of 'ch' to the buffer pointed by 'p' + * using at most 7 characters prefixed by '&#' and suffixed by ';'. + */ *outp++ = '&'; *outp++ = '#'; - while (digits-- > 0) { - assert(base >= 1); - *outp++ = '0' + ch / base; - ch %= base; - base /= 10; + Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); + for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) { + *p_digit = '0' + (ch % 10); + ch /= 10; } + assert(ch == 0); + outp = digit_end; *outp++ = ';'; } assert(_PyUnicode_CheckConsistency(res, 1)); @@ -1517,7 +1495,8 @@ replace_errors(PyObject *Py_UNUSED(self), PyObject *exc) } -static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) +static inline PyObject * +xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) { return PyCodec_XMLCharRefReplaceErrors(exc); } _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com