This is an automated email from the ASF dual-hosted git repository. swebb2066 pushed a commit to branch remove_charset_decoder_recursion in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
commit 24d99ec986bf1bc27b54821d1c49cecab0e03018 Author: Stephen Webb <[email protected]> AuthorDate: Fri Jun 5 13:02:21 2026 +1000 Prevent possible stack overflow when decoding UTF8 --- src/main/cpp/charsetdecoder.cpp | 109 +++++++++++++++++---- src/main/cpp/transcoder.cpp | 111 ++-------------------- src/main/include/log4cxx/helpers/charsetdecoder.h | 13 ++- 3 files changed, 105 insertions(+), 128 deletions(-) diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp index 90977573..fc9ce181 100644 --- a/src/main/cpp/charsetdecoder.cpp +++ b/src/main/cpp/charsetdecoder.cpp @@ -300,7 +300,7 @@ class TrivialCharsetDecoder : public CharsetDecoder }; /** -* Converts from UTF-8 to std::wstring +* Converts from UTF-8 to LogString * */ class UTF8CharsetDecoder : public CharsetDecoder @@ -319,28 +319,15 @@ class UTF8CharsetDecoder : public CharsetDecoder LogString& out) { auto availableByteCount = in.remaining(); - std::string tmp(in.current(), availableByteCount); - std::string::const_iterator nextCodePoint = tmp.begin(); - - while (nextCodePoint != tmp.end()) + while (0 < availableByteCount) { - auto lastCodePoint = nextCodePoint; - auto sv = Transcoder::decode(tmp, nextCodePoint); - - if (sv == 0xFFFF || nextCodePoint == lastCodePoint) - { - size_t offset = nextCodePoint - tmp.begin(); - in.increment_position(offset); + auto sv = getUTF8CodePoint(in); + auto nextAvailableByteCount = in.remaining(); + if (sv == 0xFFFF || nextAvailableByteCount == availableByteCount) return APR_BADCH; - } - else - { - Transcoder::encode(sv, out); - } + Transcoder::encode(sv, out); + availableByteCount = nextAvailableByteCount; } - - in.increment_position(availableByteCount); - return APR_SUCCESS; } @@ -607,8 +594,90 @@ log4cxx_status_t CharsetDecoder::decode(const char* in, size_t maxByteCount, Log return decode(buf, out); } +unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in) +{ + auto availableByteCount = in.remaining(); + if (0 == availableByteCount) + return 0xFFFF; + + auto pChar = in.current(); + auto ch1 = static_cast<unsigned char>(*pChar); + if (ch1 <= 0x7F) + { + in.increment_position(1); + return ch1; + } + + // + // should not have continuation character here + // + if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount) + { + auto ch2 = static_cast<unsigned char>(*(pChar + 1)); + if ((ch2 & 0xC0) != 0x80) // not a continuation? + return 0xFFFF; + if ((ch1 & 0xE0) == 0xC0) + { + unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F); + if (rv >= 0x80) + { + in.increment_position(2); + return rv; + } + return 0xFFFF; + } + if (2 < availableByteCount) + { + auto ch3 = static_cast<unsigned char>(*(pChar + 2)); + if ((ch3 & 0xC0) != 0x80) // not a continuation? + return 0xFFFF; + if ((ch1 & 0xF0) == 0xE0) + { + unsigned int rv = ((ch1 & 0x0F) << 12) + + ((ch2 & 0x3F) << 6) + + (ch3 & 0x3F); + + // RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate + // halves (U+D800..U+DFFF); accepting them lets malformed Unicode + // cross the decode boundary into LogString and downstream output. + if (rv < 0x800 || (0xD800 <= rv && rv <= 0xDFFF)) + return 0xFFFF; + + in.increment_position(3); + return rv; + } + if (3 < availableByteCount) + { + auto ch4 = static_cast<unsigned char>(*(pChar + 3)); + if ((ch4 & 0xC0) != 0x80) // not a continuation? + return 0xFFFF; + + unsigned int rv = ((ch1 & 0x07) << 18) + + ((ch2 & 0x3F) << 12) + + ((ch3 & 0x3F) << 6) + + (ch4 & 0x3F); + + // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead bytes F5..F7 (and + // F4 with an over-high trailer) produce rv > 0x10FFFF, which + // is not a Unicode code point. Without this bound, encodeUTF16 + // later silently aliases the bogus value to a valid in-range + // code point — a substitution-collision filter-bypass primitive. + // Lead bytes F8..FF are never valid UTF-8, but the & 0x07 mask + // discards their high bits, so without the (ch1 & 0xF8) == 0xF0 + // guard F8 BF BF BF would alias to U+3FFFF instead of being + // rejected. + if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv <= 0x10FFFF) + { + in.increment_position(4); + return rv; + } + } + } + } + return 0xFFFF; +} diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp index b5a9d3d0..5a6241c2 100644 --- a/src/main/cpp/transcoder.cpp +++ b/src/main/cpp/transcoder.cpp @@ -218,111 +218,12 @@ size_t Transcoder::encodeUTF16LE(unsigned int ch, char* dst) unsigned int Transcoder::decode(const std::string& src, std::string::const_iterator& iter) { - std::string::const_iterator start(iter); - unsigned char ch1 = *(iter++); - - if (ch1 <= 0x7F) - { - return ch1; - } - - // - // should not have continuation character here - // - if ((ch1 & 0xC0) != 0x80 && iter != src.end()) - { - unsigned char ch2 = *(iter++); - - // - // should be continuation - if ((ch2 & 0xC0) != 0x80) - { - iter = start; - return 0xFFFF; - } - - if ((ch1 & 0xE0) == 0xC0) - { - if ((ch2 & 0xC0) == 0x80) - { - unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F); - - if (rv >= 0x80) - { - return rv; - } - } - - iter = start; - return 0xFFFF; - } - - if (iter != src.end()) - { - unsigned char ch3 = *(iter++); - - // - // should be continuation - // - if ((ch3 & 0xC0) != 0x80) - { - iter = start; - return 0xFFFF; - } - - if ((ch1 & 0xF0) == 0xE0) - { - unsigned rv = ((ch1 & 0x0F) << 12) - + ((ch2 & 0x3F) << 6) - + (ch3 & 0x3F); - - // RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate - // halves (U+D800..U+DFFF); accepting them lets malformed Unicode - // cross the decode boundary into LogString and downstream output. - if (rv < 0x800 || (0xD800 <= rv && rv <= 0xDFFF)) - { - iter = start; - return 0xFFFF; - } - - return rv; - } - - if (iter != src.end()) - { - unsigned char ch4 = *(iter++); - - if ((ch4 & 0xC0) != 0x80) - { - iter = start; - return 0xFFFF; - } - - unsigned int rv = ((ch1 & 0x07) << 18) - + ((ch2 & 0x3F) << 12) - + ((ch3 & 0x3F) << 6) - + (ch4 & 0x3F); - - // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead bytes F5..F7 (and - // F4 with an over-high trailer) produce rv > 0x10FFFF, which - // is not a Unicode code point. Without this bound, encodeUTF16 - // later silently aliases the bogus value to a valid in-range - // code point — a substitution-collision filter-bypass primitive. - // Lead bytes F8..FF are never valid UTF-8, but the & 0x07 mask - // discards their high bits, so without the (ch1 & 0xF8) == 0xF0 - // guard F8 BF BF BF would alias to U+3FFFF instead of being - // rejected. - if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv <= 0x10FFFF) - { - return rv; - } - - } - } - } - - iter = start; - return 0xFFFF; + auto index = iter - src.begin(); + auto remaining = src.size() - index; + ByteBuffer buf(const_cast<char*>(&src[index]), remaining); + auto result = CharsetDecoder::getUTF8CodePoint(buf); + iter += remaining - buf.remaining(); + return result; } diff --git a/src/main/include/log4cxx/helpers/charsetdecoder.h b/src/main/include/log4cxx/helpers/charsetdecoder.h index d801e70b..1275d525 100644 --- a/src/main/include/log4cxx/helpers/charsetdecoder.h +++ b/src/main/include/log4cxx/helpers/charsetdecoder.h @@ -79,9 +79,9 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object /** * Decodes as many bytes as possible from \c in, * appending the result onto \c out. - * @param in a null terminated string. + * @param in the bytes to decode. * @param out the string onto which characters are appended. - * @return APR_SUCCESS if not encoding errors were found. + * @return APR_SUCCESS if no encoding errors were found. */ virtual log4cxx_status_t decode(ByteBuffer& in, LogString& out) = 0; @@ -92,7 +92,7 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object * @param in a null terminated string. * @param maxByteCount the limit on the size of \c in. * @param out the string onto which characters are appended. - * @return APR_SUCCESS if not encoding errors were found. + * @return APR_SUCCESS if no encoding errors were found. */ log4cxx_status_t decode(const char* in, size_t maxByteCount, LogString& out); @@ -104,6 +104,13 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object return (stat != 0); } + /** + * The first code point in \c in. + * @param in the bytes to decode. + * @return the code point value, if successful, otherwise 0xFFFF. The \c in cursor position is updated if successful. + */ + static unsigned int getUTF8CodePoint(ByteBuffer& in); + private: /** * Private copy constructor.
