(logging-log4cxx) 01/01: Prevent possible stack overflow when decoding UTF8

swebb2066 Thu, 04 Jun 2026 20:02:51 -0700

This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch remove_charset_decoder_recursion
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


commit 24d99ec986bf1bc27b54821d1c49cecab0e03018
Author: Stephen Webb <[email protected]>
AuthorDate: Fri Jun 5 13:02:21 2026 +1000

    Prevent possible stack overflow when decoding UTF8
---
 src/main/cpp/charsetdecoder.cpp                   | 109 +++++++++++++++++----
 src/main/cpp/transcoder.cpp                       | 111 ++--------------------
 src/main/include/log4cxx/helpers/charsetdecoder.h |  13 ++-
 3 files changed, 105 insertions(+), 128 deletions(-)

diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp
index 90977573..fc9ce181 100644
--- a/src/main/cpp/charsetdecoder.cpp
+++ b/src/main/cpp/charsetdecoder.cpp
@@ -300,7 +300,7 @@ class TrivialCharsetDecoder : public CharsetDecoder
 };
 
 /**
-*    Converts from UTF-8 to std::wstring
+*    Converts from UTF-8 to LogString
 *
 */
 class UTF8CharsetDecoder : public CharsetDecoder
@@ -319,28 +319,15 @@ class UTF8CharsetDecoder : public CharsetDecoder
                        LogString& out)
                {
                        auto availableByteCount = in.remaining();
-                       std::string tmp(in.current(), availableByteCount);
-                       std::string::const_iterator nextCodePoint = tmp.begin();
-
-                       while (nextCodePoint != tmp.end())
+                       while (0 < availableByteCount)
                        {
-                               auto lastCodePoint = nextCodePoint;
-                               auto sv = Transcoder::decode(tmp, 
nextCodePoint);
-
-                               if (sv == 0xFFFF || nextCodePoint == 
lastCodePoint)
-                               {
-                                       size_t offset = nextCodePoint - 
tmp.begin();
-                                       in.increment_position(offset);
+                               auto sv = getUTF8CodePoint(in);
+                               auto nextAvailableByteCount = in.remaining();
+                               if (sv == 0xFFFF || nextAvailableByteCount == 
availableByteCount)
                                        return APR_BADCH;
-                               }
-                               else
-                               {
-                                       Transcoder::encode(sv, out);
-                               }
+                               Transcoder::encode(sv, out);
+                               availableByteCount = nextAvailableByteCount;
                        }
-
-                       in.increment_position(availableByteCount);
-
                        return APR_SUCCESS;
                }
 
@@ -607,8 +594,90 @@ log4cxx_status_t CharsetDecoder::decode(const char* in, 
size_t maxByteCount, Log
        return decode(buf, out);
 }
 
+unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in)
+{
+       auto availableByteCount = in.remaining();
+       if (0 == availableByteCount)
+               return 0xFFFF;
+
+       auto pChar = in.current();
+       auto ch1 = static_cast<unsigned char>(*pChar);
+       if (ch1 <= 0x7F)
+       {
+               in.increment_position(1);
+               return ch1;
+       }
+
+       //
+       //   should not have continuation character here
+       //
+       if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount)
+       {
+               auto ch2 = static_cast<unsigned char>(*(pChar + 1));
+               if ((ch2 & 0xC0) != 0x80) // not a continuation?
+                       return 0xFFFF;
 
+               if ((ch1 & 0xE0) == 0xC0)
+               {
+                       unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F);
+                       if (rv >= 0x80)
+                       {
+                               in.increment_position(2);
+                               return rv;
+                       }
+                       return 0xFFFF;
+               }
 
+               if (2 < availableByteCount)
+               {
+                       auto ch3 = static_cast<unsigned char>(*(pChar + 2));
+                       if ((ch3 & 0xC0) != 0x80) // not a continuation?
+                               return 0xFFFF;
 
+                       if ((ch1 & 0xF0) == 0xE0)
+                       {
+                               unsigned int rv = ((ch1 & 0x0F) << 12)
+                                       + ((ch2 & 0x3F) << 6)
+                                       + (ch3 & 0x3F);
+
+                               // RFC 3629 §3 prohibits UTF-8 encodings of the 
UTF-16 surrogate
+                               // halves (U+D800..U+DFFF); accepting them lets 
malformed Unicode
+                               // cross the decode boundary into LogString and 
downstream output.
+                               if (rv < 0x800 || (0xD800 <= rv && rv <= 
0xDFFF))
+                                       return 0xFFFF;
+
+                               in.increment_position(3);
+                               return rv;
+                       }
 
+                       if (3 < availableByteCount)
+                       {
+                               auto ch4 = static_cast<unsigned char>(*(pChar + 
3));
+                               if ((ch4 & 0xC0) != 0x80) // not a continuation?
+                                       return 0xFFFF;
+
+                               unsigned int rv = ((ch1 & 0x07) << 18)
+                                       + ((ch2 & 0x3F) << 12)
+                                       + ((ch3 & 0x3F) << 6)
+                                       + (ch4 & 0x3F);
+
+                               // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead 
bytes F5..F7 (and
+                               // F4 with an over-high trailer) produce rv > 
0x10FFFF, which
+                               // is not a Unicode code point. Without this 
bound, encodeUTF16
+                               // later silently aliases the bogus value to a 
valid in-range
+                               // code point — a substitution-collision 
filter-bypass primitive.
+                               // Lead bytes F8..FF are never valid UTF-8, but 
the & 0x07 mask
+                               // discards their high bits, so without the 
(ch1 & 0xF8) == 0xF0
+                               // guard F8 BF BF BF would alias to U+3FFFF 
instead of being
+                               // rejected.
+                               if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv 
<= 0x10FFFF)
+                               {
+                                       in.increment_position(4);
+                                       return rv;
+                               }
 
+                       }
+               }
+       }
+       return 0xFFFF;
+}
diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index b5a9d3d0..5a6241c2 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -218,111 +218,12 @@ size_t Transcoder::encodeUTF16LE(unsigned int ch, char* 
dst)
 unsigned int Transcoder::decode(const std::string& src,
        std::string::const_iterator& iter)
 {
-       std::string::const_iterator start(iter);
-       unsigned char ch1 = *(iter++);
-
-       if (ch1 <= 0x7F)
-       {
-               return ch1;
-       }
-
-       //
-       //   should not have continuation character here
-       //
-       if ((ch1 & 0xC0) != 0x80 && iter != src.end())
-       {
-               unsigned char ch2 = *(iter++);
-
-               //
-               //   should be continuation
-               if ((ch2 & 0xC0) != 0x80)
-               {
-                       iter = start;
-                       return 0xFFFF;
-               }
-
-               if ((ch1 & 0xE0) == 0xC0)
-               {
-                       if ((ch2 & 0xC0) == 0x80)
-                       {
-                               unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 
0x3F);
-
-                               if (rv >= 0x80)
-                               {
-                                       return rv;
-                               }
-                       }
-
-                       iter = start;
-                       return 0xFFFF;
-               }
-
-               if (iter != src.end())
-               {
-                       unsigned char ch3 = *(iter++);
-
-                       //
-                       //   should be continuation
-                       //
-                       if ((ch3 & 0xC0) != 0x80)
-                       {
-                               iter = start;
-                               return 0xFFFF;
-                       }
-
-                       if ((ch1 & 0xF0) == 0xE0)
-                       {
-                               unsigned rv = ((ch1 & 0x0F) << 12)
-                                       + ((ch2 & 0x3F) << 6)
-                                       + (ch3 & 0x3F);
-
-                               // RFC 3629 §3 prohibits UTF-8 encodings of the 
UTF-16 surrogate
-                               // halves (U+D800..U+DFFF); accepting them lets 
malformed Unicode
-                               // cross the decode boundary into LogString and 
downstream output.
-                               if (rv < 0x800 || (0xD800 <= rv && rv <= 
0xDFFF))
-                               {
-                                       iter = start;
-                                       return 0xFFFF;
-                               }
-
-                               return rv;
-                       }
-
-                       if (iter != src.end())
-                       {
-                               unsigned char ch4 = *(iter++);
-
-                               if ((ch4 & 0xC0) != 0x80)
-                               {
-                                       iter = start;
-                                       return 0xFFFF;
-                               }
-
-                               unsigned int rv = ((ch1 & 0x07) << 18)
-                                       + ((ch2 & 0x3F) << 12)
-                                       + ((ch3 & 0x3F) << 6)
-                                       + (ch4 & 0x3F);
-
-                               // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead 
bytes F5..F7 (and
-                               // F4 with an over-high trailer) produce rv > 
0x10FFFF, which
-                               // is not a Unicode code point. Without this 
bound, encodeUTF16
-                               // later silently aliases the bogus value to a 
valid in-range
-                               // code point — a substitution-collision 
filter-bypass primitive.
-                               // Lead bytes F8..FF are never valid UTF-8, but 
the & 0x07 mask
-                               // discards their high bits, so without the 
(ch1 & 0xF8) == 0xF0
-                               // guard F8 BF BF BF would alias to U+3FFFF 
instead of being
-                               // rejected.
-                               if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv 
<= 0x10FFFF)
-                               {
-                                       return rv;
-                               }
-
-                       }
-               }
-       }
-
-       iter = start;
-       return 0xFFFF;
+       auto index = iter - src.begin();
+       auto remaining = src.size() - index;
+       ByteBuffer buf(const_cast<char*>(&src[index]), remaining);
+       auto result = CharsetDecoder::getUTF8CodePoint(buf);
+       iter += remaining - buf.remaining();
+       return result;
 }
 
 
diff --git a/src/main/include/log4cxx/helpers/charsetdecoder.h 
b/src/main/include/log4cxx/helpers/charsetdecoder.h
index d801e70b..1275d525 100644
--- a/src/main/include/log4cxx/helpers/charsetdecoder.h
+++ b/src/main/include/log4cxx/helpers/charsetdecoder.h
@@ -79,9 +79,9 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
                /**
                 *  Decodes as many bytes as possible from \c in,
                 *  appending the result onto \c out.
-                *  @param in a null terminated string.
+                *  @param in the bytes to decode.
                 *  @param out the string onto which characters are appended.
-                *  @return APR_SUCCESS if not encoding errors were found.
+                *  @return APR_SUCCESS if no encoding errors were found.
                 */
                virtual log4cxx_status_t decode(ByteBuffer& in, LogString& out) 
= 0;
 
@@ -92,7 +92,7 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
                 *  @param in a null terminated string.
                 *  @param maxByteCount the limit on the size of \c in.
                 *  @param out the string onto which characters are appended.
-                *  @return APR_SUCCESS if not encoding errors were found.
+                *  @return APR_SUCCESS if no encoding errors were found.
                 */
                log4cxx_status_t decode(const char* in, size_t maxByteCount, 
LogString& out);
 
@@ -104,6 +104,13 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
                        return (stat != 0);
                }
 
+               /**
+                *  The first code point in \c in.
+                *  @param in the bytes to decode.
+                *  @return the code point value, if successful, otherwise 
0xFFFF. The \c in cursor position is updated if successful.
+                */
+               static unsigned int getUTF8CodePoint(ByteBuffer& in);
+
        private:
                /**
                *  Private copy constructor.

(logging-log4cxx) 01/01: Prevent possible stack overflow when decoding UTF8

Reply via email to