This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


The following commit(s) were added to refs/heads/master by this push:
     new d5be1824 Improve helpers::Transcoder reference documention (#710)
d5be1824 is described below

commit d5be1824b4f1bed26b408cf2d45a40ae90be0d8d
Author: Stephen Webb <[email protected]>
AuthorDate: Sun Jun 7 10:48:33 2026 +1000

    Improve helpers::Transcoder reference documention (#710)
    
    * Document Transcode::decode preconditions explicitly
    
    * Fix the fault in the fuzzing test case code
---
 src/fuzzers/cpp/TranscoderFuzzer.cpp               |   5 +-
 src/main/cpp/charsetdecoder.cpp                    | 109 ++++++++++++++++----
 src/main/cpp/transcoder.cpp                        | 111 ++-------------------
 .../include/log4cxx/helpers/cacheddateformat.h     |   4 +-
 src/main/include/log4cxx/helpers/charsetdecoder.h  |  13 ++-
 src/main/include/log4cxx/helpers/transcoder.h      | 100 ++++++++++---------
 6 files changed, 165 insertions(+), 177 deletions(-)

diff --git a/src/fuzzers/cpp/TranscoderFuzzer.cpp 
b/src/fuzzers/cpp/TranscoderFuzzer.cpp
index f7980488..12850a37 100644
--- a/src/fuzzers/cpp/TranscoderFuzzer.cpp
+++ b/src/fuzzers/cpp/TranscoderFuzzer.cpp
@@ -254,10 +254,13 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* 
data, size_t size)
                std::string::const_iterator it = bytes.begin();
                while (it != bytes.end())
                {
+                       auto old_it = it;
                        unsigned int sv = Transcoder::decode(bytes, it);
                        if (sv == 0xFFFF)
                        {
-                               ++it; // mirror decodeUTF8's recovery advance 
on a bad sequence
+                               // mirror decodeUTF8's recovery advance on a 
bad sequence
+                               if (old_it == it)
+                                       ++it;
                                continue;
                        }
                        checkUTF16RoundTrip(sv);
diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp
index 90977573..fc9ce181 100644
--- a/src/main/cpp/charsetdecoder.cpp
+++ b/src/main/cpp/charsetdecoder.cpp
@@ -300,7 +300,7 @@ class TrivialCharsetDecoder : public CharsetDecoder
 };
 
 /**
-*    Converts from UTF-8 to std::wstring
+*    Converts from UTF-8 to LogString
 *
 */
 class UTF8CharsetDecoder : public CharsetDecoder
@@ -319,28 +319,15 @@ class UTF8CharsetDecoder : public CharsetDecoder
                        LogString& out)
                {
                        auto availableByteCount = in.remaining();
-                       std::string tmp(in.current(), availableByteCount);
-                       std::string::const_iterator nextCodePoint = tmp.begin();
-
-                       while (nextCodePoint != tmp.end())
+                       while (0 < availableByteCount)
                        {
-                               auto lastCodePoint = nextCodePoint;
-                               auto sv = Transcoder::decode(tmp, 
nextCodePoint);
-
-                               if (sv == 0xFFFF || nextCodePoint == 
lastCodePoint)
-                               {
-                                       size_t offset = nextCodePoint - 
tmp.begin();
-                                       in.increment_position(offset);
+                               auto sv = getUTF8CodePoint(in);
+                               auto nextAvailableByteCount = in.remaining();
+                               if (sv == 0xFFFF || nextAvailableByteCount == 
availableByteCount)
                                        return APR_BADCH;
-                               }
-                               else
-                               {
-                                       Transcoder::encode(sv, out);
-                               }
+                               Transcoder::encode(sv, out);
+                               availableByteCount = nextAvailableByteCount;
                        }
-
-                       in.increment_position(availableByteCount);
-
                        return APR_SUCCESS;
                }
 
@@ -607,8 +594,90 @@ log4cxx_status_t CharsetDecoder::decode(const char* in, 
size_t maxByteCount, Log
        return decode(buf, out);
 }
 
+unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in)
+{
+       auto availableByteCount = in.remaining();
+       if (0 == availableByteCount)
+               return 0xFFFF;
+
+       auto pChar = in.current();
+       auto ch1 = static_cast<unsigned char>(*pChar);
+       if (ch1 <= 0x7F)
+       {
+               in.increment_position(1);
+               return ch1;
+       }
+
+       //
+       //   should not have continuation character here
+       //
+       if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount)
+       {
+               auto ch2 = static_cast<unsigned char>(*(pChar + 1));
+               if ((ch2 & 0xC0) != 0x80) // not a continuation?
+                       return 0xFFFF;
 
+               if ((ch1 & 0xE0) == 0xC0)
+               {
+                       unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F);
+                       if (rv >= 0x80)
+                       {
+                               in.increment_position(2);
+                               return rv;
+                       }
+                       return 0xFFFF;
+               }
 
+               if (2 < availableByteCount)
+               {
+                       auto ch3 = static_cast<unsigned char>(*(pChar + 2));
+                       if ((ch3 & 0xC0) != 0x80) // not a continuation?
+                               return 0xFFFF;
 
+                       if ((ch1 & 0xF0) == 0xE0)
+                       {
+                               unsigned int rv = ((ch1 & 0x0F) << 12)
+                                       + ((ch2 & 0x3F) << 6)
+                                       + (ch3 & 0x3F);
+
+                               // RFC 3629 §3 prohibits UTF-8 encodings of the 
UTF-16 surrogate
+                               // halves (U+D800..U+DFFF); accepting them lets 
malformed Unicode
+                               // cross the decode boundary into LogString and 
downstream output.
+                               if (rv < 0x800 || (0xD800 <= rv && rv <= 
0xDFFF))
+                                       return 0xFFFF;
+
+                               in.increment_position(3);
+                               return rv;
+                       }
 
+                       if (3 < availableByteCount)
+                       {
+                               auto ch4 = static_cast<unsigned char>(*(pChar + 
3));
+                               if ((ch4 & 0xC0) != 0x80) // not a continuation?
+                                       return 0xFFFF;
+
+                               unsigned int rv = ((ch1 & 0x07) << 18)
+                                       + ((ch2 & 0x3F) << 12)
+                                       + ((ch3 & 0x3F) << 6)
+                                       + (ch4 & 0x3F);
+
+                               // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead 
bytes F5..F7 (and
+                               // F4 with an over-high trailer) produce rv > 
0x10FFFF, which
+                               // is not a Unicode code point. Without this 
bound, encodeUTF16
+                               // later silently aliases the bogus value to a 
valid in-range
+                               // code point — a substitution-collision 
filter-bypass primitive.
+                               // Lead bytes F8..FF are never valid UTF-8, but 
the & 0x07 mask
+                               // discards their high bits, so without the 
(ch1 & 0xF8) == 0xF0
+                               // guard F8 BF BF BF would alias to U+3FFFF 
instead of being
+                               // rejected.
+                               if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv 
<= 0x10FFFF)
+                               {
+                                       in.increment_position(4);
+                                       return rv;
+                               }
 
+                       }
+               }
+       }
+       return 0xFFFF;
+}
diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index ddc7e1c0..870ff436 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -218,111 +218,12 @@ size_t Transcoder::encodeUTF16LE(unsigned int ch, char* 
dst)
 unsigned int Transcoder::decode(const std::string& src,
        std::string::const_iterator& iter)
 {
-       std::string::const_iterator start(iter);
-       unsigned char ch1 = *(iter++);
-
-       if (ch1 <= 0x7F)
-       {
-               return ch1;
-       }
-
-       //
-       //   should not have continuation character here
-       //
-       if ((ch1 & 0xC0) != 0x80 && iter != src.end())
-       {
-               unsigned char ch2 = *(iter++);
-
-               //
-               //   should be continuation
-               if ((ch2 & 0xC0) != 0x80)
-               {
-                       iter = start;
-                       return 0xFFFF;
-               }
-
-               if ((ch1 & 0xE0) == 0xC0)
-               {
-                       if ((ch2 & 0xC0) == 0x80)
-                       {
-                               unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 
0x3F);
-
-                               if (rv >= 0x80)
-                               {
-                                       return rv;
-                               }
-                       }
-
-                       iter = start;
-                       return 0xFFFF;
-               }
-
-               if (iter != src.end())
-               {
-                       unsigned char ch3 = *(iter++);
-
-                       //
-                       //   should be continuation
-                       //
-                       if ((ch3 & 0xC0) != 0x80)
-                       {
-                               iter = start;
-                               return 0xFFFF;
-                       }
-
-                       if ((ch1 & 0xF0) == 0xE0)
-                       {
-                               unsigned rv = ((ch1 & 0x0F) << 12)
-                                       + ((ch2 & 0x3F) << 6)
-                                       + (ch3 & 0x3F);
-
-                               // RFC 3629 §3 prohibits UTF-8 encodings of the 
UTF-16 surrogate
-                               // halves (U+D800..U+DFFF); accepting them lets 
malformed Unicode
-                               // cross the decode boundary into LogString and 
downstream output.
-                               if (rv < 0x800 || (0xD800 <= rv && rv <= 
0xDFFF))
-                               {
-                                       iter = start;
-                                       return 0xFFFF;
-                               }
-
-                               return rv;
-                       }
-
-                       if (iter != src.end())
-                       {
-                               unsigned char ch4 = *(iter++);
-
-                               if ((ch4 & 0xC0) != 0x80)
-                               {
-                                       iter = start;
-                                       return 0xFFFF;
-                               }
-
-                               unsigned int rv = ((ch1 & 0x07) << 18)
-                                       + ((ch2 & 0x3F) << 12)
-                                       + ((ch3 & 0x3F) << 6)
-                                       + (ch4 & 0x3F);
-
-                               // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead 
bytes F5..F7 (and
-                               // F4 with an over-high trailer) produce rv > 
0x10FFFF, which
-                               // is not a Unicode code point. Without this 
bound, encodeUTF16
-                               // later silently aliases the bogus value to a 
valid in-range
-                               // code point — a substitution-collision 
filter-bypass primitive.
-                               // Lead bytes F8..FF are never valid UTF-8, but 
the & 0x07 mask
-                               // discards their high bits, so without the 
(ch1 & 0xF8) == 0xF0
-                               // guard F8 BF BF BF would alias to U+3FFFF 
instead of being
-                               // rejected.
-                               if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv 
<= 0x10FFFF)
-                               {
-                                       return rv;
-                               }
-
-                       }
-               }
-       }
-
-       iter = start;
-       return 0xFFFF;
+       auto offset = iter - src.begin();
+       auto remaining = src.size() - offset;
+       ByteBuffer buf(const_cast<char*>(src.data() + offset), remaining);
+       auto result = CharsetDecoder::getUTF8CodePoint(buf);
+       iter += remaining - buf.remaining();
+       return result;
 }
 
 
diff --git a/src/main/include/log4cxx/helpers/cacheddateformat.h 
b/src/main/include/log4cxx/helpers/cacheddateformat.h
index b2b8a32d..9a71665e 100644
--- a/src/main/include/log4cxx/helpers/cacheddateformat.h
+++ b/src/main/include/log4cxx/helpers/cacheddateformat.h
@@ -119,8 +119,8 @@ class LOG4CXX_EXPORT CachedDateFormat : public 
helpers::DateFormat
                /**
                 * Formats a Date into a date/time string.
                 *
-                *  @param date the date to format.
-                *  @param sbuf the string buffer to write to.
+                *  @param tm the date/time to format.
+                *  @param toAppendTo the string buffer to write to.
                 */
                void format( LOG4CXX_FORMAT_TIME_FORMAL_PARAMETERS ) const 
override;
 
diff --git a/src/main/include/log4cxx/helpers/charsetdecoder.h 
b/src/main/include/log4cxx/helpers/charsetdecoder.h
index d801e70b..71938e84 100644
--- a/src/main/include/log4cxx/helpers/charsetdecoder.h
+++ b/src/main/include/log4cxx/helpers/charsetdecoder.h
@@ -79,9 +79,9 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
                /**
                 *  Decodes as many bytes as possible from \c in,
                 *  appending the result onto \c out.
-                *  @param in a null terminated string.
+                *  @param in the bytes to decode.
                 *  @param out the string onto which characters are appended.
-                *  @return APR_SUCCESS if not encoding errors were found.
+                *  @return APR_SUCCESS if no encoding errors were found.
                 */
                virtual log4cxx_status_t decode(ByteBuffer& in, LogString& out) 
= 0;
 
@@ -92,7 +92,7 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
                 *  @param in a null terminated string.
                 *  @param maxByteCount the limit on the size of \c in.
                 *  @param out the string onto which characters are appended.
-                *  @return APR_SUCCESS if not encoding errors were found.
+                *  @return APR_SUCCESS if no encoding errors were found.
                 */
                log4cxx_status_t decode(const char* in, size_t maxByteCount, 
LogString& out);
 
@@ -104,6 +104,13 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
                        return (stat != 0);
                }
 
+               /**
+                *  Increment the \c buf cursor position past the next UTF8 
code point in \c buf.
+                *  @param buf the bytes to decode.
+                *  @return the code point value, if successful, otherwise 
0xFFFF. The \c buf cursor position is only incremented if successful.
+                */
+               static unsigned int getUTF8CodePoint(ByteBuffer& buf);
+
        private:
                /**
                *  Private copy constructor.
diff --git a/src/main/include/log4cxx/helpers/transcoder.h 
b/src/main/include/log4cxx/helpers/transcoder.h
index 9907ede0..e91a09d6 100644
--- a/src/main/include/log4cxx/helpers/transcoder.h
+++ b/src/main/include/log4cxx/helpers/transcoder.h
@@ -28,10 +28,7 @@ namespace helpers
 class ByteBuffer;
 class Pool;
 /**
-*    Simple transcoder for converting between
-*      external char and wchar_t strings and
-*      internal strings.
-*
+*    Methods for converting between external and internal strings.
 */
 class LOG4CXX_EXPORT Transcoder
 {
@@ -39,11 +36,11 @@ class LOG4CXX_EXPORT Transcoder
 
 
                /**
-                *   Appends this specified string of UTF-8 characters to 
LogString.
+                *   Append the UTF-8 characters in \c src onto \c dst.
                 */
                static void decodeUTF8(const std::string& src, LogString& dst);
                /**
-                *    Converts the LogString to a UTF-8 string.
+                *    Append \src onto \c dst as a UTF-8 string.
                 */
                static void encodeUTF8(const LogString& src, std::string& dst);
 #if LOG4CXX_ABI_VERSION <= 15
@@ -55,45 +52,43 @@ class LOG4CXX_EXPORT Transcoder
                static char* encodeUTF8(const LogString& src, 
LOG4CXX_NS::helpers::Pool& p);
 #endif
                /**
-                *    Append UCS-4 code point to a byte buffer as UTF-8.
+                *    Append the code point \c sv to \c dst as UTF-8.
                 */
                static void encodeUTF8(unsigned int sv, ByteBuffer& dst);
                /**
-                *    Append UCS-4 code point to a byte buffer as UTF-16LE.
+                *    Append the code point \c sv to \c dst as UTF-16LE.
                 */
                static void encodeUTF16LE(unsigned int sv, ByteBuffer& dst);
                /**
-                *    Append UCS-4 code point to a byte buffer as UTF-16BE.
+                *    Append the code point \c sv to \c dst as UTF-16BE.
                 */
                static void encodeUTF16BE(unsigned int sv, ByteBuffer& dst);
 
 
                /**
-                *   Decodes next character from a UTF-8 string.
-                *   @param in string from which the character is extracted.
-                *   @param iter iterator addressing start of character, will be
-                *   advanced to next character if successful.
-                *   @return scalar value (UCS-4) or 0xFFFF if invalid sequence.
+                *   Increment \c iter past the next code point in \c str.
+                *   @pre \c iter is a valid, dereferenceable iterator.
+                *   @pre \c iter and the end of \c str are in the same 
sequence.
+                *   @param str contains the code point to which \c iter refers.
+                *   @param iter the start of the current code point.
+                *   @return the code point value or 0xFFFF if not a valid 
sequence.
                 */
-               static unsigned int decode(const std::string& in,
+               static unsigned int decode(const std::string& str,
                        std::string::const_iterator& iter);
 
                /**
-                 *   Appends UCS-4 value to a UTF-8 string.
-                 *   @param ch UCS-4 value.
-                 *   @param dst destination.
+                 *   Append the UTF8 equivalent to \c ch onto \c dst.
                  */
                static void encode(unsigned int ch, std::string& dst);
 
                /**
-                *    Appends string in the current code-page
-                *       to a LogString.
+                *    Append the LogString equivalent of \c src onto \c dst.
                 */
                static void decode(const std::string& src, LogString& dst);
 
                /**
-                *     Appends a LogString to a string in the current
-                *        code-page.  Unrepresentable characters may be
+                *     Append the UTF8 equivalent of \c src onto \c dst.
+                *        Unrepresentable characters may be
                 *        replaced with loss characters.
                */
                static void encode(const LogString& src, std::string& dst);
@@ -113,24 +108,30 @@ class LOG4CXX_EXPORT Transcoder
 
 
 #if LOG4CXX_WCHAR_T_API || LOG4CXX_LOGCHAR_IS_WCHAR || defined(WIN32) || 
defined(_WIN32)
+               /**
+                *    Append the LogString equivalent of \c src onto \c dst.
+                */
                static void decode(const std::wstring& src, LogString& dst);
+               /**
+                *    Append the equivalent of \c src onto \c dst.
+                */
                static void encode(const LogString& src, std::wstring& dst);
+               /// A null-terminated equivalent of \c src.
                static wchar_t* wencode(const LogString& src, 
LOG4CXX_NS::helpers::Pool& p);
 
                /**
-                *   Decodes next character from a wstring.
-                *   @param in string from which the character is extracted.
-                *   @param iter iterator addressing start of character, will be
-                *   advanced to next character if successful.
-                *   @return scalar value (UCS-4) or 0xFFFF if invalid sequence.
+                *   Increment \c iter past the next code point in \c str.
+                *   @pre \c iter is a valid, dereferenceable iterator.
+                *   @pre \c iter and the end of \c str are in the same 
sequence.
+                *   @param str contains the code point to which \c iter refers.
+                *   @param iter the start of the current code point.
+                *   @return the code point value or 0xFFFF if not a valid 
sequence.
                 */
-               static unsigned int decode(const std::wstring& in,
+               static unsigned int decode(const std::wstring& str,
                        std::wstring::const_iterator& iter);
 
                /**
-                 *   Appends UCS-4 value to a UTF-8 string.
-                 *   @param ch UCS-4 value.
-                 *   @param dst destination.
+                 *   Append the wchar_t equivalent to \c ch onto \c dst.
                  */
                static void encode(unsigned int ch, std::wstring& dst);
 
@@ -138,45 +139,52 @@ class LOG4CXX_EXPORT Transcoder
 
 
 #if LOG4CXX_UNICHAR_API || LOG4CXX_LOGCHAR_IS_UNICHAR
+               /**
+                *    Append the LogString equivalent of \c src onto \c dst.
+                */
                static void decode(const std::basic_string<UniChar>& src, 
LogString& dst);
+               /**
+                *    Append the equivalent of \c src onto \c dst.
+                */
                static void encode(const LogString& src, 
std::basic_string<UniChar>& dst);
 
                /**
-                *   Decodes next character from a UniChar string.
-                *   @param in string from which the character is extracted.
-                *   @param iter iterator addressing start of character, will be
-                *   advanced to next character if successful.
-                *   @return scalar value (UCS-4) or 0xFFFF if invalid sequence.
+                *   Increment \c iter past the next code point in \c str.
+                *   @pre \c iter is a valid, dereferenceable iterator.
+                *   @pre \c iter and the end of \c str are in the same 
sequence.
+                *   @param str contains the code point to which \c iter refers.
+                *   @param iter the start of the current code point.
+                *   @return the code point value or 0xFFFF if not a valid 
sequence.
                 */
-               static unsigned int decode(const std::basic_string<UniChar>& in,
+               static unsigned int decode(const std::basic_string<UniChar>& 
str,
                        std::basic_string<UniChar>::const_iterator& iter);
 
                /**
-                 *   Appends UCS-4 value to a UTF-8 string.
-                 *   @param ch UCS-4 value.
-                 *   @param dst destination.
+                 *   Append the UniChar equivalent to \c ch onto \c dst.
                  */
                static void encode(unsigned int ch, std::basic_string<UniChar>& 
dst);
 
 #endif
 
 #if LOG4CXX_CFSTRING_API
+               /**
+                *    Append the LogString equivalent of \c src onto \c dst.
+                */
                static void decode(const CFStringRef& src, LogString& dst);
+               /// A CFStringRef equivalent of \c src.
                static CFStringRef encode(const LogString& src);
 #endif
 
                enum { LOSSCHAR = 0x3F };
 
                /**
-                *   Returns a logchar value given a character literal in the 
ASCII charset.
-                *   Used to implement the LOG4CXX_STR macro for EBCDIC and 
UNICHAR.
+                *   The logchar equivalent to \c ch.
                 */
-               static logchar decode(char v);
+               static logchar decode(char ch);
                /**
-                *   Returns a LogString given a string literal in the ASCII 
charset.
-                *   Used to implement the LOG4CXX_STR macro for EBCDIC and 
UNICHAR.
+                *   The LogString equivalent to the characters in the null 
terminated bytes at \c str.
                 */
-               static LogString decode(const char* v);
+               static LogString decode(const char* str);
 
                /**
                 *   Encodes a charset name in the default encoding

Reply via email to