This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new d5be1824 Improve helpers::Transcoder reference documention (#710)
d5be1824 is described below
commit d5be1824b4f1bed26b408cf2d45a40ae90be0d8d
Author: Stephen Webb <[email protected]>
AuthorDate: Sun Jun 7 10:48:33 2026 +1000
Improve helpers::Transcoder reference documention (#710)
* Document Transcode::decode preconditions explicitly
* Fix the fault in the fuzzing test case code
---
src/fuzzers/cpp/TranscoderFuzzer.cpp | 5 +-
src/main/cpp/charsetdecoder.cpp | 109 ++++++++++++++++----
src/main/cpp/transcoder.cpp | 111 ++-------------------
.../include/log4cxx/helpers/cacheddateformat.h | 4 +-
src/main/include/log4cxx/helpers/charsetdecoder.h | 13 ++-
src/main/include/log4cxx/helpers/transcoder.h | 100 ++++++++++---------
6 files changed, 165 insertions(+), 177 deletions(-)
diff --git a/src/fuzzers/cpp/TranscoderFuzzer.cpp
b/src/fuzzers/cpp/TranscoderFuzzer.cpp
index f7980488..12850a37 100644
--- a/src/fuzzers/cpp/TranscoderFuzzer.cpp
+++ b/src/fuzzers/cpp/TranscoderFuzzer.cpp
@@ -254,10 +254,13 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t*
data, size_t size)
std::string::const_iterator it = bytes.begin();
while (it != bytes.end())
{
+ auto old_it = it;
unsigned int sv = Transcoder::decode(bytes, it);
if (sv == 0xFFFF)
{
- ++it; // mirror decodeUTF8's recovery advance
on a bad sequence
+ // mirror decodeUTF8's recovery advance on a
bad sequence
+ if (old_it == it)
+ ++it;
continue;
}
checkUTF16RoundTrip(sv);
diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp
index 90977573..fc9ce181 100644
--- a/src/main/cpp/charsetdecoder.cpp
+++ b/src/main/cpp/charsetdecoder.cpp
@@ -300,7 +300,7 @@ class TrivialCharsetDecoder : public CharsetDecoder
};
/**
-* Converts from UTF-8 to std::wstring
+* Converts from UTF-8 to LogString
*
*/
class UTF8CharsetDecoder : public CharsetDecoder
@@ -319,28 +319,15 @@ class UTF8CharsetDecoder : public CharsetDecoder
LogString& out)
{
auto availableByteCount = in.remaining();
- std::string tmp(in.current(), availableByteCount);
- std::string::const_iterator nextCodePoint = tmp.begin();
-
- while (nextCodePoint != tmp.end())
+ while (0 < availableByteCount)
{
- auto lastCodePoint = nextCodePoint;
- auto sv = Transcoder::decode(tmp,
nextCodePoint);
-
- if (sv == 0xFFFF || nextCodePoint ==
lastCodePoint)
- {
- size_t offset = nextCodePoint -
tmp.begin();
- in.increment_position(offset);
+ auto sv = getUTF8CodePoint(in);
+ auto nextAvailableByteCount = in.remaining();
+ if (sv == 0xFFFF || nextAvailableByteCount ==
availableByteCount)
return APR_BADCH;
- }
- else
- {
- Transcoder::encode(sv, out);
- }
+ Transcoder::encode(sv, out);
+ availableByteCount = nextAvailableByteCount;
}
-
- in.increment_position(availableByteCount);
-
return APR_SUCCESS;
}
@@ -607,8 +594,90 @@ log4cxx_status_t CharsetDecoder::decode(const char* in,
size_t maxByteCount, Log
return decode(buf, out);
}
+unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in)
+{
+ auto availableByteCount = in.remaining();
+ if (0 == availableByteCount)
+ return 0xFFFF;
+
+ auto pChar = in.current();
+ auto ch1 = static_cast<unsigned char>(*pChar);
+ if (ch1 <= 0x7F)
+ {
+ in.increment_position(1);
+ return ch1;
+ }
+
+ //
+ // should not have continuation character here
+ //
+ if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount)
+ {
+ auto ch2 = static_cast<unsigned char>(*(pChar + 1));
+ if ((ch2 & 0xC0) != 0x80) // not a continuation?
+ return 0xFFFF;
+ if ((ch1 & 0xE0) == 0xC0)
+ {
+ unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F);
+ if (rv >= 0x80)
+ {
+ in.increment_position(2);
+ return rv;
+ }
+ return 0xFFFF;
+ }
+ if (2 < availableByteCount)
+ {
+ auto ch3 = static_cast<unsigned char>(*(pChar + 2));
+ if ((ch3 & 0xC0) != 0x80) // not a continuation?
+ return 0xFFFF;
+ if ((ch1 & 0xF0) == 0xE0)
+ {
+ unsigned int rv = ((ch1 & 0x0F) << 12)
+ + ((ch2 & 0x3F) << 6)
+ + (ch3 & 0x3F);
+
+ // RFC 3629 §3 prohibits UTF-8 encodings of the
UTF-16 surrogate
+ // halves (U+D800..U+DFFF); accepting them lets
malformed Unicode
+ // cross the decode boundary into LogString and
downstream output.
+ if (rv < 0x800 || (0xD800 <= rv && rv <=
0xDFFF))
+ return 0xFFFF;
+
+ in.increment_position(3);
+ return rv;
+ }
+ if (3 < availableByteCount)
+ {
+ auto ch4 = static_cast<unsigned char>(*(pChar +
3));
+ if ((ch4 & 0xC0) != 0x80) // not a continuation?
+ return 0xFFFF;
+
+ unsigned int rv = ((ch1 & 0x07) << 18)
+ + ((ch2 & 0x3F) << 12)
+ + ((ch3 & 0x3F) << 6)
+ + (ch4 & 0x3F);
+
+ // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead
bytes F5..F7 (and
+ // F4 with an over-high trailer) produce rv >
0x10FFFF, which
+ // is not a Unicode code point. Without this
bound, encodeUTF16
+ // later silently aliases the bogus value to a
valid in-range
+ // code point — a substitution-collision
filter-bypass primitive.
+ // Lead bytes F8..FF are never valid UTF-8, but
the & 0x07 mask
+ // discards their high bits, so without the
(ch1 & 0xF8) == 0xF0
+ // guard F8 BF BF BF would alias to U+3FFFF
instead of being
+ // rejected.
+ if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv
<= 0x10FFFF)
+ {
+ in.increment_position(4);
+ return rv;
+ }
+ }
+ }
+ }
+ return 0xFFFF;
+}
diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index ddc7e1c0..870ff436 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -218,111 +218,12 @@ size_t Transcoder::encodeUTF16LE(unsigned int ch, char*
dst)
unsigned int Transcoder::decode(const std::string& src,
std::string::const_iterator& iter)
{
- std::string::const_iterator start(iter);
- unsigned char ch1 = *(iter++);
-
- if (ch1 <= 0x7F)
- {
- return ch1;
- }
-
- //
- // should not have continuation character here
- //
- if ((ch1 & 0xC0) != 0x80 && iter != src.end())
- {
- unsigned char ch2 = *(iter++);
-
- //
- // should be continuation
- if ((ch2 & 0xC0) != 0x80)
- {
- iter = start;
- return 0xFFFF;
- }
-
- if ((ch1 & 0xE0) == 0xC0)
- {
- if ((ch2 & 0xC0) == 0x80)
- {
- unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 &
0x3F);
-
- if (rv >= 0x80)
- {
- return rv;
- }
- }
-
- iter = start;
- return 0xFFFF;
- }
-
- if (iter != src.end())
- {
- unsigned char ch3 = *(iter++);
-
- //
- // should be continuation
- //
- if ((ch3 & 0xC0) != 0x80)
- {
- iter = start;
- return 0xFFFF;
- }
-
- if ((ch1 & 0xF0) == 0xE0)
- {
- unsigned rv = ((ch1 & 0x0F) << 12)
- + ((ch2 & 0x3F) << 6)
- + (ch3 & 0x3F);
-
- // RFC 3629 §3 prohibits UTF-8 encodings of the
UTF-16 surrogate
- // halves (U+D800..U+DFFF); accepting them lets
malformed Unicode
- // cross the decode boundary into LogString and
downstream output.
- if (rv < 0x800 || (0xD800 <= rv && rv <=
0xDFFF))
- {
- iter = start;
- return 0xFFFF;
- }
-
- return rv;
- }
-
- if (iter != src.end())
- {
- unsigned char ch4 = *(iter++);
-
- if ((ch4 & 0xC0) != 0x80)
- {
- iter = start;
- return 0xFFFF;
- }
-
- unsigned int rv = ((ch1 & 0x07) << 18)
- + ((ch2 & 0x3F) << 12)
- + ((ch3 & 0x3F) << 6)
- + (ch4 & 0x3F);
-
- // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead
bytes F5..F7 (and
- // F4 with an over-high trailer) produce rv >
0x10FFFF, which
- // is not a Unicode code point. Without this
bound, encodeUTF16
- // later silently aliases the bogus value to a
valid in-range
- // code point — a substitution-collision
filter-bypass primitive.
- // Lead bytes F8..FF are never valid UTF-8, but
the & 0x07 mask
- // discards their high bits, so without the
(ch1 & 0xF8) == 0xF0
- // guard F8 BF BF BF would alias to U+3FFFF
instead of being
- // rejected.
- if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv
<= 0x10FFFF)
- {
- return rv;
- }
-
- }
- }
- }
-
- iter = start;
- return 0xFFFF;
+ auto offset = iter - src.begin();
+ auto remaining = src.size() - offset;
+ ByteBuffer buf(const_cast<char*>(src.data() + offset), remaining);
+ auto result = CharsetDecoder::getUTF8CodePoint(buf);
+ iter += remaining - buf.remaining();
+ return result;
}
diff --git a/src/main/include/log4cxx/helpers/cacheddateformat.h
b/src/main/include/log4cxx/helpers/cacheddateformat.h
index b2b8a32d..9a71665e 100644
--- a/src/main/include/log4cxx/helpers/cacheddateformat.h
+++ b/src/main/include/log4cxx/helpers/cacheddateformat.h
@@ -119,8 +119,8 @@ class LOG4CXX_EXPORT CachedDateFormat : public
helpers::DateFormat
/**
* Formats a Date into a date/time string.
*
- * @param date the date to format.
- * @param sbuf the string buffer to write to.
+ * @param tm the date/time to format.
+ * @param toAppendTo the string buffer to write to.
*/
void format( LOG4CXX_FORMAT_TIME_FORMAL_PARAMETERS ) const
override;
diff --git a/src/main/include/log4cxx/helpers/charsetdecoder.h
b/src/main/include/log4cxx/helpers/charsetdecoder.h
index d801e70b..71938e84 100644
--- a/src/main/include/log4cxx/helpers/charsetdecoder.h
+++ b/src/main/include/log4cxx/helpers/charsetdecoder.h
@@ -79,9 +79,9 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
/**
* Decodes as many bytes as possible from \c in,
* appending the result onto \c out.
- * @param in a null terminated string.
+ * @param in the bytes to decode.
* @param out the string onto which characters are appended.
- * @return APR_SUCCESS if not encoding errors were found.
+ * @return APR_SUCCESS if no encoding errors were found.
*/
virtual log4cxx_status_t decode(ByteBuffer& in, LogString& out)
= 0;
@@ -92,7 +92,7 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
* @param in a null terminated string.
* @param maxByteCount the limit on the size of \c in.
* @param out the string onto which characters are appended.
- * @return APR_SUCCESS if not encoding errors were found.
+ * @return APR_SUCCESS if no encoding errors were found.
*/
log4cxx_status_t decode(const char* in, size_t maxByteCount,
LogString& out);
@@ -104,6 +104,13 @@ class LOG4CXX_EXPORT CharsetDecoder : public Object
return (stat != 0);
}
+ /**
+ * Increment the \c buf cursor position past the next UTF8
code point in \c buf.
+ * @param buf the bytes to decode.
+ * @return the code point value, if successful, otherwise
0xFFFF. The \c buf cursor position is only incremented if successful.
+ */
+ static unsigned int getUTF8CodePoint(ByteBuffer& buf);
+
private:
/**
* Private copy constructor.
diff --git a/src/main/include/log4cxx/helpers/transcoder.h
b/src/main/include/log4cxx/helpers/transcoder.h
index 9907ede0..e91a09d6 100644
--- a/src/main/include/log4cxx/helpers/transcoder.h
+++ b/src/main/include/log4cxx/helpers/transcoder.h
@@ -28,10 +28,7 @@ namespace helpers
class ByteBuffer;
class Pool;
/**
-* Simple transcoder for converting between
-* external char and wchar_t strings and
-* internal strings.
-*
+* Methods for converting between external and internal strings.
*/
class LOG4CXX_EXPORT Transcoder
{
@@ -39,11 +36,11 @@ class LOG4CXX_EXPORT Transcoder
/**
- * Appends this specified string of UTF-8 characters to
LogString.
+ * Append the UTF-8 characters in \c src onto \c dst.
*/
static void decodeUTF8(const std::string& src, LogString& dst);
/**
- * Converts the LogString to a UTF-8 string.
+ * Append \src onto \c dst as a UTF-8 string.
*/
static void encodeUTF8(const LogString& src, std::string& dst);
#if LOG4CXX_ABI_VERSION <= 15
@@ -55,45 +52,43 @@ class LOG4CXX_EXPORT Transcoder
static char* encodeUTF8(const LogString& src,
LOG4CXX_NS::helpers::Pool& p);
#endif
/**
- * Append UCS-4 code point to a byte buffer as UTF-8.
+ * Append the code point \c sv to \c dst as UTF-8.
*/
static void encodeUTF8(unsigned int sv, ByteBuffer& dst);
/**
- * Append UCS-4 code point to a byte buffer as UTF-16LE.
+ * Append the code point \c sv to \c dst as UTF-16LE.
*/
static void encodeUTF16LE(unsigned int sv, ByteBuffer& dst);
/**
- * Append UCS-4 code point to a byte buffer as UTF-16BE.
+ * Append the code point \c sv to \c dst as UTF-16BE.
*/
static void encodeUTF16BE(unsigned int sv, ByteBuffer& dst);
/**
- * Decodes next character from a UTF-8 string.
- * @param in string from which the character is extracted.
- * @param iter iterator addressing start of character, will be
- * advanced to next character if successful.
- * @return scalar value (UCS-4) or 0xFFFF if invalid sequence.
+ * Increment \c iter past the next code point in \c str.
+ * @pre \c iter is a valid, dereferenceable iterator.
+ * @pre \c iter and the end of \c str are in the same
sequence.
+ * @param str contains the code point to which \c iter refers.
+ * @param iter the start of the current code point.
+ * @return the code point value or 0xFFFF if not a valid
sequence.
*/
- static unsigned int decode(const std::string& in,
+ static unsigned int decode(const std::string& str,
std::string::const_iterator& iter);
/**
- * Appends UCS-4 value to a UTF-8 string.
- * @param ch UCS-4 value.
- * @param dst destination.
+ * Append the UTF8 equivalent to \c ch onto \c dst.
*/
static void encode(unsigned int ch, std::string& dst);
/**
- * Appends string in the current code-page
- * to a LogString.
+ * Append the LogString equivalent of \c src onto \c dst.
*/
static void decode(const std::string& src, LogString& dst);
/**
- * Appends a LogString to a string in the current
- * code-page. Unrepresentable characters may be
+ * Append the UTF8 equivalent of \c src onto \c dst.
+ * Unrepresentable characters may be
* replaced with loss characters.
*/
static void encode(const LogString& src, std::string& dst);
@@ -113,24 +108,30 @@ class LOG4CXX_EXPORT Transcoder
#if LOG4CXX_WCHAR_T_API || LOG4CXX_LOGCHAR_IS_WCHAR || defined(WIN32) ||
defined(_WIN32)
+ /**
+ * Append the LogString equivalent of \c src onto \c dst.
+ */
static void decode(const std::wstring& src, LogString& dst);
+ /**
+ * Append the equivalent of \c src onto \c dst.
+ */
static void encode(const LogString& src, std::wstring& dst);
+ /// A null-terminated equivalent of \c src.
static wchar_t* wencode(const LogString& src,
LOG4CXX_NS::helpers::Pool& p);
/**
- * Decodes next character from a wstring.
- * @param in string from which the character is extracted.
- * @param iter iterator addressing start of character, will be
- * advanced to next character if successful.
- * @return scalar value (UCS-4) or 0xFFFF if invalid sequence.
+ * Increment \c iter past the next code point in \c str.
+ * @pre \c iter is a valid, dereferenceable iterator.
+ * @pre \c iter and the end of \c str are in the same
sequence.
+ * @param str contains the code point to which \c iter refers.
+ * @param iter the start of the current code point.
+ * @return the code point value or 0xFFFF if not a valid
sequence.
*/
- static unsigned int decode(const std::wstring& in,
+ static unsigned int decode(const std::wstring& str,
std::wstring::const_iterator& iter);
/**
- * Appends UCS-4 value to a UTF-8 string.
- * @param ch UCS-4 value.
- * @param dst destination.
+ * Append the wchar_t equivalent to \c ch onto \c dst.
*/
static void encode(unsigned int ch, std::wstring& dst);
@@ -138,45 +139,52 @@ class LOG4CXX_EXPORT Transcoder
#if LOG4CXX_UNICHAR_API || LOG4CXX_LOGCHAR_IS_UNICHAR
+ /**
+ * Append the LogString equivalent of \c src onto \c dst.
+ */
static void decode(const std::basic_string<UniChar>& src,
LogString& dst);
+ /**
+ * Append the equivalent of \c src onto \c dst.
+ */
static void encode(const LogString& src,
std::basic_string<UniChar>& dst);
/**
- * Decodes next character from a UniChar string.
- * @param in string from which the character is extracted.
- * @param iter iterator addressing start of character, will be
- * advanced to next character if successful.
- * @return scalar value (UCS-4) or 0xFFFF if invalid sequence.
+ * Increment \c iter past the next code point in \c str.
+ * @pre \c iter is a valid, dereferenceable iterator.
+ * @pre \c iter and the end of \c str are in the same
sequence.
+ * @param str contains the code point to which \c iter refers.
+ * @param iter the start of the current code point.
+ * @return the code point value or 0xFFFF if not a valid
sequence.
*/
- static unsigned int decode(const std::basic_string<UniChar>& in,
+ static unsigned int decode(const std::basic_string<UniChar>&
str,
std::basic_string<UniChar>::const_iterator& iter);
/**
- * Appends UCS-4 value to a UTF-8 string.
- * @param ch UCS-4 value.
- * @param dst destination.
+ * Append the UniChar equivalent to \c ch onto \c dst.
*/
static void encode(unsigned int ch, std::basic_string<UniChar>&
dst);
#endif
#if LOG4CXX_CFSTRING_API
+ /**
+ * Append the LogString equivalent of \c src onto \c dst.
+ */
static void decode(const CFStringRef& src, LogString& dst);
+ /// A CFStringRef equivalent of \c src.
static CFStringRef encode(const LogString& src);
#endif
enum { LOSSCHAR = 0x3F };
/**
- * Returns a logchar value given a character literal in the
ASCII charset.
- * Used to implement the LOG4CXX_STR macro for EBCDIC and
UNICHAR.
+ * The logchar equivalent to \c ch.
*/
- static logchar decode(char v);
+ static logchar decode(char ch);
/**
- * Returns a LogString given a string literal in the ASCII
charset.
- * Used to implement the LOG4CXX_STR macro for EBCDIC and
UNICHAR.
+ * The LogString equivalent to the characters in the null
terminated bytes at \c str.
*/
- static LogString decode(const char* v);
+ static LogString decode(const char* str);
/**
* Encodes a charset name in the default encoding