(logging-log4cxx) branch master updated: fuzz coverage for transcoder encode/decode paths (#704)

swebb2066 Tue, 02 Jun 2026 18:32:00 -0700

This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git



The following commit(s) were added to refs/heads/master by this push:
     new bf662ba4 fuzz coverage for transcoder encode/decode paths (#704)
bf662ba4 is described below

commit bf662ba4e7c9565d15ae9d52c42715afaa912988
Author: jmestwa-coder <[email protected]>
AuthorDate: Wed Jun 3 07:01:47 2026 +0530

    fuzz coverage for transcoder encode/decode paths (#704)
---
 src/fuzzers/cpp/CMakeLists.txt       |   1 +
 src/fuzzers/cpp/TranscoderFuzzer.cpp | 280 +++++++++++++++++++++++++++++++++++
 2 files changed, 281 insertions(+)

diff --git a/src/fuzzers/cpp/CMakeLists.txt b/src/fuzzers/cpp/CMakeLists.txt
index c1068960..d17d20c6 100644
--- a/src/fuzzers/cpp/CMakeLists.txt
+++ b/src/fuzzers/cpp/CMakeLists.txt
@@ -24,6 +24,7 @@ set(ALL_LOG4CXX_FUZZERS
     TimeBasedRollingPolicyFuzzer
     PatternConverterFuzzer
     DOMConfiguratorFuzzer
+    TranscoderFuzzer
 )
 
 # Get the most recent Git commit ID
diff --git a/src/fuzzers/cpp/TranscoderFuzzer.cpp 
b/src/fuzzers/cpp/TranscoderFuzzer.cpp
new file mode 100644
index 00000000..f7980488
--- /dev/null
+++ b/src/fuzzers/cpp/TranscoderFuzzer.cpp
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Fuzzer for the charset transcoding layer
+// (Transcoder, CharsetDecoder, CharsetEncoder).
+//
+// This is the code that turns untrusted external bytes into the internal
+// LogString and back out again for every appender.  It has historically been
+// the source of several memory-safety and correctness defects in the decode
+// boundary, for example:
+//
+//   * reject invalid UTF-8 lead bytes F8..FF in Transcoder::decode (#699)
+//   * UTF-8 recovery loop end-of-input handling                    (#695)
+//   * reject UTF-16 surrogate-half encodings in UTF-8             (#669)
+//   * nullptr pointer arithmetic in charset decoder              (#670)
+//   * UTF-8 decoder rejecting valid U+0800 three-byte sequence   (#664)
+//   * ISO Latin-1 decoder sign extension                         (#660)
+//   * UTF-16 supplementary character encoding                    (#659)
+//   * infinite loop in MbstowcsCharsetDecoder                    (#589)
+//
+// The harness drives that layer with arbitrary bytes under ASan and the
+// integer-overflow sanitizer, and additionally checks two round-trip
+// invariants that a substitution-collision / aliasing defect (the class of
+// bug behind #699 and #669) would break.
+//
+
+#include <fuzzer/FuzzedDataProvider.h>
+#include <log4cxx/logstring.h>
+#include <log4cxx/helpers/transcoder.h>
+#include <log4cxx/helpers/charsetdecoder.h>
+#include <log4cxx/helpers/charsetencoder.h>
+#include <log4cxx/helpers/bytebuffer.h>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <exception>
+
+using namespace LOG4CXX_NS;
+using namespace LOG4CXX_NS::helpers;
+
+namespace
+{
+       const size_t MaximumByteCount = 1 << 16;
+
+       // Abort so libFuzzer/ASan reports the violation with a saved 
reproducer.
+       void requireInvariant(bool ok, const char* what)
+       {
+               if (!ok)
+               {
+                       fprintf(stderr, "transcoder invariant violated: %s\n", 
what);
+                       abort();
+               }
+       }
+
+       // Independent UTF-16 reference decoder used only by the oracle below.
+       // Kept deliberately tiny and self-contained so that it cannot share a
+       // defect with the Transcoder::encodeUTF16BE/LE functions it validates: 
it
+       // reassembles the two big/little-endian bytes of each code unit and 
applies
+       // the standard surrogate-pair formula, nothing more.
+       unsigned int referenceDecodeUTF16(const char* raw, size_t n, bool 
bigEndian)
+       {
+               auto unit = [&](size_t i) -> unsigned int
+               {
+                       unsigned char hi = (unsigned char) raw[bigEndian ? i : 
i + 1];
+                       unsigned char lo = (unsigned char) raw[bigEndian ? i + 
1 : i];
+                       return (unsigned int) ((hi << 8) | lo);
+               };
+
+               if (n == 2)
+                       return unit(0);
+               if (n == 4)
+               {
+                       unsigned int hs = unit(0);
+                       unsigned int ls = unit(2);
+                       return (hs - 0xD800) * 0x400 + (ls - 0xDC00) + 0x10000;
+               }
+               return 0xFFFFFFFFu; // unexpected length -- not this oracle's 
concern
+       }
+
+       // Oracle: a Unicode scalar encoded to UTF-16BE/LE and decoded back 
through
+       // the independent reference above must reproduce the scalar.  This is 
the
+       // round trip that exercises *and checks* Transcoder::encodeUTF16BE/LE 
-- the
+       // surrogate-pair byte encoders fixed in #659.  Those functions are 
reached
+       // from UTF16BECharsetEncoder::encode, but the named-codec path 
discards the
+       // bytes, leaving the encoders without a correctness oracle; this 
supplies
+       // one.  A defect that mis-derives a surrogate byte (in bounds, no 
crash)
+       // silently decodes to the wrong code point and trips this check.
+       void checkUTF16RoundTrip(unsigned int sv)
+       {
+               char be[4] = { 0, 0, 0, 0 };
+               ByteBuffer beBuf(be, sizeof be);
+               Transcoder::encodeUTF16BE(sv, beBuf);
+               requireInvariant(referenceDecodeUTF16(be, beBuf.position(), 
true) == sv,
+                       "UTF-16BE encode/decode round trip corrupted the code 
point");
+
+               char le[4] = { 0, 0, 0, 0 };
+               ByteBuffer leBuf(le, sizeof le);
+               Transcoder::encodeUTF16LE(sv, leBuf);
+               requireInvariant(referenceDecodeUTF16(le, leBuf.position(), 
false) == sv,
+                       "UTF-16LE encode/decode round trip corrupted the code 
point");
+       }
+
+       // Drive a decoder over every byte of `bytes`, mirroring the 
error-recovery
+       // loop in Transcoder::decode so that a single invalid byte advances the
+       // cursor instead of stalling it.  The defensive no-progress break 
guards
+       // the harness against a hang if a decoder ever returns success without
+       // consuming input (that condition is itself worth surfacing, but as a
+       // finding rather than a fuzzer timeout).
+       void exerciseDecoder(const CharsetDecoderPtr& decoder, const 
std::string& bytes)
+       {
+               if (!decoder || bytes.empty())
+                       return;
+
+               LogString out;
+               ByteBuffer buf(const_cast<char*>(bytes.data()), bytes.size());
+
+               while (buf.remaining() > 0)
+               {
+                       size_t before = buf.position();
+                       log4cxx_status_t stat = decoder->decode(buf, out);
+
+                       if (CharsetDecoder::isError(stat))
+                       {
+                               out.append(1, (logchar) Transcoder::LOSSCHAR);
+                               buf.increment_position(1);
+                       }
+                       else if (buf.position() == before)
+                       {
+                               break;
+                       }
+               }
+
+               decoder->decode(buf, out); // flush any pending state
+       }
+
+       // Drain an entire LogString through `encoder`, mirroring the loop in
+       // Transcoder::encode (flip / consume / clear) and advancing past any
+       // character the target charset cannot represent.
+       void exerciseEncoder(const CharsetEncoderPtr& encoder, const LogString& 
in)
+       {
+               if (!encoder)
+                       return;
+
+               char scratch[128];
+               std::string sink;
+               ByteBuffer out(scratch, sizeof scratch);
+               LogString::const_iterator iter = in.begin();
+
+               while (iter != in.end())
+               {
+                       LogString::const_iterator before = iter;
+                       log4cxx_status_t stat = encoder->encode(in, iter, out);
+                       out.flip();
+                       sink.append(out.data(), out.limit());
+                       out.clear();
+
+                       if (CharsetEncoder::isError(stat))
+                       {
+                               if (iter != in.end())
+                                       ++iter; // skip the unrepresentable 
character
+                       }
+                       else if (iter == before)
+                       {
+                               break; // defensive: success without progress
+                       }
+               }
+
+               encoder->flush(out);
+       }
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+       FuzzedDataProvider fdp(data, size);
+
+       // Reserve a one-byte selector for the named charset exercised below, 
then
+       // treat the remainder as the untrusted byte stream.
+       const int charsetSel = fdp.ConsumeIntegralInRange<int>(0, 4);
+       std::string bytes = fdp.ConsumeRandomLengthString(MaximumByteCount);
+
+       // Path 1: decode in the configured default code page
+       // (this is what every std::string -> LogString conversion uses).
+       LogString viaDefault;
+       Transcoder::decode(bytes, viaDefault);
+
+       // Path 2: explicit UTF-8 decode through the hardened scalar decoder.
+       // decodeUTF8 replaces every malformed sequence with LOSSCHAR, so the
+       // result contains only well-formed Unicode scalar values.
+       LogString sanitized;
+       Transcoder::decodeUTF8(bytes, sanitized);
+
+       // Invariant A: because `sanitized` is already well-formed, encoding it 
to
+       // UTF-8 and decoding it again must reproduce it exactly.  A 
decode/encode
+       // asymmetry -- e.g. a malformed input aliasing onto an in-range code 
point
+       // inconsistently -- breaks this fixed point.
+       std::string utf8;
+       Transcoder::encodeUTF8(sanitized, utf8);
+       LogString reSanitized;
+       Transcoder::decodeUTF8(utf8, reSanitized);
+       requireInvariant(sanitized == reSanitized,
+               "decodeUTF8/encodeUTF8 round trip is not idempotent");
+
+       // Path 3: default-charset encode of the sanitized string.
+       std::string reencoded;
+       Transcoder::encode(sanitized, reencoded);
+
+       // Path 4: every named codec, driven over the raw fuzz bytes (decode) 
and
+       // over the full Unicode range (encode -- exercises the unrepresentable
+       // character / error-recovery branches of US-ASCII and ISO-8859-1).
+       static const LogString charsetNames[] =
+       {
+               LOG4CXX_STR("US-ASCII"),
+               LOG4CXX_STR("ISO-8859-1"),
+               LOG4CXX_STR("UTF-8"),
+               LOG4CXX_STR("UTF-16BE"),
+               LOG4CXX_STR("UTF-16LE"),
+       };
+       const LogString& charset = charsetNames[charsetSel % 5];
+
+       try
+       {
+               exerciseDecoder(CharsetDecoder::getDecoder(charset), bytes);
+               exerciseEncoder(CharsetEncoder::getEncoder(charset), sanitized);
+       }
+       catch (const std::exception&)
+       {
+               // getDecoder/getEncoder throw IllegalArgumentException for an
+               // unrecognised name; all names above are valid, but stay 
defensive.
+       }
+
+       // Path 6: UTF-16BE/LE byte-encoder round trip over every scalar decoded
+       // from the input.  Unlike Path 4 (which discards the encoder's bytes), 
this
+       // drives Transcoder::encodeUTF16BE/LE directly -- the exact #659 site 
-- with
+       // real code points, including supplementary-plane scalars that form 
surrogate
+       // pairs, and verifies each survives a decode.  Portable across 
LOG4CXX_CHAR
+       // configurations because it reads scalars through the UTF-8 scalar 
decoder
+       // rather than the platform wchar_t path.
+       {
+               std::string::const_iterator it = bytes.begin();
+               while (it != bytes.end())
+               {
+                       unsigned int sv = Transcoder::decode(bytes, it);
+                       if (sv == 0xFFFF)
+                       {
+                               ++it; // mirror decodeUTF8's recovery advance 
on a bad sequence
+                               continue;
+                       }
+                       checkUTF16RoundTrip(sv);
+               }
+       }
+
+#if LOG4CXX_WCHAR_T_API || LOG4CXX_LOGCHAR_IS_WCHAR || defined(WIN32) || 
defined(_WIN32)
+       // Path 5: wide round trip, covering the UTF-16 surrogate-pair handling
+       // that produced #659.  `sanitized` holds no surrogate-range scalars, so
+       // encoding to wchar_t and decoding back must be a fixed point.
+       std::wstring wide;
+       Transcoder::encode(sanitized, wide);
+       LogString fromWide;
+       Transcoder::decode(wide, fromWide);
+       requireInvariant(sanitized == fromWide,
+               "wchar_t encode/decode round trip is not idempotent");
+#endif
+
+       return 0;
+}

(logging-log4cxx) branch master updated: fuzz coverage for transcoder encode/decode paths (#704)

Reply via email to