(logging-log4cxx) branch master updated: Reject UTF-16 surrogate-half encodings in UTF-8 (#669)

swebb2066 Wed, 20 May 2026 19:20:47 -0700

This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git



The following commit(s) were added to refs/heads/master by this push:
     new 43360b5a Reject UTF-16 surrogate-half encodings in UTF-8 (#669)
43360b5a is described below

commit 43360b5a199b21bc08a6e8d155145df3d395a74c
Author: metsw24-max <[email protected]>
AuthorDate: Thu May 21 07:50:28 2026 +0530

    Reject UTF-16 surrogate-half encodings in UTF-8 (#669)
    
    * Invalid Unicode characters become the Unicode replacement character in XML
---
 src/main/cpp/transcoder.cpp                 | 12 +++-
 src/main/cpp/transform.cpp                  |  5 ++
 src/site/markdown/change-report-gh.md       |  1 +
 src/test/cpp/helpers/transcodertestcase.cpp | 98 +++++++++++++++++++++++++++++
 src/test/cpp/xml/xmllayouttest.cpp          |  5 +-
 5 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index e5002fbf..de0deadf 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -265,7 +265,10 @@ unsigned int Transcoder::decode(const std::string& src,
                                        + ((ch2 & 0x3F) << 6)
                                        + (ch3 & 0x3F);
 
-                               if (rv < 0x800)
+                               // RFC 3629 §3 prohibits UTF-8 encodings of the 
UTF-16 surrogate
+                               // halves (U+D800..U+DFFF); accepting them lets 
malformed Unicode
+                               // cross the decode boundary into LogString and 
downstream output.
+                               if (rv < 0x800 || (0xD800 <= rv && rv <= 
0xDFFF))
                                {
                                        iter = start;
                                        return 0xFFFF;
@@ -289,7 +292,12 @@ unsigned int Transcoder::decode(const std::string& src,
                                        + ((ch3 & 0x3F) << 6)
                                        + (ch4 & 0x3F);
 
-                               if (rv > 0xFFFF)
+                               // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead 
bytes F5..F7 (and
+                               // F4 with an over-high trailer) produce rv > 
0x10FFFF, which
+                               // is not a Unicode code point. Without this 
bound, encodeUTF16
+                               // later silently aliases the bogus value to a 
valid in-range
+                               // code point — a substitution-collision 
filter-bypass primitive.
+                               if (rv > 0xFFFF && rv <= 0x10FFFF)
                                {
                                        return rv;
                                }
diff --git a/src/main/cpp/transform.cpp b/src/main/cpp/transform.cpp
index ec7c68c4..41206c83 100644
--- a/src/main/cpp/transform.cpp
+++ b/src/main/cpp/transform.cpp
@@ -46,6 +46,11 @@ void appendValidCharacters(LogString& buf, const LogString& 
input, CharProcessor
                auto ch = Transcoder::decode(input, nextCodePoint);
                if (nextCodePoint == lastCodePoint) // failed to decode input?
                        nextCodePoint = input.end();
+               else if (0xD800 <= ch && ch <= 0xDFFF)
+               {
+                       // RFC 3629 §3 explicitly forbids surrogate-half values 
in UTF-8
+                       ch = 0xFFFF;
+               }
                else if (((0x20 <= ch && ch <= 0xD7FF) &&
                                specials[0] != ch &&
                                specials[1] != ch &&
diff --git a/src/site/markdown/change-report-gh.md 
b/src/site/markdown/change-report-gh.md
index 2f4e1e6e..51b5cef4 100644
--- a/src/site/markdown/change-report-gh.md
+++ b/src/site/markdown/change-report-gh.md
@@ -71,6 +71,7 @@ The following issues have been addressed:
    , [#659](https://github.com/apache/logging-log4cxx/pull/659)
    , [#660](https://github.com/apache/logging-log4cxx/pull/660)
    , [#664](https://github.com/apache/logging-log4cxx/pull/664)
+   , [#669](https://github.com/apache/logging-log4cxx/pull/669)
    , [#670](https://github.com/apache/logging-log4cxx/pull/670)
 
 * A lack of robustness dealing with values near numeric limits  
diff --git a/src/test/cpp/helpers/transcodertestcase.cpp 
b/src/test/cpp/helpers/transcodertestcase.cpp
index a94dab5d..1bc1431e 100644
--- a/src/test/cpp/helpers/transcodertestcase.cpp
+++ b/src/test/cpp/helpers/transcodertestcase.cpp
@@ -64,7 +64,11 @@ LOGUNIT_CLASS(TranscoderTestCase)
        LOGUNIT_TEST(testDecodeUTF8_2);
        LOGUNIT_TEST(testDecodeUTF8_3);
        LOGUNIT_TEST(testDecodeUTF8_4);
+       LOGUNIT_TEST(testDecodeUTF8_RejectSurrogate);
+       LOGUNIT_TEST(testDecodeUTF8_SurrogateBoundaries);
        LOGUNIT_TEST(testDecodeUTF8_U0800);
+       LOGUNIT_TEST(testDecodeUTF8_RejectAboveMax);
+       LOGUNIT_TEST(testDecodeUTF8_MaxBoundary);
        LOGUNIT_TEST(testEncodeUTF16BE_BMP);
        LOGUNIT_TEST(testEncodeUTF16BE_Supplementary);
        LOGUNIT_TEST(testEncodeUTF16LE_Supplementary);
@@ -317,6 +321,28 @@ public:
                LOGUNIT_ASSERT_EQUAL(true, iter == out.end());
        }
 
+       /**
+        * RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate halves
+        * (U+D800..U+DFFF). The three-byte sequences ED A0 80 .. ED BF BF must
+        * not decode to the corresponding surrogate code points: doing so lets
+        * lone surrogates enter LogString and be re-emitted by JSON/XML 
layouts,
+        * propagating malformed Unicode past the parsing boundary. Each byte of
+        * the invalid sequence is replaced with Transcoder::LOSSCHAR.
+        */
+       void testDecodeUTF8_RejectSurrogate()
+       {
+               // ED A0 80 would encode U+D800 (the smallest high-surrogate).
+               std::string src("\xED\xA0\x80");
+               LogString out;
+               Transcoder::decodeUTF8(src, out);
+
+               LogString expected;
+               expected.append(1, Transcoder::LOSSCHAR);
+               expected.append(1, Transcoder::LOSSCHAR);
+               expected.append(1, Transcoder::LOSSCHAR);
+               LOGUNIT_ASSERT_EQUAL(expected, out);
+       }
+
        /**
         * U+0800 (SAMARITAN LETTER ALAF) is the smallest code point that
         * legitimately requires a three-byte UTF-8 sequence (E0 A0 80).
@@ -338,6 +364,78 @@ public:
                LOGUNIT_ASSERT(out.find(Transcoder::LOSSCHAR) == 
LogString::npos);
        }
 
+       /**
+        * Confirm the surrogate-rejection range is exactly U+D800..U+DFFF:
+        * U+D7FF (ED 9F BF) and U+E000 (EE 80 80) bracket the range and must
+        * still decode cleanly. The four interior values are each rejected.
+        */
+       void testDecodeUTF8_SurrogateBoundaries()
+       {
+               struct { const char* bytes; size_t len; bool reject; } cases[] =
+               {
+                       { "\xED\x9F\xBF", 3, false }, // U+D7FF — last valid 
before surrogates
+                       { "\xED\xA0\x80", 3, true  }, // U+D800 — 
high-surrogate min (reject)
+                       { "\xED\xAF\xBF", 3, true  }, // U+DBFF — 
high-surrogate max (reject)
+                       { "\xED\xB0\x80", 3, true  }, // U+DC00 — low-surrogate 
min  (reject)
+                       { "\xED\xBF\xBF", 3, true  }, // U+DFFF — low-surrogate 
max  (reject)
+                       { "\xEE\x80\x80", 3, false }, // U+E000 — first valid 
after surrogates
+               };
+               for (auto& c : cases)
+               {
+                       std::string src(c.bytes, c.len);
+                       LogString out;
+                       Transcoder::decodeUTF8(src, out);
+                       bool hasLoss = out.find(Transcoder::LOSSCHAR) != 
LogString::npos;
+                       LOGUNIT_ASSERT_EQUAL(c.reject, hasLoss);
+               }
+       }
+
+       /**
+        * RFC 3629 §3 caps UTF-8 at U+10FFFF. Four-byte sequences with lead F5,
+        * F6, F7 (and F4 with an over-high trailer) decode to values above the
+        * Unicode maximum. Without bounds-rejection here, 
Transcoder::encodeUTF16
+        * later silently aliases the bogus value to a valid in-range code point
+        * (e.g. U+110000 collides with U+10000) — a substitution-collision
+        * filter-bypass primitive in wchar builds.
+        */
+       void testDecodeUTF8_RejectAboveMax()
+       {
+               // F4 90 80 80 would encode U+110000 (one past the maximum).
+               std::string src("\xF4\x90\x80\x80");
+               LogString out;
+               Transcoder::decodeUTF8(src, out);
+
+               LogString expected;
+               for (int i = 0; i < 4; ++i)
+                       expected.append(1, Transcoder::LOSSCHAR);
+               LOGUNIT_ASSERT_EQUAL(expected, out);
+       }
+
+       /**
+        * Boundary check around U+10FFFF: the canonical encoding of the
+        * maximum legal code point (F4 8F BF BF) must decode cleanly; one past
+        * (F4 90 80 80) and the F5/F6/F7 lead bytes must all be rejected.
+        */
+       void testDecodeUTF8_MaxBoundary()
+       {
+               struct { const char* bytes; size_t len; bool reject; } cases[] =
+               {
+                       { "\xF4\x8F\xBF\xBF", 4, false }, // U+10FFFF — maximum 
legal code point
+                       { "\xF4\x90\x80\x80", 4, true  }, // U+110000 — one 
past max (reject)
+                       { "\xF5\x80\x80\x80", 4, true  }, // F5 lead: rv = 
0x140000 (reject)
+                       { "\xF6\x80\x80\x80", 4, true  }, // F6 lead: rv = 
0x180000 (reject)
+                       { "\xF7\xBF\xBF\xBF", 4, true  }, // F7 lead: rv = 
0x1FFFFF (reject)
+               };
+               for (auto& c : cases)
+               {
+                       std::string src(c.bytes, c.len);
+                       LogString out;
+                       Transcoder::decodeUTF8(src, out);
+                       bool hasLoss = out.find(Transcoder::LOSSCHAR) != 
LogString::npos;
+                       LOGUNIT_ASSERT_EQUAL(c.reject, hasLoss);
+               }
+       }
+
        void testEncodeUTF16BE_BMP()
        {
                char raw[4] = { 0, 0, 0, 0 };
diff --git a/src/test/cpp/xml/xmllayouttest.cpp 
b/src/test/cpp/xml/xmllayouttest.cpp
index 2ffbb2f0..0399b94b 100644
--- a/src/test/cpp/xml/xmllayouttest.cpp
+++ b/src/test/cpp/xml/xmllayouttest.cpp
@@ -377,11 +377,8 @@ public:
                Transcoder::encode(0xD822, problemNameLS); // Add an invalid 
character that should be stripped from attribute values
                auto keyLS = problemNameLS;
                auto expectedKeyValue = problemName;
-#if LOG4CXX_LOGCHAR_IS_WCHAR && !defined(__STDC_ISO_10646__)
-               // encodeUTF16 adds 0xD822, but decodeUTF16 cannot convert 
0xD822
-               // Expat translates the Unicode replacement character to the 
following
+               // UTF-8 encodes the Unicode replacement character (0xFFFD) as 
the following:
                expectedKeyValue += "\xEF\xBF\xBD";
-#endif
                std::string problemMessage = "'\001\"<Hello >\"\004'";
                std::string expectedCdataValue = "'&#x1;\"<Hello >\"&#x4;'";
                std::string expectedAttributeValue = "'\"<Hello >\"'"; // 
Invalid characters stripped

(logging-log4cxx) branch master updated: Reject UTF-16 surrogate-half encodings in UTF-8 (#669)

Reply via email to