This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


The following commit(s) were added to refs/heads/master by this push:
     new b0671d81 Decode input after an undecodable byte in XML/JSON layouts 
(#701)
b0671d81 is described below

commit b0671d81d195fae2633574dfc70bb3615dfe02ed
Author: metsw24-max <[email protected]>
AuthorDate: Wed Jun 3 06:44:52 2026 +0530

    Decode input after an undecodable byte in XML/JSON layouts (#701)
---
 src/main/cpp/jsonlayout.cpp     | 10 +++++++++-
 src/main/cpp/transform.cpp      | 22 ++++++++++++++++++++--
 src/test/cpp/jsonlayouttest.cpp | 19 +++++++++++++++++++
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/main/cpp/jsonlayout.cpp b/src/main/cpp/jsonlayout.cpp
index 0833d031..6f5e1b86 100644
--- a/src/main/cpp/jsonlayout.cpp
+++ b/src/main/cpp/jsonlayout.cpp
@@ -226,7 +226,15 @@ void JSONLayout::appendItem(const LogString& input, 
LogString& buf)
                auto ch = Transcoder::decode(input, nextCodePoint);
                if (nextCodePoint == lastCodePoint) // failed to decode input?
                {
-                       nextCodePoint = input.end();
+                       // Skip the undecodable run and keep escaping the 
remaining input
+                       // instead of discarding it; the run collapses to one 
replacement.
+                       for (++nextCodePoint; nextCodePoint != input.end(); 
++nextCodePoint)
+                       {
+                               auto probe = nextCodePoint;
+                               Transcoder::decode(input, probe);
+                               if (probe != nextCodePoint) // next unit starts 
a decodable sequence
+                                       break;
+                       }
                        ch = 0xFFFD; // The Unicode replacement character
                }
                else if ((0xD800 <= ch && ch <= 0xDFFF) || 0x10FFFF < ch)
diff --git a/src/main/cpp/transform.cpp b/src/main/cpp/transform.cpp
index 41206c83..e5cec926 100644
--- a/src/main/cpp/transform.cpp
+++ b/src/main/cpp/transform.cpp
@@ -45,7 +45,17 @@ void appendValidCharacters(LogString& buf, const LogString& 
input, CharProcessor
                auto lastCodePoint = nextCodePoint;
                auto ch = Transcoder::decode(input, nextCodePoint);
                if (nextCodePoint == lastCodePoint) // failed to decode input?
-                       nextCodePoint = input.end();
+               {
+                       // Skip the undecodable run and keep escaping the 
remaining input
+                       // instead of discarding it; the run collapses to one 
replacement.
+                       for (++nextCodePoint; nextCodePoint != input.end(); 
++nextCodePoint)
+                       {
+                               auto probe = nextCodePoint;
+                               Transcoder::decode(input, probe);
+                               if (probe != nextCodePoint) // next unit starts 
a decodable sequence
+                                       break;
+                       }
+               }
                else if (0xD800 <= ch && ch <= 0xDFFF)
                {
                        // RFC 3629 ยง3 explicitly forbids surrogate-half values 
in UTF-8
@@ -140,7 +150,15 @@ void Transform::appendEscapingCDATA(
                auto ch = Transcoder::decode(input, nextCodePoint);
                if (nextCodePoint == lastCodePoint) // failed to decode input?
                {
-                       nextCodePoint = input.end();
+                       // Skip the undecodable run and keep escaping the 
remaining input
+                       // instead of discarding it; the run collapses to one 
replacement.
+                       for (++nextCodePoint; nextCodePoint != input.end(); 
++nextCodePoint)
+                       {
+                               auto probe = nextCodePoint;
+                               Transcoder::decode(input, probe);
+                               if (probe != nextCodePoint) // next unit starts 
a decodable sequence
+                                       break;
+                       }
                        ch = 0xFFFD; // The Unicode replacement character
                }
                else if (CDATA_END[0] == ch && input.end() != nextCodePoint)
diff --git a/src/test/cpp/jsonlayouttest.cpp b/src/test/cpp/jsonlayouttest.cpp
index 870514cc..60b93936 100644
--- a/src/test/cpp/jsonlayouttest.cpp
+++ b/src/test/cpp/jsonlayouttest.cpp
@@ -53,6 +53,9 @@ LOGUNIT_CLASS(JSONLayoutTest), public JSONLayout
        LOGUNIT_TEST(testIgnoresThrowable);
        LOGUNIT_TEST(testAppendQuotedEscapedStringWithPrintableChars);
        LOGUNIT_TEST(testAppendQuotedEscapedStringWithControlChars);
+#if LOG4CXX_LOGCHAR_IS_UTF8
+       LOGUNIT_TEST(testAppendQuotedEscapedStringWithInvalidByte);
+#endif
        LOGUNIT_TEST(testAppendSerializedMDC);
        LOGUNIT_TEST(testAppendSerializedMDCWithPrettyPrint);
        LOGUNIT_TEST(testAppendSerializedNDC);
@@ -186,6 +189,22 @@ public:
                LOGUNIT_ASSERT_EQUAL(esc_expected, esc_escaped);
        }
 
+#if LOG4CXX_LOGCHAR_IS_UTF8
+       /**
+        * An undecodable byte must be replaced and the text following it kept,
+        * not dropped along with the rest of the input.
+        */
+       void testAppendQuotedEscapedStringWithInvalidByte()
+       {
+               logchar in[] = {'A', (logchar) 0xFF, 'B', 0};
+               LogString expected(LOG4CXX_STR("\"A\\ufffdB\""));
+               LogString actual;
+
+               appendQuotedEscapedString(actual, in);
+               LOGUNIT_ASSERT_EQUAL(expected, actual);
+       }
+#endif
+
        /**
         * Tests appendSerializedMDC.
         */

Reply via email to