This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new b0671d81 Decode input after an undecodable byte in XML/JSON layouts
(#701)
b0671d81 is described below
commit b0671d81d195fae2633574dfc70bb3615dfe02ed
Author: metsw24-max <[email protected]>
AuthorDate: Wed Jun 3 06:44:52 2026 +0530
Decode input after an undecodable byte in XML/JSON layouts (#701)
---
src/main/cpp/jsonlayout.cpp | 10 +++++++++-
src/main/cpp/transform.cpp | 22 ++++++++++++++++++++--
src/test/cpp/jsonlayouttest.cpp | 19 +++++++++++++++++++
3 files changed, 48 insertions(+), 3 deletions(-)
diff --git a/src/main/cpp/jsonlayout.cpp b/src/main/cpp/jsonlayout.cpp
index 0833d031..6f5e1b86 100644
--- a/src/main/cpp/jsonlayout.cpp
+++ b/src/main/cpp/jsonlayout.cpp
@@ -226,7 +226,15 @@ void JSONLayout::appendItem(const LogString& input,
LogString& buf)
auto ch = Transcoder::decode(input, nextCodePoint);
if (nextCodePoint == lastCodePoint) // failed to decode input?
{
- nextCodePoint = input.end();
+ // Skip the undecodable run and keep escaping the
remaining input
+ // instead of discarding it; the run collapses to one
replacement.
+ for (++nextCodePoint; nextCodePoint != input.end();
++nextCodePoint)
+ {
+ auto probe = nextCodePoint;
+ Transcoder::decode(input, probe);
+ if (probe != nextCodePoint) // next unit starts
a decodable sequence
+ break;
+ }
ch = 0xFFFD; // The Unicode replacement character
}
else if ((0xD800 <= ch && ch <= 0xDFFF) || 0x10FFFF < ch)
diff --git a/src/main/cpp/transform.cpp b/src/main/cpp/transform.cpp
index 41206c83..e5cec926 100644
--- a/src/main/cpp/transform.cpp
+++ b/src/main/cpp/transform.cpp
@@ -45,7 +45,17 @@ void appendValidCharacters(LogString& buf, const LogString&
input, CharProcessor
auto lastCodePoint = nextCodePoint;
auto ch = Transcoder::decode(input, nextCodePoint);
if (nextCodePoint == lastCodePoint) // failed to decode input?
- nextCodePoint = input.end();
+ {
+ // Skip the undecodable run and keep escaping the
remaining input
+ // instead of discarding it; the run collapses to one
replacement.
+ for (++nextCodePoint; nextCodePoint != input.end();
++nextCodePoint)
+ {
+ auto probe = nextCodePoint;
+ Transcoder::decode(input, probe);
+ if (probe != nextCodePoint) // next unit starts
a decodable sequence
+ break;
+ }
+ }
else if (0xD800 <= ch && ch <= 0xDFFF)
{
// RFC 3629 ยง3 explicitly forbids surrogate-half values
in UTF-8
@@ -140,7 +150,15 @@ void Transform::appendEscapingCDATA(
auto ch = Transcoder::decode(input, nextCodePoint);
if (nextCodePoint == lastCodePoint) // failed to decode input?
{
- nextCodePoint = input.end();
+ // Skip the undecodable run and keep escaping the
remaining input
+ // instead of discarding it; the run collapses to one
replacement.
+ for (++nextCodePoint; nextCodePoint != input.end();
++nextCodePoint)
+ {
+ auto probe = nextCodePoint;
+ Transcoder::decode(input, probe);
+ if (probe != nextCodePoint) // next unit starts
a decodable sequence
+ break;
+ }
ch = 0xFFFD; // The Unicode replacement character
}
else if (CDATA_END[0] == ch && input.end() != nextCodePoint)
diff --git a/src/test/cpp/jsonlayouttest.cpp b/src/test/cpp/jsonlayouttest.cpp
index 870514cc..60b93936 100644
--- a/src/test/cpp/jsonlayouttest.cpp
+++ b/src/test/cpp/jsonlayouttest.cpp
@@ -53,6 +53,9 @@ LOGUNIT_CLASS(JSONLayoutTest), public JSONLayout
LOGUNIT_TEST(testIgnoresThrowable);
LOGUNIT_TEST(testAppendQuotedEscapedStringWithPrintableChars);
LOGUNIT_TEST(testAppendQuotedEscapedStringWithControlChars);
+#if LOG4CXX_LOGCHAR_IS_UTF8
+ LOGUNIT_TEST(testAppendQuotedEscapedStringWithInvalidByte);
+#endif
LOGUNIT_TEST(testAppendSerializedMDC);
LOGUNIT_TEST(testAppendSerializedMDCWithPrettyPrint);
LOGUNIT_TEST(testAppendSerializedNDC);
@@ -186,6 +189,22 @@ public:
LOGUNIT_ASSERT_EQUAL(esc_expected, esc_escaped);
}
+#if LOG4CXX_LOGCHAR_IS_UTF8
+ /**
+ * An undecodable byte must be replaced and the text following it kept,
+ * not dropped along with the rest of the input.
+ */
+ void testAppendQuotedEscapedStringWithInvalidByte()
+ {
+ logchar in[] = {'A', (logchar) 0xFF, 'B', 0};
+ LogString expected(LOG4CXX_STR("\"A\\ufffdB\""));
+ LogString actual;
+
+ appendQuotedEscapedString(actual, in);
+ LOGUNIT_ASSERT_EQUAL(expected, actual);
+ }
+#endif
+
/**
* Tests appendSerializedMDC.
*/