This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new 5d328e99 Handle U+FFFF correctly in decodeUTF8 recovery path (#705)
5d328e99 is described below
commit 5d328e9991c917814a8358745328c9c733facda9
Author: jmestwa-coder <[email protected]>
AuthorDate: Wed Jun 3 10:31:48 2026 +0530
Handle U+FFFF correctly in decodeUTF8 recovery path (#705)
---
src/main/cpp/transcoder.cpp | 12 +++++++++++-
src/test/cpp/helpers/transcodertestcase.cpp | 17 +++++++++++++++++
2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index 19404745..b5a9d3d0 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -46,6 +46,7 @@ void Transcoder::decodeUTF8(const std::string& src,
LogString& dst)
while (iter != src.end())
{
+ std::string::const_iterator start = iter;
unsigned int sv = decode(src, iter);
if (sv != 0xFFFF)
@@ -55,7 +56,16 @@ void Transcoder::decodeUTF8(const std::string& src,
LogString& dst)
else
{
dst.append(1, LOSSCHAR);
- iter++;
+
+ // decode() returns 0xFFFF both for a decode error
(iter left at
+ // start) and for a successfully decoded U+FFFF (iter
already
+ // advanced past EF BF BF). Only advance here in the
former case,
+ // otherwise the byte following U+FFFF is skipped and,
at end of
+ // input, iter is pushed past src.end().
+ if (iter == start)
+ {
+ iter++;
+ }
}
}
}
diff --git a/src/test/cpp/helpers/transcodertestcase.cpp
b/src/test/cpp/helpers/transcodertestcase.cpp
index 7887b82a..f405bb06 100644
--- a/src/test/cpp/helpers/transcodertestcase.cpp
+++ b/src/test/cpp/helpers/transcodertestcase.cpp
@@ -70,6 +70,7 @@ LOGUNIT_CLASS(TranscoderTestCase)
LOGUNIT_TEST(testDecodeUTF8_RejectAboveMax);
LOGUNIT_TEST(testDecodeUTF8_MaxBoundary);
LOGUNIT_TEST(testDecodeUTF8_RejectInvalidLeadByte);
+ LOGUNIT_TEST(testDecodeUTF8_FFFF_KeepsFollowingByte);
LOGUNIT_TEST(testEncodeUTF16BE_BMP);
LOGUNIT_TEST(testEncodeUTF16BE_Supplementary);
LOGUNIT_TEST(testEncodeUTF16LE_Supplementary);
@@ -465,6 +466,22 @@ public:
}
}
+ /**
+ * U+FFFF (EF BF BF) is a legal three-byte sequence whose scalar value
+ * collides with the 0xFFFF error sentinel returned by
Transcoder::decode.
+ * decode() consumes all three bytes and returns 0xFFFF, but decodeUTF8
+ * mistakes that for a decode failure (which leaves the iterator parked)
+ * and advances the iterator a second time. The byte following U+FFFF is
+ * therefore silently dropped. Here the trailing 'A' must survive.
+ */
+ void testDecodeUTF8_FFFF_KeepsFollowingByte()
+ {
+ std::string src("\xEF\xBF\xBF\x41"); // U+FFFF then 'A'
+ LogString out;
+ Transcoder::decodeUTF8(src, out);
+ LOGUNIT_ASSERT(out.find(LOG4CXX_STR("A")) != LogString::npos);
+ }
+
void testEncodeUTF16BE_BMP()
{
char raw[4] = { 0, 0, 0, 0 };