This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


The following commit(s) were added to refs/heads/master by this push:
     new 5d328e99 Handle U+FFFF correctly in decodeUTF8 recovery path (#705)
5d328e99 is described below

commit 5d328e9991c917814a8358745328c9c733facda9
Author: jmestwa-coder <[email protected]>
AuthorDate: Wed Jun 3 10:31:48 2026 +0530

    Handle U+FFFF correctly in decodeUTF8 recovery path (#705)
---
 src/main/cpp/transcoder.cpp                 | 12 +++++++++++-
 src/test/cpp/helpers/transcodertestcase.cpp | 17 +++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index 19404745..b5a9d3d0 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -46,6 +46,7 @@ void Transcoder::decodeUTF8(const std::string& src, 
LogString& dst)
 
        while (iter != src.end())
        {
+               std::string::const_iterator start = iter;
                unsigned int sv = decode(src, iter);
 
                if (sv != 0xFFFF)
@@ -55,7 +56,16 @@ void Transcoder::decodeUTF8(const std::string& src, 
LogString& dst)
                else
                {
                        dst.append(1, LOSSCHAR);
-                       iter++;
+
+                       // decode() returns 0xFFFF both for a decode error 
(iter left at
+                       // start) and for a successfully decoded U+FFFF (iter 
already
+                       // advanced past EF BF BF).  Only advance here in the 
former case,
+                       // otherwise the byte following U+FFFF is skipped and, 
at end of
+                       // input, iter is pushed past src.end().
+                       if (iter == start)
+                       {
+                               iter++;
+                       }
                }
        }
 }
diff --git a/src/test/cpp/helpers/transcodertestcase.cpp 
b/src/test/cpp/helpers/transcodertestcase.cpp
index 7887b82a..f405bb06 100644
--- a/src/test/cpp/helpers/transcodertestcase.cpp
+++ b/src/test/cpp/helpers/transcodertestcase.cpp
@@ -70,6 +70,7 @@ LOGUNIT_CLASS(TranscoderTestCase)
        LOGUNIT_TEST(testDecodeUTF8_RejectAboveMax);
        LOGUNIT_TEST(testDecodeUTF8_MaxBoundary);
        LOGUNIT_TEST(testDecodeUTF8_RejectInvalidLeadByte);
+       LOGUNIT_TEST(testDecodeUTF8_FFFF_KeepsFollowingByte);
        LOGUNIT_TEST(testEncodeUTF16BE_BMP);
        LOGUNIT_TEST(testEncodeUTF16BE_Supplementary);
        LOGUNIT_TEST(testEncodeUTF16LE_Supplementary);
@@ -465,6 +466,22 @@ public:
                }
        }
 
+       /**
+        * U+FFFF (EF BF BF) is a legal three-byte sequence whose scalar value
+        * collides with the 0xFFFF error sentinel returned by 
Transcoder::decode.
+        * decode() consumes all three bytes and returns 0xFFFF, but decodeUTF8
+        * mistakes that for a decode failure (which leaves the iterator parked)
+        * and advances the iterator a second time. The byte following U+FFFF is
+        * therefore silently dropped. Here the trailing 'A' must survive.
+        */
+       void testDecodeUTF8_FFFF_KeepsFollowingByte()
+       {
+               std::string src("\xEF\xBF\xBF\x41"); // U+FFFF then 'A'
+               LogString out;
+               Transcoder::decodeUTF8(src, out);
+               LOGUNIT_ASSERT(out.find(LOG4CXX_STR("A")) != LogString::npos);
+       }
+
        void testEncodeUTF16BE_BMP()
        {
                char raw[4] = { 0, 0, 0, 0 };

Reply via email to