This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new 46037d01 Fix UTF-16 supplementary character encoding (#659)
46037d01 is described below
commit 46037d0143682257de2d8095cd79dfdd8a16b19a
Author: metsw24-max <[email protected]>
AuthorDate: Tue May 12 10:22:45 2026 +0530
Fix UTF-16 supplementary character encoding (#659)
---
src/main/cpp/transcoder.cpp | 4 +--
src/test/cpp/helpers/transcodertestcase.cpp | 42 +++++++++++++++++++++++++++++
2 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/src/main/cpp/transcoder.cpp b/src/main/cpp/transcoder.cpp
index 82f20c19..02eb520b 100644
--- a/src/main/cpp/transcoder.cpp
+++ b/src/main/cpp/transcoder.cpp
@@ -165,7 +165,7 @@ size_t Transcoder::encodeUTF16BE(unsigned int ch, char* dst)
unsigned char w = (unsigned char) ((ch >> 16) - 1);
dst[0] = (char) (0xD8 + (w >> 2));
dst[1] = (char) (((w & 0x03) << 6) + ((ch >> 10) & 0x3F));
- dst[2] = (char) (0xDC + ((ch & 0x30) >> 4));
+ dst[2] = (char) (0xDC + ((ch >> 8) & 0x03));
dst[3] = (char) (ch & 0xFF);
return 4;
}
@@ -194,7 +194,7 @@ size_t Transcoder::encodeUTF16LE(unsigned int ch, char* dst)
unsigned char w = (unsigned char) ((ch >> 16) - 1);
dst[1] = (char) (0xD8 + (w >> 2));
dst[0] = (char) (((w & 0x03) << 6) + ((ch >> 10) & 0x3F));
- dst[3] = (char) (0xDC + ((ch & 0x30) >> 4));
+ dst[3] = (char) (0xDC + ((ch >> 8) & 0x03));
dst[2] = (char) (ch & 0xFF);
return 4;
}
diff --git a/src/test/cpp/helpers/transcodertestcase.cpp
b/src/test/cpp/helpers/transcodertestcase.cpp
index 23a5caa5..47904456 100644
--- a/src/test/cpp/helpers/transcodertestcase.cpp
+++ b/src/test/cpp/helpers/transcodertestcase.cpp
@@ -16,6 +16,7 @@
*/
#include <log4cxx/helpers/transcoder.h>
+#include <log4cxx/helpers/bytebuffer.h>
#include "../insertwide.h"
#include "../logunit.h"
@@ -63,6 +64,9 @@ LOGUNIT_CLASS(TranscoderTestCase)
LOGUNIT_TEST(testDecodeUTF8_2);
LOGUNIT_TEST(testDecodeUTF8_3);
LOGUNIT_TEST(testDecodeUTF8_4);
+ LOGUNIT_TEST(testEncodeUTF16BE_BMP);
+ LOGUNIT_TEST(testEncodeUTF16BE_Supplementary);
+ LOGUNIT_TEST(testEncodeUTF16LE_Supplementary);
#if LOG4CXX_UNICHAR_API
LOGUNIT_TEST(udecode2);
LOGUNIT_TEST(udecode4);
@@ -312,6 +316,44 @@ public:
LOGUNIT_ASSERT_EQUAL(true, iter == out.end());
}
+ void testEncodeUTF16BE_BMP()
+ {
+ char raw[4] = { 0, 0, 0, 0 };
+ ByteBuffer buf(raw, sizeof(raw));
+ Transcoder::encodeUTF16BE(0x4E03, buf); // CJK 七
+ LOGUNIT_ASSERT_EQUAL((size_t) 2, buf.position());
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0x4E, (unsigned char)
raw[0]);
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0x03, (unsigned char)
raw[1]);
+ }
+
+ // U+1F600 (GRINNING FACE) encodes to UTF-16BE as D8 3D DE 00.
+ // Before the fix the low surrogate's high byte was derived from bits
4-5
+ // of the code point, yielding 0xDC here instead of 0xDE — corrupting
the
+ // pair into two unpaired surrogates.
+ void testEncodeUTF16BE_Supplementary()
+ {
+ char raw[4] = { 0, 0, 0, 0 };
+ ByteBuffer buf(raw, sizeof(raw));
+ Transcoder::encodeUTF16BE(0x1F600, buf);
+ LOGUNIT_ASSERT_EQUAL((size_t) 4, buf.position());
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0xD8, (unsigned char)
raw[0]);
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0x3D, (unsigned char)
raw[1]);
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0xDE, (unsigned char)
raw[2]);
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0x00, (unsigned char)
raw[3]);
+ }
+
+ void testEncodeUTF16LE_Supplementary()
+ {
+ char raw[4] = { 0, 0, 0, 0 };
+ ByteBuffer buf(raw, sizeof(raw));
+ Transcoder::encodeUTF16LE(0x1F600, buf);
+ LOGUNIT_ASSERT_EQUAL((size_t) 4, buf.position());
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0x3D, (unsigned char)
raw[0]);
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0xD8, (unsigned char)
raw[1]);
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0x00, (unsigned char)
raw[2]);
+ LOGUNIT_ASSERT_EQUAL((unsigned char) 0xDE, (unsigned char)
raw[3]);
+ }
+
#if LOG4CXX_UNICHAR_API
void udecode2()