This is an automated email from the ASF dual-hosted git repository. swebb2066 pushed a commit to branch fix_locale in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
commit c17e8ea48ffff383f7048dc49ca9cbc0691d3317 Author: Stephen Webb <[email protected]> AuthorDate: Sat Aug 12 15:56:39 2023 +1000 Implement a functional 'locale' character encoding --- src/main/cpp/charsetdecoder.cpp | 63 +++++++++++++------------ src/main/cpp/charsetencoder.cpp | 54 ++++++++++----------- src/test/cpp/decodingtest.cpp | 4 ++ src/test/cpp/helpers/charsetdecodertestcase.cpp | 38 +++++++++++++-- src/test/cpp/helpers/charsetencodertestcase.cpp | 46 ++++++++++++++++++ 5 files changed, 142 insertions(+), 63 deletions(-) diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp index 26a099df..80fa78bf 100644 --- a/src/main/cpp/charsetdecoder.cpp +++ b/src/main/cpp/charsetdecoder.cpp @@ -433,11 +433,13 @@ class LocaleCharsetDecoder : public CharsetDecoder virtual log4cxx_status_t decode(ByteBuffer& in, LogString& out) { + log4cxx_status_t result = APR_SUCCESS; const char* p = in.current(); size_t i = in.position(); + size_t remain = in.limit() - i; #if !LOG4CXX_CHARSET_EBCDIC - for (; i < in.limit() && ((unsigned int) *p) < 0x80; i++, p++) + for (; 0 < remain && ((unsigned int) *p) < 0x80; --remain, ++i, p++) { out.append(1, *p); } @@ -445,40 +447,36 @@ class LocaleCharsetDecoder : public CharsetDecoder in.position(i); #endif - if (i < in.limit()) + if (0 < remain) { - Pool subpool; - const char* enc = apr_os_locale_encoding(subpool.getAPRPool()); + std::mbstate_t state = {}; + while (0 < remain) { - std::unique_lock<std::mutex> lock(mutex); - - if (enc == 0) + wchar_t ch; + size_t n = std::mbrtowc(&ch, p, remain, &state); + if (0 == n) + { + ++i; + break; + } + if (static_cast<std::size_t>(-1) == n) { - if (decoder == 0) - { - encoding = "C"; - decoder.reset( new USASCIICharsetDecoder() ); - } + result = APR_BADARG; + break; } - else if (encoding != enc) + if (static_cast<std::size_t>(-2) == n) { - encoding = enc; - - try - { - LOG4CXX_DECODE_CHAR(e, encoding); - decoder = getDecoder(e); - } - catch (IllegalArgumentException&) - { - decoder.reset( new USASCIICharsetDecoder() ); - } + break; } + Transcoder::encode(static_cast<unsigned int>(ch), out); + remain -= n; + i += n; + p += n; } - return decoder->decode(in, out); + in.position(i); } - return APR_SUCCESS; + return result; } private: Pool pool; @@ -561,7 +559,8 @@ CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder() CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset) { if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) || - StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8"))) + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) || + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001"))) { return std::make_shared<UTF8CharsetDecoder>(); } @@ -569,15 +568,21 @@ CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset) charset == LOG4CXX_STR("646") || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) || - StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968"))) + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) || + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127"))) { return std::make_shared<USASCIICharsetDecoder>(); } else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) || - StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1"))) + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) || + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252"))) { return std::make_shared<ISOLatinCharsetDecoder>(); } + else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale"))) + { + return std::make_shared<LocaleCharsetDecoder>(); + } #if APR_HAS_XLATE return std::make_shared<APRCharsetDecoder>(charset); diff --git a/src/main/cpp/charsetencoder.cpp b/src/main/cpp/charsetencoder.cpp index 010a6ec1..5132448c 100644 --- a/src/main/cpp/charsetencoder.cpp +++ b/src/main/cpp/charsetencoder.cpp @@ -462,6 +462,7 @@ class LocaleCharsetEncoder : public CharsetEncoder LogString::const_iterator& iter, ByteBuffer& out) { + log4cxx_status_t result = APR_SUCCESS; #if !LOG4CXX_CHARSET_EBCDIC char* current = out.current(); size_t remain = out.remaining(); @@ -478,38 +479,23 @@ class LocaleCharsetEncoder : public CharsetEncoder if (iter != in.end() && out.remaining() > 0) { - Pool subpool; - const char* enc = apr_os_locale_encoding(subpool.getAPRPool()); + std::mbstate_t state = {}; + while (iter != in.end() && MB_CUR_MAX <= remain) { - std::unique_lock<std::mutex> lock(mutex); - - if (enc == 0) - { - if (encoder == 0) - { - encoding = "C"; - encoder.reset( new USASCIICharsetEncoder() ); - } - } - else if (encoding != enc) + auto ch = Transcoder::decode(in, iter); + auto n = std::wcrtomb(current, ch, &state); + if (static_cast<std::size_t>(-1) == n) { - encoding = enc; - LOG4CXX_DECODE_CHAR(ename, encoding); - - try - { - encoder = CharsetEncoder::getEncoder(ename); - } - catch (IllegalArgumentException&) - { - encoder.reset( new USASCIICharsetEncoder() ); - } + result = APR_BADARG; + break; } + remain -= n; + current += n; } - return encoder->encode(in, iter, out); + out.position(current - out.data()); } - return APR_SUCCESS; + return result; } private: @@ -578,7 +564,8 @@ CharsetEncoderPtr CharsetEncoder::getUTF8Encoder() CharsetEncoderPtr CharsetEncoder::getEncoder(const LogString& charset) { - if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8"))) + if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) + || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001"))) { return std::make_shared<UTF8CharsetEncoder>(); } @@ -586,17 +573,20 @@ CharsetEncoderPtr CharsetEncoder::getEncoder(const LogString& charset) charset == LOG4CXX_STR("646") || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) || - StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968"))) + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) || + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127"))) { return std::make_shared<USASCIICharsetEncoder>(); } else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) || - StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1"))) + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) || + StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252"))) { return std::make_shared<ISOLatinCharsetEncoder>(); } else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-16BE"), LOG4CXX_STR("utf-16be")) - || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-16"), LOG4CXX_STR("utf-16"))) + || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-16"), LOG4CXX_STR("utf-16")) + || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1200"), LOG4CXX_STR("cp1200"))) { return std::make_shared<UTF16BECharsetEncoder>(); } @@ -604,6 +594,10 @@ CharsetEncoderPtr CharsetEncoder::getEncoder(const LogString& charset) { return std::make_shared<UTF16LECharsetEncoder>(); } + else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale"))) + { + return std::make_shared<LocaleCharsetEncoder>(); + } #if APR_HAS_XLATE return std::make_shared<APRCharsetEncoder>(charset); diff --git a/src/test/cpp/decodingtest.cpp b/src/test/cpp/decodingtest.cpp index 37c00a45..05760ed2 100644 --- a/src/test/cpp/decodingtest.cpp +++ b/src/test/cpp/decodingtest.cpp @@ -27,6 +27,10 @@ #include <log4cxx/logstring.h> #include "logunit.h" + +#define LOG4CXX_TEST 1 +#include <log4cxx/private/log4cxx_private.h> + // // If there is no support for wchar_t logging then // there is not a consistent way to get the test characters compared. diff --git a/src/test/cpp/helpers/charsetdecodertestcase.cpp b/src/test/cpp/helpers/charsetdecodertestcase.cpp index 9fc58a14..ba7539db 100644 --- a/src/test/cpp/helpers/charsetdecodertestcase.cpp +++ b/src/test/cpp/helpers/charsetdecodertestcase.cpp @@ -34,7 +34,8 @@ LOGUNIT_CLASS(CharsetDecoderTestCase) LOGUNIT_TEST_SUITE(CharsetDecoderTestCase); LOGUNIT_TEST(decode1); LOGUNIT_TEST(decode2); - LOGUNIT_TEST(decode8); + LOGUNIT_TEST(decode3); + LOGUNIT_TEST(decode4); LOGUNIT_TEST_SUITE_END(); enum { BUFSIZE = 256 }; @@ -83,9 +84,7 @@ public: LOGUNIT_ASSERT_EQUAL(LogString(LOG4CXX_STR("Hello")), greeting.substr(BUFSIZE - 3)); } - - - void decode8() + void decode3() { char buf[] = { 'H', 'e', 'l', 'l', 'o', ',', 0, 'W', 'o', 'r', 'l', 'd'}; ByteBuffer src(buf, 12); @@ -103,6 +102,37 @@ public: LOGUNIT_ASSERT_EQUAL(LogString(expected, 12), greeting); } + void decode4() + { + char utf8_greet[] = { 'A', + (char) 0xD8, (char) 0x85, + (char) 0xD4, (char) 0xB0, + (char) 0xE0, (char) 0xA6, (char) 0x86, + (char) 0xE4, (char) 0xB8, (char) 0x83, + (char) 0xD0, (char) 0x80, + 0 + }; +#if LOG4CXX_LOGCHAR_IS_WCHAR || LOG4CXX_LOGCHAR_IS_UNICHAR + // arbitrary, hopefully meaningless, characters from + // Latin, Arabic, Armenian, Bengali, CJK and Cyrillic + const logchar greet[] = { L'A', 0x0605, 0x0530, 0x986, 0x4E03, 0x400, 0 }; +#endif + +#if LOG4CXX_LOGCHAR_IS_UTF8 + const logchar* greet = utf8_greet; +#endif + + std::locale::global(std::locale("en_US.UTF-8")); + auto dec = CharsetDecoder::getDecoder(LOG4CXX_STR("locale")); + + ByteBuffer in(utf8_greet, sizeof (utf8_greet)); + LogString out; + log4cxx_status_t stat = dec->decode(in, out); + LOGUNIT_ASSERT_EQUAL(false, CharsetDecoder::isError(stat)); + stat = dec->decode(in, out); + LOGUNIT_ASSERT_EQUAL(false, CharsetDecoder::isError(stat)); + LOGUNIT_ASSERT(out == greet); + } }; diff --git a/src/test/cpp/helpers/charsetencodertestcase.cpp b/src/test/cpp/helpers/charsetencodertestcase.cpp index 8cc01d93..b44a6e4c 100644 --- a/src/test/cpp/helpers/charsetencodertestcase.cpp +++ b/src/test/cpp/helpers/charsetencodertestcase.cpp @@ -35,6 +35,7 @@ LOGUNIT_CLASS(CharsetEncoderTestCase) LOGUNIT_TEST(encode2); LOGUNIT_TEST(encode3); LOGUNIT_TEST(encode4); + LOGUNIT_TEST(encode5); #if APR_HAS_THREADS LOGUNIT_TEST(thread1); #endif @@ -178,6 +179,51 @@ public: LOGUNIT_ASSERT(iter == greeting.end()); } + void encode5() + { + const char utf8_greet[] = { 'A', + (char) 0xD8, (char) 0x85, + (char) 0xD4, (char) 0xB0, + (char) 0xE0, (char) 0xA6, (char) 0x86, + (char) 0xE4, (char) 0xB8, (char) 0x83, + (char) 0xD0, (char) 0x80, + 0 + }; +#if LOG4CXX_LOGCHAR_IS_WCHAR || LOG4CXX_LOGCHAR_IS_UNICHAR + // arbitrary, hopefully meaningless, characters from + // Latin, Arabic, Armenian, Bengali, CJK and Cyrillic + const logchar greet[] = { L'A', 0x0605, 0x0530, 0x986, 0x4E03, 0x400, 0 }; +#endif + +#if LOG4CXX_LOGCHAR_IS_UTF8 + const logchar* greet = utf8_greet; +#endif + LogString greeting(greet); + + std::locale::global(std::locale("en_US.UTF-8")); + auto enc = CharsetEncoder::getEncoder(LOG4CXX_STR("locale")); + + char buf[BUFSIZE]; + ByteBuffer out(buf, BUFSIZE); + LogString::const_iterator iter = greeting.begin(); + log4cxx_status_t stat = enc->encode(greeting, iter, out); + LOGUNIT_ASSERT_EQUAL(false, CharsetEncoder::isError(stat)); + stat = enc->encode(greeting, iter, out); + LOGUNIT_ASSERT_EQUAL(false, CharsetEncoder::isError(stat)); + + out.flip(); + LOGUNIT_ASSERT_EQUAL((size_t) 13, out.limit()); + + for (size_t i = 0; i < out.limit(); i++) + { + unsigned expected = (unsigned)utf8_greet[i]; + unsigned actual = (unsigned)out.data()[i]; + LOGUNIT_ASSERT_EQUAL(expected, actual); + } + + LOGUNIT_ASSERT(iter == greeting.end()); + } + #if APR_HAS_THREADS class ThreadPackage {
