This is an automated email from the ASF dual-hosted git repository. thiru pushed a commit to branch AVRO-3860 in repository https://gitbox.apache.org/repos/asf/avro.git
commit 4a3d94ff9b781f1aa4eca8048fecc4b3b80de298 Author: Thiruvalluvan M G <[email protected]> AuthorDate: Tue Apr 2 13:34:14 2024 +0530 Fix for wrong encoding of Unicode values above 0xffff --- lang/c++/impl/json/JsonIO.cc | 80 ++++++++++++++++++++++++++++++++++---------- lang/c++/impl/json/JsonIO.hh | 13 ++++++- lang/c++/test/JsonTests.cc | 1 + 3 files changed, 76 insertions(+), 18 deletions(-) diff --git a/lang/c++/impl/json/JsonIO.cc b/lang/c++/impl/json/JsonIO.cc index 62549484a..da2d85f72 100644 --- a/lang/c++/impl/json/JsonIO.cc +++ b/lang/c++/impl/json/JsonIO.cc @@ -314,11 +314,37 @@ JsonParser::Token JsonParser::tryString() { } } +static string::const_iterator unicodeParse(string::const_iterator b, string::const_iterator e, uint32_t &n) { + string::const_iterator start = b; + for (int i = 0; i < 4; i++) { + ++b; + if (b == e) { + throw Exception(boost::format( + "Invalid unicode escape: %1%") % string(start, b)); + } + n *= 16; + char c = *b; + if (isdigit(c)) { + n += c - '0'; + } else if (c >= 'a' && c <= 'f') { + n += c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + n += c - 'A' + 10; + } else { + throw Exception(boost::format( "Invalid hex character: %1%") % c); + } + } + return b; +} + +// Decode the given string and return contents as UTF8-encoded bytes. +// The input does not have the enclosing double-quotes. string JsonParser::decodeString(const string &s, bool binary) { string result; for (string::const_iterator it = s.begin(); it != s.end(); ++it) { char ch = *it; if (ch == '\\') { + string::const_iterator startSeq = it; ch = *++it; switch (ch) { case '"': @@ -344,29 +370,49 @@ string JsonParser::decodeString(const string &s, bool binary) { case 'u': case 'U': { uint32_t n = 0; - char e[4]; - for (char &i : e) { - n *= 16; - char c = *++it; - i = c; - if (isdigit(c)) { - n += c - '0'; - } else if (c >= 'a' && c <= 'f') { - n += c - 'a' + 10; - } else if (c >= 'A' && c <= 'F') { - n += c - 'A' + 10; - } - } + it = unicodeParse(it, s.end(), n); if (binary) { if (n > 0xff) { throw Exception(boost::format( "Invalid byte for binary: %1%%2%") - % ch % string(e, 4)); + % ch % string(startSeq, ++it)); } else { result.push_back(n); continue; } } + if (n >= 0xd800) { + ++it; + if (n > 0xdbff || it == s.end()) { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, it)); + } + if (*it != '\\') { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, ++it)); + } + ++it; + if (it == s.end()) { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, it)); + } + if (*it != 'u' && *it != 'U') { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, ++it)); + } + uint32_t m = 0; + it = unicodeParse(it, s.end(), m); + if (m < 0xdc00 || m > 0xdfff) { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, ++it)); + } + n = 0x10000 + (((n - 0xd800) << 10) | (m - 0xdc00)); + } if (n < 0x80) { result.push_back(n); } else if (n < 0x800) { @@ -376,15 +422,15 @@ string JsonParser::decodeString(const string &s, bool binary) { result.push_back((n >> 12) | 0xe0); result.push_back(((n >> 6) & 0x3f) | 0x80); result.push_back((n & 0x3f) | 0x80); - } else if (n < 110000) { + } else if (n < 0x110000) { result.push_back((n >> 18) | 0xf0); result.push_back(((n >> 12) & 0x3f) | 0x80); result.push_back(((n >> 6) & 0x3f) | 0x80); result.push_back((n & 0x3f) | 0x80); } else { throw Exception(boost::format( - "Invalid unicode value: %1%i%2%") - % ch % string(e, 4)); + "Invalid unicode value: %1%%2%") + % n % string(startSeq, ++it)); } } continue; diff --git a/lang/c++/impl/json/JsonIO.hh b/lang/c++/impl/json/JsonIO.hh index 94889e5d0..447c0b0df 100644 --- a/lang/c++/impl/json/JsonIO.hh +++ b/lang/c++/impl/json/JsonIO.hh @@ -263,12 +263,23 @@ class AVRO_DECL JsonGenerator { out_.write(toHex((static_cast<unsigned char>(c)) % 16)); } - void escapeUnicode(uint32_t c) { + void escapeUnicode16(uint32_t c) { out_.write('\\'); out_.write('u'); writeHex((c >> 8) & 0xff); writeHex(c & 0xff); } + void escapeUnicode(uint32_t c) { + if (c < 0x10000) { + escapeUnicode16(c); + } else if (c < 0x110000) { + c -= 0x10000; + escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800); + escapeUnicode16((c & 0x3ff) | 0xdc00); + } else { + throw Exception(boost::format("Invalid code-point: %1%") % c); + } + } void doEncodeString(const char *b, size_t len, bool binary) { const char *e = b + len; out_.write('"'); diff --git a/lang/c++/test/JsonTests.cc b/lang/c++/test/JsonTests.cc index da9722f30..125b6d6e9 100644 --- a/lang/c++/test/JsonTests.cc +++ b/lang/c++/test/JsonTests.cc @@ -68,6 +68,7 @@ TestData<const char *> stringData[] = { {R"("\/")", EntityType::String, "/", R"("\/")"}, {R"("\u20ac")", EntityType::String, "\xe2\x82\xac", R"("\u20ac")"}, {R"("\u03c0")", EntityType::String, "\xcf\x80", R"("\u03c0")"}, + {R"("\Ud8ab\udccd")", EntityType::String, "\xf0\xba\xb3\x8d", R"("\ud8ab\udccd")"}, }; void testBool(const TestData<bool> &d) {
