This is an automated email from the ASF dual-hosted git repository.
thiru pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/main by this push:
new 00afbaeda AVRO-3860: Fix for wrong encoding of Unicode values above
0xffff (#2831)
00afbaeda is described below
commit 00afbaeda36db48cf48ec1c5faad1736add9e8f2
Author: Thiruvalluvan M G <[email protected]>
AuthorDate: Sat Apr 6 08:40:49 2024 +0530
AVRO-3860: Fix for wrong encoding of Unicode values above 0xffff (#2831)
* Fix for wrong encoding of Unicode values above 0xffff
* More approriate error message with wrong Unicode escapes
* Fixed a subtle bug in detecting surrogate codes of UTF-16 encoding
* Fixed a bug that allowed prhibited unicode values
---------
Co-authored-by: Thiruvalluvan M G <[email protected]>
---
lang/c++/impl/json/JsonIO.cc | 82 +++++++++++++++++++++++++++++++-------------
lang/c++/impl/json/JsonIO.hh | 13 ++++++-
lang/c++/test/JsonTests.cc | 1 +
3 files changed, 71 insertions(+), 25 deletions(-)
diff --git a/lang/c++/impl/json/JsonIO.cc b/lang/c++/impl/json/JsonIO.cc
index 7f002ed99..d30d1fded 100644
--- a/lang/c++/impl/json/JsonIO.cc
+++ b/lang/c++/impl/json/JsonIO.cc
@@ -314,19 +314,41 @@ JsonParser::Token JsonParser::tryString() {
}
}
+
+// Decode the given string and return contents as UTF8-encoded bytes.
+// The input does not have the enclosing double-quotes.
string JsonParser::decodeString(const string &s, bool binary) {
string result;
- const auto readNextByte = [](string::const_iterator &it, const
string::const_iterator &end) -> char {
- if (it == end)
+ auto it = s.cbegin();
+ const auto end = s.cend();
+ const auto readNextByte = [&]() -> char {
+ if (it == end) {
throw Exception("Unexpected EOF");
+ }
return *it++;
};
- auto it = s.cbegin();
- const auto end = s.cend();
+ const auto unicodeParse = [&]() {
+ uint32_t n = 0;
+ for (int i = 0; i < 4; i++) {
+ auto c = readNextByte();
+ n *= 16;
+ if (isdigit(c)) {
+ n += c - '0';
+ } else if (c >= 'a' && c <= 'f') {
+ n += c - 'a' + 10;
+ } else if (c >= 'A' && c <= 'F') {
+ n += c - 'A' + 10;
+ } else {
+ throw Exception(boost::format( "Invalid hex character: %1%") %
c);
+ }
+ }
+ return n;
+ };
while (it != end) {
- char ch = *it++;
+ string::const_iterator startSeq = it;
+ char ch = readNextByte();
if (ch == '\\') {
- ch = readNextByte(it, end);
+ ch = readNextByte();
switch (ch) {
case '"':
case '\\':
@@ -350,30 +372,42 @@ string JsonParser::decodeString(const string &s, bool
binary) {
continue;
case 'u':
case 'U': {
- uint32_t n = 0;
- char e[4];
- for (char &i : e) {
- n *= 16;
- char c = readNextByte(it, end);
- i = c;
- if (isdigit(c)) {
- n += c - '0';
- } else if (c >= 'a' && c <= 'f') {
- n += c - 'a' + 10;
- } else if (c >= 'A' && c <= 'F') {
- n += c - 'A' + 10;
- }
- }
+ uint32_t n = unicodeParse();
if (binary) {
if (n > 0xff) {
throw Exception(boost::format(
"Invalid byte for binary:
%1%%2%")
- % ch % string(e, 4));
+ % ch % string(startSeq, ++it));
} else {
result.push_back(n);
continue;
}
}
+ if (n >= 0xd800 && n < 0xdc00) {
+ ch = readNextByte();
+ if (ch != '\\') {
+ throw Exception(boost::format(
+ "Invalid unicode sequence:
%1%")
+ % string(startSeq, it));
+ }
+ ch = readNextByte();
+ if (ch != 'u' && ch != 'U') {
+ throw Exception(boost::format(
+ "Invalid unicode sequence:
%1%")
+ % string(startSeq, it));
+ }
+ uint32_t m = unicodeParse();
+ if (m < 0xdc00 || m > 0xdfff) {
+ throw Exception(boost::format(
+ "Invalid unicode sequence:
%1%")
+ % string(startSeq, it));
+ }
+ n = 0x10000 + (((n - 0xd800) << 10) | (m - 0xdc00));
+ } else if (n >= 0xdc00 && n < 0xdfff) {
+ throw Exception(boost::format(
+ "Invalid unicode sequence: %1%")
+ % string(startSeq, it));
+ }
if (n < 0x80) {
result.push_back(n);
} else if (n < 0x800) {
@@ -383,15 +417,15 @@ string JsonParser::decodeString(const string &s, bool
binary) {
result.push_back((n >> 12) | 0xe0);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
- } else if (n < 110000) {
+ } else if (n < 0x110000) {
result.push_back((n >> 18) | 0xf0);
result.push_back(((n >> 12) & 0x3f) | 0x80);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
} else {
throw Exception(boost::format(
- "Invalid unicode value: %1%i%2%")
- % ch % string(e, 4));
+ "Invalid unicode value: %1%%2%")
+ % n % string(startSeq, ++it));
}
}
continue;
diff --git a/lang/c++/impl/json/JsonIO.hh b/lang/c++/impl/json/JsonIO.hh
index 94889e5d0..447c0b0df 100644
--- a/lang/c++/impl/json/JsonIO.hh
+++ b/lang/c++/impl/json/JsonIO.hh
@@ -263,12 +263,23 @@ class AVRO_DECL JsonGenerator {
out_.write(toHex((static_cast<unsigned char>(c)) % 16));
}
- void escapeUnicode(uint32_t c) {
+ void escapeUnicode16(uint32_t c) {
out_.write('\\');
out_.write('u');
writeHex((c >> 8) & 0xff);
writeHex(c & 0xff);
}
+ void escapeUnicode(uint32_t c) {
+ if (c < 0x10000) {
+ escapeUnicode16(c);
+ } else if (c < 0x110000) {
+ c -= 0x10000;
+ escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800);
+ escapeUnicode16((c & 0x3ff) | 0xdc00);
+ } else {
+ throw Exception(boost::format("Invalid code-point: %1%") % c);
+ }
+ }
void doEncodeString(const char *b, size_t len, bool binary) {
const char *e = b + len;
out_.write('"');
diff --git a/lang/c++/test/JsonTests.cc b/lang/c++/test/JsonTests.cc
index 10a100540..f65839058 100644
--- a/lang/c++/test/JsonTests.cc
+++ b/lang/c++/test/JsonTests.cc
@@ -69,6 +69,7 @@ TestData<const char *> stringData[] = {
{R"("\u20ac")", EntityType::String, "\xe2\x82\xac", R"("\u20ac")"},
{R"("\u03c0")", EntityType::String, "\xcf\x80", R"("\u03c0")"},
{R"("hello\n")", EntityType::String, "hello\n", R"("hello\n")"},
+ {R"("\Ud8ab\udccd")", EntityType::String, "\xf0\xba\xb3\x8d",
R"("\ud8ab\udccd")"},
};
void testBool(const TestData<bool> &d) {