This is an automated email from the ASF dual-hosted git repository.
junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-ffi.git
The following commit(s) were added to refs/heads/main by this push:
new d3b5532 fix: handle UTF-8 bytes correctly in JSON parser control
character check (#442)
d3b5532 is described below
commit d3b5532fe68ad0d76dd8e8636a000629a7ec4716
Author: Ruihang Lai <[email protected]>
AuthorDate: Wed Feb 11 21:07:49 2026 -0500
fix: handle UTF-8 bytes correctly in JSON parser control character check
(#442)
Cast `*cur_` to `unsigned char` before comparing against space (0x20).
On platforms where `char` is signed, bytes >= 0x80 (valid UTF-8
continuation/lead bytes) were misinterpreted as negative values, causing
all non-ASCII strings to be rejected as "Invalid control character".
---
src/ffi/extra/json_parser.cc | 8 ++++++--
tests/cpp/extra/test_json_parser.cc | 6 ++++++
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/ffi/extra/json_parser.cc b/src/ffi/extra/json_parser.cc
index a1c9ae2..117ef54 100644
--- a/src/ffi/extra/json_parser.cc
+++ b/src/ffi/extra/json_parser.cc
@@ -131,7 +131,9 @@ class JSONParserContext {
++cur_;
return true;
}
- if (*cur_ < ' ' || *cur_ == '\\') {
+ // Use uint8_t: on platforms where char is signed, raw UTF-8 bytes (>=
0x80)
+ // would be negative and incorrectly treated as control characters.
+ if (*reinterpret_cast<const uint8_t*>(cur_) < ' ' || *cur_ == '\\') {
// fallback to full string handling
return this->NextStringWithFullHandling(out, start_pos);
}
@@ -341,7 +343,9 @@ class JSONParserContext {
// copy over the prefix that was already parsed
std::string out_str(start_pos + 1, cur_ - start_pos - 1);
while (cur_ != end_) {
- if (*cur_ < ' ') {
+ // Use uint8_t: on platforms where char is signed, raw UTF-8 bytes (>=
0x80)
+ // would be negative and incorrectly treated as control characters.
+ if (*reinterpret_cast<const uint8_t*>(cur_) < ' ') {
this->SetErrorInvalidControlCharacter();
return false;
}
diff --git a/tests/cpp/extra/test_json_parser.cc
b/tests/cpp/extra/test_json_parser.cc
index 4e134ff..49d2c34 100644
--- a/tests/cpp/extra/test_json_parser.cc
+++ b/tests/cpp/extra/test_json_parser.cc
@@ -290,6 +290,12 @@ TEST(JSONParser, UnicodeEdgeCases) {
u8"\U0001F600\U0001F601");
}
+TEST(JSONParser, RawUTF8Bytes) {
+ // Regression test: raw UTF-8 bytes (>= 0x80) must not be rejected as
control characters.
+ // This failed when the parser used signed char comparison: *cur_ < ' '
+ EXPECT_EQ(json::Parse("\"\xE4\xB8\xAD\xE6\x96\x87\"").cast<String>(),
"\xE4\xB8\xAD\xE6\x96\x87");
+}
+
TEST(JSONParser, LargeInputs) {
// Test large array
std::string large_array = "[";