This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-ffi.git


The following commit(s) were added to refs/heads/main by this push:
     new d3b5532  fix: handle UTF-8 bytes correctly in JSON parser control 
character check (#442)
d3b5532 is described below

commit d3b5532fe68ad0d76dd8e8636a000629a7ec4716
Author: Ruihang Lai <[email protected]>
AuthorDate: Wed Feb 11 21:07:49 2026 -0500

    fix: handle UTF-8 bytes correctly in JSON parser control character check 
(#442)
    
    Cast `*cur_` to `unsigned char` before comparing against space (0x20).
    On platforms where `char` is signed, bytes >= 0x80 (valid UTF-8
    continuation/lead bytes) were misinterpreted as negative values, causing
    all non-ASCII strings to be rejected as "Invalid control character".
---
 src/ffi/extra/json_parser.cc        | 8 ++++++--
 tests/cpp/extra/test_json_parser.cc | 6 ++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/ffi/extra/json_parser.cc b/src/ffi/extra/json_parser.cc
index a1c9ae2..117ef54 100644
--- a/src/ffi/extra/json_parser.cc
+++ b/src/ffi/extra/json_parser.cc
@@ -131,7 +131,9 @@ class JSONParserContext {
         ++cur_;
         return true;
       }
-      if (*cur_ < ' ' || *cur_ == '\\') {
+      // Use uint8_t: on platforms where char is signed, raw UTF-8 bytes (>= 
0x80)
+      // would be negative and incorrectly treated as control characters.
+      if (*reinterpret_cast<const uint8_t*>(cur_) < ' ' || *cur_ == '\\') {
         // fallback to full string handling
         return this->NextStringWithFullHandling(out, start_pos);
       }
@@ -341,7 +343,9 @@ class JSONParserContext {
     // copy over the prefix that was already parsed
     std::string out_str(start_pos + 1, cur_ - start_pos - 1);
     while (cur_ != end_) {
-      if (*cur_ < ' ') {
+      // Use uint8_t: on platforms where char is signed, raw UTF-8 bytes (>= 
0x80)
+      // would be negative and incorrectly treated as control characters.
+      if (*reinterpret_cast<const uint8_t*>(cur_) < ' ') {
         this->SetErrorInvalidControlCharacter();
         return false;
       }
diff --git a/tests/cpp/extra/test_json_parser.cc 
b/tests/cpp/extra/test_json_parser.cc
index 4e134ff..49d2c34 100644
--- a/tests/cpp/extra/test_json_parser.cc
+++ b/tests/cpp/extra/test_json_parser.cc
@@ -290,6 +290,12 @@ TEST(JSONParser, UnicodeEdgeCases) {
             u8"\U0001F600\U0001F601");
 }
 
+TEST(JSONParser, RawUTF8Bytes) {
+  // Regression test: raw UTF-8 bytes (>= 0x80) must not be rejected as 
control characters.
+  // This failed when the parser used signed char comparison: *cur_ < ' '
+  EXPECT_EQ(json::Parse("\"\xE4\xB8\xAD\xE6\x96\x87\"").cast<String>(), 
"\xE4\xB8\xAD\xE6\x96\x87");
+}
+
 TEST(JSONParser, LargeInputs) {
   // Test large array
   std::string large_array = "[";

Reply via email to