(tvm) 01/02: [FFI] Lightweight json parser

tqchen Fri, 01 Aug 2025 16:40:58 -0700

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch json-small-str
in repository https://gitbox.apache.org/repos/asf/tvm.git


commit 9aad88dd4be78219e84b1ab29f362d76ca72a28b
Author: tqchen <[email protected]>
AuthorDate: Wed Jul 30 16:51:04 2025 -0400

    [FFI] Lightweight json parser
    
    This PR adds a lightweight json parser to extra component
---
 ffi/CMakeLists.txt                      |   1 +
 ffi/include/tvm/ffi/extra/json.h        |  82 ++++
 ffi/src/ffi/extra/json_parser.cc        | 692 ++++++++++++++++++++++++++++++++
 ffi/tests/cpp/extra/test_json_parser.cc | 363 +++++++++++++++++
 4 files changed, 1138 insertions(+)

diff --git a/ffi/CMakeLists.txt b/ffi/CMakeLists.txt
index 76b2901c7a..9eb6ad7663 100644
--- a/ffi/CMakeLists.txt
+++ b/ffi/CMakeLists.txt
@@ -66,6 +66,7 @@ if (TVM_FFI_USE_EXTRA_CXX_API)
   list(APPEND tvm_ffi_objs_sources
     "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/structural_equal.cc"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/structural_hash.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/json_parser.cc"
   )
 endif()
 
diff --git a/ffi/include/tvm/ffi/extra/json.h b/ffi/include/tvm/ffi/extra/json.h
new file mode 100644
index 0000000000..5522715d3f
--- /dev/null
+++ b/ffi/include/tvm/ffi/extra/json.h
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file tvm/ffi/json/json.h
+ * \brief Minimal lightweight JSON parsing and serialization utilities
+ */
+#ifndef TVM_FFI_EXTRA_JSON_H_
+#define TVM_FFI_EXTRA_JSON_H_
+
+#include <tvm/ffi/any.h>
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/container/map.h>
+#include <tvm/ffi/extra/base.h>
+
+namespace tvm {
+namespace ffi {
+namespace json {
+
+/*!
+ * \brief alias Any as json Value.
+ *
+ * To keep things lightweight, we simply reuse the ffi::Any system.
+ */
+using Value = Any;
+
+/*!
+ * \brief alias Map<Any, Any> as json Object.
+ * \note We use Map<Any, Any> instead of Map<String, Any> to avoid
+ *      the overhead of key checking when doing as conversion,
+ *      the check will be performed at runtime when we read each key
+ */
+using Object = ffi::Map<Any, Any>;
+
+/*! \brief alias Array<Any> as json Array. */
+using Array = ffi::Array<Any>;
+
+/*!
+ * \brief Parse a JSON string into an Any value.
+ *
+ * Besides the standard JSON syntax, this function also supports:
+ * - Infinity/NaN as javascript syntax
+ * - int64 integer value
+ *
+ * If error_msg is not nullptr, the error message will be written to it
+ * and no exception will be thrown when parsing fails.
+ *
+ * \param json_str The JSON string to parse.
+ * \param error_msg The output error message, can be nullptr.
+ *
+ * \return The parsed Any value.
+ * \note This function is optional and will be removed in the future.
+ */
+TVM_FFI_EXTRA_CXX_API json::Value Parse(const String& json_str, String* 
error_msg = nullptr);
+
+/*!
+ * \brief Serialize an Any value into a JSON string.
+ *
+ * \param value The Any value to serialize.
+ * \return The output JSON string.
+ */
+TVM_FFI_EXTRA_CXX_API String Stringify(const json::Value& value, int indent = 
-1);
+
+}  // namespace json
+}  // namespace ffi
+}  // namespace tvm
+#endif  // TVM_FFI_EXTRA_JSON_H_
diff --git a/ffi/src/ffi/extra/json_parser.cc b/ffi/src/ffi/extra/json_parser.cc
new file mode 100644
index 0000000000..bf503b235f
--- /dev/null
+++ b/ffi/src/ffi/extra/json_parser.cc
@@ -0,0 +1,692 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * \file src/ffi/json/parser.cc
+ *
+ * \brief A minimalistic JSON parser implementation.
+ */
+#include <tvm/ffi/any.h>
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/container/map.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/extra/json.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ffi/string.h>
+
+#include <cinttypes>
+#include <limits>
+
+namespace tvm {
+namespace ffi {
+namespace json {
+
+/*!
+ * \brief Helper class to parse a JSON string.
+ *
+ * Keep leaf level string/number parse also in context.
+ */
+class JSONParserContext {
+ public:
+  JSONParserContext(const char* begin, const char* end) : begin_(begin), 
cur_(begin), end_(end) {
+    last_line_begin_ = cur_;
+  }
+
+  /*!
+   * \brief Peek the current character.
+   * \return The current character, or -1 if the end of the string is reached.
+   */
+  int Peek() const {
+    return (cur_ != end_ ? static_cast<int>(*reinterpret_cast<const 
uint8_t*>(cur_)) : -1);
+  }
+
+  /*!
+   * \brief Skip the next char that we know is not a space
+   *
+   * \note Caller must explicitly call SkipSpaces first or use
+   *       Peek already that confirms char is not any space char.
+   */
+  void SkipNextAssumeNoSpace() { ++cur_; }
+
+  /*!
+   * \brief Get the current position.
+   * \return The current position.
+   */
+  const char* GetCurrentPos() const { return cur_; }
+
+  /*!
+   * \brief Set the current position for better error message
+   * \param pos The new position.
+   * \note implementation can do it as no-op if needed
+   */
+  void SetCurrentPosForBetterErrorMsg(const char* pos) { cur_ = pos; }
+
+  /*!
+   * \brief Skip the space characters.
+   * \note This function does not check if the end of the string is reached.
+   */
+  void SkipSpaces() {
+    while (cur_ != end_) {
+      if (!(*cur_ == ' ' || *cur_ == '\t' || *cur_ == '\n' || *cur_ == '\r')) {
+        break;
+      }
+      if (*cur_ == '\n') {
+        ++line_counter_;
+        last_line_begin_ = cur_ + 1;
+      }
+      ++cur_;
+    }
+  }
+
+  /*!
+   * \brief Check if the next characters match the given string.
+   * \param str The string to match.
+   * \param len The length of the string.
+   * \return True if the next characters match the given string, false 
otherwise.
+   */
+  bool MatchLiteral(const char* pattern, int len) {
+    const char* pend = pattern + len;
+    const char* ptr = pattern;
+    for (; ptr != pend && cur_ != end_; ++ptr, ++cur_) {
+      if (*ptr != *cur_) {
+        return false;
+      }
+    }
+    // we get to the end of the pattern and match is successful
+    return ptr == pend;
+  }
+
+  /*
+   * \brief Parse the next strin starting with a double quote.
+   * \param out The output string.
+   * \return Whether the next string parsing is successful.
+   */
+  bool NextString(json::Value* out) {
+    // NOTE: we keep string parsing logic here to allow some special
+    // optimizations for simple string that do not e
+    const char* start_pos = cur_;
+    TVM_FFI_ICHECK(*cur_ == '\"');
+    // skip first double quote
+    ++cur_;
+    // the loop focuses on simple string without escape characters
+    for (; cur_ != end_; ++cur_) {
+      if (*cur_ == '\"') {
+        *out = String(start_pos + 1, cur_ - start_pos - 1);
+        ++cur_;
+        return true;
+      }
+      if (*cur_ < ' ' || *cur_ == '\\') {
+        // fallback to full string handling
+        return this->NextStringWithFullHandling(out, start_pos);
+      }
+    }
+    this->SetCurrentPosForBetterErrorMsg(start_pos);
+    this->SetErrorUnterminatedString();
+    return false;
+  }
+
+  /*!
+   * \brief Parse the next number.
+   * \param out The output number.
+   * \return Whether the next number parsing is successful.
+   */
+  bool NextNumber(json::Value* out) {
+    const char* start_pos = cur_;
+    if (cur_ == end_) {
+      this->SetErrorExpectingValue();
+      return false;
+    }
+    // JSON number grammar:
+    //
+    // number = [ minus ] int [ frac ] [ exp ]
+    // decimal-point = %x2E       ; .
+    // digit1-9 = %x31-39         ; 1-9
+    // e = %x65 / %x45            ; e E
+    // exp = e [ minus / plus ] 1*DIGIT
+    // frac = decimal-point 1*DIGIT
+    std::string temp_buffer;
+    bool maybe_int = true;
+    // parse [minus], cross check for Infinity/NaN/-Infinity
+    if (*cur_ == '-') {
+      temp_buffer.push_back('-');
+      ++cur_;
+      if (cur_ != end_ && *cur_ == 'I') {
+        if (this->MatchLiteral("Infinity", 8)) {
+          *out = -std::numeric_limits<double>::infinity();
+          return true;
+        } else {
+          this->SetCurrentPosForBetterErrorMsg(start_pos);
+          this->SetErrorExpectingValue();
+          return false;
+        }
+      }
+    } else if (*cur_ == 'I') {
+      if (this->MatchLiteral("Infinity", 8)) {
+        *out = std::numeric_limits<double>::infinity();
+        return true;
+      } else {
+        this->SetCurrentPosForBetterErrorMsg(start_pos);
+        this->SetErrorExpectingValue();
+        return false;
+      }
+    } else if (*cur_ == 'N') {
+      if (this->MatchLiteral("NaN", 3)) {
+        *out = std::numeric_limits<double>::quiet_NaN();
+        return true;
+      } else {
+        this->SetCurrentPosForBetterErrorMsg(start_pos);
+        this->SetErrorExpectingValue();
+        return false;
+      }
+    }
+    // read in all parts that are possibly part of a number
+    while (cur_ != end_) {
+      char next_char = *cur_;
+      if ((next_char >= '0' && next_char <= '9') || next_char == 'e' || 
next_char == 'E' ||
+          next_char == '+' || next_char == '-' || next_char == '.') {
+        temp_buffer.push_back(next_char);
+        if (next_char == '.' || next_char == 'e' || next_char == 'E') {
+          maybe_int = false;
+        }
+        ++cur_;
+      } else {
+        break;
+      }
+    }
+    if (temp_buffer.empty()) {
+      this->SetErrorExpectingValue();
+      return false;
+    }
+    // parse from temp_buffer_
+    if (maybe_int) {
+      // now try to parse the number as int64
+      char* end_ptr;
+      errno = 0;
+      intmax_t int_val = strtoimax(temp_buffer.data(), &end_ptr, 10);
+      if (errno == 0 && int_val >= std::numeric_limits<int64_t>::min() &&
+          int_val <= std::numeric_limits<int64_t>::max() &&
+          end_ptr == temp_buffer.data() + temp_buffer.size()) {
+        *out = static_cast<int64_t>(int_val);
+        return true;
+      }
+    }
+    {
+      // now try to parse number as double
+      char* end_ptr;
+      errno = 0;
+      double double_val = strtod(temp_buffer.data(), &end_ptr);
+      if (errno == 0 && end_ptr == temp_buffer.data() + temp_buffer.size()) {
+        *out = double_val;
+        return true;
+      } else {
+        this->SetCurrentPosForBetterErrorMsg(start_pos);
+        this->SetErrorExpectingValue();
+        return false;
+      }
+    }
+  }
+
+  /*!
+   * \brief Get the current line context.
+   * \return The current line context.
+   */
+  String GetSyntaxErrorContext(std::string err_prefix) const {
+    int64_t column = static_cast<int64_t>(cur_ - last_line_begin_) + 1;
+    int64_t char_pos = static_cast<int64_t>(cur_ - begin_);
+    if (err_prefix.empty()) {
+      err_prefix = "Syntax error";
+    }
+    err_prefix += ": line " + std::to_string(line_counter_) + " column " + 
std::to_string(column) +
+                  " (char " + std::to_string(char_pos) + ")";
+    return String(err_prefix);
+  }
+
+  std::string FinalizeErrorMsg() {
+    if (error_msg_.empty()) {
+      SetErrorDefault();
+    }
+    return std::string(error_msg_);
+  }
+
+  void SetErrorDefault() { error_msg_ = GetSyntaxErrorContext("Syntax error 
near"); }
+
+  void SetErrorExpectingValue() { error_msg_ = 
GetSyntaxErrorContext("Expecting value"); }
+
+  void SetErrorInvalidControlCharacter() {
+    error_msg_ = GetSyntaxErrorContext("Invalid control character at");
+  }
+
+  void SetErrorUnterminatedString() {
+    error_msg_ = GetSyntaxErrorContext("Unterminated string starting at");
+  }
+
+  void SetErrorInvalidUnicodeEscape() {
+    error_msg_ = GetSyntaxErrorContext("Invalid \\uXXXX escape");
+  }
+
+  void SetErrorInvalidSurrogatePair() {
+    error_msg_ = GetSyntaxErrorContext("Invalid surrogate pair of \\uXXXX 
escapes");
+  }
+
+  void SetErrorInvalidEscape() { error_msg_ = GetSyntaxErrorContext("Invalid 
\\escape"); }
+
+  void SetErrorExtraData() { error_msg_ = GetSyntaxErrorContext("Extra data"); 
}
+
+  void SetErrorExpectingPropertyName() {
+    error_msg_ = GetSyntaxErrorContext("Expecting property name enclosed in 
double quotes");
+  }
+
+  void SetErrorExpectingColon() { error_msg_ = 
GetSyntaxErrorContext("Expecting \':\' delimiter"); }
+
+  void SetErrorExpectingComma() { error_msg_ = 
GetSyntaxErrorContext("Expecting \',\' delimiter"); }
+
+ private:
+  // Full string parsing with escape and unicode handling
+  bool NextStringWithFullHandling(Any* out, const char* start_pos) {
+    // copy over the prefix that was already parsed
+    std::string out_str(start_pos + 1, cur_ - start_pos - 1);
+    while (cur_ != end_) {
+      if (*cur_ < ' ') {
+        this->SetErrorInvalidControlCharacter();
+        return false;
+      }
+      if (*cur_ == '\"') {
+        *out = String(std::move(out_str));
+        ++cur_;
+        return true;
+      }
+      if (*cur_ == '\\') {
+        ++cur_;
+        switch (*cur_) {
+          // handle escape characters per JSON spec(RFC 8259)
+#define HANDLE_ESCAPE_CHAR(pattern, val) \
+  case pattern:                          \
+    ++cur_;                              \
+    out_str.push_back(val);              \
+    break
+          HANDLE_ESCAPE_CHAR('\"', '\"');
+          HANDLE_ESCAPE_CHAR('\\', '\\');
+          HANDLE_ESCAPE_CHAR('/', '/');
+          HANDLE_ESCAPE_CHAR('b', '\b');
+          HANDLE_ESCAPE_CHAR('f', '\f');
+          HANDLE_ESCAPE_CHAR('n', '\n');
+          HANDLE_ESCAPE_CHAR('r', '\r');
+          HANDLE_ESCAPE_CHAR('t', '\t');
+#undef HANDLE_ESCAPE_CHAR
+          case 'u': {
+            const char* escape_pos = cur_;
+            // handle unicode code point
+            ++cur_;
+            int32_t first_i16, code_point = 0;
+            if (!Parse4Hex(&first_i16)) {
+              this->SetCurrentPosForBetterErrorMsg(escape_pos);
+              this->SetErrorInvalidUnicodeEscape();
+              return false;
+            }
+            // Check if the first i16 is a UTF-16 surrogate pair
+            //
+            // Surrogate pair encoding rule:
+            // U' = yyyyyyyyyyxxxxxxxxxx  // U - 0x10000
+            // W1 = 110110yyyyyyyyyy      // 0xD800 + yyyyyyyyyy
+            // W2 = 110111xxxxxxxxxx      // 0xDC00 + xxxxxxxxxx
+            //
+            // Range of W1 and W2:
+            // 0xD800–0xDBFF for W1
+            // 0xDC00–0xDFFF for W2
+            // both W1 and W2 fit into 0xD800–0xDFFF
+            // Detect if the first i16 fit into range of W1/W2
+            if (first_i16 >= 0xD800 && first_i16 <= 0xDFFF) {
+              // we are in the surrogate pair range
+              if (first_i16 >= 0xDC00) {
+                this->SetCurrentPosForBetterErrorMsg(escape_pos);
+                this->SetErrorInvalidSurrogatePair();
+                // we need to return false instead because this range is for W2
+                return false;
+              }
+              if (!this->MatchLiteral("\\u", 2)) {
+                this->SetCurrentPosForBetterErrorMsg(escape_pos);
+                this->SetErrorInvalidSurrogatePair();
+                return false;
+              }
+              escape_pos = cur_;
+              // get the value of the W2 (second i16)
+              int32_t second_i16;
+              if (!Parse4Hex(&second_i16)) {
+                this->SetCurrentPosForBetterErrorMsg(escape_pos);
+                this->SetErrorInvalidUnicodeEscape();
+                return false;
+              }
+              if (!(second_i16 >= 0xDC00 && second_i16 <= 0xDFFF)) {
+                this->SetCurrentPosForBetterErrorMsg(escape_pos);
+                this->SetErrorInvalidSurrogatePair();
+                return false;
+              }
+              // recover the code point
+              code_point = ((first_i16 - 0xD800) << 10) + (second_i16 - 
0xDC00) + 0x10000;
+            } else {
+              // not a surrogate case, just assign as code point
+              code_point = first_i16;
+            }
+            // now need to push back the string based on UTF-8 encoding
+            // UTF-8 encoding rule: four cases
+            // ------------------------------------------------------------
+            // Pattern                                | code point range
+            // ------------------------------------------------------------
+            // 0xxxxxxx                               | 0x0 - 0x7F
+            // 110xxxxx 10xxxxxx                      | 0x80 - 0x7FF
+            // 1110xxxx 10xxxxxx 10xxxxxx             | 0x800 - 0xFFFF
+            // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx    | 0x10000 - end
+            // ------------------------------------------------------------
+            if (code_point < 0x80) {
+              out_str.push_back(code_point);
+            } else if (code_point < 0x800) {
+              // first byte: 110xxxxx (5 effective bits)
+              // second byte: 10xxxxxx (6 effecive bits)
+              // shift by 6 bits to get the first bytes
+              out_str.push_back(0xC0 | (code_point >> 6));
+              // mask by 6 effective bits
+              out_str.push_back(0x80 | (code_point & 0x3F));
+            } else if (code_point < 0x10000) {
+              // first byte: 1110xxxx (4 effective bits)
+              // second byte: 10xxxxxx (6 effecive bits)
+              // third byte: 10xxxxxx (6 effecive bits)
+              // shift by 12 bits to get the first bytes
+              out_str.push_back(0xE0 | (code_point >> 12));
+              // shift by 6 bits to get the second bytes, mask by 6 effective 
bits
+              out_str.push_back(0x80 | ((code_point >> 6) & 0x3F));
+              // mask by 6 effective bits
+              out_str.push_back(0x80 | (code_point & 0x3F));
+            } else {
+              // first byte: 11110xxx (3 effective bits)
+              // second byte: 10xxxxxx (6 effecive bits)
+              // third byte: 10xxxxxx (6 effecive bits)
+              // fourth byte: 10xxxxxx (6 effecive bits)
+              // shift by 18 bits to get the first bytes
+              out_str.push_back(0xF0 | (code_point >> 18));
+              // shift by 12 bits to get the second bytes, mask by 6 effective 
bits
+              out_str.push_back(0x80 | ((code_point >> 12) & 0x3F));
+              // shift by 6 bits to get the third bytes, mask by 6 effective 
bits
+              out_str.push_back(0x80 | ((code_point >> 6) & 0x3F));
+              // mask by 6 effective bits
+              out_str.push_back(0x80 | (code_point & 0x3F));
+            }
+            break;
+          }
+          default: {
+            this->SetErrorInvalidEscape();
+            return false;
+          }
+        }
+      } else {
+        out_str.push_back(*cur_);
+        ++cur_;
+      }
+    }
+    this->SetCurrentPosForBetterErrorMsg(start_pos);
+    this->SetErrorUnterminatedString();
+    return false;
+  }
+  /*!
+   * \brief Parse the four hex digits of a unicode code point per json spec.
+   * \param out_i16 The output i16 number
+   * \return True if four hex digits are parsed successfully, false otherwise.
+   */
+  bool Parse4Hex(int32_t* out_i16) {
+    int32_t result = 0;
+    for (int i = 0; i < 4; ++i, ++cur_) {
+      int hex_val = *reinterpret_cast<const uint8_t*>(cur_);
+      if (hex_val >= '0' && hex_val <= '9') {
+        hex_val -= '0';
+      } else if (hex_val >= 'a' && hex_val <= 'f') {
+        hex_val -= 'a' - 0xa;
+      } else if (hex_val >= 'A' && hex_val <= 'F') {
+        hex_val -= 'A' - 0xa;
+      } else {
+        return false;
+      }
+      result = result * 16 + hex_val;
+    }
+    *out_i16 = result;
+    return true;
+  }
+
+  /*! \brief The beginning of the string */
+  const char* begin_;
+  /*! \brief The current pointer */
+  const char* cur_;
+  /*! \brief End of the string */
+  const char* end_;
+  /*! \brief The beginning of the last line */
+  const char* last_line_begin_;
+  /*! \brief The error message */
+  std::string error_msg_;
+  /*! \brief The line counter */
+  int64_t line_counter_{1};
+};
+
+class JSONParser {
+ public:
+  static json::Value Parse(const String& json_str, String* error_msg) {
+    JSONParser parser(json_str);
+    json::Value result;
+    if (parser.ParseValue(&result) && parser.ParseTail()) {
+      if (error_msg != nullptr) {
+        *error_msg = String("");
+      }
+      return result;
+    }
+    if (error_msg != nullptr) {
+      *error_msg = parser.ctx_.FinalizeErrorMsg();
+      TVM_FFI_ICHECK(!error_msg->empty());
+    } else {
+      TVM_FFI_THROW(ValueError) << parser.ctx_.FinalizeErrorMsg();
+    }
+    // note that when we don't throw, error msg is set to indicate
+    // an error happens
+    return nullptr;
+  }
+
+ private:
+  explicit JSONParser(String json_str) : ctx_(json_str.data(), json_str.data() 
+ json_str.size()) {}
+
+  bool ParseTail() {
+    ctx_.SkipSpaces();
+    // there are extra data in the tail
+    if (ctx_.Peek() != -1) {
+      ctx_.SetErrorExtraData();
+      return false;
+    }
+    return true;
+  }
+
+  bool ParseValue(json::Value* out) {
+    ctx_.SkipSpaces();
+    // record start pos for cases where we might need to reset
+    // current position for better error message
+    auto start_pos = ctx_.GetCurrentPos();
+    // check if the end of the string is reached
+    switch (ctx_.Peek()) {
+      case -1: {
+        ctx_.SetErrorExpectingValue();
+        return false;
+      }
+      case '{': {
+        return ParseObject(out);
+      }
+      case '[': {
+        return ParseArray(out);
+      }
+      case '\"': {
+        return ctx_.NextString(out);
+      }
+      case 't': {
+        ctx_.SkipNextAssumeNoSpace();
+        if (ctx_.MatchLiteral("rue", 3)) {
+          *out = true;
+          return true;
+        } else {
+          ctx_.SetCurrentPosForBetterErrorMsg(start_pos);
+          ctx_.SetErrorExpectingValue();
+          return false;
+        }
+      }
+      case 'f': {
+        ctx_.SkipNextAssumeNoSpace();
+        if (ctx_.MatchLiteral("alse", 4)) {
+          *out = false;
+          return true;
+        } else {
+          ctx_.SetCurrentPosForBetterErrorMsg(start_pos);
+          ctx_.SetErrorExpectingValue();
+          return false;
+        }
+      }
+      case 'n': {
+        ctx_.SkipNextAssumeNoSpace();
+        if (ctx_.MatchLiteral("ull", 3)) {
+          *out = nullptr;
+          return true;
+        } else {
+          ctx_.SetCurrentPosForBetterErrorMsg(start_pos);
+          ctx_.SetErrorExpectingValue();
+          return false;
+        }
+      }
+      default: {
+        return ctx_.NextNumber(out);
+      }
+    }
+    return false;
+  }
+
+  bool ParseObject(json::Value* out) {
+    size_t stack_top = object_temp_stack_.size();
+    json::Object result;
+    ctx_.SkipNextAssumeNoSpace();
+    ctx_.SkipSpaces();
+    int next_char = ctx_.Peek();
+    if (next_char == -1) {
+      ctx_.SetErrorExpectingPropertyName();
+      return false;
+    }
+    // empty object
+    if (next_char == '}') {
+      ctx_.SkipNextAssumeNoSpace();
+      *out = json::Object();
+      return true;
+    }
+    // non-empty object
+    while ((next_char = ctx_.Peek()) != -1) {
+      if (next_char != '\"') {
+        ctx_.SetErrorExpectingPropertyName();
+        return false;
+      }
+      json::Value key;
+      if (!ctx_.NextString(&key)) return false;
+      ctx_.SkipSpaces();
+      if (ctx_.Peek() != ':') {
+        ctx_.SetErrorExpectingColon();
+        return false;
+      }
+      ctx_.SkipNextAssumeNoSpace();
+      json::Value value;
+      if (!ParseValue(&value)) return false;
+      object_temp_stack_.emplace_back(key, value);
+      // result.Set(key, value);
+      ctx_.SkipSpaces();
+      if (ctx_.Peek() == '}') {
+        ctx_.SkipNextAssumeNoSpace();
+        *out = json::Object(object_temp_stack_.begin() + stack_top, 
object_temp_stack_.end());
+        // recover the stack to original state
+        object_temp_stack_.resize(stack_top);
+        return true;
+      } else if (ctx_.Peek() == ',') {
+        ctx_.SkipNextAssumeNoSpace();
+        // must skip space so next iteration do not have to do so
+        ctx_.SkipSpaces();
+      } else {
+        ctx_.SetErrorExpectingComma();
+        return false;
+      }
+    }
+    return false;
+  }
+
+  bool ParseArray(json::Value* out) {
+    size_t stack_top = array_temp_stack_.size();
+    ctx_.SkipNextAssumeNoSpace();
+    ctx_.SkipSpaces();
+    int next_char = ctx_.Peek();
+    if (next_char == -1) {
+      ctx_.SetErrorExpectingValue();
+      return false;
+    }
+    // empty array
+    if (next_char == ']') {
+      ctx_.SkipNextAssumeNoSpace();
+      *out = json::Array();
+      return true;
+    }
+    // non-empty array
+    while ((next_char = ctx_.Peek()) != -1) {
+      json::Value value;
+      // no need to skip space here because we already skipped space
+      // at the beginning or in previous iteration
+      if (!ParseValue(&value)) return false;
+      array_temp_stack_.emplace_back(std::move(value));
+      ctx_.SkipSpaces();
+      next_char = ctx_.Peek();
+      if (next_char == ',') {
+        ctx_.SkipNextAssumeNoSpace();
+        // must skip space so next iteration do not have to do so
+        ctx_.SkipSpaces();
+      } else if (next_char == ']') {
+        ctx_.SkipNextAssumeNoSpace();
+        *out = json::Array(array_temp_stack_.begin() + stack_top, 
array_temp_stack_.end());
+        // recover the stack
+        array_temp_stack_.resize(stack_top);
+        return true;
+      } else {
+        ctx_.SetErrorExpectingComma();
+        return false;
+      }
+    }
+    return false;
+  }
+
+  JSONParserContext ctx_;
+  // Temp stack for intermediate values
+  // we first create a persistent stack to store the parsed values
+  // then create the final array/object object with the precise size
+  std::vector<Any> array_temp_stack_;
+  std::vector<std::pair<Any, Any>> object_temp_stack_;
+};
+
+json::Value Parse(const String& json_str, String* error_msg) {
+  return JSONParser::Parse(json_str, error_msg);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("ffi.json.Parse",
+                        [](const String& json_str) { return 
json::Parse(json_str); });
+});
+
+}  // namespace json
+}  // namespace ffi
+}  // namespace tvm
diff --git a/ffi/tests/cpp/extra/test_json_parser.cc 
b/ffi/tests/cpp/extra/test_json_parser.cc
new file mode 100644
index 0000000000..c0332e6f8f
--- /dev/null
+++ b/ffi/tests/cpp/extra/test_json_parser.cc
@@ -0,0 +1,363 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <gtest/gtest.h>
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/container/map.h>
+#include <tvm/ffi/extra/json.h>
+#include <tvm/ffi/extra/structural_equal.h>
+
+#include <cmath>
+
+namespace {
+
+using namespace tvm::ffi;
+
+TEST(JSONParser, BoolNull) {
+  // boolean value
+  EXPECT_EQ(json::Parse("true").cast<bool>(), true);
+  EXPECT_EQ(json::Parse("false").cast<bool>(), false);
+  EXPECT_EQ(json::Parse("null"), nullptr);
+}
+
+TEST(JSONParser, WrongBoolNull) {
+  String error_msg;
+  EXPECT_EQ(json::Parse("nul", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+  EXPECT_EQ(json::Parse("fals", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+  EXPECT_EQ(json::Parse("\n\nfx", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 3 column 1 (char 2)");
+  EXPECT_EQ(json::Parse("fx", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+  EXPECT_EQ(json::Parse("n1", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+  EXPECT_EQ(json::Parse("t1", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+  EXPECT_EQ(json::Parse("f1", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+}
+
+TEST(JSONParser, Number) {
+  // number
+  EXPECT_EQ(json::Parse("123").cast<int64_t>(), 123);
+  EXPECT_EQ(json::Parse("-124").cast<int64_t>(), -124);
+  EXPECT_EQ(json::Parse("123.456").cast<double>(), 123.456);
+  // parsing scientific notation
+  EXPECT_EQ(json::Parse("1.456e12").cast<double>(), 1.456e12);
+  // NaN
+  EXPECT_EQ(std::isnan(json::Parse("NaN").cast<double>()), true);
+  // Infinity
+  EXPECT_EQ(std::isinf(json::Parse("Infinity").cast<double>()), true);
+  // -Infinity
+  EXPECT_EQ(std::isinf(-json::Parse("-Infinity").cast<double>()), true);
+
+  // Test zero variants
+  EXPECT_EQ(json::Parse("0").cast<int64_t>(), 0);
+  EXPECT_EQ(json::Parse("-0").cast<double>(), -0.0);
+  EXPECT_EQ(json::Parse("0.0").cast<double>(), 0.0);
+
+  // Test very large numbers
+  EXPECT_EQ(json::Parse("9223372036854775807").cast<int64_t>(),
+            std::numeric_limits<int64_t>::max());
+  EXPECT_EQ(json::Parse("-9223372036854775808").cast<int64_t>(),
+            std::numeric_limits<int64_t>::min());
+
+  // Test very small decimals
+  EXPECT_EQ(json::Parse("1e-10").cast<double>(), 1e-10);
+  EXPECT_EQ(json::Parse("-1e-10").cast<double>(), -1e-10);
+
+  // Test scientific notation edge cases
+  EXPECT_EQ(json::Parse("1E+10").cast<double>(), 1E+10);
+  EXPECT_EQ(json::Parse("1e+10").cast<double>(), 1e+10);
+  EXPECT_EQ(json::Parse("1E-10").cast<double>(), 1E-10);
+  EXPECT_EQ(json::Parse("123.456E+10").cast<double>(), 123.456E+10);
+}
+
+TEST(JSONParser, WrongNumber) {
+  String error_msg;
+  EXPECT_EQ(json::Parse("123.456.789", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+
+  // Test invalid number formats
+  EXPECT_EQ(json::Parse("123e", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+  EXPECT_EQ(json::Parse("123e+", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+  EXPECT_EQ(json::Parse("123E-", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+}
+
+TEST(JSONParser, String) {
+  EXPECT_EQ(json::Parse("\"hello\"").cast<String>(), "hello");
+  EXPECT_EQ(json::Parse("\n\t \"hello\"\n\r").cast<String>(), "hello");
+  EXPECT_EQ(json::Parse("\"hello\\nworld\"").cast<String>(), "hello\nworld");
+  EXPECT_EQ(json::Parse("\"\"").cast<String>(), "");
+  // test escape characters
+  EXPECT_EQ(json::Parse("\"\\ta\\n\\/\\f\\\"\\\\\"").cast<String>(), 
"\ta\n/\f\"\\");
+  // test unicode code point
+  EXPECT_EQ(json::Parse("\"\\u0041\"").cast<String>(), "A");
+  // test unicode surrogate pair
+  EXPECT_EQ(json::Parse("\"\\uD83D\\uDE04hello\"").cast<String>(), 
u8"\U0001F604hello");
+}
+
+TEST(JSONParser, WrongString) {
+  String error_msg;
+  EXPECT_EQ(json::Parse("\"hello", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Unterminated string starting at: line 1 column 1 (char 
0)");
+
+  EXPECT_EQ(json::Parse("\"hello\x01\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Invalid control character at: line 1 column 7 (char 
6)");
+
+  EXPECT_EQ(json::Parse("\"hello\\uxx\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Invalid \\uXXXX escape: line 1 column 8 (char 7)");
+
+  EXPECT_EQ(json::Parse("\"hello\\uDC00\\uDE04\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Invalid surrogate pair of \\uXXXX escapes: line 1 
column 8 (char 7)");
+
+  EXPECT_EQ(json::Parse("\"hello\\uD800\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Invalid surrogate pair of \\uXXXX escapes: line 1 
column 8 (char 7)");
+
+  EXPECT_EQ(json::Parse("\"hello\\uD800\\uxx\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Invalid \\uXXXX escape: line 1 column 15 (char 14)");
+
+  EXPECT_EQ(json::Parse("\"hello\\a\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Invalid \\escape: line 1 column 8 (char 7)");
+}
+
+TEST(JSONParser, Array) {
+  EXPECT_TRUE(StructuralEqual()(json::Parse("[]"), json::Array{}));
+
+  EXPECT_TRUE(StructuralEqual()(json::Parse("[1, 2,\n\t\"a\"]"), 
json::Array{1, 2, "a"}));
+}
+
+TEST(JSONParser, WrongArray) {
+  String error_msg;
+
+  EXPECT_EQ(json::Parse("]", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+
+  EXPECT_EQ(json::Parse("[1,]", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 4 (char 3)");
+
+  EXPECT_EQ(json::Parse("[", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 2 (char 1)");
+
+  EXPECT_EQ(json::Parse("[1a", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting ',' delimiter: line 1 column 3 (char 2)");
+
+  EXPECT_EQ(json::Parse("[1,2,3", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting ',' delimiter: line 1 column 7 (char 6)");
+
+  EXPECT_EQ(json::Parse("[1]  a", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Extra data: line 1 column 6 (char 5)");
+}
+
+TEST(JSONParser, Object) {
+  EXPECT_TRUE(StructuralEqual()(json::Parse("{}"), json::Object{}));
+
+  EXPECT_TRUE(StructuralEqual()(json::Parse("{\"a\":  1, \n\"b\": \t\"c\"}   
"),
+                                json::Object{{"a", 1}, {"b", "c"}}));
+}
+
+TEST(JSONParser, ObjectOrderPreserving) {
+  auto obj = json::Parse("{\"c\": 1, \"a\": 2, \"b\": 3}   ");
+  json::Array keys;
+  for (auto& [key, value] : obj.cast<json::Object>()) {
+    keys.push_back(key);
+  }
+  EXPECT_TRUE(StructuralEqual()(keys, json::Array{"c", "a", "b"}));
+}
+
+TEST(JSONParser, WrongObject) {
+  String error_msg;
+  EXPECT_EQ(json::Parse("{\"a\":", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 6 (char 5)");
+
+  EXPECT_EQ(json::Parse("{", &error_msg), nullptr);
+  EXPECT_EQ(error_msg,
+            "Expecting property name enclosed in double quotes: line 1 column 
2 (char 1)");
+
+  // Test incomplete structures
+  EXPECT_EQ(json::Parse("{\"incomplete\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting ':' delimiter: line 1 column 14 (char 13)");
+}
+
+TEST(JSONParser, NestedObject) {
+  EXPECT_TRUE(
+      StructuralEqual()(json::Parse("{\"a\": \t{\"b\": 1}, \n\"c\": [1, 2, 
3]}"),
+                        json::Object{{"a", json::Object{{"b", 1}}}, {"c", 
json::Array{1, 2, 3}}}));
+
+  EXPECT_TRUE(StructuralEqual()(
+      json::Parse("{\"a\": \t{\"b\": 1}, \n\"c\": [1, null, Infinity]}"),
+      json::Object{{"a", json::Object{{"b", 1}}},
+                   {"c", json::Array{1, nullptr, 
std::numeric_limits<double>::infinity()}}}));
+
+  EXPECT_TRUE(StructuralEqual()(
+      json::Parse("[{}, {\"a\": [1.1, 1000000]}]"),
+      json::Array{json::Object{}, json::Object{{"a", json::Array{1.1, 
1000000}}}}));
+}
+
+TEST(JSONParser, WrongNestedObject) {
+  String error_msg;
+  EXPECT_EQ(json::Parse("{\"a\":\n\n[1]", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting ',' delimiter: line 3 column 4 (char 10)");
+
+  EXPECT_EQ(json::Parse("{\"a\":\n\n[abc]}", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 3 column 2 (char 8)");
+}
+
+// edge cases
+TEST(JSONParser, WhitespaceHandling) {
+  // Test various whitespace characters
+  EXPECT_EQ(json::Parse(" \t\n\r true \t\n\r ").cast<bool>(), true);
+  EXPECT_EQ(json::Parse("\n\n\n123\n\n\n").cast<int64_t>(), 123);
+  EXPECT_EQ(json::Parse("   \"hello world\"   ").cast<String>(), "hello 
world");
+
+  // Test whitespace in arrays and objects
+  EXPECT_TRUE(StructuralEqual()(json::Parse("  [  1  ,  2  ,  3  ]  "), 
json::Array{1, 2, 3}));
+
+  EXPECT_TRUE(StructuralEqual()(json::Parse("  {  \"a\"  :  1  ,  \"b\"  :  2  
}  "),
+                                json::Object{{"a", 1}, {"b", 2}}));
+}
+
+TEST(JSONParser, WrongEmptyAndMinimalInputs) {
+  String error_msg;
+  // Test empty string
+  EXPECT_EQ(json::Parse("", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)");
+
+  // Test only whitespace
+  EXPECT_EQ(json::Parse("   \t\n    ", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Expecting value: line 2 column 5 (char 9)");
+}
+
+TEST(JSONParser, UnicodeEdgeCases) {
+  // Test various unicode characters
+  EXPECT_EQ(json::Parse("\"\\u0000\"").cast<String>(), std::string("\0", 1));
+  // replace using \U to avoid encoding issues
+  EXPECT_EQ(json::Parse("\"\\u00FF\"").cast<String>(), u8"\U000000FF");
+  EXPECT_EQ(json::Parse("\"\\u4E2D\\u6587\"").cast<String>(), 
u8"\U00004E2D\U00006587");
+
+  // Test multiple surrogate pairs
+  EXPECT_EQ(json::Parse("\"\\uD83D\\uDE00\\uD83D\\uDE01\"").cast<String>(),
+            u8"\U0001F600\U0001F601");
+}
+
+TEST(JSONParser, LargeInputs) {
+  // Test large array
+  std::string large_array = "[";
+  for (int i = 0; i < 1000; ++i) {
+    if (i > 0) large_array += ",";
+    large_array += std::to_string(i);
+  }
+  large_array += "]";
+
+  auto result = json::Parse(large_array);
+  EXPECT_TRUE(result != nullptr);
+  EXPECT_EQ(result.cast<json::Array>().size(), 1000);
+
+  // Test large object
+  std::string large_object = "{";
+  for (int i = 0; i < 500; ++i) {
+    if (i > 0) large_object += ",";
+    large_object += "\"key" + std::to_string(i) + "\":" + std::to_string(i);
+  }
+  large_object += "}";
+
+  result = json::Parse(large_object);
+  EXPECT_TRUE(result != nullptr);
+  EXPECT_EQ(result.cast<json::Object>().size(), 500);
+}
+
+TEST(JSONParser, MixedDataTypes) {
+  // Test complex nested structure with all data types
+  std::string complex_json = R"({
+    "null_value": null,
+    "boolean_true": true,
+    "boolean_false": false,
+    "integer": 42,
+    "negative_integer": -42,
+    "float": 3.14159,
+    "scientific": 1.23e-4,
+    "string": "hello world",
+    "unicode_string": "Hello \u4e16\u754c \ud83c\udf0d",
+    "empty_string": "",
+    "empty_array": [],
+    "empty_object": {},
+    "number_array": [1, 2, 3, 4, 5],
+    "mixed_array": [1, "two", true, null, 3.14],
+    "nested_object": {
+      "level1": {
+        "level2": {
+          "data": [1, 2, {"nested_array": [true, false]}]
+        }
+      }
+    }
+  })";
+
+  auto result = json::Parse(complex_json);
+
+  // Create expected structure for comparison
+  json::Object expected{
+      {"null_value", nullptr},
+      {"boolean_true", true},
+      {"boolean_false", false},
+      {"integer", 42},
+      {"negative_integer", -42},
+      {"float", 3.14159},
+      {"scientific", 1.23e-4},
+      {"string", "hello world"},
+      {"unicode_string", u8"Hello \U00004E16\U0000754C \U0001F30D"},
+      {"empty_string", ""},
+      {"empty_array", json::Array{}},
+      {"empty_object", json::Object{}},
+      {"number_array", json::Array{1, 2, 3, 4, 5}},
+      {"mixed_array", json::Array{1, "two", true, nullptr, 3.14}},
+      {"nested_object",
+       json::Object{
+           {"level1",
+            json::Object{
+                {"level2",
+                 json::Object{
+                     {"data",
+                      json::Array{1, 2,
+                                  json::Object{{"nested_array", 
json::Array{true, false}}}}}}}}}}}};
+
+  EXPECT_TRUE(StructuralEqual()(result, expected));
+}
+
+TEST(JSONParser, WrongExtraData) {
+  String error_msg;
+
+  EXPECT_EQ(json::Parse("truee", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Extra data: line 1 column 5 (char 4)");
+
+  EXPECT_EQ(json::Parse("true false", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Extra data: line 1 column 6 (char 5)");
+
+  EXPECT_EQ(json::Parse("123 456", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Extra data: line 1 column 5 (char 4)");
+
+  EXPECT_EQ(json::Parse("\"hello\" \"world\"", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Extra data: line 1 column 9 (char 8)");
+
+  EXPECT_EQ(json::Parse("{} []", &error_msg), nullptr);
+  EXPECT_EQ(error_msg, "Extra data: line 1 column 4 (char 3)");
+}
+}  // namespace

(tvm) 01/02: [FFI] Lightweight json parser

Reply via email to