This is an automated email from the ASF dual-hosted git repository. tqchen pushed a commit to branch json-small-str in repository https://gitbox.apache.org/repos/asf/tvm.git
commit 9aad88dd4be78219e84b1ab29f362d76ca72a28b Author: tqchen <[email protected]> AuthorDate: Wed Jul 30 16:51:04 2025 -0400 [FFI] Lightweight json parser This PR adds a lightweight json parser to extra component --- ffi/CMakeLists.txt | 1 + ffi/include/tvm/ffi/extra/json.h | 82 ++++ ffi/src/ffi/extra/json_parser.cc | 692 ++++++++++++++++++++++++++++++++ ffi/tests/cpp/extra/test_json_parser.cc | 363 +++++++++++++++++ 4 files changed, 1138 insertions(+) diff --git a/ffi/CMakeLists.txt b/ffi/CMakeLists.txt index 76b2901c7a..9eb6ad7663 100644 --- a/ffi/CMakeLists.txt +++ b/ffi/CMakeLists.txt @@ -66,6 +66,7 @@ if (TVM_FFI_USE_EXTRA_CXX_API) list(APPEND tvm_ffi_objs_sources "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/structural_equal.cc" "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/structural_hash.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/extra/json_parser.cc" ) endif() diff --git a/ffi/include/tvm/ffi/extra/json.h b/ffi/include/tvm/ffi/extra/json.h new file mode 100644 index 0000000000..5522715d3f --- /dev/null +++ b/ffi/include/tvm/ffi/extra/json.h @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*! + * \file tvm/ffi/json/json.h + * \brief Minimal lightweight JSON parsing and serialization utilities + */ +#ifndef TVM_FFI_EXTRA_JSON_H_ +#define TVM_FFI_EXTRA_JSON_H_ + +#include <tvm/ffi/any.h> +#include <tvm/ffi/container/array.h> +#include <tvm/ffi/container/map.h> +#include <tvm/ffi/extra/base.h> + +namespace tvm { +namespace ffi { +namespace json { + +/*! + * \brief alias Any as json Value. + * + * To keep things lightweight, we simply reuse the ffi::Any system. + */ +using Value = Any; + +/*! + * \brief alias Map<Any, Any> as json Object. + * \note We use Map<Any, Any> instead of Map<String, Any> to avoid + * the overhead of key checking when doing as conversion, + * the check will be performed at runtime when we read each key + */ +using Object = ffi::Map<Any, Any>; + +/*! \brief alias Array<Any> as json Array. */ +using Array = ffi::Array<Any>; + +/*! + * \brief Parse a JSON string into an Any value. + * + * Besides the standard JSON syntax, this function also supports: + * - Infinity/NaN as javascript syntax + * - int64 integer value + * + * If error_msg is not nullptr, the error message will be written to it + * and no exception will be thrown when parsing fails. + * + * \param json_str The JSON string to parse. + * \param error_msg The output error message, can be nullptr. + * + * \return The parsed Any value. + * \note This function is optional and will be removed in the future. + */ +TVM_FFI_EXTRA_CXX_API json::Value Parse(const String& json_str, String* error_msg = nullptr); + +/*! + * \brief Serialize an Any value into a JSON string. + * + * \param value The Any value to serialize. + * \return The output JSON string. + */ +TVM_FFI_EXTRA_CXX_API String Stringify(const json::Value& value, int indent = -1); + +} // namespace json +} // namespace ffi +} // namespace tvm +#endif // TVM_FFI_EXTRA_JSON_H_ diff --git a/ffi/src/ffi/extra/json_parser.cc b/ffi/src/ffi/extra/json_parser.cc new file mode 100644 index 0000000000..bf503b235f --- /dev/null +++ b/ffi/src/ffi/extra/json_parser.cc @@ -0,0 +1,692 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * \file src/ffi/json/parser.cc + * + * \brief A minimalistic JSON parser implementation. + */ +#include <tvm/ffi/any.h> +#include <tvm/ffi/container/array.h> +#include <tvm/ffi/container/map.h> +#include <tvm/ffi/error.h> +#include <tvm/ffi/extra/json.h> +#include <tvm/ffi/reflection/registry.h> +#include <tvm/ffi/string.h> + +#include <cinttypes> +#include <limits> + +namespace tvm { +namespace ffi { +namespace json { + +/*! + * \brief Helper class to parse a JSON string. + * + * Keep leaf level string/number parse also in context. + */ +class JSONParserContext { + public: + JSONParserContext(const char* begin, const char* end) : begin_(begin), cur_(begin), end_(end) { + last_line_begin_ = cur_; + } + + /*! + * \brief Peek the current character. + * \return The current character, or -1 if the end of the string is reached. + */ + int Peek() const { + return (cur_ != end_ ? static_cast<int>(*reinterpret_cast<const uint8_t*>(cur_)) : -1); + } + + /*! + * \brief Skip the next char that we know is not a space + * + * \note Caller must explicitly call SkipSpaces first or use + * Peek already that confirms char is not any space char. + */ + void SkipNextAssumeNoSpace() { ++cur_; } + + /*! + * \brief Get the current position. + * \return The current position. + */ + const char* GetCurrentPos() const { return cur_; } + + /*! + * \brief Set the current position for better error message + * \param pos The new position. + * \note implementation can do it as no-op if needed + */ + void SetCurrentPosForBetterErrorMsg(const char* pos) { cur_ = pos; } + + /*! + * \brief Skip the space characters. + * \note This function does not check if the end of the string is reached. + */ + void SkipSpaces() { + while (cur_ != end_) { + if (!(*cur_ == ' ' || *cur_ == '\t' || *cur_ == '\n' || *cur_ == '\r')) { + break; + } + if (*cur_ == '\n') { + ++line_counter_; + last_line_begin_ = cur_ + 1; + } + ++cur_; + } + } + + /*! + * \brief Check if the next characters match the given string. + * \param str The string to match. + * \param len The length of the string. + * \return True if the next characters match the given string, false otherwise. + */ + bool MatchLiteral(const char* pattern, int len) { + const char* pend = pattern + len; + const char* ptr = pattern; + for (; ptr != pend && cur_ != end_; ++ptr, ++cur_) { + if (*ptr != *cur_) { + return false; + } + } + // we get to the end of the pattern and match is successful + return ptr == pend; + } + + /* + * \brief Parse the next strin starting with a double quote. + * \param out The output string. + * \return Whether the next string parsing is successful. + */ + bool NextString(json::Value* out) { + // NOTE: we keep string parsing logic here to allow some special + // optimizations for simple string that do not e + const char* start_pos = cur_; + TVM_FFI_ICHECK(*cur_ == '\"'); + // skip first double quote + ++cur_; + // the loop focuses on simple string without escape characters + for (; cur_ != end_; ++cur_) { + if (*cur_ == '\"') { + *out = String(start_pos + 1, cur_ - start_pos - 1); + ++cur_; + return true; + } + if (*cur_ < ' ' || *cur_ == '\\') { + // fallback to full string handling + return this->NextStringWithFullHandling(out, start_pos); + } + } + this->SetCurrentPosForBetterErrorMsg(start_pos); + this->SetErrorUnterminatedString(); + return false; + } + + /*! + * \brief Parse the next number. + * \param out The output number. + * \return Whether the next number parsing is successful. + */ + bool NextNumber(json::Value* out) { + const char* start_pos = cur_; + if (cur_ == end_) { + this->SetErrorExpectingValue(); + return false; + } + // JSON number grammar: + // + // number = [ minus ] int [ frac ] [ exp ] + // decimal-point = %x2E ; . + // digit1-9 = %x31-39 ; 1-9 + // e = %x65 / %x45 ; e E + // exp = e [ minus / plus ] 1*DIGIT + // frac = decimal-point 1*DIGIT + std::string temp_buffer; + bool maybe_int = true; + // parse [minus], cross check for Infinity/NaN/-Infinity + if (*cur_ == '-') { + temp_buffer.push_back('-'); + ++cur_; + if (cur_ != end_ && *cur_ == 'I') { + if (this->MatchLiteral("Infinity", 8)) { + *out = -std::numeric_limits<double>::infinity(); + return true; + } else { + this->SetCurrentPosForBetterErrorMsg(start_pos); + this->SetErrorExpectingValue(); + return false; + } + } + } else if (*cur_ == 'I') { + if (this->MatchLiteral("Infinity", 8)) { + *out = std::numeric_limits<double>::infinity(); + return true; + } else { + this->SetCurrentPosForBetterErrorMsg(start_pos); + this->SetErrorExpectingValue(); + return false; + } + } else if (*cur_ == 'N') { + if (this->MatchLiteral("NaN", 3)) { + *out = std::numeric_limits<double>::quiet_NaN(); + return true; + } else { + this->SetCurrentPosForBetterErrorMsg(start_pos); + this->SetErrorExpectingValue(); + return false; + } + } + // read in all parts that are possibly part of a number + while (cur_ != end_) { + char next_char = *cur_; + if ((next_char >= '0' && next_char <= '9') || next_char == 'e' || next_char == 'E' || + next_char == '+' || next_char == '-' || next_char == '.') { + temp_buffer.push_back(next_char); + if (next_char == '.' || next_char == 'e' || next_char == 'E') { + maybe_int = false; + } + ++cur_; + } else { + break; + } + } + if (temp_buffer.empty()) { + this->SetErrorExpectingValue(); + return false; + } + // parse from temp_buffer_ + if (maybe_int) { + // now try to parse the number as int64 + char* end_ptr; + errno = 0; + intmax_t int_val = strtoimax(temp_buffer.data(), &end_ptr, 10); + if (errno == 0 && int_val >= std::numeric_limits<int64_t>::min() && + int_val <= std::numeric_limits<int64_t>::max() && + end_ptr == temp_buffer.data() + temp_buffer.size()) { + *out = static_cast<int64_t>(int_val); + return true; + } + } + { + // now try to parse number as double + char* end_ptr; + errno = 0; + double double_val = strtod(temp_buffer.data(), &end_ptr); + if (errno == 0 && end_ptr == temp_buffer.data() + temp_buffer.size()) { + *out = double_val; + return true; + } else { + this->SetCurrentPosForBetterErrorMsg(start_pos); + this->SetErrorExpectingValue(); + return false; + } + } + } + + /*! + * \brief Get the current line context. + * \return The current line context. + */ + String GetSyntaxErrorContext(std::string err_prefix) const { + int64_t column = static_cast<int64_t>(cur_ - last_line_begin_) + 1; + int64_t char_pos = static_cast<int64_t>(cur_ - begin_); + if (err_prefix.empty()) { + err_prefix = "Syntax error"; + } + err_prefix += ": line " + std::to_string(line_counter_) + " column " + std::to_string(column) + + " (char " + std::to_string(char_pos) + ")"; + return String(err_prefix); + } + + std::string FinalizeErrorMsg() { + if (error_msg_.empty()) { + SetErrorDefault(); + } + return std::string(error_msg_); + } + + void SetErrorDefault() { error_msg_ = GetSyntaxErrorContext("Syntax error near"); } + + void SetErrorExpectingValue() { error_msg_ = GetSyntaxErrorContext("Expecting value"); } + + void SetErrorInvalidControlCharacter() { + error_msg_ = GetSyntaxErrorContext("Invalid control character at"); + } + + void SetErrorUnterminatedString() { + error_msg_ = GetSyntaxErrorContext("Unterminated string starting at"); + } + + void SetErrorInvalidUnicodeEscape() { + error_msg_ = GetSyntaxErrorContext("Invalid \\uXXXX escape"); + } + + void SetErrorInvalidSurrogatePair() { + error_msg_ = GetSyntaxErrorContext("Invalid surrogate pair of \\uXXXX escapes"); + } + + void SetErrorInvalidEscape() { error_msg_ = GetSyntaxErrorContext("Invalid \\escape"); } + + void SetErrorExtraData() { error_msg_ = GetSyntaxErrorContext("Extra data"); } + + void SetErrorExpectingPropertyName() { + error_msg_ = GetSyntaxErrorContext("Expecting property name enclosed in double quotes"); + } + + void SetErrorExpectingColon() { error_msg_ = GetSyntaxErrorContext("Expecting \':\' delimiter"); } + + void SetErrorExpectingComma() { error_msg_ = GetSyntaxErrorContext("Expecting \',\' delimiter"); } + + private: + // Full string parsing with escape and unicode handling + bool NextStringWithFullHandling(Any* out, const char* start_pos) { + // copy over the prefix that was already parsed + std::string out_str(start_pos + 1, cur_ - start_pos - 1); + while (cur_ != end_) { + if (*cur_ < ' ') { + this->SetErrorInvalidControlCharacter(); + return false; + } + if (*cur_ == '\"') { + *out = String(std::move(out_str)); + ++cur_; + return true; + } + if (*cur_ == '\\') { + ++cur_; + switch (*cur_) { + // handle escape characters per JSON spec(RFC 8259) +#define HANDLE_ESCAPE_CHAR(pattern, val) \ + case pattern: \ + ++cur_; \ + out_str.push_back(val); \ + break + HANDLE_ESCAPE_CHAR('\"', '\"'); + HANDLE_ESCAPE_CHAR('\\', '\\'); + HANDLE_ESCAPE_CHAR('/', '/'); + HANDLE_ESCAPE_CHAR('b', '\b'); + HANDLE_ESCAPE_CHAR('f', '\f'); + HANDLE_ESCAPE_CHAR('n', '\n'); + HANDLE_ESCAPE_CHAR('r', '\r'); + HANDLE_ESCAPE_CHAR('t', '\t'); +#undef HANDLE_ESCAPE_CHAR + case 'u': { + const char* escape_pos = cur_; + // handle unicode code point + ++cur_; + int32_t first_i16, code_point = 0; + if (!Parse4Hex(&first_i16)) { + this->SetCurrentPosForBetterErrorMsg(escape_pos); + this->SetErrorInvalidUnicodeEscape(); + return false; + } + // Check if the first i16 is a UTF-16 surrogate pair + // + // Surrogate pair encoding rule: + // U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000 + // W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy + // W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx + // + // Range of W1 and W2: + // 0xD800–0xDBFF for W1 + // 0xDC00–0xDFFF for W2 + // both W1 and W2 fit into 0xD800–0xDFFF + // Detect if the first i16 fit into range of W1/W2 + if (first_i16 >= 0xD800 && first_i16 <= 0xDFFF) { + // we are in the surrogate pair range + if (first_i16 >= 0xDC00) { + this->SetCurrentPosForBetterErrorMsg(escape_pos); + this->SetErrorInvalidSurrogatePair(); + // we need to return false instead because this range is for W2 + return false; + } + if (!this->MatchLiteral("\\u", 2)) { + this->SetCurrentPosForBetterErrorMsg(escape_pos); + this->SetErrorInvalidSurrogatePair(); + return false; + } + escape_pos = cur_; + // get the value of the W2 (second i16) + int32_t second_i16; + if (!Parse4Hex(&second_i16)) { + this->SetCurrentPosForBetterErrorMsg(escape_pos); + this->SetErrorInvalidUnicodeEscape(); + return false; + } + if (!(second_i16 >= 0xDC00 && second_i16 <= 0xDFFF)) { + this->SetCurrentPosForBetterErrorMsg(escape_pos); + this->SetErrorInvalidSurrogatePair(); + return false; + } + // recover the code point + code_point = ((first_i16 - 0xD800) << 10) + (second_i16 - 0xDC00) + 0x10000; + } else { + // not a surrogate case, just assign as code point + code_point = first_i16; + } + // now need to push back the string based on UTF-8 encoding + // UTF-8 encoding rule: four cases + // ------------------------------------------------------------ + // Pattern | code point range + // ------------------------------------------------------------ + // 0xxxxxxx | 0x0 - 0x7F + // 110xxxxx 10xxxxxx | 0x80 - 0x7FF + // 1110xxxx 10xxxxxx 10xxxxxx | 0x800 - 0xFFFF + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 0x10000 - end + // ------------------------------------------------------------ + if (code_point < 0x80) { + out_str.push_back(code_point); + } else if (code_point < 0x800) { + // first byte: 110xxxxx (5 effective bits) + // second byte: 10xxxxxx (6 effecive bits) + // shift by 6 bits to get the first bytes + out_str.push_back(0xC0 | (code_point >> 6)); + // mask by 6 effective bits + out_str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point < 0x10000) { + // first byte: 1110xxxx (4 effective bits) + // second byte: 10xxxxxx (6 effecive bits) + // third byte: 10xxxxxx (6 effecive bits) + // shift by 12 bits to get the first bytes + out_str.push_back(0xE0 | (code_point >> 12)); + // shift by 6 bits to get the second bytes, mask by 6 effective bits + out_str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + // mask by 6 effective bits + out_str.push_back(0x80 | (code_point & 0x3F)); + } else { + // first byte: 11110xxx (3 effective bits) + // second byte: 10xxxxxx (6 effecive bits) + // third byte: 10xxxxxx (6 effecive bits) + // fourth byte: 10xxxxxx (6 effecive bits) + // shift by 18 bits to get the first bytes + out_str.push_back(0xF0 | (code_point >> 18)); + // shift by 12 bits to get the second bytes, mask by 6 effective bits + out_str.push_back(0x80 | ((code_point >> 12) & 0x3F)); + // shift by 6 bits to get the third bytes, mask by 6 effective bits + out_str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + // mask by 6 effective bits + out_str.push_back(0x80 | (code_point & 0x3F)); + } + break; + } + default: { + this->SetErrorInvalidEscape(); + return false; + } + } + } else { + out_str.push_back(*cur_); + ++cur_; + } + } + this->SetCurrentPosForBetterErrorMsg(start_pos); + this->SetErrorUnterminatedString(); + return false; + } + /*! + * \brief Parse the four hex digits of a unicode code point per json spec. + * \param out_i16 The output i16 number + * \return True if four hex digits are parsed successfully, false otherwise. + */ + bool Parse4Hex(int32_t* out_i16) { + int32_t result = 0; + for (int i = 0; i < 4; ++i, ++cur_) { + int hex_val = *reinterpret_cast<const uint8_t*>(cur_); + if (hex_val >= '0' && hex_val <= '9') { + hex_val -= '0'; + } else if (hex_val >= 'a' && hex_val <= 'f') { + hex_val -= 'a' - 0xa; + } else if (hex_val >= 'A' && hex_val <= 'F') { + hex_val -= 'A' - 0xa; + } else { + return false; + } + result = result * 16 + hex_val; + } + *out_i16 = result; + return true; + } + + /*! \brief The beginning of the string */ + const char* begin_; + /*! \brief The current pointer */ + const char* cur_; + /*! \brief End of the string */ + const char* end_; + /*! \brief The beginning of the last line */ + const char* last_line_begin_; + /*! \brief The error message */ + std::string error_msg_; + /*! \brief The line counter */ + int64_t line_counter_{1}; +}; + +class JSONParser { + public: + static json::Value Parse(const String& json_str, String* error_msg) { + JSONParser parser(json_str); + json::Value result; + if (parser.ParseValue(&result) && parser.ParseTail()) { + if (error_msg != nullptr) { + *error_msg = String(""); + } + return result; + } + if (error_msg != nullptr) { + *error_msg = parser.ctx_.FinalizeErrorMsg(); + TVM_FFI_ICHECK(!error_msg->empty()); + } else { + TVM_FFI_THROW(ValueError) << parser.ctx_.FinalizeErrorMsg(); + } + // note that when we don't throw, error msg is set to indicate + // an error happens + return nullptr; + } + + private: + explicit JSONParser(String json_str) : ctx_(json_str.data(), json_str.data() + json_str.size()) {} + + bool ParseTail() { + ctx_.SkipSpaces(); + // there are extra data in the tail + if (ctx_.Peek() != -1) { + ctx_.SetErrorExtraData(); + return false; + } + return true; + } + + bool ParseValue(json::Value* out) { + ctx_.SkipSpaces(); + // record start pos for cases where we might need to reset + // current position for better error message + auto start_pos = ctx_.GetCurrentPos(); + // check if the end of the string is reached + switch (ctx_.Peek()) { + case -1: { + ctx_.SetErrorExpectingValue(); + return false; + } + case '{': { + return ParseObject(out); + } + case '[': { + return ParseArray(out); + } + case '\"': { + return ctx_.NextString(out); + } + case 't': { + ctx_.SkipNextAssumeNoSpace(); + if (ctx_.MatchLiteral("rue", 3)) { + *out = true; + return true; + } else { + ctx_.SetCurrentPosForBetterErrorMsg(start_pos); + ctx_.SetErrorExpectingValue(); + return false; + } + } + case 'f': { + ctx_.SkipNextAssumeNoSpace(); + if (ctx_.MatchLiteral("alse", 4)) { + *out = false; + return true; + } else { + ctx_.SetCurrentPosForBetterErrorMsg(start_pos); + ctx_.SetErrorExpectingValue(); + return false; + } + } + case 'n': { + ctx_.SkipNextAssumeNoSpace(); + if (ctx_.MatchLiteral("ull", 3)) { + *out = nullptr; + return true; + } else { + ctx_.SetCurrentPosForBetterErrorMsg(start_pos); + ctx_.SetErrorExpectingValue(); + return false; + } + } + default: { + return ctx_.NextNumber(out); + } + } + return false; + } + + bool ParseObject(json::Value* out) { + size_t stack_top = object_temp_stack_.size(); + json::Object result; + ctx_.SkipNextAssumeNoSpace(); + ctx_.SkipSpaces(); + int next_char = ctx_.Peek(); + if (next_char == -1) { + ctx_.SetErrorExpectingPropertyName(); + return false; + } + // empty object + if (next_char == '}') { + ctx_.SkipNextAssumeNoSpace(); + *out = json::Object(); + return true; + } + // non-empty object + while ((next_char = ctx_.Peek()) != -1) { + if (next_char != '\"') { + ctx_.SetErrorExpectingPropertyName(); + return false; + } + json::Value key; + if (!ctx_.NextString(&key)) return false; + ctx_.SkipSpaces(); + if (ctx_.Peek() != ':') { + ctx_.SetErrorExpectingColon(); + return false; + } + ctx_.SkipNextAssumeNoSpace(); + json::Value value; + if (!ParseValue(&value)) return false; + object_temp_stack_.emplace_back(key, value); + // result.Set(key, value); + ctx_.SkipSpaces(); + if (ctx_.Peek() == '}') { + ctx_.SkipNextAssumeNoSpace(); + *out = json::Object(object_temp_stack_.begin() + stack_top, object_temp_stack_.end()); + // recover the stack to original state + object_temp_stack_.resize(stack_top); + return true; + } else if (ctx_.Peek() == ',') { + ctx_.SkipNextAssumeNoSpace(); + // must skip space so next iteration do not have to do so + ctx_.SkipSpaces(); + } else { + ctx_.SetErrorExpectingComma(); + return false; + } + } + return false; + } + + bool ParseArray(json::Value* out) { + size_t stack_top = array_temp_stack_.size(); + ctx_.SkipNextAssumeNoSpace(); + ctx_.SkipSpaces(); + int next_char = ctx_.Peek(); + if (next_char == -1) { + ctx_.SetErrorExpectingValue(); + return false; + } + // empty array + if (next_char == ']') { + ctx_.SkipNextAssumeNoSpace(); + *out = json::Array(); + return true; + } + // non-empty array + while ((next_char = ctx_.Peek()) != -1) { + json::Value value; + // no need to skip space here because we already skipped space + // at the beginning or in previous iteration + if (!ParseValue(&value)) return false; + array_temp_stack_.emplace_back(std::move(value)); + ctx_.SkipSpaces(); + next_char = ctx_.Peek(); + if (next_char == ',') { + ctx_.SkipNextAssumeNoSpace(); + // must skip space so next iteration do not have to do so + ctx_.SkipSpaces(); + } else if (next_char == ']') { + ctx_.SkipNextAssumeNoSpace(); + *out = json::Array(array_temp_stack_.begin() + stack_top, array_temp_stack_.end()); + // recover the stack + array_temp_stack_.resize(stack_top); + return true; + } else { + ctx_.SetErrorExpectingComma(); + return false; + } + } + return false; + } + + JSONParserContext ctx_; + // Temp stack for intermediate values + // we first create a persistent stack to store the parsed values + // then create the final array/object object with the precise size + std::vector<Any> array_temp_stack_; + std::vector<std::pair<Any, Any>> object_temp_stack_; +}; + +json::Value Parse(const String& json_str, String* error_msg) { + return JSONParser::Parse(json_str, error_msg); +} + +TVM_FFI_STATIC_INIT_BLOCK({ + namespace refl = tvm::ffi::reflection; + refl::GlobalDef().def("ffi.json.Parse", + [](const String& json_str) { return json::Parse(json_str); }); +}); + +} // namespace json +} // namespace ffi +} // namespace tvm diff --git a/ffi/tests/cpp/extra/test_json_parser.cc b/ffi/tests/cpp/extra/test_json_parser.cc new file mode 100644 index 0000000000..c0332e6f8f --- /dev/null +++ b/ffi/tests/cpp/extra/test_json_parser.cc @@ -0,0 +1,363 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include <gtest/gtest.h> +#include <tvm/ffi/container/array.h> +#include <tvm/ffi/container/map.h> +#include <tvm/ffi/extra/json.h> +#include <tvm/ffi/extra/structural_equal.h> + +#include <cmath> + +namespace { + +using namespace tvm::ffi; + +TEST(JSONParser, BoolNull) { + // boolean value + EXPECT_EQ(json::Parse("true").cast<bool>(), true); + EXPECT_EQ(json::Parse("false").cast<bool>(), false); + EXPECT_EQ(json::Parse("null"), nullptr); +} + +TEST(JSONParser, WrongBoolNull) { + String error_msg; + EXPECT_EQ(json::Parse("nul", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + EXPECT_EQ(json::Parse("fals", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + EXPECT_EQ(json::Parse("\n\nfx", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 3 column 1 (char 2)"); + EXPECT_EQ(json::Parse("fx", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + EXPECT_EQ(json::Parse("n1", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + EXPECT_EQ(json::Parse("t1", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + EXPECT_EQ(json::Parse("f1", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); +} + +TEST(JSONParser, Number) { + // number + EXPECT_EQ(json::Parse("123").cast<int64_t>(), 123); + EXPECT_EQ(json::Parse("-124").cast<int64_t>(), -124); + EXPECT_EQ(json::Parse("123.456").cast<double>(), 123.456); + // parsing scientific notation + EXPECT_EQ(json::Parse("1.456e12").cast<double>(), 1.456e12); + // NaN + EXPECT_EQ(std::isnan(json::Parse("NaN").cast<double>()), true); + // Infinity + EXPECT_EQ(std::isinf(json::Parse("Infinity").cast<double>()), true); + // -Infinity + EXPECT_EQ(std::isinf(-json::Parse("-Infinity").cast<double>()), true); + + // Test zero variants + EXPECT_EQ(json::Parse("0").cast<int64_t>(), 0); + EXPECT_EQ(json::Parse("-0").cast<double>(), -0.0); + EXPECT_EQ(json::Parse("0.0").cast<double>(), 0.0); + + // Test very large numbers + EXPECT_EQ(json::Parse("9223372036854775807").cast<int64_t>(), + std::numeric_limits<int64_t>::max()); + EXPECT_EQ(json::Parse("-9223372036854775808").cast<int64_t>(), + std::numeric_limits<int64_t>::min()); + + // Test very small decimals + EXPECT_EQ(json::Parse("1e-10").cast<double>(), 1e-10); + EXPECT_EQ(json::Parse("-1e-10").cast<double>(), -1e-10); + + // Test scientific notation edge cases + EXPECT_EQ(json::Parse("1E+10").cast<double>(), 1E+10); + EXPECT_EQ(json::Parse("1e+10").cast<double>(), 1e+10); + EXPECT_EQ(json::Parse("1E-10").cast<double>(), 1E-10); + EXPECT_EQ(json::Parse("123.456E+10").cast<double>(), 123.456E+10); +} + +TEST(JSONParser, WrongNumber) { + String error_msg; + EXPECT_EQ(json::Parse("123.456.789", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + + // Test invalid number formats + EXPECT_EQ(json::Parse("123e", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + EXPECT_EQ(json::Parse("123e+", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + EXPECT_EQ(json::Parse("123E-", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); +} + +TEST(JSONParser, String) { + EXPECT_EQ(json::Parse("\"hello\"").cast<String>(), "hello"); + EXPECT_EQ(json::Parse("\n\t \"hello\"\n\r").cast<String>(), "hello"); + EXPECT_EQ(json::Parse("\"hello\\nworld\"").cast<String>(), "hello\nworld"); + EXPECT_EQ(json::Parse("\"\"").cast<String>(), ""); + // test escape characters + EXPECT_EQ(json::Parse("\"\\ta\\n\\/\\f\\\"\\\\\"").cast<String>(), "\ta\n/\f\"\\"); + // test unicode code point + EXPECT_EQ(json::Parse("\"\\u0041\"").cast<String>(), "A"); + // test unicode surrogate pair + EXPECT_EQ(json::Parse("\"\\uD83D\\uDE04hello\"").cast<String>(), u8"\U0001F604hello"); +} + +TEST(JSONParser, WrongString) { + String error_msg; + EXPECT_EQ(json::Parse("\"hello", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Unterminated string starting at: line 1 column 1 (char 0)"); + + EXPECT_EQ(json::Parse("\"hello\x01\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Invalid control character at: line 1 column 7 (char 6)"); + + EXPECT_EQ(json::Parse("\"hello\\uxx\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Invalid \\uXXXX escape: line 1 column 8 (char 7)"); + + EXPECT_EQ(json::Parse("\"hello\\uDC00\\uDE04\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Invalid surrogate pair of \\uXXXX escapes: line 1 column 8 (char 7)"); + + EXPECT_EQ(json::Parse("\"hello\\uD800\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Invalid surrogate pair of \\uXXXX escapes: line 1 column 8 (char 7)"); + + EXPECT_EQ(json::Parse("\"hello\\uD800\\uxx\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Invalid \\uXXXX escape: line 1 column 15 (char 14)"); + + EXPECT_EQ(json::Parse("\"hello\\a\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Invalid \\escape: line 1 column 8 (char 7)"); +} + +TEST(JSONParser, Array) { + EXPECT_TRUE(StructuralEqual()(json::Parse("[]"), json::Array{})); + + EXPECT_TRUE(StructuralEqual()(json::Parse("[1, 2,\n\t\"a\"]"), json::Array{1, 2, "a"})); +} + +TEST(JSONParser, WrongArray) { + String error_msg; + + EXPECT_EQ(json::Parse("]", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + + EXPECT_EQ(json::Parse("[1,]", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 4 (char 3)"); + + EXPECT_EQ(json::Parse("[", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 2 (char 1)"); + + EXPECT_EQ(json::Parse("[1a", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting ',' delimiter: line 1 column 3 (char 2)"); + + EXPECT_EQ(json::Parse("[1,2,3", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting ',' delimiter: line 1 column 7 (char 6)"); + + EXPECT_EQ(json::Parse("[1] a", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Extra data: line 1 column 6 (char 5)"); +} + +TEST(JSONParser, Object) { + EXPECT_TRUE(StructuralEqual()(json::Parse("{}"), json::Object{})); + + EXPECT_TRUE(StructuralEqual()(json::Parse("{\"a\": 1, \n\"b\": \t\"c\"} "), + json::Object{{"a", 1}, {"b", "c"}})); +} + +TEST(JSONParser, ObjectOrderPreserving) { + auto obj = json::Parse("{\"c\": 1, \"a\": 2, \"b\": 3} "); + json::Array keys; + for (auto& [key, value] : obj.cast<json::Object>()) { + keys.push_back(key); + } + EXPECT_TRUE(StructuralEqual()(keys, json::Array{"c", "a", "b"})); +} + +TEST(JSONParser, WrongObject) { + String error_msg; + EXPECT_EQ(json::Parse("{\"a\":", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 6 (char 5)"); + + EXPECT_EQ(json::Parse("{", &error_msg), nullptr); + EXPECT_EQ(error_msg, + "Expecting property name enclosed in double quotes: line 1 column 2 (char 1)"); + + // Test incomplete structures + EXPECT_EQ(json::Parse("{\"incomplete\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting ':' delimiter: line 1 column 14 (char 13)"); +} + +TEST(JSONParser, NestedObject) { + EXPECT_TRUE( + StructuralEqual()(json::Parse("{\"a\": \t{\"b\": 1}, \n\"c\": [1, 2, 3]}"), + json::Object{{"a", json::Object{{"b", 1}}}, {"c", json::Array{1, 2, 3}}})); + + EXPECT_TRUE(StructuralEqual()( + json::Parse("{\"a\": \t{\"b\": 1}, \n\"c\": [1, null, Infinity]}"), + json::Object{{"a", json::Object{{"b", 1}}}, + {"c", json::Array{1, nullptr, std::numeric_limits<double>::infinity()}}})); + + EXPECT_TRUE(StructuralEqual()( + json::Parse("[{}, {\"a\": [1.1, 1000000]}]"), + json::Array{json::Object{}, json::Object{{"a", json::Array{1.1, 1000000}}}})); +} + +TEST(JSONParser, WrongNestedObject) { + String error_msg; + EXPECT_EQ(json::Parse("{\"a\":\n\n[1]", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting ',' delimiter: line 3 column 4 (char 10)"); + + EXPECT_EQ(json::Parse("{\"a\":\n\n[abc]}", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 3 column 2 (char 8)"); +} + +// edge cases +TEST(JSONParser, WhitespaceHandling) { + // Test various whitespace characters + EXPECT_EQ(json::Parse(" \t\n\r true \t\n\r ").cast<bool>(), true); + EXPECT_EQ(json::Parse("\n\n\n123\n\n\n").cast<int64_t>(), 123); + EXPECT_EQ(json::Parse(" \"hello world\" ").cast<String>(), "hello world"); + + // Test whitespace in arrays and objects + EXPECT_TRUE(StructuralEqual()(json::Parse(" [ 1 , 2 , 3 ] "), json::Array{1, 2, 3})); + + EXPECT_TRUE(StructuralEqual()(json::Parse(" { \"a\" : 1 , \"b\" : 2 } "), + json::Object{{"a", 1}, {"b", 2}})); +} + +TEST(JSONParser, WrongEmptyAndMinimalInputs) { + String error_msg; + // Test empty string + EXPECT_EQ(json::Parse("", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 1 column 1 (char 0)"); + + // Test only whitespace + EXPECT_EQ(json::Parse(" \t\n ", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Expecting value: line 2 column 5 (char 9)"); +} + +TEST(JSONParser, UnicodeEdgeCases) { + // Test various unicode characters + EXPECT_EQ(json::Parse("\"\\u0000\"").cast<String>(), std::string("\0", 1)); + // replace using \U to avoid encoding issues + EXPECT_EQ(json::Parse("\"\\u00FF\"").cast<String>(), u8"\U000000FF"); + EXPECT_EQ(json::Parse("\"\\u4E2D\\u6587\"").cast<String>(), u8"\U00004E2D\U00006587"); + + // Test multiple surrogate pairs + EXPECT_EQ(json::Parse("\"\\uD83D\\uDE00\\uD83D\\uDE01\"").cast<String>(), + u8"\U0001F600\U0001F601"); +} + +TEST(JSONParser, LargeInputs) { + // Test large array + std::string large_array = "["; + for (int i = 0; i < 1000; ++i) { + if (i > 0) large_array += ","; + large_array += std::to_string(i); + } + large_array += "]"; + + auto result = json::Parse(large_array); + EXPECT_TRUE(result != nullptr); + EXPECT_EQ(result.cast<json::Array>().size(), 1000); + + // Test large object + std::string large_object = "{"; + for (int i = 0; i < 500; ++i) { + if (i > 0) large_object += ","; + large_object += "\"key" + std::to_string(i) + "\":" + std::to_string(i); + } + large_object += "}"; + + result = json::Parse(large_object); + EXPECT_TRUE(result != nullptr); + EXPECT_EQ(result.cast<json::Object>().size(), 500); +} + +TEST(JSONParser, MixedDataTypes) { + // Test complex nested structure with all data types + std::string complex_json = R"({ + "null_value": null, + "boolean_true": true, + "boolean_false": false, + "integer": 42, + "negative_integer": -42, + "float": 3.14159, + "scientific": 1.23e-4, + "string": "hello world", + "unicode_string": "Hello \u4e16\u754c \ud83c\udf0d", + "empty_string": "", + "empty_array": [], + "empty_object": {}, + "number_array": [1, 2, 3, 4, 5], + "mixed_array": [1, "two", true, null, 3.14], + "nested_object": { + "level1": { + "level2": { + "data": [1, 2, {"nested_array": [true, false]}] + } + } + } + })"; + + auto result = json::Parse(complex_json); + + // Create expected structure for comparison + json::Object expected{ + {"null_value", nullptr}, + {"boolean_true", true}, + {"boolean_false", false}, + {"integer", 42}, + {"negative_integer", -42}, + {"float", 3.14159}, + {"scientific", 1.23e-4}, + {"string", "hello world"}, + {"unicode_string", u8"Hello \U00004E16\U0000754C \U0001F30D"}, + {"empty_string", ""}, + {"empty_array", json::Array{}}, + {"empty_object", json::Object{}}, + {"number_array", json::Array{1, 2, 3, 4, 5}}, + {"mixed_array", json::Array{1, "two", true, nullptr, 3.14}}, + {"nested_object", + json::Object{ + {"level1", + json::Object{ + {"level2", + json::Object{ + {"data", + json::Array{1, 2, + json::Object{{"nested_array", json::Array{true, false}}}}}}}}}}}}; + + EXPECT_TRUE(StructuralEqual()(result, expected)); +} + +TEST(JSONParser, WrongExtraData) { + String error_msg; + + EXPECT_EQ(json::Parse("truee", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Extra data: line 1 column 5 (char 4)"); + + EXPECT_EQ(json::Parse("true false", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Extra data: line 1 column 6 (char 5)"); + + EXPECT_EQ(json::Parse("123 456", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Extra data: line 1 column 5 (char 4)"); + + EXPECT_EQ(json::Parse("\"hello\" \"world\"", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Extra data: line 1 column 9 (char 8)"); + + EXPECT_EQ(json::Parse("{} []", &error_msg), nullptr); + EXPECT_EQ(error_msg, "Extra data: line 1 column 4 (char 3)"); +} +} // namespace
