github-actions[bot] commented on code in PR #63322:
URL: https://github.com/apache/doris/pull/63322#discussion_r3255002040
##########
be/src/util/json/simd_json_parser.h:
##########
@@ -38,143 +72,400 @@ class SimdJSONParser {
class Element {
public:
ALWAYS_INLINE Element() {} /// NOLINT
- ALWAYS_INLINE Element(const simdjson::dom::element& element_)
- : element(element_) {} /// NOLINT
+ ALWAYS_INLINE explicit Element(const simdjson::dom::element& element_)
+ : dom_element(element_) {} ///
NOLINT
+ ALWAYS_INLINE explicit Element(const Node* node_) : node(node_) {} ///
NOLINT
ALWAYS_INLINE bool isInt64() const {
- return element.type() == simdjson::dom::element_type::INT64;
+ return node ? node->type == Node::Type::INT64
+ : dom_element.type() ==
simdjson::dom::element_type::INT64;
}
ALWAYS_INLINE bool isUInt64() const {
- return element.type() == simdjson::dom::element_type::UINT64;
+ return node ? node->type == Node::Type::UINT64
+ : dom_element.type() ==
simdjson::dom::element_type::UINT64;
}
+ ALWAYS_INLINE bool isInt128() const { return node && node->type ==
Node::Type::INT128; }
ALWAYS_INLINE bool isDouble() const {
- return element.type() == simdjson::dom::element_type::DOUBLE;
+ return node ? node->type == Node::Type::DOUBLE
+ : dom_element.type() ==
simdjson::dom::element_type::DOUBLE;
}
ALWAYS_INLINE bool isString() const {
- return element.type() == simdjson::dom::element_type::STRING;
+ return node ? node->type == Node::Type::STRING
+ : dom_element.type() ==
simdjson::dom::element_type::STRING;
}
ALWAYS_INLINE bool isArray() const {
- return element.type() == simdjson::dom::element_type::ARRAY;
+ return node ? node->type == Node::Type::ARRAY
+ : dom_element.type() ==
simdjson::dom::element_type::ARRAY;
}
ALWAYS_INLINE bool isObject() const {
- return element.type() == simdjson::dom::element_type::OBJECT;
+ return node ? node->type == Node::Type::OBJECT
+ : dom_element.type() ==
simdjson::dom::element_type::OBJECT;
}
ALWAYS_INLINE bool isBool() const {
- return element.type() == simdjson::dom::element_type::BOOLEAN;
+ return node ? node->type == Node::Type::BOOL
+ : dom_element.type() ==
simdjson::dom::element_type::BOOLEAN;
}
ALWAYS_INLINE bool isNull() const {
- return element.type() == simdjson::dom::element_type::NULL_VALUE;
+ return node ? node->type == Node::Type::NULL_VALUE
+ : dom_element.type() ==
simdjson::dom::element_type::NULL_VALUE;
+ }
+ ALWAYS_INLINE Int64 getInt64() const {
+ return node ? node->int64_value :
dom_element.get_int64().value_unsafe();
+ }
+ ALWAYS_INLINE double getDouble() const {
+ return node ? node->double_value :
dom_element.get_double().value_unsafe();
+ }
+ ALWAYS_INLINE bool getBool() const {
+ return node ? node->bool_value :
dom_element.get_bool().value_unsafe();
}
- ALWAYS_INLINE Int64 getInt64() const { return
element.get_int64().value_unsafe(); }
- ALWAYS_INLINE double getDouble() const { return
element.get_double().value_unsafe(); }
- ALWAYS_INLINE bool getBool() const { return
element.get_bool().value_unsafe(); }
ALWAYS_INLINE std::string_view getString() const {
- return element.get_string().value_unsafe();
+ return node ? std::string_view(node->string_value)
+ : dom_element.get_string().value_unsafe();
+ }
+ ALWAYS_INLINE UInt64 getUInt64() const {
+ return node ? node->uint64_value :
dom_element.get_uint64().value_unsafe();
+ }
+ ALWAYS_INLINE Int128 getInt128() const {
+ assert(node != nullptr);
+ return node->int128_value;
+ }
+ ALWAYS_INLINE std::string_view getRawNumber() const {
+ assert(node != nullptr);
+ return node->raw_number;
}
- ALWAYS_INLINE UInt64 getUInt64() const { return
element.get_uint64().value_unsafe(); }
ALWAYS_INLINE Array getArray() const;
ALWAYS_INLINE Object getObject() const;
private:
- simdjson::dom::element element;
+ simdjson::dom::element dom_element;
+ const Node* node = nullptr;
};
/// References an array in a JSON document.
class Array {
public:
class Iterator {
public:
- ALWAYS_INLINE Iterator(const simdjson::dom::array::iterator& it_)
- : it(it_) {} /// NOLINT
- ALWAYS_INLINE Element operator*() const { return *it; }
+ using DomIterator = simdjson::dom::array::iterator;
+ using NodeIterator = std::vector<Node>::const_iterator;
+ ALWAYS_INLINE explicit Iterator(const DomIterator& it_) : it(it_)
{} /// NOLINT
+ ALWAYS_INLINE explicit Iterator(NodeIterator it_) : it(it_) {}
/// NOLINT
+ ALWAYS_INLINE Element operator*() const {
+ if (const auto* node_it = std::get_if<NodeIterator>(&it)) {
+ return Element(&**node_it);
+ }
+ return Element(*std::get<DomIterator>(it));
+ }
ALWAYS_INLINE Iterator& operator++() {
- ++it;
+ if (auto* node_it = std::get_if<NodeIterator>(&it)) {
+ ++(*node_it);
+ } else {
+ ++std::get<DomIterator>(it);
+ }
return *this;
}
ALWAYS_INLINE friend bool operator!=(const Iterator& left, const
Iterator& right) {
- return left.it != right.it;
+ if (const auto* left_node_it =
std::get_if<NodeIterator>(&left.it)) {
+ return *left_node_it != std::get<NodeIterator>(right.it);
+ }
+ return std::get<DomIterator>(left.it) !=
std::get<DomIterator>(right.it);
}
private:
- simdjson::dom::array::iterator it;
+ std::variant<DomIterator, NodeIterator> it;
};
- ALWAYS_INLINE Array(const simdjson::dom::array& array_) :
array(array_) {} /// NOLINT
- ALWAYS_INLINE Iterator begin() const { return array.begin(); }
- ALWAYS_INLINE Iterator end() const { return array.end(); }
- ALWAYS_INLINE size_t size() const { return array.size(); }
+ ALWAYS_INLINE explicit Array(const simdjson::dom::array& array_)
+ : dom_array(array_) {}
/// NOLINT
+ ALWAYS_INLINE explicit Array(const std::vector<Node>* array_) :
array(array_) {} /// NOLINT
+ ALWAYS_INLINE Iterator begin() const {
+ return array ? Iterator(array->begin()) :
Iterator(dom_array.begin());
+ }
+ ALWAYS_INLINE Iterator end() const {
+ return array ? Iterator(array->end()) : Iterator(dom_array.end());
+ }
+ ALWAYS_INLINE size_t size() const { return array ? array->size() :
dom_array.size(); }
ALWAYS_INLINE Element operator[](size_t index) const {
assert(index < size());
- return array.at(index).value_unsafe();
+ return array ? Element(&(*array)[index]) :
Element(dom_array.at(index).value_unsafe());
}
private:
- simdjson::dom::array array;
+ simdjson::dom::array dom_array;
+ const std::vector<Node>* array = nullptr;
};
using KeyValuePair = std::pair<std::string_view, Element>;
/// References an object in a JSON document.
class Object {
public:
class Iterator {
public:
- ALWAYS_INLINE Iterator(const simdjson::dom::object::iterator& it_)
- : it(it_) {} /// NOLINT
+ using DomIterator = simdjson::dom::object::iterator;
+ ALWAYS_INLINE explicit Iterator(const std::vector<std::string>*
keys_,
+ const std::vector<Node>* values_,
size_t index_)
+ : it(index_), keys(keys_), values(values_) {}
/// NOLINT
+ ALWAYS_INLINE explicit Iterator(const DomIterator& it_) : it(it_)
{} /// NOLINT
ALWAYS_INLINE KeyValuePair operator*() const {
- const auto& res = *it;
- return {res.key, res.value};
+ if (const auto* index = std::get_if<size_t>(&it)) {
+ return {(*keys)[*index], Element(&(*values)[*index])};
+ }
+ const auto& res = *std::get<DomIterator>(it);
+ return {res.key, Element(res.value)};
}
ALWAYS_INLINE Iterator& operator++() {
- ++it;
+ if (auto* index = std::get_if<size_t>(&it)) {
+ ++(*index);
+ } else {
+ ++std::get<DomIterator>(it);
+ }
return *this;
}
ALWAYS_INLINE Iterator operator++(int) {
auto res = *this;
- ++it;
+ ++(*this);
return res;
} /// NOLINT
ALWAYS_INLINE friend bool operator!=(const Iterator& left, const
Iterator& right) {
- return left.it != right.it;
+ if (const auto* left_index = std::get_if<size_t>(&left.it)) {
+ return *left_index != std::get<size_t>(right.it);
+ }
+ return std::get<DomIterator>(left.it) !=
std::get<DomIterator>(right.it);
}
ALWAYS_INLINE friend bool operator==(const Iterator& left, const
Iterator& right) {
return !(left != right);
}
private:
- simdjson::dom::object::iterator it;
+ std::variant<DomIterator, size_t> it;
+ const std::vector<std::string>* keys = nullptr;
+ const std::vector<Node>* values = nullptr;
};
- ALWAYS_INLINE Object(const simdjson::dom::object& object_) :
object(object_) {} /// NOLINT
- ALWAYS_INLINE Iterator begin() const { return object.begin(); }
- ALWAYS_INLINE Iterator end() const { return object.end(); }
- ALWAYS_INLINE size_t size() const { return object.size(); }
+ ALWAYS_INLINE explicit Object(const simdjson::dom::object& object_)
+ : dom_object(object_) {} /// NOLINT
+ ALWAYS_INLINE explicit Object(const std::vector<std::string>* keys_,
+ const std::vector<Node>* values_)
+ : keys(keys_), values(values_) {} /// NOLINT
+ ALWAYS_INLINE Iterator begin() const {
+ return values ? Iterator(keys, values, 0) :
Iterator(dom_object.begin());
+ }
+ ALWAYS_INLINE Iterator end() const {
+ return values ? Iterator(keys, values, size()) :
Iterator(dom_object.end());
+ }
+ ALWAYS_INLINE size_t size() const { return values ? values->size() :
dom_object.size(); }
/// Optional: Provides access to an object's element by index.
KeyValuePair operator[](size_t index) const {
assert(index < size());
- auto it = object.begin();
+ if (values) {
+ return {(*keys)[index], Element(&(*values)[index])};
+ }
+ auto it = dom_object.begin();
while (index--) {
++it;
}
const auto& res = *it;
- return {res.key, res.value};
+ return {res.key, Element(res.value)};
}
private:
- simdjson::dom::object object;
+ simdjson::dom::object dom_object;
+ const std::vector<std::string>* keys = nullptr;
+ const std::vector<Node>* values = nullptr;
};
/// Parses a JSON document, returns the reference to its root element if
succeeded.
- bool parse(const char* data, size_t size, Element& result) {
- auto document = parser.parse(data, size);
- if (document.error()) {
+ bool parse(const char* data, size_t size, Element& result, bool
preserve_raw_numbers = false) {
+ if (!preserve_raw_numbers) {
+ auto document = dom_parser.parse(data, size);
+ if (document.error()) {
+ return false;
+ }
+ result = Element(document.value_unsafe());
+ return true;
+ }
+
+ return parse_ondemand(data, size, result);
+ }
+
+private:
+ bool parse_ondemand(const char* data, size_t size, Element& result) {
+ simdjson::padded_string padded_json(data, size);
+ simdjson::ondemand::document document;
+ auto error = ondemand_parser.iterate(padded_json).get(document);
+ if (error) {
+ return false;
+ }
+ root = Node();
+ if (!build_node(document, &root)) {
return false;
}
- result = document.value_unsafe();
+ result = Element(&root);
return true;
}
-private:
- simdjson::dom::parser parser;
+ static std::string_view trim_raw_number(std::string_view raw_number) {
+ auto is_space = [](char ch) { return ch == ' ' || ch == '\t' || ch ==
'\n' || ch == '\r'; };
+ while (!raw_number.empty() && is_space(raw_number.front())) {
+ raw_number.remove_prefix(1);
+ }
+ while (!raw_number.empty() && is_space(raw_number.back())) {
+ raw_number.remove_suffix(1);
+ }
+ return raw_number;
+ }
+
+ template <typename RawNumber>
+ static bool assign_raw_number(RawNumber&& raw_number, std::string* out) {
+ if constexpr (std::is_same_v<std::decay_t<RawNumber>,
std::string_view>) {
+ *out = std::string(trim_raw_number(raw_number));
+ return true;
+ } else {
+ std::string_view raw_number_view;
+ auto error = std::move(raw_number).get(raw_number_view);
+ if (error) {
+ return false;
+ }
+ *out = std::string(trim_raw_number(raw_number_view));
+ return true;
+ }
+ }
+
+ template <typename Value>
+ bool build_array_node(Value& value, Node* out) {
+ simdjson::ondemand::array array;
+ auto error = value.get_array().get(array);
+ if (error) {
+ return false;
+ }
+ out->type = Node::Type::ARRAY;
+ for (auto element_result : array) {
+ simdjson::ondemand::value element;
+ error = std::move(element_result).get(element);
+ if (error) {
+ return false;
+ }
+ Node element_node;
+ if (!build_node(element, &element_node)) {
+ return false;
+ }
+ out->array_values.push_back(std::move(element_node));
+ }
+ return true;
+ }
+
+ template <typename Value>
+ bool build_object_node(Value& value, Node* out) {
+ simdjson::ondemand::object object;
+ auto error = value.get_object().get(object);
+ if (error) {
+ return false;
+ }
+ out->type = Node::Type::OBJECT;
+ for (auto field_result : object) {
+ simdjson::ondemand::field field;
+ error = std::move(field_result).get(field);
+ if (error) {
+ return false;
+ }
+ std::string_view key;
+ error = field.unescaped_key().get(key);
+ if (error) {
+ return false;
+ }
+ std::string key_copy(key);
+ simdjson::ondemand::value field_value = field.value();
+ Node field_node;
+ if (!build_node(field_value, &field_node)) {
+ return false;
+ }
+ out->object_keys.push_back(std::move(key_copy));
+ out->object_values.push_back(std::move(field_node));
+ }
+ return true;
+ }
+
+ template <typename Value>
+ bool build_number_node(Value& value, Node* out) {
+ simdjson::ondemand::number_type number_type;
+ auto error = value.get_number_type().get(number_type);
+ if (error) {
+ return false;
+ }
+ switch (number_type) {
+ case simdjson::ondemand::number_type::signed_integer:
+ out->type = Node::Type::INT64;
+ error = value.get_int64().get(out->int64_value);
+ return !error;
+ case simdjson::ondemand::number_type::unsigned_integer:
+ out->type = Node::Type::UINT64;
+ error = value.get_uint64().get(out->uint64_value);
+ return !error;
+ case simdjson::ondemand::number_type::floating_point_number:
+ if (!assign_raw_number(value.raw_json_token(), &out->raw_number)) {
+ return false;
+ }
+ out->type = Node::Type::DOUBLE;
+ error = value.get_double().get(out->double_value);
+ return !error;
+ case simdjson::ondemand::number_type::big_integer: {
+ if (!assign_raw_number(value.raw_json_token(), &out->raw_number)) {
+ return false;
+ }
+ out->type = Node::Type::INT128;
+ StringParser::ParseResult parse_result;
+ out->int128_value = StringParser::string_to_int<Int128>(
+ out->raw_number.data(), out->raw_number.size(),
&parse_result);
+ return parse_result == StringParser::PARSE_SUCCESS;
+ }
Review Comment:
This `big_integer` branch still requires every raw-preservation parse to fit
the token into `Int128` before `JSONDataParser` can check whether the current
path is a typed decimal path. A Variant path declared as
`decimalv3(76,0)`/Decimal256 can validly contain an integer larger than
`Int128`, for example `340282366920938463463374607431768211456`; with a schema
typed path this should be preserved as raw text and cast to Decimal256, but
`string_to_int<Int128>()` fails here, `build_node()` returns false, and the
whole JSON is treated as parse failure/fallback instead. Please keep the raw
token for `big_integer` nodes without requiring an Int128 conversion on paths
that may be decimal-preserved, or defer the Int128 conversion until the
consumer actually needs a LargeInt, and add a Decimal256 integer regression
case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]