github-actions[bot] commented on code in PR #63322:
URL: https://github.com/apache/doris/pull/63322#discussion_r3254568675


##########
be/src/util/json/simd_json_parser.h:
##########
@@ -94,87 +106,234 @@ class SimdJSONParser {
             }
 
         private:
-            simdjson::dom::array::iterator it;
+            std::vector<Node>::const_iterator it;
         };
-        ALWAYS_INLINE Array(const simdjson::dom::array& array_) : 
array(array_) {} /// NOLINT
-        ALWAYS_INLINE Iterator begin() const { return array.begin(); }
-        ALWAYS_INLINE Iterator end() const { return array.end(); }
-        ALWAYS_INLINE size_t size() const { return array.size(); }
+        ALWAYS_INLINE explicit Array(const std::vector<Node>* array_) : 
array(array_) {} /// NOLINT
+        ALWAYS_INLINE Iterator begin() const { return 
Iterator(array->begin()); }
+        ALWAYS_INLINE Iterator end() const { return Iterator(array->end()); }
+        ALWAYS_INLINE size_t size() const { return array->size(); }
         ALWAYS_INLINE Element operator[](size_t index) const {
             assert(index < size());
-            return array.at(index).value_unsafe();
+            return Element(&(*array)[index]);
         }
 
     private:
-        simdjson::dom::array array;
+        const std::vector<Node>* array = nullptr;
     };
     using KeyValuePair = std::pair<std::string_view, Element>;
     /// References an object in a JSON document.
     class Object {
     public:
         class Iterator {
         public:
-            ALWAYS_INLINE Iterator(const simdjson::dom::object::iterator& it_)
-                    : it(it_) {} /// NOLINT
+            ALWAYS_INLINE explicit Iterator(const std::vector<std::string>* 
keys_,
+                                            const std::vector<Node>* values_, 
size_t index_)
+                    : keys(keys_), values(values_), index(index_) {} /// NOLINT
             ALWAYS_INLINE KeyValuePair operator*() const {
-                const auto& res = *it;
-                return {res.key, res.value};
+                return {(*keys)[index], Element(&(*values)[index])};
             }
             ALWAYS_INLINE Iterator& operator++() {
-                ++it;
+                ++index;
                 return *this;
             }
             ALWAYS_INLINE Iterator operator++(int) {
                 auto res = *this;
-                ++it;
+                ++index;
                 return res;
             } /// NOLINT
             ALWAYS_INLINE friend bool operator!=(const Iterator& left, const 
Iterator& right) {
-                return left.it != right.it;
+                return left.index != right.index;
             }
             ALWAYS_INLINE friend bool operator==(const Iterator& left, const 
Iterator& right) {
                 return !(left != right);
             }
 
         private:
-            simdjson::dom::object::iterator it;
+            const std::vector<std::string>* keys = nullptr;
+            const std::vector<Node>* values = nullptr;
+            size_t index = 0;
         };
-        ALWAYS_INLINE Object(const simdjson::dom::object& object_) : 
object(object_) {} /// NOLINT
-        ALWAYS_INLINE Iterator begin() const { return object.begin(); }
-        ALWAYS_INLINE Iterator end() const { return object.end(); }
-        ALWAYS_INLINE size_t size() const { return object.size(); }
+        ALWAYS_INLINE explicit Object(const std::vector<std::string>* keys_,
+                                      const std::vector<Node>* values_)
+                : keys(keys_), values(values_) {} /// NOLINT
+        ALWAYS_INLINE Iterator begin() const { return Iterator(keys, values, 
0); }
+        ALWAYS_INLINE Iterator end() const { return Iterator(keys, values, 
size()); }
+        ALWAYS_INLINE size_t size() const { return values->size(); }
         /// Optional: Provides access to an object's element by index.
         KeyValuePair operator[](size_t index) const {
             assert(index < size());
-            auto it = object.begin();
-            while (index--) {
-                ++it;
-            }
-            const auto& res = *it;
-            return {res.key, res.value};
+            return {(*keys)[index], Element(&(*values)[index])};
         }
 
     private:
-        simdjson::dom::object object;
+        const std::vector<std::string>* keys = nullptr;
+        const std::vector<Node>* values = nullptr;
     };
     /// Parses a JSON document, returns the reference to its root element if 
succeeded.
     bool parse(const char* data, size_t size, Element& result) {
-        auto document = parser.parse(data, size);
-        if (document.error()) {
+        simdjson::padded_string padded_json(data, size);
+        simdjson::ondemand::document document;
+        auto error = parser.iterate(padded_json).get(document);
+        if (error) {
             return false;
         }
-        result = document.value_unsafe();
+        root = Node();
+        if (!build_node(document, &root)) {
+            return false;
+        }
+        result = Element(&root);
         return true;
     }
 
 private:
-    simdjson::dom::parser parser;
+    template <typename RawNumber>
+    static bool assign_raw_number(RawNumber&& raw_number, std::string* out) {
+        if constexpr (std::is_same_v<std::decay_t<RawNumber>, 
std::string_view>) {
+            *out = std::string(raw_number);
+            return true;
+        } else {
+            std::string_view raw_number_view;
+            auto error = std::move(raw_number).get(raw_number_view);
+            if (error) {
+                return false;
+            }
+            *out = std::string(raw_number_view);
+            return true;
+        }

Review Comment:
   This now eagerly materializes a second recursive JSON tree for every parse, 
copying every object key, string value, raw numeric token, and allocating 
vectors for every object/array before `JSONDataParser` traverses it. That 
happens even when `ParseConfig` has no decimal typed paths, so ordinary Variant 
ingestion pays a large CPU/memory cost and the allocations are in 
`std::string`/`std::vector` outside the existing column memory accounting. The 
raw-number preservation should stay on the hot path only when needed, or 
preserve raw numeric text without copying the whole document for 
untyped/non-decimal parses.



##########
be/src/util/json/simd_json_parser.h:
##########
@@ -94,87 +106,234 @@ class SimdJSONParser {
             }
 
         private:
-            simdjson::dom::array::iterator it;
+            std::vector<Node>::const_iterator it;
         };
-        ALWAYS_INLINE Array(const simdjson::dom::array& array_) : 
array(array_) {} /// NOLINT
-        ALWAYS_INLINE Iterator begin() const { return array.begin(); }
-        ALWAYS_INLINE Iterator end() const { return array.end(); }
-        ALWAYS_INLINE size_t size() const { return array.size(); }
+        ALWAYS_INLINE explicit Array(const std::vector<Node>* array_) : 
array(array_) {} /// NOLINT
+        ALWAYS_INLINE Iterator begin() const { return 
Iterator(array->begin()); }
+        ALWAYS_INLINE Iterator end() const { return Iterator(array->end()); }
+        ALWAYS_INLINE size_t size() const { return array->size(); }
         ALWAYS_INLINE Element operator[](size_t index) const {
             assert(index < size());
-            return array.at(index).value_unsafe();
+            return Element(&(*array)[index]);
         }
 
     private:
-        simdjson::dom::array array;
+        const std::vector<Node>* array = nullptr;
     };
     using KeyValuePair = std::pair<std::string_view, Element>;
     /// References an object in a JSON document.
     class Object {
     public:
         class Iterator {
         public:
-            ALWAYS_INLINE Iterator(const simdjson::dom::object::iterator& it_)
-                    : it(it_) {} /// NOLINT
+            ALWAYS_INLINE explicit Iterator(const std::vector<std::string>* 
keys_,
+                                            const std::vector<Node>* values_, 
size_t index_)
+                    : keys(keys_), values(values_), index(index_) {} /// NOLINT
             ALWAYS_INLINE KeyValuePair operator*() const {
-                const auto& res = *it;
-                return {res.key, res.value};
+                return {(*keys)[index], Element(&(*values)[index])};
             }
             ALWAYS_INLINE Iterator& operator++() {
-                ++it;
+                ++index;
                 return *this;
             }
             ALWAYS_INLINE Iterator operator++(int) {
                 auto res = *this;
-                ++it;
+                ++index;
                 return res;
             } /// NOLINT
             ALWAYS_INLINE friend bool operator!=(const Iterator& left, const 
Iterator& right) {
-                return left.it != right.it;
+                return left.index != right.index;
             }
             ALWAYS_INLINE friend bool operator==(const Iterator& left, const 
Iterator& right) {
                 return !(left != right);
             }
 
         private:
-            simdjson::dom::object::iterator it;
+            const std::vector<std::string>* keys = nullptr;
+            const std::vector<Node>* values = nullptr;
+            size_t index = 0;
         };
-        ALWAYS_INLINE Object(const simdjson::dom::object& object_) : 
object(object_) {} /// NOLINT
-        ALWAYS_INLINE Iterator begin() const { return object.begin(); }
-        ALWAYS_INLINE Iterator end() const { return object.end(); }
-        ALWAYS_INLINE size_t size() const { return object.size(); }
+        ALWAYS_INLINE explicit Object(const std::vector<std::string>* keys_,
+                                      const std::vector<Node>* values_)
+                : keys(keys_), values(values_) {} /// NOLINT
+        ALWAYS_INLINE Iterator begin() const { return Iterator(keys, values, 
0); }
+        ALWAYS_INLINE Iterator end() const { return Iterator(keys, values, 
size()); }
+        ALWAYS_INLINE size_t size() const { return values->size(); }
         /// Optional: Provides access to an object's element by index.
         KeyValuePair operator[](size_t index) const {
             assert(index < size());
-            auto it = object.begin();
-            while (index--) {
-                ++it;
-            }
-            const auto& res = *it;
-            return {res.key, res.value};
+            return {(*keys)[index], Element(&(*values)[index])};
         }
 
     private:
-        simdjson::dom::object object;
+        const std::vector<std::string>* keys = nullptr;
+        const std::vector<Node>* values = nullptr;
     };
     /// Parses a JSON document, returns the reference to its root element if 
succeeded.
     bool parse(const char* data, size_t size, Element& result) {
-        auto document = parser.parse(data, size);
-        if (document.error()) {
+        simdjson::padded_string padded_json(data, size);
+        simdjson::ondemand::document document;
+        auto error = parser.iterate(padded_json).get(document);
+        if (error) {
             return false;
         }
-        result = document.value_unsafe();
+        root = Node();
+        if (!build_node(document, &root)) {
+            return false;
+        }
+        result = Element(&root);
         return true;
     }
 
 private:
-    simdjson::dom::parser parser;
+    template <typename RawNumber>
+    static bool assign_raw_number(RawNumber&& raw_number, std::string* out) {
+        if constexpr (std::is_same_v<std::decay_t<RawNumber>, 
std::string_view>) {
+            *out = std::string(raw_number);
+            return true;
+        } else {
+            std::string_view raw_number_view;
+            auto error = std::move(raw_number).get(raw_number_view);
+            if (error) {
+                return false;
+            }
+            *out = std::string(raw_number_view);
+            return true;
+        }
+    }
+
+    template <typename Value>
+    bool build_array_node(Value& value, Node* out) {
+        simdjson::ondemand::array array;
+        auto error = value.get_array().get(array);
+        if (error) {
+            return false;
+        }
+        out->type = Node::Type::ARRAY;
+        for (auto element_result : array) {
+            simdjson::ondemand::value element;
+            error = std::move(element_result).get(element);
+            if (error) {
+                return false;
+            }
+            Node element_node;
+            if (!build_node(element, &element_node)) {
+                return false;
+            }
+            out->array_values.push_back(std::move(element_node));
+        }
+        return true;
+    }
+
+    template <typename Value>
+    bool build_object_node(Value& value, Node* out) {
+        simdjson::ondemand::object object;
+        auto error = value.get_object().get(object);
+        if (error) {
+            return false;
+        }
+        out->type = Node::Type::OBJECT;
+        for (auto field_result : object) {
+            simdjson::ondemand::field field;
+            error = std::move(field_result).get(field);
+            if (error) {
+                return false;
+            }
+            std::string_view key;
+            error = field.unescaped_key().get(key);
+            if (error) {
+                return false;
+            }
+            simdjson::ondemand::value field_value = field.value();
+            Node field_node;
+            if (!build_node(field_value, &field_node)) {
+                return false;
+            }
+            out->object_keys.emplace_back(key);
+            out->object_values.push_back(std::move(field_node));
+        }
+        return true;
+    }
+
+    template <typename Value>
+    bool build_number_node(Value& value, Node* out) {
+        if (!assign_raw_number(value.raw_json_token(), &out->raw_number)) {
+            return false;
+        }
+        simdjson::ondemand::number_type number_type;
+        auto error = value.get_number_type().get(number_type);
+        if (error) {
+            return false;
+        }
+        switch (number_type) {
+        case simdjson::ondemand::number_type::signed_integer:
+            out->type = Node::Type::INT64;
+            error = value.get_int64().get(out->int64_value);
+            return !error;
+        case simdjson::ondemand::number_type::unsigned_integer:
+            out->type = Node::Type::UINT64;

Review Comment:
   `key` comes from `field.unescaped_key()`, whose returned view can point at 
simdjson's string buffer for escaped keys. This code does not copy it until 
after `build_node(field_value, ...)`, and building the value can parse another 
escaped string/key and reuse that buffer. For input such as `{"a\\nb": {"x": 
"escaped\\tvalue"}}`, the key view can be overwritten before 
`emplace_back(key)`, so the Variant path stored for the object field is 
corrupted or mismatched. Please copy `key` into `object_keys` before descending 
into the value, then append the value node, and add an escaped-key case.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to