This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch feat-nested
in repository https://gitbox.apache.org/repos/asf/doris.git

commit e3e6fdb62077789908134b88b3027fce4b7633a9
Author: eldenmoon <[email protected]>
AuthorDate: Fri Jan 9 18:54:29 2026 +0800

    Add NestedGroupBuilder for expanding JSONB arrays.
---
 .../segment_v2/variant/nested_group_builder.cpp    | 357 +++++++++++++++++++++
 .../segment_v2/variant/nested_group_builder.h      | 124 +++++++
 2 files changed, 481 insertions(+)

diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp 
b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
new file mode 100644
index 00000000000..c20d6524f1c
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant/nested_group_builder.h"
+
+#include <algorithm>
+#include <string>
+
+#include "common/exception.h"
+#include "util/jsonb_document.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_string.h"
+#include "vec/common/assert_cast.h"
+
+namespace doris::segment_v2 {
+
+void NestedGroup::ensure_offsets() {
+    if (!offsets) {
+        offsets = vectorized::ColumnOffset64::create();
+    }
+}
+
+Status NestedGroupBuilder::build_from_jsonb(const vectorized::ColumnPtr& 
jsonb_column,
+                                           const vectorized::PathInData& 
base_path,
+                                           NestedGroupsMap& nested_groups, 
size_t num_rows) {
+    if (!jsonb_column) {
+        return Status::OK();
+    }
+
+    const auto* col_nullable =
+            
vectorized::check_and_get_column<vectorized::ColumnNullable>(jsonb_column.get());
+    const vectorized::IColumn* data_col = jsonb_column.get();
+    const vectorized::ColumnUInt8* null_map = nullptr;
+    if (col_nullable) {
+        data_col = &col_nullable->get_nested_column();
+        null_map = &col_nullable->get_null_map_column();
+    }
+
+    const auto* str_col = 
vectorized::check_and_get_column<vectorized::ColumnString>(data_col);
+    if (!str_col) {
+        return Status::InvalidArgument("NestedGroupBuilder expects JSONB as 
ColumnString, got {}",
+                                       data_col->get_name());
+    }
+
+    const size_t rows = std::min(num_rows, str_col->size());
+    for (size_t r = 0; r < rows; ++r) {
+        if (null_map && (*null_map).get_data()[r]) {
+            continue;
+        }
+        const auto val = str_col->get_data_at(r);
+        if (val.size == 0) {
+            continue;
+        }
+
+        const doris::JsonbValue* root = 
doris::JsonbDocument::createValue(val.data, val.size);
+        if (!root) {
+            continue;
+        }
+
+        // English comment: base_path is the JSON path of this JSONB column in 
ColumnVariant.
+        // For root JSONB, base_path is empty and we only traverse into 
objects to discover
+        // nested arrays under named fields.
+        RETURN_IF_ERROR(_process_jsonb_value(root, base_path, nested_groups, 
r, 0));
+    }
+
+    return Status::OK();
+}
+
+Status NestedGroupBuilder::_process_jsonb_value(const doris::JsonbValue* value,
+                                               const vectorized::PathInData& 
current_path,
+                                               NestedGroupsMap& nested_groups, 
size_t row_idx,
+                                               size_t depth) {
+    if (!value) {
+        return Status::OK();
+    }
+    if (_max_depth > 0 && depth > _max_depth) {
+        return Status::OK();
+    }
+
+    if (value->isObject()) {
+        const auto* obj = value->unpack<doris::ObjectVal>();
+        for (auto it = obj->begin(); it != obj->end(); ++it) {
+            std::string key(it->getKeyStr(), it->klen());
+            vectorized::PathInData next =
+                    current_path.empty() ? vectorized::PathInData(key)
+                                         : 
vectorized::PathInData(current_path.get_path() + "." +
+                                                                  key);
+            RETURN_IF_ERROR(_process_jsonb_value(it->value(), next, 
nested_groups, row_idx,
+                                                depth + 1));
+        }
+        return Status::OK();
+    }
+
+    if (value->isArray()) {
+        // English comment: ignore top-level arrays when base path is empty, 
since they are kept
+        // as root JSONB and do not require NestedGroup to preserve 
associations.
+        if (current_path.empty()) {
+            return Status::OK();
+        }
+
+        if (!_is_array_of_objects(value)) {
+            return Status::OK();
+        }
+
+        // Get or create top-level group keyed by full array path.
+        std::shared_ptr<NestedGroup>& gptr = nested_groups[current_path];
+        if (!gptr) {
+            gptr = std::make_shared<NestedGroup>();
+            gptr->path = current_path;
+        }
+
+        if (_handle_conflict(*gptr, /*is_array_object=*/true)) {
+            return Status::OK();
+        }
+
+        return _process_array_of_objects(value, *gptr, row_idx, depth + 1);
+    }
+
+    return Status::OK();
+}
+
+bool NestedGroupBuilder::_is_array_of_objects(const doris::JsonbValue* 
arr_value) const {
+    if (!arr_value || !arr_value->isArray()) {
+        return false;
+    }
+    const auto* arr = arr_value->unpack<doris::ArrayVal>();
+    const int n = arr->numElem();
+    for (int i = 0; i < n; ++i) {
+        const auto* elem = arr->get(i);
+        if (!elem || elem->isNull()) {
+            continue;
+        }
+        if (!elem->isObject()) {
+            return false;
+        }
+    }
+    return true;
+}
+
+Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* 
arr_value,
+                                                    NestedGroup& group, size_t 
/*parent_row_idx*/,
+                                                    size_t depth) {
+    DCHECK(arr_value && arr_value->isArray());
+    group.ensure_offsets();
+    auto* offsets_col =
+            
vectorized::assert_cast<vectorized::ColumnOffset64*>(group.offsets.get());
+
+    const auto* arr = arr_value->unpack<doris::ArrayVal>();
+    const int n = arr->numElem();
+
+    const size_t prev_total = group.current_flat_size;
+    const size_t new_total = prev_total + static_cast<size_t>(std::max(0, n));
+    offsets_col->get_data().push_back(static_cast<uint64_t>(new_total));
+    group.current_flat_size = new_total;
+
+    // Process each element (flat index in [prev_total, new_total)).
+    size_t flat_idx = prev_total;
+    for (int i = 0; i < n; ++i, ++flat_idx) {
+        const auto* elem = arr->get(i);
+
+        std::unordered_set<std::string_view> seen_child;
+        std::unordered_set<std::string_view> seen_nested;
+
+        if (elem && !elem->isNull()) {
+            if (!elem->isObject()) {
+                // English comment: array<object> validation already checked, 
skip defensively.
+            } else {
+                RETURN_IF_ERROR(_process_object_as_paths(elem, 
vectorized::PathInData {}, group,
+                                                        flat_idx, seen_child, 
seen_nested,
+                                                        depth + 1));
+            }
+        }
+
+        // Fill defaults for missing scalar children.
+        for (auto& [p, sub] : group.children) {
+            if (!seen_child.contains(p.get_path())) {
+                sub.insert_default();
+            }
+        }
+        // Fill empty offsets for missing nested groups.
+        for (auto& [p, ng] : group.nested_groups) {
+            if (!seen_nested.contains(p.get_path())) {
+                ng->ensure_offsets();
+                auto* off =
+                        
vectorized::assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get());
+                
off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size));
+            }
+        }
+    }
+
+    return Status::OK();
+}
+
+Status NestedGroupBuilder::_process_object_as_paths(
+        const doris::JsonbValue* obj_value, const vectorized::PathInData& 
current_prefix,
+        NestedGroup& group, size_t element_flat_idx,
+        std::unordered_set<std::string_view>& seen_child_paths,
+        std::unordered_set<std::string_view>& seen_nested_paths, size_t depth) 
{
+    DCHECK(obj_value && obj_value->isObject());
+    if (_max_depth > 0 && depth > _max_depth) {
+        return Status::OK();
+    }
+
+    const auto* obj = obj_value->unpack<doris::ObjectVal>();
+    for (auto it = obj->begin(); it != obj->end(); ++it) {
+        std::string key(it->getKeyStr(), it->klen());
+        vectorized::PathInData next_prefix =
+                current_prefix.empty() ? vectorized::PathInData(key)
+                                       : 
vectorized::PathInData(current_prefix.get_path() + "." +
+                                                                key);
+        const auto* v = it->value();
+        if (!v) {
+            continue;
+        }
+
+        if (v->isObject()) {
+            // English comment: flatten object fields into dotted paths.
+            RETURN_IF_ERROR(_process_object_as_paths(v, next_prefix, group, 
element_flat_idx,
+                                                    seen_child_paths, 
seen_nested_paths,
+                                                    depth + 1));
+            continue;
+        }
+
+        if (v->isArray() && _is_array_of_objects(v)) {
+            // Nested array<object> inside this group.
+            std::shared_ptr<NestedGroup>& ng = 
group.nested_groups[next_prefix];
+            if (!ng) {
+                ng = std::make_shared<NestedGroup>();
+                ng->path = next_prefix;
+            }
+
+            if (_handle_conflict(*ng, /*is_array_object=*/true)) {
+                continue;
+            }
+
+            // Ensure offsets size up to current parent element.
+            ng->ensure_offsets();
+            auto* off = 
vectorized::assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get());
+            if (off->size() < element_flat_idx) {
+                // English comment: fill missing parent elements with empty 
arrays.
+                const size_t gap = element_flat_idx - off->size();
+                for (size_t i = 0; i < gap; ++i) {
+                    
off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size));
+                }
+            }
+
+            // Process nested group for this parent element (one offsets entry 
appended inside).
+            RETURN_IF_ERROR(_process_array_of_objects(v, *ng, 
element_flat_idx, depth + 1));
+            seen_nested_paths.insert(ng->path.get_path());
+            continue;
+        }
+
+        // Scalar / non-array value becomes a child subcolumn.
+        vectorized::Field f;
+        RETURN_IF_ERROR(_jsonb_to_field(v, f));
+
+        auto& sub = group.children[next_prefix];
+        if (sub.size() < element_flat_idx) {
+            sub.insert_many_defaults(element_flat_idx - sub.size());
+        }
+        try {
+            sub.insert(f);
+        } catch (const doris::Exception& e) {
+            return Status::InternalError("NestedGroupBuilder insert failed at 
{}: {}",
+                                         next_prefix.get_path(), 
e.to_string());
+        }
+        seen_child_paths.insert(next_prefix.get_path());
+    }
+
+    return Status::OK();
+}
+
+Status NestedGroupBuilder::_jsonb_to_field(const doris::JsonbValue* value,
+                                          vectorized::Field& out) const {
+    if (!value || value->isNull()) {
+        out = vectorized::Field();
+        return Status::OK();
+    }
+    if (value->isTrue()) {
+        out = vectorized::Field::create_field<TYPE_BOOLEAN>(true);
+        return Status::OK();
+    }
+    if (value->isFalse()) {
+        out = vectorized::Field::create_field<TYPE_BOOLEAN>(false);
+        return Status::OK();
+    }
+    if (value->isInt()) {
+        out = 
vectorized::Field::create_field<TYPE_BIGINT>(static_cast<int64_t>(value->int_val()));
+        return Status::OK();
+    }
+    if (value->isDouble()) {
+        out = 
vectorized::Field::create_field<TYPE_DOUBLE>(value->unpack<doris::JsonbDoubleVal>()->val());
+        return Status::OK();
+    }
+    if (value->isFloat()) {
+        out = vectorized::Field::create_field<TYPE_DOUBLE>(static_cast<double>(
+                value->unpack<doris::JsonbFloatVal>()->val()));
+        return Status::OK();
+    }
+    if (value->isString()) {
+        const auto* s = value->unpack<doris::JsonbStringVal>();
+        out = vectorized::Field::create_field<TYPE_STRING>(
+                vectorized::String(s->getBlob(), s->getBlobLen()));
+        return Status::OK();
+    }
+    if (value->isBinary()) {
+        // English comment: keep binary as JSONB blob to avoid data loss.
+        const auto* b = value->unpack<doris::JsonbBinaryVal>();
+        out = vectorized::Field::create_field<TYPE_JSONB>(
+                vectorized::JsonbField(b->getBlob(), b->getBlobLen()));
+        return Status::OK();
+    }
+
+    return Status::InvalidArgument("NestedGroupBuilder cannot convert 
container type {} to field",
+                                   value->typeName());
+}
+
+bool NestedGroupBuilder::_handle_conflict(NestedGroup& group, bool 
is_array_object) const {
+    // English comment: conflict policy will be refined in a later phase.
+    // For now, if a group is already disabled, skip it.
+    if (group.is_disabled) {
+        return true;
+    }
+    if (group.expected_type == NestedGroup::StructureType::UNKNOWN) {
+        group.expected_type =
+                is_array_object ? NestedGroup::StructureType::ARRAY : 
NestedGroup::StructureType::SCALAR;
+        return false;
+    }
+    const bool expected_array = (group.expected_type == 
NestedGroup::StructureType::ARRAY);
+    if (expected_array != is_array_object) {
+        // Prefer array<object> (keep nested) by default: discard scalars.
+        if (!is_array_object) {
+            return true;
+        }
+        group.children.clear();
+        group.expected_type = NestedGroup::StructureType::ARRAY;
+        return false;
+    }
+    return false;
+}
+
+} // namespace doris::segment_v2
+
diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h 
b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h
new file mode 100644
index 00000000000..cc903bab675
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "common/status.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_variant.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+struct JsonbValue;
+} // namespace doris
+
+namespace doris::segment_v2 {
+
+/**
+ * English comment: NestedGroup is a storage-layer structure used to persist 
array<object>
+ * with shared offsets to preserve per-element field associations.
+ *
+ * This is intentionally independent from ColumnVariant's in-memory nested 
structures.
+ */
+struct NestedGroup {
+    // Full array path for top-level group (e.g. "voltage.list"),
+    // and relative path for nested groups within another NestedGroup (e.g. 
"cells").
+    vectorized::PathInData path;
+
+    // Offsets per parent row (or per parent element for nested groups).
+    vectorized::MutableColumnPtr offsets;
+
+    // Scalar (or flattened object) children under this array path.
+    std::unordered_map<vectorized::PathInData, 
vectorized::ColumnVariant::Subcolumn,
+                       vectorized::PathInData::Hash>
+            children;
+
+    // Nested array<object> groups under this array path.
+    std::unordered_map<vectorized::PathInData, std::shared_ptr<NestedGroup>,
+                       vectorized::PathInData::Hash>
+            nested_groups;
+
+    size_t current_flat_size = 0;
+    bool is_disabled = false;
+
+    enum struct StructureType { UNKNOWN, SCALAR, ARRAY, OBJECT };
+    StructureType expected_type = StructureType::UNKNOWN;
+
+    void ensure_offsets();
+};
+
+using NestedGroupsMap =
+        std::unordered_map<vectorized::PathInData, 
std::shared_ptr<NestedGroup>,
+                           vectorized::PathInData::Hash>;
+
+/**
+ * English comment: Build NestedGroup(s) from JSONB columns at storage 
finalize stage.
+ * The builder scans JSONB values and only expands array<object>.
+ */
+class NestedGroupBuilder {
+public:
+    NestedGroupBuilder() = default;
+
+    // Build NestedGroups from a JSONB column. base_path is the path of this 
JSONB column
+    // in ColumnVariant (empty for root JSONB).
+    Status build_from_jsonb(const vectorized::ColumnPtr& jsonb_column,
+                            const vectorized::PathInData& base_path, 
NestedGroupsMap& nested_groups,
+                            size_t num_rows);
+
+    // Convenience overload for root JSONB.
+    Status build_from_jsonb(const vectorized::ColumnPtr& jsonb_column, 
NestedGroupsMap& nested_groups,
+                            size_t num_rows) {
+        return build_from_jsonb(jsonb_column, vectorized::PathInData {}, 
nested_groups, num_rows);
+    }
+
+    void set_max_depth(size_t max_depth) { _max_depth = max_depth; }
+
+private:
+    Status _process_jsonb_value(const doris::JsonbValue* value,
+                               const vectorized::PathInData& current_path,
+                               NestedGroupsMap& nested_groups, size_t row_idx, 
size_t depth);
+
+    Status _process_object_as_paths(const doris::JsonbValue* obj_value,
+                                   const vectorized::PathInData& 
current_prefix,
+                                   NestedGroup& group, size_t element_flat_idx,
+                                   std::unordered_set<std::string_view>& 
seen_child_paths,
+                                   std::unordered_set<std::string_view>& 
seen_nested_paths,
+                                   size_t depth);
+
+    Status _process_array_of_objects(const doris::JsonbValue* arr_value, 
NestedGroup& group,
+                                    size_t parent_row_idx, size_t depth);
+
+    // Return true if this array can be treated as array<object> (nulls 
allowed).
+    bool _is_array_of_objects(const doris::JsonbValue* arr_value) const;
+
+    // Convert a JsonbValue to a scalar Field (or NULL Field). Container types 
are not supported.
+    Status _jsonb_to_field(const doris::JsonbValue* value, vectorized::Field& 
out) const;
+
+    // Conflict policy placeholder. Returns true if the current value should 
be discarded.
+    bool _handle_conflict(NestedGroup& group, bool is_array_object) const;
+
+private:
+    size_t _max_depth = 0; // 0 = unlimited
+};
+
+} // namespace doris::segment_v2
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to