This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch feat-nested in repository https://gitbox.apache.org/repos/asf/doris.git
commit e3e6fdb62077789908134b88b3027fce4b7633a9 Author: eldenmoon <[email protected]> AuthorDate: Fri Jan 9 18:54:29 2026 +0800 Add NestedGroupBuilder for expanding JSONB arrays. --- .../segment_v2/variant/nested_group_builder.cpp | 357 +++++++++++++++++++++ .../segment_v2/variant/nested_group_builder.h | 124 +++++++ 2 files changed, 481 insertions(+) diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp new file mode 100644 index 00000000000..c20d6524f1c --- /dev/null +++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp @@ -0,0 +1,357 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/variant/nested_group_builder.h" + +#include <algorithm> +#include <string> + +#include "common/exception.h" +#include "util/jsonb_document.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_string.h" +#include "vec/common/assert_cast.h" + +namespace doris::segment_v2 { + +void NestedGroup::ensure_offsets() { + if (!offsets) { + offsets = vectorized::ColumnOffset64::create(); + } +} + +Status NestedGroupBuilder::build_from_jsonb(const vectorized::ColumnPtr& jsonb_column, + const vectorized::PathInData& base_path, + NestedGroupsMap& nested_groups, size_t num_rows) { + if (!jsonb_column) { + return Status::OK(); + } + + const auto* col_nullable = + vectorized::check_and_get_column<vectorized::ColumnNullable>(jsonb_column.get()); + const vectorized::IColumn* data_col = jsonb_column.get(); + const vectorized::ColumnUInt8* null_map = nullptr; + if (col_nullable) { + data_col = &col_nullable->get_nested_column(); + null_map = &col_nullable->get_null_map_column(); + } + + const auto* str_col = vectorized::check_and_get_column<vectorized::ColumnString>(data_col); + if (!str_col) { + return Status::InvalidArgument("NestedGroupBuilder expects JSONB as ColumnString, got {}", + data_col->get_name()); + } + + const size_t rows = std::min(num_rows, str_col->size()); + for (size_t r = 0; r < rows; ++r) { + if (null_map && (*null_map).get_data()[r]) { + continue; + } + const auto val = str_col->get_data_at(r); + if (val.size == 0) { + continue; + } + + const doris::JsonbValue* root = doris::JsonbDocument::createValue(val.data, val.size); + if (!root) { + continue; + } + + // English comment: base_path is the JSON path of this JSONB column in ColumnVariant. + // For root JSONB, base_path is empty and we only traverse into objects to discover + // nested arrays under named fields. + RETURN_IF_ERROR(_process_jsonb_value(root, base_path, nested_groups, r, 0)); + } + + return Status::OK(); +} + +Status NestedGroupBuilder::_process_jsonb_value(const doris::JsonbValue* value, + const vectorized::PathInData& current_path, + NestedGroupsMap& nested_groups, size_t row_idx, + size_t depth) { + if (!value) { + return Status::OK(); + } + if (_max_depth > 0 && depth > _max_depth) { + return Status::OK(); + } + + if (value->isObject()) { + const auto* obj = value->unpack<doris::ObjectVal>(); + for (auto it = obj->begin(); it != obj->end(); ++it) { + std::string key(it->getKeyStr(), it->klen()); + vectorized::PathInData next = + current_path.empty() ? vectorized::PathInData(key) + : vectorized::PathInData(current_path.get_path() + "." + + key); + RETURN_IF_ERROR(_process_jsonb_value(it->value(), next, nested_groups, row_idx, + depth + 1)); + } + return Status::OK(); + } + + if (value->isArray()) { + // English comment: ignore top-level arrays when base path is empty, since they are kept + // as root JSONB and do not require NestedGroup to preserve associations. + if (current_path.empty()) { + return Status::OK(); + } + + if (!_is_array_of_objects(value)) { + return Status::OK(); + } + + // Get or create top-level group keyed by full array path. + std::shared_ptr<NestedGroup>& gptr = nested_groups[current_path]; + if (!gptr) { + gptr = std::make_shared<NestedGroup>(); + gptr->path = current_path; + } + + if (_handle_conflict(*gptr, /*is_array_object=*/true)) { + return Status::OK(); + } + + return _process_array_of_objects(value, *gptr, row_idx, depth + 1); + } + + return Status::OK(); +} + +bool NestedGroupBuilder::_is_array_of_objects(const doris::JsonbValue* arr_value) const { + if (!arr_value || !arr_value->isArray()) { + return false; + } + const auto* arr = arr_value->unpack<doris::ArrayVal>(); + const int n = arr->numElem(); + for (int i = 0; i < n; ++i) { + const auto* elem = arr->get(i); + if (!elem || elem->isNull()) { + continue; + } + if (!elem->isObject()) { + return false; + } + } + return true; +} + +Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* arr_value, + NestedGroup& group, size_t /*parent_row_idx*/, + size_t depth) { + DCHECK(arr_value && arr_value->isArray()); + group.ensure_offsets(); + auto* offsets_col = + vectorized::assert_cast<vectorized::ColumnOffset64*>(group.offsets.get()); + + const auto* arr = arr_value->unpack<doris::ArrayVal>(); + const int n = arr->numElem(); + + const size_t prev_total = group.current_flat_size; + const size_t new_total = prev_total + static_cast<size_t>(std::max(0, n)); + offsets_col->get_data().push_back(static_cast<uint64_t>(new_total)); + group.current_flat_size = new_total; + + // Process each element (flat index in [prev_total, new_total)). + size_t flat_idx = prev_total; + for (int i = 0; i < n; ++i, ++flat_idx) { + const auto* elem = arr->get(i); + + std::unordered_set<std::string_view> seen_child; + std::unordered_set<std::string_view> seen_nested; + + if (elem && !elem->isNull()) { + if (!elem->isObject()) { + // English comment: array<object> validation already checked, skip defensively. + } else { + RETURN_IF_ERROR(_process_object_as_paths(elem, vectorized::PathInData {}, group, + flat_idx, seen_child, seen_nested, + depth + 1)); + } + } + + // Fill defaults for missing scalar children. + for (auto& [p, sub] : group.children) { + if (!seen_child.contains(p.get_path())) { + sub.insert_default(); + } + } + // Fill empty offsets for missing nested groups. + for (auto& [p, ng] : group.nested_groups) { + if (!seen_nested.contains(p.get_path())) { + ng->ensure_offsets(); + auto* off = + vectorized::assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get()); + off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size)); + } + } + } + + return Status::OK(); +} + +Status NestedGroupBuilder::_process_object_as_paths( + const doris::JsonbValue* obj_value, const vectorized::PathInData& current_prefix, + NestedGroup& group, size_t element_flat_idx, + std::unordered_set<std::string_view>& seen_child_paths, + std::unordered_set<std::string_view>& seen_nested_paths, size_t depth) { + DCHECK(obj_value && obj_value->isObject()); + if (_max_depth > 0 && depth > _max_depth) { + return Status::OK(); + } + + const auto* obj = obj_value->unpack<doris::ObjectVal>(); + for (auto it = obj->begin(); it != obj->end(); ++it) { + std::string key(it->getKeyStr(), it->klen()); + vectorized::PathInData next_prefix = + current_prefix.empty() ? vectorized::PathInData(key) + : vectorized::PathInData(current_prefix.get_path() + "." + + key); + const auto* v = it->value(); + if (!v) { + continue; + } + + if (v->isObject()) { + // English comment: flatten object fields into dotted paths. + RETURN_IF_ERROR(_process_object_as_paths(v, next_prefix, group, element_flat_idx, + seen_child_paths, seen_nested_paths, + depth + 1)); + continue; + } + + if (v->isArray() && _is_array_of_objects(v)) { + // Nested array<object> inside this group. + std::shared_ptr<NestedGroup>& ng = group.nested_groups[next_prefix]; + if (!ng) { + ng = std::make_shared<NestedGroup>(); + ng->path = next_prefix; + } + + if (_handle_conflict(*ng, /*is_array_object=*/true)) { + continue; + } + + // Ensure offsets size up to current parent element. + ng->ensure_offsets(); + auto* off = vectorized::assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get()); + if (off->size() < element_flat_idx) { + // English comment: fill missing parent elements with empty arrays. + const size_t gap = element_flat_idx - off->size(); + for (size_t i = 0; i < gap; ++i) { + off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size)); + } + } + + // Process nested group for this parent element (one offsets entry appended inside). + RETURN_IF_ERROR(_process_array_of_objects(v, *ng, element_flat_idx, depth + 1)); + seen_nested_paths.insert(ng->path.get_path()); + continue; + } + + // Scalar / non-array value becomes a child subcolumn. + vectorized::Field f; + RETURN_IF_ERROR(_jsonb_to_field(v, f)); + + auto& sub = group.children[next_prefix]; + if (sub.size() < element_flat_idx) { + sub.insert_many_defaults(element_flat_idx - sub.size()); + } + try { + sub.insert(f); + } catch (const doris::Exception& e) { + return Status::InternalError("NestedGroupBuilder insert failed at {}: {}", + next_prefix.get_path(), e.to_string()); + } + seen_child_paths.insert(next_prefix.get_path()); + } + + return Status::OK(); +} + +Status NestedGroupBuilder::_jsonb_to_field(const doris::JsonbValue* value, + vectorized::Field& out) const { + if (!value || value->isNull()) { + out = vectorized::Field(); + return Status::OK(); + } + if (value->isTrue()) { + out = vectorized::Field::create_field<TYPE_BOOLEAN>(true); + return Status::OK(); + } + if (value->isFalse()) { + out = vectorized::Field::create_field<TYPE_BOOLEAN>(false); + return Status::OK(); + } + if (value->isInt()) { + out = vectorized::Field::create_field<TYPE_BIGINT>(static_cast<int64_t>(value->int_val())); + return Status::OK(); + } + if (value->isDouble()) { + out = vectorized::Field::create_field<TYPE_DOUBLE>(value->unpack<doris::JsonbDoubleVal>()->val()); + return Status::OK(); + } + if (value->isFloat()) { + out = vectorized::Field::create_field<TYPE_DOUBLE>(static_cast<double>( + value->unpack<doris::JsonbFloatVal>()->val())); + return Status::OK(); + } + if (value->isString()) { + const auto* s = value->unpack<doris::JsonbStringVal>(); + out = vectorized::Field::create_field<TYPE_STRING>( + vectorized::String(s->getBlob(), s->getBlobLen())); + return Status::OK(); + } + if (value->isBinary()) { + // English comment: keep binary as JSONB blob to avoid data loss. + const auto* b = value->unpack<doris::JsonbBinaryVal>(); + out = vectorized::Field::create_field<TYPE_JSONB>( + vectorized::JsonbField(b->getBlob(), b->getBlobLen())); + return Status::OK(); + } + + return Status::InvalidArgument("NestedGroupBuilder cannot convert container type {} to field", + value->typeName()); +} + +bool NestedGroupBuilder::_handle_conflict(NestedGroup& group, bool is_array_object) const { + // English comment: conflict policy will be refined in a later phase. + // For now, if a group is already disabled, skip it. + if (group.is_disabled) { + return true; + } + if (group.expected_type == NestedGroup::StructureType::UNKNOWN) { + group.expected_type = + is_array_object ? NestedGroup::StructureType::ARRAY : NestedGroup::StructureType::SCALAR; + return false; + } + const bool expected_array = (group.expected_type == NestedGroup::StructureType::ARRAY); + if (expected_array != is_array_object) { + // Prefer array<object> (keep nested) by default: discard scalars. + if (!is_array_object) { + return true; + } + group.children.clear(); + group.expected_type = NestedGroup::StructureType::ARRAY; + return false; + } + return false; +} + +} // namespace doris::segment_v2 + diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h new file mode 100644 index 00000000000..cc903bab675 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstddef> +#include <memory> +#include <unordered_map> +#include <unordered_set> + +#include "common/status.h" +#include "vec/columns/column.h" +#include "vec/columns/column_variant.h" +#include "vec/json/path_in_data.h" + +namespace doris { +struct JsonbValue; +} // namespace doris + +namespace doris::segment_v2 { + +/** + * English comment: NestedGroup is a storage-layer structure used to persist array<object> + * with shared offsets to preserve per-element field associations. + * + * This is intentionally independent from ColumnVariant's in-memory nested structures. + */ +struct NestedGroup { + // Full array path for top-level group (e.g. "voltage.list"), + // and relative path for nested groups within another NestedGroup (e.g. "cells"). + vectorized::PathInData path; + + // Offsets per parent row (or per parent element for nested groups). + vectorized::MutableColumnPtr offsets; + + // Scalar (or flattened object) children under this array path. + std::unordered_map<vectorized::PathInData, vectorized::ColumnVariant::Subcolumn, + vectorized::PathInData::Hash> + children; + + // Nested array<object> groups under this array path. + std::unordered_map<vectorized::PathInData, std::shared_ptr<NestedGroup>, + vectorized::PathInData::Hash> + nested_groups; + + size_t current_flat_size = 0; + bool is_disabled = false; + + enum struct StructureType { UNKNOWN, SCALAR, ARRAY, OBJECT }; + StructureType expected_type = StructureType::UNKNOWN; + + void ensure_offsets(); +}; + +using NestedGroupsMap = + std::unordered_map<vectorized::PathInData, std::shared_ptr<NestedGroup>, + vectorized::PathInData::Hash>; + +/** + * English comment: Build NestedGroup(s) from JSONB columns at storage finalize stage. + * The builder scans JSONB values and only expands array<object>. + */ +class NestedGroupBuilder { +public: + NestedGroupBuilder() = default; + + // Build NestedGroups from a JSONB column. base_path is the path of this JSONB column + // in ColumnVariant (empty for root JSONB). + Status build_from_jsonb(const vectorized::ColumnPtr& jsonb_column, + const vectorized::PathInData& base_path, NestedGroupsMap& nested_groups, + size_t num_rows); + + // Convenience overload for root JSONB. + Status build_from_jsonb(const vectorized::ColumnPtr& jsonb_column, NestedGroupsMap& nested_groups, + size_t num_rows) { + return build_from_jsonb(jsonb_column, vectorized::PathInData {}, nested_groups, num_rows); + } + + void set_max_depth(size_t max_depth) { _max_depth = max_depth; } + +private: + Status _process_jsonb_value(const doris::JsonbValue* value, + const vectorized::PathInData& current_path, + NestedGroupsMap& nested_groups, size_t row_idx, size_t depth); + + Status _process_object_as_paths(const doris::JsonbValue* obj_value, + const vectorized::PathInData& current_prefix, + NestedGroup& group, size_t element_flat_idx, + std::unordered_set<std::string_view>& seen_child_paths, + std::unordered_set<std::string_view>& seen_nested_paths, + size_t depth); + + Status _process_array_of_objects(const doris::JsonbValue* arr_value, NestedGroup& group, + size_t parent_row_idx, size_t depth); + + // Return true if this array can be treated as array<object> (nulls allowed). + bool _is_array_of_objects(const doris::JsonbValue* arr_value) const; + + // Convert a JsonbValue to a scalar Field (or NULL Field). Container types are not supported. + Status _jsonb_to_field(const doris::JsonbValue* value, vectorized::Field& out) const; + + // Conflict policy placeholder. Returns true if the current value should be discarded. + bool _handle_conflict(NestedGroup& group, bool is_array_object) const; + +private: + size_t _max_depth = 0; // 0 = unlimited +}; + +} // namespace doris::segment_v2 + --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
