This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch feat-nested in repository https://gitbox.apache.org/repos/asf/doris.git
commit e24d8c0de671b95fd39bf5654b57c3d8be33bb65 Author: eldenmoon <[email protected]> AuthorDate: Fri Jan 9 19:16:38 2026 +0800 Write NestedGroup from JSONB during variant finalize. --- be/src/olap/rowset/segment_v2/column_writer.h | 13 + .../segment_v2/variant/nested_group_builder.cpp | 12 +- .../segment_v2/variant/nested_group_builder.h | 5 +- .../variant/variant_column_writer_impl.cpp | 394 ++++++++++++++++++++- .../variant/variant_column_writer_impl.h | 14 +- 5 files changed, 418 insertions(+), 20 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 9e39ef45bb4..a43cae436cc 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -25,6 +25,7 @@ #include <memory> // for unique_ptr #include <ostream> #include <string> +#include <unordered_map> #include <utility> #include <vector> @@ -34,6 +35,7 @@ #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/inverted_index_writer.h" +#include "olap/rowset/segment_v2/variant/variant_statistics.h" #include "util/bitmap.h" // for BitmapChange #include "util/slice.h" // for OwnedSlice @@ -99,6 +101,14 @@ class BloomFilterIndexWriter; class ZoneMapIndexWriter; class VariantColumnWriterImpl; +// English comment: NestedGroup writers for array<object> paths (offsets + child writers). +struct NestedGroupWriter { + std::unique_ptr<ColumnWriter> offsets_writer; + std::unordered_map<std::string, std::unique_ptr<ColumnWriter>> child_writers; + ColumnWriterOptions offsets_opts; + std::unordered_map<std::string, ColumnWriterOptions> child_opts; +}; + class ColumnWriter { public: static Status create(const ColumnWriterOptions& opts, const TabletColumn* column, @@ -636,6 +646,9 @@ private: ColumnWriterOptions _opts; std::unique_ptr<ColumnWriter> _writer; TabletIndexes _indexes; + + std::unordered_map<std::string, NestedGroupWriter> _nested_group_writers; + VariantStatistics _statistics; }; class VariantColumnWriter : public ColumnWriter { diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp index c20d6524f1c..469f360bd8d 100644 --- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp +++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp @@ -172,8 +172,8 @@ Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* ar for (int i = 0; i < n; ++i, ++flat_idx) { const auto* elem = arr->get(i); - std::unordered_set<std::string_view> seen_child; - std::unordered_set<std::string_view> seen_nested; + std::unordered_set<std::string> seen_child; + std::unordered_set<std::string> seen_nested; if (elem && !elem->isNull()) { if (!elem->isObject()) { @@ -187,13 +187,13 @@ Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* ar // Fill defaults for missing scalar children. for (auto& [p, sub] : group.children) { - if (!seen_child.contains(p.get_path())) { + if (seen_child.find(p.get_path()) == seen_child.end()) { sub.insert_default(); } } // Fill empty offsets for missing nested groups. for (auto& [p, ng] : group.nested_groups) { - if (!seen_nested.contains(p.get_path())) { + if (seen_nested.find(p.get_path()) == seen_nested.end()) { ng->ensure_offsets(); auto* off = vectorized::assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get()); @@ -208,8 +208,8 @@ Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* ar Status NestedGroupBuilder::_process_object_as_paths( const doris::JsonbValue* obj_value, const vectorized::PathInData& current_prefix, NestedGroup& group, size_t element_flat_idx, - std::unordered_set<std::string_view>& seen_child_paths, - std::unordered_set<std::string_view>& seen_nested_paths, size_t depth) { + std::unordered_set<std::string>& seen_child_paths, + std::unordered_set<std::string>& seen_nested_paths, size_t depth) { DCHECK(obj_value && obj_value->isObject()); if (_max_depth > 0 && depth > _max_depth) { return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h index cc903bab675..157eabe7ca1 100644 --- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h +++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h @@ -19,6 +19,7 @@ #include <cstddef> #include <memory> +#include <string> #include <unordered_map> #include <unordered_set> @@ -100,8 +101,8 @@ private: Status _process_object_as_paths(const doris::JsonbValue* obj_value, const vectorized::PathInData& current_prefix, NestedGroup& group, size_t element_flat_idx, - std::unordered_set<std::string_view>& seen_child_paths, - std::unordered_set<std::string_view>& seen_nested_paths, + std::unordered_set<std::string>& seen_child_paths, + std::unordered_set<std::string>& seen_nested_paths, size_t depth); Status _process_array_of_objects(const doris::JsonbValue* arr_value, NestedGroup& group, diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp index fe93152bd53..311d4ffb5e0 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp @@ -19,6 +19,7 @@ #include <fmt/core.h> #include <gen_cpp/segment_v2.pb.h> +#include <algorithm> #include <memory> #include <set> @@ -32,6 +33,7 @@ #include "olap/rowset/rowset_writer_context.h" #include "olap/rowset/segment_v2/column_writer.h" #include "olap/rowset/segment_v2/indexed_column_writer.h" +#include "olap/rowset/segment_v2/variant/nested_group_builder.h" #include "olap/segment_loader.h" #include "olap/tablet_schema.h" #include "olap/types.h" @@ -50,6 +52,14 @@ namespace doris::segment_v2 { #include "common/compile_check_begin.h" +// English comment: forward declaration for NestedGroup write helper used by both +// VariantColumnWriterImpl and VariantSubcolumnWriter. +static Status write_nested_groups_to_segment_from_jsonb( + const doris::segment_v2::NestedGroupsMap& nested_groups, const TabletColumn* tablet_column, + const ColumnWriterOptions& opts, vectorized::OlapBlockDataConvertor* converter, size_t num_rows, + int& column_id, std::unordered_map<std::string, NestedGroupWriter>& writers, + VariantStatistics& statistics); + void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column, CompressionTypePB compression_type) { meta->Clear(); @@ -475,6 +485,9 @@ Status VariantColumnWriterImpl::_process_subcolumns(vectorized::ColumnVariant* p // already handled continue; } + if (_skip_subcolumn_paths.find(entry->path.get_path()) != _skip_subcolumn_paths.end()) { + continue; + } CHECK(entry->data.is_finalized()); // create subcolumn writer if under limit; otherwise externalize ColumnMetaPB via IndexedColumn @@ -581,6 +594,34 @@ Status VariantColumnWriterImpl::finalize() { ptr->check_consistency(); #endif + // Build NestedGroups from JSONB columns before writing subcolumns, so we can skip + // JSONB subcolumns that are expanded into NestedGroup. + doris::segment_v2::NestedGroupsMap nested_groups; + doris::segment_v2::NestedGroupBuilder ng_builder; + ng_builder.set_max_depth(static_cast<size_t>(config::variant_nested_group_max_depth)); + _skip_subcolumn_paths.clear(); + + if (ptr->get_root_type() && + vectorized::remove_nullable(ptr->get_root_type())->get_type_id() == TypeIndex::JSONB && + ptr->get_root() != nullptr) { + RETURN_IF_ERROR(ng_builder.build_from_jsonb(ptr->get_root()->get_ptr(), nested_groups, + ptr->rows())); + } + for (const auto& entry : vectorized::schema_util::get_sorted_subcolumns(ptr->get_subcolumns())) { + if (entry->path.empty()) { + continue; + } + const auto& t = entry->data.get_least_common_type(); + if (!t || vectorized::remove_nullable(t)->get_type_id() != TypeIndex::JSONB) { + continue; + } + RETURN_IF_ERROR(ng_builder.build_from_jsonb(entry->data.get_finalized_column_ptr()->get_ptr(), + entry->path, nested_groups, entry->data.size())); + if (nested_groups.find(entry->path) != nested_groups.end()) { + _skip_subcolumn_paths.insert(entry->path.get_path()); + } + } + size_t num_rows = _column->size(); int column_id = 0; @@ -600,9 +641,11 @@ Status VariantColumnWriterImpl::finalize() { RETURN_IF_ERROR( _process_sparse_column(ptr, olap_data_convertor.get(), num_rows, column_id)); - // NestedGroup processing: expansion from JSONB is handled by NestedGroupBuilder - // This will be implemented in a subsequent phase - // TODO: Call _build_nested_groups_from_jsonb() and _write_nested_groups() here + // Write NestedGroups to segment and persist stats to root meta. + RETURN_IF_ERROR(write_nested_groups_to_segment_from_jsonb( + nested_groups, _tablet_column, _opts, olap_data_convertor.get(), num_rows, column_id, + _nested_group_writers, _statistics)); + _statistics.to_pb(_opts.meta->mutable_variant_statistics()); } _is_finalized = true; @@ -636,6 +679,16 @@ uint64_t VariantColumnWriterImpl::estimate_buffer_size() { size += column_writer->estimate_buffer_size(); } size += _sparse_writer.estimate_buffer_size(); + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + size += ngw.offsets_writer->estimate_buffer_size(); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + size += cw->estimate_buffer_size(); + } + } + } return size; } @@ -647,6 +700,16 @@ Status VariantColumnWriterImpl::finish() { for (auto& column_writer : _subcolumn_writers) { RETURN_IF_ERROR(column_writer->finish()); } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + RETURN_IF_ERROR(ngw.offsets_writer->finish()); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + RETURN_IF_ERROR(cw->finish()); + } + } + } RETURN_IF_ERROR(_sparse_writer.finish()); return Status::OK(); } @@ -658,6 +721,16 @@ Status VariantColumnWriterImpl::write_data() { for (auto& column_writer : _subcolumn_writers) { RETURN_IF_ERROR(column_writer->write_data()); } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + RETURN_IF_ERROR(ngw.offsets_writer->write_data()); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + RETURN_IF_ERROR(cw->write_data()); + } + } + } RETURN_IF_ERROR(_sparse_writer.write_data()); return Status::OK(); } @@ -668,6 +741,16 @@ Status VariantColumnWriterImpl::write_ordinal_index() { for (auto& column_writer : _subcolumn_writers) { RETURN_IF_ERROR(column_writer->write_ordinal_index()); } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + RETURN_IF_ERROR(ngw.offsets_writer->write_ordinal_index()); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + RETURN_IF_ERROR(cw->write_ordinal_index()); + } + } + } RETURN_IF_ERROR(_sparse_writer.write_ordinal_index()); return Status::OK(); } @@ -679,6 +762,20 @@ Status VariantColumnWriterImpl::write_zone_map() { RETURN_IF_ERROR(_subcolumn_writers[i]->write_zone_map()); } } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer && ngw.offsets_opts.need_zone_map) { + RETURN_IF_ERROR(ngw.offsets_writer->write_zone_map()); + } + for (auto& [child_path, cw] : ngw.child_writers) { + if (!cw) { + continue; + } + auto it = ngw.child_opts.find(child_path); + if (it != ngw.child_opts.end() && it->second.need_zone_map) { + RETURN_IF_ERROR(cw->write_zone_map()); + } + } + } return Status::OK(); } @@ -689,6 +786,20 @@ Status VariantColumnWriterImpl::write_bitmap_index() { RETURN_IF_ERROR(_subcolumn_writers[i]->write_bitmap_index()); } } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer && ngw.offsets_opts.need_bitmap_index) { + RETURN_IF_ERROR(ngw.offsets_writer->write_bitmap_index()); + } + for (auto& [child_path, cw] : ngw.child_writers) { + if (!cw) { + continue; + } + auto it = ngw.child_opts.find(child_path); + if (it != ngw.child_opts.end() && it->second.need_bitmap_index) { + RETURN_IF_ERROR(cw->write_bitmap_index()); + } + } + } return Status::OK(); } Status VariantColumnWriterImpl::write_inverted_index() { @@ -698,6 +809,20 @@ Status VariantColumnWriterImpl::write_inverted_index() { RETURN_IF_ERROR(_subcolumn_writers[i]->write_inverted_index()); } } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer && ngw.offsets_opts.need_inverted_index) { + RETURN_IF_ERROR(ngw.offsets_writer->write_inverted_index()); + } + for (auto& [child_path, cw] : ngw.child_writers) { + if (!cw) { + continue; + } + auto it = ngw.child_opts.find(child_path); + if (it != ngw.child_opts.end() && it->second.need_inverted_index) { + RETURN_IF_ERROR(cw->write_inverted_index()); + } + } + } return Status::OK(); } Status VariantColumnWriterImpl::write_bloom_filter_index() { @@ -707,6 +832,20 @@ Status VariantColumnWriterImpl::write_bloom_filter_index() { RETURN_IF_ERROR(_subcolumn_writers[i]->write_bloom_filter_index()); } } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer && ngw.offsets_opts.need_bloom_filter) { + RETURN_IF_ERROR(ngw.offsets_writer->write_bloom_filter_index()); + } + for (auto& [child_path, cw] : ngw.child_writers) { + if (!cw) { + continue; + } + auto it = ngw.child_opts.find(child_path); + if (it != ngw.child_opts.end() && it->second.need_bloom_filter) { + RETURN_IF_ERROR(cw->write_bloom_filter_index()); + } + } + } return Status::OK(); } @@ -743,7 +882,24 @@ Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t num_rows) } uint64_t VariantSubcolumnWriter::estimate_buffer_size() { - return _column->byte_size(); + if (!is_finalized()) { + return _column->byte_size(); + } + uint64_t size = 0; + if (_writer) { + size += _writer->estimate_buffer_size(); + } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + size += ngw.offsets_writer->estimate_buffer_size(); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + size += cw->estimate_buffer_size(); + } + } + } + return size; } bool VariantSubcolumnWriter::is_finalized() const { @@ -794,6 +950,24 @@ Status VariantSubcolumnWriter::finalize() { RETURN_IF_ERROR(convert_and_write_column(olap_data_convertor.get(), flush_column, ptr->get_root_type(), _writer.get(), ptr->get_root()->get_ptr(), ptr->rows(), column_id)); + _opts.meta->set_num_rows(ptr->rows()); + ++column_id; + + // English comment: also expand array<object> JSONB into NestedGroup for compaction sub-variant writer. + doris::segment_v2::NestedGroupsMap nested_groups; + doris::segment_v2::NestedGroupBuilder ng_builder; + ng_builder.set_max_depth(static_cast<size_t>(config::variant_nested_group_max_depth)); + if (ptr->get_root_type() && + vectorized::remove_nullable(ptr->get_root_type())->get_type_id() == TypeIndex::JSONB && + ptr->get_root() != nullptr) { + RETURN_IF_ERROR(ng_builder.build_from_jsonb(ptr->get_root()->get_ptr(), nested_groups, + ptr->rows())); + } + RETURN_IF_ERROR(write_nested_groups_to_segment_from_jsonb( + nested_groups, &flush_column, _opts, olap_data_convertor.get(), ptr->rows(), column_id, + /*writers=*/_nested_group_writers, /*statistics=*/_statistics)); + _statistics.to_pb(_opts.meta->mutable_variant_statistics()); + _is_finalized = true; return Status::OK(); } @@ -803,6 +977,16 @@ Status VariantSubcolumnWriter::finish() { RETURN_IF_ERROR(finalize()); } RETURN_IF_ERROR(_writer->finish()); + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + RETURN_IF_ERROR(ngw.offsets_writer->finish()); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + RETURN_IF_ERROR(cw->finish()); + } + } + } return Status::OK(); } Status VariantSubcolumnWriter::write_data() { @@ -810,11 +994,31 @@ Status VariantSubcolumnWriter::write_data() { RETURN_IF_ERROR(finalize()); } RETURN_IF_ERROR(_writer->write_data()); + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + RETURN_IF_ERROR(ngw.offsets_writer->write_data()); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + RETURN_IF_ERROR(cw->write_data()); + } + } + } return Status::OK(); } Status VariantSubcolumnWriter::write_ordinal_index() { assert(is_finalized()); RETURN_IF_ERROR(_writer->write_ordinal_index()); + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer) { + RETURN_IF_ERROR(ngw.offsets_writer->write_ordinal_index()); + } + for (auto& [__, cw] : ngw.child_writers) { + if (cw) { + RETURN_IF_ERROR(cw->write_ordinal_index()); + } + } + } return Status::OK(); } @@ -823,6 +1027,20 @@ Status VariantSubcolumnWriter::write_zone_map() { if (_opts.need_zone_map) { RETURN_IF_ERROR(_writer->write_zone_map()); } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer && ngw.offsets_opts.need_zone_map) { + RETURN_IF_ERROR(ngw.offsets_writer->write_zone_map()); + } + for (auto& [child_path, cw] : ngw.child_writers) { + if (!cw) { + continue; + } + auto it = ngw.child_opts.find(child_path); + if (it != ngw.child_opts.end() && it->second.need_zone_map) { + RETURN_IF_ERROR(cw->write_zone_map()); + } + } + } return Status::OK(); } @@ -834,6 +1052,20 @@ Status VariantSubcolumnWriter::write_inverted_index() { if (_opts.need_inverted_index) { RETURN_IF_ERROR(_writer->write_inverted_index()); } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer && ngw.offsets_opts.need_inverted_index) { + RETURN_IF_ERROR(ngw.offsets_writer->write_inverted_index()); + } + for (auto& [child_path, cw] : ngw.child_writers) { + if (!cw) { + continue; + } + auto it = ngw.child_opts.find(child_path); + if (it != ngw.child_opts.end() && it->second.need_inverted_index) { + RETURN_IF_ERROR(cw->write_inverted_index()); + } + } + } return Status::OK(); } Status VariantSubcolumnWriter::write_bloom_filter_index() { @@ -841,6 +1073,20 @@ Status VariantSubcolumnWriter::write_bloom_filter_index() { if (_opts.need_bloom_filter) { RETURN_IF_ERROR(_writer->write_bloom_filter_index()); } + for (auto& [_, ngw] : _nested_group_writers) { + if (ngw.offsets_writer && ngw.offsets_opts.need_bloom_filter) { + RETURN_IF_ERROR(ngw.offsets_writer->write_bloom_filter_index()); + } + for (auto& [child_path, cw] : ngw.child_writers) { + if (!cw) { + continue; + } + auto it = ngw.child_opts.find(child_path); + if (it != ngw.child_opts.end() && it->second.need_bloom_filter) { + RETURN_IF_ERROR(cw->write_bloom_filter_index()); + } + } + } return Status::OK(); } @@ -851,6 +1097,146 @@ Status VariantSubcolumnWriter::append_nullable(const uint8_t* null_map, const ui return Status::OK(); } +// English comment: recursively write NestedGroup built from JSONB at finalize stage. +static Status write_nested_group_recursive_from_jsonb( + const doris::segment_v2::NestedGroup* group, const std::string& path_prefix, + const TabletColumn* tablet_column, const ColumnWriterOptions& base_opts, + vectorized::OlapBlockDataConvertor* converter, size_t parent_num_rows, int& column_id, + std::unordered_map<std::string, NestedGroupWriter>& writers, VariantStatistics& statistics, + size_t depth) { + if (!group || group->is_disabled) { + return Status::OK(); + } + + std::string full_path = path_prefix.empty() ? group->path.get_path() + : path_prefix + ".__ng." + group->path.get_path(); + auto& group_writer = writers[full_path]; + + // 1. Create and write offsets column + std::string offsets_col_name = + tablet_column->name_lower_case() + ".__ng." + full_path + ".__offsets"; + + TabletColumn offsets_column; + offsets_column.set_name(offsets_col_name); + offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT); + offsets_column.set_is_nullable(false); + offsets_column.set_length(sizeof(uint64_t)); + offsets_column.set_index_length(sizeof(uint64_t)); + + group_writer.offsets_opts = base_opts; + group_writer.offsets_opts.meta = base_opts.footer->add_columns(); + _init_column_meta(group_writer.offsets_opts.meta, column_id, offsets_column, + base_opts.compression_type); + + auto* path_info = group_writer.offsets_opts.meta->mutable_column_path_info(); + path_info->set_is_nested_group_offsets(true); + path_info->set_nested_group_parent_path(full_path); + path_info->set_path(offsets_col_name); + path_info->set_nested_group_depth(static_cast<uint32_t>(depth)); + + RETURN_IF_ERROR(ColumnWriter::create(group_writer.offsets_opts, &offsets_column, + base_opts.file_writer, &group_writer.offsets_writer)); + RETURN_IF_ERROR(group_writer.offsets_writer->init()); + + vectorized::ColumnPtr offsets_col = + static_cast<const vectorized::IColumn&>(*group->offsets).get_ptr(); + size_t offsets_num_rows = offsets_col->size(); + converter->add_column_data_convertor(offsets_column); + RETURN_IF_ERROR(converter->set_source_content_with_specifid_column( + {offsets_col, nullptr, ""}, 0, offsets_num_rows, column_id)); + auto [status, converted] = converter->convert_column_data(column_id); + RETURN_IF_ERROR(status); + RETURN_IF_ERROR(group_writer.offsets_writer->append(converted->get_nullmap(), + converted->get_data(), offsets_num_rows)); + converter->clear_source_content(column_id); + group_writer.offsets_opts.meta->set_num_rows(offsets_num_rows); + ++column_id; + + // 2. Write child columns + for (const auto& [relative_path, subcolumn] : group->children) { + std::string child_col_name = tablet_column->name_lower_case() + ".__ng." + full_path + + "." + relative_path.get_path(); + + const auto& child_type = subcolumn.get_least_common_type(); + TabletColumn child_column = vectorized::schema_util::get_column_by_type( + child_type, child_col_name, + vectorized::schema_util::ExtraInfo {.unique_id = -1, + .parent_unique_id = tablet_column->unique_id(), + .path_info = vectorized::PathInData(child_col_name)}); + + ColumnWriterOptions child_opts = base_opts; + child_opts.meta = base_opts.footer->add_columns(); + _init_column_meta(child_opts.meta, column_id, child_column, base_opts.compression_type); + + auto* child_path_info = child_opts.meta->mutable_column_path_info(); + child_path_info->set_nested_group_parent_path(full_path); + child_path_info->set_path(child_col_name); + child_path_info->set_nested_group_depth(static_cast<uint32_t>(depth)); + + std::unique_ptr<ColumnWriter> child_writer; + RETURN_IF_ERROR(ColumnWriter::create(child_opts, &child_column, base_opts.file_writer, + &child_writer)); + RETURN_IF_ERROR(child_writer->init()); + + auto child_col = subcolumn.get_finalized_column_ptr(); + size_t child_num_rows = child_col->size(); + + converter->add_column_data_convertor(child_column); + RETURN_IF_ERROR(converter->set_source_content_with_specifid_column( + {child_col, nullptr, ""}, 0, child_num_rows, column_id)); + auto [child_status, child_converted] = converter->convert_column_data(column_id); + RETURN_IF_ERROR(child_status); + RETURN_IF_ERROR(child_writer->append(child_converted->get_nullmap(), + child_converted->get_data(), child_num_rows)); + converter->clear_source_content(column_id); + child_opts.meta->set_num_rows(child_num_rows); + + group_writer.child_writers[relative_path.get_path()] = std::move(child_writer); + group_writer.child_opts[relative_path.get_path()] = child_opts; + ++column_id; + } + + // 3. Recursively write nested groups within this group + for (const auto& [nested_path, nested_group] : group->nested_groups) { + RETURN_IF_ERROR(write_nested_group_recursive_from_jsonb( + nested_group.get(), full_path, tablet_column, base_opts, converter, + group->current_flat_size, column_id, writers, statistics, depth + 1)); + } + + NestedGroupInfoPB info; + info.set_element_count(static_cast<uint32_t>(group->current_flat_size)); + info.set_child_count(static_cast<uint32_t>(group->children.size() + group->nested_groups.size())); + info.set_has_conflict(group->is_disabled); + statistics.nested_group_info[full_path] = info; + + return Status::OK(); +} + +static Status write_nested_groups_to_segment_from_jsonb( + const doris::segment_v2::NestedGroupsMap& nested_groups, const TabletColumn* tablet_column, + const ColumnWriterOptions& opts, vectorized::OlapBlockDataConvertor* converter, size_t num_rows, + int& column_id, std::unordered_map<std::string, NestedGroupWriter>& writers, + VariantStatistics& statistics) { + if (nested_groups.empty()) { + return Status::OK(); + } + + std::vector<std::shared_ptr<doris::segment_v2::NestedGroup>> groups; + groups.reserve(nested_groups.size()); + for (const auto& [_, g] : nested_groups) { + if (g) { + groups.push_back(g); + } + } + std::sort(groups.begin(), groups.end(), + [](const auto& a, const auto& b) { return a->path.get_path() < b->path.get_path(); }); + for (const auto& g : groups) { + RETURN_IF_ERROR(write_nested_group_recursive_from_jsonb( + g.get(), "", tablet_column, opts, converter, num_rows, column_id, writers, statistics, 1)); + } + return Status::OK(); +} + // Helper function to recursively write a NestedGroup static Status write_nested_group_recursive( const vectorized::ColumnVariant::NestedGroup* group, diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.h b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.h index e13f5003446..90e302ee0b7 100644 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.h +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.h @@ -19,6 +19,8 @@ #include <gen_cpp/segment_v2.pb.h> +#include <unordered_set> + #include "common/status.h" #include "olap/rowset/segment_v2/column_writer.h" #include "olap/rowset/segment_v2/indexed_column_writer.h" @@ -108,14 +110,6 @@ private: int _first_column_id = -1; }; -// NestedGroup writers for array<object> paths (offsets + child writers). -struct NestedGroupWriter { - std::unique_ptr<ColumnWriter> offsets_writer; - std::unordered_map<std::string, std::unique_ptr<ColumnWriter>> child_writers; - ColumnWriterOptions offsets_opts; - std::unordered_map<std::string, ColumnWriterOptions> child_opts; -}; - class VariantColumnWriterImpl { public: VariantColumnWriterImpl(const ColumnWriterOptions& opts, const TabletColumn* column); @@ -172,6 +166,10 @@ private: // hold the references of subcolumns info std::unordered_map<std::string, TabletSchema::SubColumnInfo> _subcolumns_info; std::unordered_map<std::string, NestedGroupWriter> _nested_group_writers; + + // English comment: JSONB subcolumns that are expanded into NestedGroup should not be written + // as physical subcolumns in the segment. + std::unordered_set<std::string> _skip_subcolumn_paths; }; void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column, --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
