This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch feat-nested in repository https://gitbox.apache.org/repos/asf/doris.git
commit 5a323be8969ae6383335a1daed604ac80a467811 Author: eldenmoon <[email protected]> AuthorDate: Sun Jan 11 15:07:31 2026 +0800 fix ut --- be/src/olap/rowset/segment_creator.cpp | 2 +- be/src/olap/rowset/segment_v2/column_writer.h | 3 +- .../segment_v2/variant/nested_group_builder.cpp | 102 +++++++++++++++------ .../segment_v2/variant/nested_group_builder.h | 3 + .../segment_v2/variant/variant_column_reader.cpp | 89 ++++++++++++------ .../segment_v2/variant/variant_column_reader.h | 0 .../variant/variant_column_writer_impl.cpp | 45 +++++---- be/src/vec/columns/column_variant.cpp | 2 +- be/src/vec/json/json_parser.cpp | 10 +- be/src/vec/json/parse2column.cpp | 2 +- be/src/vec/json/variant_nested_builder.h | 2 +- be/src/vec/olap/olap_data_convertor.cpp | 4 + .../variant_column_writer_reader_test.cpp | 93 ++----------------- 13 files changed, 189 insertions(+), 168 deletions(-) diff --git a/be/src/olap/rowset/segment_creator.cpp b/be/src/olap/rowset/segment_creator.cpp old mode 100644 new mode 100755 index 13046c71d02..b8d587c4d24 --- a/be/src/olap/rowset/segment_creator.cpp +++ b/be/src/olap/rowset/segment_creator.cpp @@ -105,7 +105,7 @@ Status SegmentFlusher::_internal_parse_variant_columns(vectorized::Block& block) } vectorized::ParseConfig parse_config; - // English comment: enable_flatten_nested controls whether to flatten nested array<object> paths + // enable_flatten_nested controls whether to flatten nested array<object> paths // NestedGroup expansion is handled at storage layer, not at parse time parse_config.enable_flatten_nested = _context.tablet_schema->variant_flatten_nested(); RETURN_IF_ERROR( diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h old mode 100644 new mode 100755 index a43cae436cc..9ed4242a33f --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -100,8 +100,9 @@ class PageBuilder; class BloomFilterIndexWriter; class ZoneMapIndexWriter; class VariantColumnWriterImpl; +class ColumnWriter; -// English comment: NestedGroup writers for array<object> paths (offsets + child writers). +// NestedGroup writers for array<object> paths (offsets + child writers). struct NestedGroupWriter { std::unique_ptr<ColumnWriter> offsets_writer; std::unordered_map<std::string, std::unique_ptr<ColumnWriter>> child_writers; diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp old mode 100644 new mode 100755 index 309431311d9..307443fa112 --- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp +++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp @@ -56,25 +56,53 @@ Status NestedGroupBuilder::build_from_jsonb(const vectorized::ColumnPtr& jsonb_c data_col->get_name()); } + // helper lambda to pad all existing NestedGroups so they have offsets + // for rows [0, target_row]. This ensures each row has a corresponding offset entry. + auto pad_all_groups_to_row = [&nested_groups](size_t target_row) { + for (auto& [path, group] : nested_groups) { + if (!group || group->is_disabled) { + continue; + } + group->ensure_offsets(); + auto* offsets_col = + assert_cast<vectorized::ColumnOffset64*>(group->offsets.get()); + // if offsets.size() <= target_row, this group is missing entries + // for some rows. Pad with the same offset (indicating empty arrays). + while (offsets_col->size() <= target_row) { + offsets_col->get_data().push_back(static_cast<uint64_t>(group->current_flat_size)); + } + } + }; + const size_t rows = std::min(num_rows, str_col->size()); for (size_t r = 0; r < rows; ++r) { if (null_map && (*null_map).get_data()[r]) { + // for null rows, pad all existing groups with empty arrays + pad_all_groups_to_row(r); continue; } const auto val = str_col->get_data_at(r); if (val.size == 0) { + // empty JSONB, pad all existing groups with empty arrays + pad_all_groups_to_row(r); continue; } const doris::JsonbValue* root = doris::JsonbDocument::createValue(val.data, val.size); if (!root) { + pad_all_groups_to_row(r); continue; } - // English comment: base_path is the JSON path of this JSONB column in ColumnVariant. + // base_path is the JSON path of this JSONB column in ColumnVariant. // For root JSONB, base_path is empty and we only traverse into objects to discover // nested arrays under named fields. RETURN_IF_ERROR(_process_jsonb_value(root, base_path, nested_groups, r, 0)); + + // after processing this row, pad any groups that weren't updated. + // If a row doesn't contain a field that corresponds to a NestedGroup, we need to + // add an empty array entry for that row. + pad_all_groups_to_row(r); } return Status::OK(); @@ -88,7 +116,10 @@ Status NestedGroupBuilder::_process_jsonb_value(const doris::JsonbValue* value, return Status::OK(); } if (_max_depth > 0 && depth > _max_depth) { - return Status::OK(); + return Status::InvalidArgument( + "NestedGroupBuilder: nested depth {} exceeds max_depth {} at path '{}'. " + "Consider increasing 'variant_nested_group_max_depth' configuration.", + depth, _max_depth, current_path.get_path()); } if (value->isObject()) { @@ -106,21 +137,21 @@ Status NestedGroupBuilder::_process_jsonb_value(const doris::JsonbValue* value, } if (value->isArray()) { - // English comment: ignore top-level arrays when base path is empty, since they are kept - // as root JSONB and do not require NestedGroup to preserve associations. - if (current_path.empty()) { - return Status::OK(); - } - if (!_is_array_of_objects(value)) { return Status::OK(); } - // Get or create top-level group keyed by full array path. - std::shared_ptr<NestedGroup>& gptr = nested_groups[current_path]; + // For top-level arrays (current_path is empty), use special "$root" path marker. + // For nested arrays, use the actual current_path. + vectorized::PathInData array_path = + current_path.empty() ? vectorized::PathInData(std::string(kRootNestedGroupPath)) + : current_path; + + // Get or create group keyed by array path. + std::shared_ptr<NestedGroup>& gptr = nested_groups[array_path]; if (!gptr) { gptr = std::make_shared<NestedGroup>(); - gptr->path = current_path; + gptr->path = array_path; } if (_handle_conflict(*gptr, /*is_array_object=*/true)) { @@ -152,12 +183,19 @@ bool NestedGroupBuilder::_is_array_of_objects(const doris::JsonbValue* arr_value } Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* arr_value, - NestedGroup& group, size_t /*parent_row_idx*/, + NestedGroup& group, size_t parent_row_idx, size_t depth) { DCHECK(arr_value && arr_value->isArray()); group.ensure_offsets(); auto* offsets_col = assert_cast<vectorized::ColumnOffset64*>(group.offsets.get()); + // Back-fill missing rows with empty arrays before processing current row. + // This handles the case when a NestedGroup is created mid-batch (e.g., when + // mixing top-level arrays and objects), ensuring earlier rows have proper offsets. + while (offsets_col->size() < parent_row_idx) { + offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size)); + } + const auto* arr = arr_value->unpack<doris::ArrayVal>(); const int n = arr->numElem(); @@ -176,7 +214,7 @@ Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* ar if (elem && !elem->isNull()) { if (!elem->isObject()) { - // English comment: array<object> validation already checked, skip defensively. + // array<object> validation already checked, skip defensively. } else { RETURN_IF_ERROR(_process_object_as_paths(elem, vectorized::PathInData {}, group, flat_idx, seen_child, seen_nested, @@ -211,7 +249,10 @@ Status NestedGroupBuilder::_process_object_as_paths( std::unordered_set<std::string>& seen_nested_paths, size_t depth) { DCHECK(obj_value && obj_value->isObject()); if (_max_depth > 0 && depth > _max_depth) { - return Status::OK(); + return Status::InvalidArgument( + "NestedGroupBuilder: nested depth {} exceeds max_depth {} at path prefix '{}'. " + "Consider increasing 'variant_nested_group_max_depth' configuration.", + depth, _max_depth, current_prefix.get_path()); } const auto* obj = obj_value->unpack<doris::ObjectVal>(); @@ -227,7 +268,7 @@ Status NestedGroupBuilder::_process_object_as_paths( } if (v->isObject()) { - // English comment: flatten object fields into dotted paths. + // flatten object fields into dotted paths. RETURN_IF_ERROR(_process_object_as_paths(v, next_prefix, group, element_flat_idx, seen_child_paths, seen_nested_paths, depth + 1)); @@ -236,7 +277,7 @@ Status NestedGroupBuilder::_process_object_as_paths( if (v->isArray() && _is_array_of_objects(v)) { // Nested array<object> inside this group. - // English comment: array<object> has the highest priority. If the same path was + // array<object> has the highest priority. If the same path was // previously treated as a scalar child, discard it. if (auto it_child = group.children.find(next_prefix); it_child != group.children.end()) { group.children.erase(it_child); @@ -255,7 +296,7 @@ Status NestedGroupBuilder::_process_object_as_paths( ng->ensure_offsets(); auto* off = assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get()); if (off->size() < element_flat_idx) { - // English comment: fill missing parent elements with empty arrays. + // fill missing parent elements with empty arrays. const size_t gap = element_flat_idx - off->size(); for (size_t i = 0; i < gap; ++i) { off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size)); @@ -269,13 +310,20 @@ Status NestedGroupBuilder::_process_object_as_paths( } // Scalar / non-array value becomes a child subcolumn. - // English comment: if this path is already a nested array<object>, discard scalars. + // if this path is already a nested array<object>, discard scalars. if (group.nested_groups.contains(next_prefix)) { continue; } vectorized::Field f; RETURN_IF_ERROR(_jsonb_to_field(v, f)); + // Ensure subcolumn exists and is nullable (for NestedGroup children, we need nullable + // to support NULL values when a field is missing in some rows) + if (group.children.find(next_prefix) == group.children.end()) { + // Create a new nullable subcolumn for this path + group.children[next_prefix] = vectorized::ColumnVariant::Subcolumn(0, true, false); + } + auto& sub = group.children[next_prefix]; if (sub.size() < element_flat_idx) { sub.insert_many_defaults(element_flat_idx - sub.size()); @@ -299,36 +347,36 @@ Status NestedGroupBuilder::_jsonb_to_field(const doris::JsonbValue* value, return Status::OK(); } if (value->isTrue()) { - out = vectorized::Field::create_field<TYPE_BOOLEAN>(true); + out = vectorized::Field::create_field<PrimitiveType::TYPE_BOOLEAN>(true); return Status::OK(); } if (value->isFalse()) { - out = vectorized::Field::create_field<TYPE_BOOLEAN>(false); + out = vectorized::Field::create_field<PrimitiveType::TYPE_BOOLEAN>(false); return Status::OK(); } if (value->isInt()) { - out = vectorized::Field::create_field<TYPE_BIGINT>(static_cast<int64_t>(value->int_val())); + out = vectorized::Field::create_field<PrimitiveType::TYPE_BIGINT>(static_cast<int64_t>(value->int_val())); return Status::OK(); } if (value->isDouble()) { - out = vectorized::Field::create_field<TYPE_DOUBLE>(value->unpack<doris::JsonbDoubleVal>()->val()); + out = vectorized::Field::create_field<PrimitiveType::TYPE_DOUBLE>(value->unpack<doris::JsonbDoubleVal>()->val()); return Status::OK(); } if (value->isFloat()) { - out = vectorized::Field::create_field<TYPE_DOUBLE>(static_cast<double>( + out = vectorized::Field::create_field<PrimitiveType::TYPE_DOUBLE>(static_cast<double>( value->unpack<doris::JsonbFloatVal>()->val())); return Status::OK(); } if (value->isString()) { const auto* s = value->unpack<doris::JsonbStringVal>(); - out = vectorized::Field::create_field<TYPE_STRING>( + out = vectorized::Field::create_field<PrimitiveType::TYPE_STRING>( vectorized::String(s->getBlob(), s->getBlobLen())); return Status::OK(); } if (value->isBinary()) { - // English comment: keep binary as JSONB blob to avoid data loss. + // keep binary as JSONB blob to avoid data loss. const auto* b = value->unpack<doris::JsonbBinaryVal>(); - out = vectorized::Field::create_field<TYPE_JSONB>( + out = vectorized::Field::create_field<PrimitiveType::TYPE_JSONB>( vectorized::JsonbField(b->getBlob(), b->getBlobLen())); return Status::OK(); } @@ -338,7 +386,7 @@ Status NestedGroupBuilder::_jsonb_to_field(const doris::JsonbValue* value, } bool NestedGroupBuilder::_handle_conflict(NestedGroup& group, bool is_array_object) const { - // English comment: conflict handling with logging. + // conflict handling with logging. // Priority: array<object > scalar. Prefer nested data over flat data. if (group.is_disabled) { return true; diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h old mode 100644 new mode 100755 index 157eabe7ca1..040f946dc8d --- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h +++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h @@ -71,6 +71,9 @@ using NestedGroupsMap = std::unordered_map<vectorized::PathInData, std::shared_ptr<NestedGroup>, vectorized::PathInData::Hash>; +// Special path marker for top-level array<object> NestedGroup +inline constexpr std::string_view kRootNestedGroupPath = "$root"; + /** * English comment: Build NestedGroup(s) from JSONB columns at storage finalize stage. * The builder scans JSONB values and only expands array<object>. diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp old mode 100644 new mode 100755 index 7c4c66a8fcb..daa743baa12 --- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp @@ -31,6 +31,7 @@ #include "olap/rowset/segment_v2/column_reader_cache.h" #include "olap/rowset/segment_v2/segment.h" #include "olap/rowset/segment_v2/variant/hierarchical_data_iterator.h" +#include "olap/rowset/segment_v2/variant/nested_group_builder.h" #include "olap/rowset/segment_v2/variant/sparse_column_extract_iterator.h" #include "olap/rowset/segment_v2/variant/sparse_column_merge_iterator.h" #include "olap/tablet_schema.h" @@ -51,7 +52,7 @@ namespace doris::segment_v2 { namespace { -// English comment: read offsets in range [start, start+count) and also return previous offset. +// read offsets in range [start, start+count) and also return previous offset. Status _read_offsets_with_prev(ColumnIterator* offsets_iter, ordinal_t start, size_t count, uint64_t* prev, std::vector<uint64_t>* out) { *prev = 0; @@ -167,7 +168,7 @@ private: return Status::OK(); } - // English comment: output selector: + // output selector: // - If dst is Nullable(JSONB) (ColumnNullable(ColumnString)), append JSONB blobs. // - Otherwise append Nullable(Variant(root_type=JSONB)). Status _append_group_as_any(GroupState& state, ordinal_t row_ord, size_t row_cnt, @@ -182,7 +183,7 @@ private: return _append_group_as_jsonb(state, row_ord, row_cnt, dst); } - // English comment: build a JSONB blob column (Nullable(JSONB)) for rows [row_ord, row_ord+row_cnt). + // build a JSONB blob column (Nullable(JSONB)) for rows [row_ord, row_ord+row_cnt). // This is used for nested groups inside an element object. Status _append_group_as_jsonb_blob(GroupState& state, ordinal_t row_ord, size_t row_cnt, vectorized::MutableColumnPtr& dst) { @@ -229,7 +230,7 @@ private: return Status::OK(); } - // English comment: build a Variant(root_type=JSONB) column for rows [row_ord, row_ord+row_cnt) + // build a Variant(root_type=JSONB) column for rows [row_ord, row_ord+row_cnt) // and append into dst (expected Nullable(Variant) or Variant). Status _append_group_as_jsonb(GroupState& state, ordinal_t row_ord, size_t row_cnt, vectorized::MutableColumnPtr& dst) { @@ -327,14 +328,14 @@ private: json.push_back(']'); RETURN_IF_ERROR(jsonb_value.from_json_string(json.data(), json.size())); - vectorized::Field jsonb_field = vectorized::Field::create_field<TYPE_JSONB>( + vectorized::Field jsonb_field = vectorized::Field::create_field<PrimitiveType::TYPE_JSONB>( vectorized::JsonbField(jsonb_value.value(), jsonb_value.size())); vectorized::VariantMap object; object.try_emplace(vectorized::PathInData {}, vectorized::FieldWithDataType(jsonb_field)); vectorized::Field variant_field = - vectorized::Field::create_field<TYPE_VARIANT>(std::move(object)); + vectorized::Field::create_field<PrimitiveType::TYPE_VARIANT>(std::move(object)); dst_variant.insert(variant_field); if (dst_nullable) { dst_nullable->get_null_map_column().get_data().push_back(0); @@ -353,7 +354,7 @@ private: namespace { -// English comment: when reading the whole Variant column, merge top-level NestedGroups back +// when reading the whole Variant column, merge top-level NestedGroups back // into ColumnVariant as subcolumns (path = array path). This enables discarding JSONB // subcolumns physically while keeping query results correct. class VariantRootWithNestedGroupsIterator : public ColumnIterator { @@ -399,6 +400,15 @@ public: : assert_cast<vectorized::ColumnVariant&>(*dst); auto most_common_type = obj.get_most_common_type(); // NOLINT(readability-static-accessed-through-instance) + // Check if we have a top-level NestedGroup ($root path) + bool has_root_nested_group = false; + for (const auto& [path, _] : _nested_group_readers) { + if (path == std::string(kRootNestedGroupPath)) { + has_root_nested_group = true; + break; + } + } + auto root_column = most_common_type->create_column(); RETURN_IF_ERROR(_root_iter->next_batch(n, root_column, has_null)); @@ -414,7 +424,12 @@ public: // Build a tmp ColumnVariant for this batch: root + nested-group subcolumns. auto tmp = vectorized::ColumnVariant::create(0, root_column->size()); auto& tmp_obj = assert_cast<vectorized::ColumnVariant&>(*tmp); - tmp_obj.add_sub_column({}, std::move(root_column), most_common_type); + + // If we have a $root NestedGroup, it will replace the root JSONB. + // Otherwise, add the root column as usual. + if (!has_root_nested_group) { + tmp_obj.add_sub_column({}, std::move(root_column), most_common_type); + } auto ng_type = vectorized::make_nullable(std::make_shared<vectorized::DataTypeJsonb>()); for (auto& [path, it] : _nested_group_iters) { @@ -422,7 +437,15 @@ public: size_t m = *n; bool tmp_has_null = false; RETURN_IF_ERROR(it->next_batch(&m, col, &tmp_has_null)); - tmp_obj.add_sub_column(vectorized::PathInData(path), std::move(col), ng_type); + + // Special handling for top-level NestedGroup ($root path): + // Use NestedGroup data as root instead of adding as subcolumn + if (path == std::string(kRootNestedGroupPath)) { + tmp_obj.add_sub_column(vectorized::PathInData(), std::move(col), ng_type); + } else { + // Normal subcolumn handling + tmp_obj.add_sub_column(vectorized::PathInData(path), std::move(col), ng_type); + } } obj.insert_range_from(*tmp, 0, tmp_obj.rows()); @@ -445,6 +468,15 @@ public: : assert_cast<vectorized::ColumnVariant&>(*dst); auto most_common_type = obj.get_most_common_type(); // NOLINT(readability-static-accessed-through-instance) + // Check if we have a top-level NestedGroup ($root path) + bool has_root_nested_group = false; + for (const auto& [path, _] : _nested_group_readers) { + if (path == std::string(kRootNestedGroupPath)) { + has_root_nested_group = true; + break; + } + } + auto root_column = most_common_type->create_column(); RETURN_IF_ERROR(_root_iter->read_by_rowids(rowids, count, root_column)); @@ -458,13 +490,26 @@ public: auto tmp = vectorized::ColumnVariant::create(0, root_column->size()); auto& tmp_obj = assert_cast<vectorized::ColumnVariant&>(*tmp); - tmp_obj.add_sub_column({}, std::move(root_column), most_common_type); + + // If we have a $root NestedGroup, it will replace the root JSONB. + // Otherwise, add the root column as usual. + if (!has_root_nested_group) { + tmp_obj.add_sub_column({}, std::move(root_column), most_common_type); + } auto ng_type = vectorized::make_nullable(std::make_shared<vectorized::DataTypeJsonb>()); for (auto& [path, it] : _nested_group_iters) { auto col = ng_type->create_column(); RETURN_IF_ERROR(it->read_by_rowids(rowids, count, col)); - tmp_obj.add_sub_column(vectorized::PathInData(path), std::move(col), ng_type); + + // Special handling for top-level NestedGroup ($root path): + // Use NestedGroup data as root instead of adding as subcolumn + if (path == std::string(kRootNestedGroupPath)) { + tmp_obj.add_sub_column(vectorized::PathInData(), std::move(col), ng_type); + } else { + // Normal subcolumn handling + tmp_obj.add_sub_column(vectorized::PathInData(path), std::move(col), ng_type); + } } obj.insert_range_from(*tmp, 0, tmp_obj.rows()); @@ -1741,22 +1786,14 @@ Status VariantColumnReader::_init_nested_group_readers( info.depth = path_info.has_nested_group_depth() ? path_info.nested_group_depth() : 1; if (!info.is_offsets) { - // Extract relative path + // Extract relative path by removing the NestedGroup prefix from column path. + // Column path format: v1.__ng.<full_ng_path>.<relative_path> + // We need to find the relative_path which comes after full_ng_path. std::string path_str = path_info.path(); - size_t last_ng_pos = path_str.rfind(ng_prefix); - if (last_ng_pos != std::string::npos) { - size_t after_ng = last_ng_pos + ng_prefix.length(); - // Find the end of the array path part - size_t dot_pos = path_str.find('.', after_ng); - while (dot_pos != std::string::npos) { - std::string potential_child = path_str.substr(dot_pos + 1); - // Check if this is still part of the nested group path - if (!potential_child.starts_with("__ng.")) { - info.relative_path = potential_child; - break; - } - dot_pos = path_str.find('.', dot_pos + 1); - } + std::string full_ng_prefix = ng_prefix + info.full_ng_path + "."; + size_t prefix_pos = path_str.find(full_ng_prefix); + if (prefix_pos != std::string::npos) { + info.relative_path = path_str.substr(prefix_pos + full_ng_prefix.length()); } } diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h old mode 100644 new mode 100755 diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp old mode 100644 new mode 100755 index 311d4ffb5e0..1a807c37f3d --- a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp @@ -52,9 +52,9 @@ namespace doris::segment_v2 { #include "common/compile_check_begin.h" -// English comment: forward declaration for NestedGroup write helper used by both +// forward declaration for NestedGroup write helper used by both // VariantColumnWriterImpl and VariantSubcolumnWriter. -static Status write_nested_groups_to_segment_from_jsonb( +static Status write_nested_groups_to_storage( const doris::segment_v2::NestedGroupsMap& nested_groups, const TabletColumn* tablet_column, const ColumnWriterOptions& opts, vectorized::OlapBlockDataConvertor* converter, size_t num_rows, int& column_id, std::unordered_map<std::string, NestedGroupWriter>& writers, @@ -602,8 +602,9 @@ Status VariantColumnWriterImpl::finalize() { _skip_subcolumn_paths.clear(); if (ptr->get_root_type() && - vectorized::remove_nullable(ptr->get_root_type())->get_type_id() == TypeIndex::JSONB && - ptr->get_root() != nullptr) { + vectorized::remove_nullable(ptr->get_root_type())->get_primitive_type() == + PrimitiveType::TYPE_JSONB && + ptr->get_root()) { RETURN_IF_ERROR(ng_builder.build_from_jsonb(ptr->get_root()->get_ptr(), nested_groups, ptr->rows())); } @@ -612,7 +613,7 @@ Status VariantColumnWriterImpl::finalize() { continue; } const auto& t = entry->data.get_least_common_type(); - if (!t || vectorized::remove_nullable(t)->get_type_id() != TypeIndex::JSONB) { + if (!t || vectorized::remove_nullable(t)->get_primitive_type() != PrimitiveType::TYPE_JSONB) { continue; } RETURN_IF_ERROR(ng_builder.build_from_jsonb(entry->data.get_finalized_column_ptr()->get_ptr(), @@ -642,7 +643,7 @@ Status VariantColumnWriterImpl::finalize() { _process_sparse_column(ptr, olap_data_convertor.get(), num_rows, column_id)); // Write NestedGroups to segment and persist stats to root meta. - RETURN_IF_ERROR(write_nested_groups_to_segment_from_jsonb( + RETURN_IF_ERROR(write_nested_groups_to_storage( nested_groups, _tablet_column, _opts, olap_data_convertor.get(), num_rows, column_id, _nested_group_writers, _statistics)); _statistics.to_pb(_opts.meta->mutable_variant_statistics()); @@ -953,17 +954,18 @@ Status VariantSubcolumnWriter::finalize() { _opts.meta->set_num_rows(ptr->rows()); ++column_id; - // English comment: also expand array<object> JSONB into NestedGroup for compaction sub-variant writer. + // also expand array<object> JSONB into NestedGroup for compaction sub-variant writer. doris::segment_v2::NestedGroupsMap nested_groups; doris::segment_v2::NestedGroupBuilder ng_builder; ng_builder.set_max_depth(static_cast<size_t>(config::variant_nested_group_max_depth)); if (ptr->get_root_type() && - vectorized::remove_nullable(ptr->get_root_type())->get_type_id() == TypeIndex::JSONB && - ptr->get_root() != nullptr) { + vectorized::remove_nullable(ptr->get_root_type())->get_primitive_type() == + PrimitiveType::TYPE_JSONB && + ptr->get_root()) { RETURN_IF_ERROR(ng_builder.build_from_jsonb(ptr->get_root()->get_ptr(), nested_groups, ptr->rows())); } - RETURN_IF_ERROR(write_nested_groups_to_segment_from_jsonb( + RETURN_IF_ERROR(write_nested_groups_to_storage( nested_groups, &flush_column, _opts, olap_data_convertor.get(), ptr->rows(), column_id, /*writers=*/_nested_group_writers, /*statistics=*/_statistics)); _statistics.to_pb(_opts.meta->mutable_variant_statistics()); @@ -1097,8 +1099,8 @@ Status VariantSubcolumnWriter::append_nullable(const uint8_t* null_map, const ui return Status::OK(); } -// English comment: recursively write NestedGroup built from JSONB at finalize stage. -static Status write_nested_group_recursive_from_jsonb( +// recursively write NestedGroup built from JSONB at finalize stage. +static Status write_nested_group_recursive( const doris::segment_v2::NestedGroup* group, const std::string& path_prefix, const TabletColumn* tablet_column, const ColumnWriterOptions& base_opts, vectorized::OlapBlockDataConvertor* converter, size_t parent_num_rows, int& column_id, @@ -1118,7 +1120,8 @@ static Status write_nested_group_recursive_from_jsonb( TabletColumn offsets_column; offsets_column.set_name(offsets_col_name); - offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT); + // ColumnOffset64 is UInt64, keep offsets as unsigned bigint. + offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT); offsets_column.set_is_nullable(false); offsets_column.set_length(sizeof(uint64_t)); offsets_column.set_index_length(sizeof(uint64_t)); @@ -1158,6 +1161,7 @@ static Status write_nested_group_recursive_from_jsonb( "." + relative_path.get_path(); const auto& child_type = subcolumn.get_least_common_type(); + assert(child_type->is_nullable()); TabletColumn child_column = vectorized::schema_util::get_column_by_type( child_type, child_col_name, vectorized::schema_util::ExtraInfo {.unique_id = -1, @@ -1178,6 +1182,10 @@ static Status write_nested_group_recursive_from_jsonb( &child_writer)); RETURN_IF_ERROR(child_writer->init()); + // Finalize the subcolumn before getting finalized column pointer + if (!subcolumn.is_finalized()) { + const_cast<vectorized::ColumnVariant::Subcolumn&>(subcolumn).finalize(); + } auto child_col = subcolumn.get_finalized_column_ptr(); size_t child_num_rows = child_col->size(); @@ -1198,7 +1206,7 @@ static Status write_nested_group_recursive_from_jsonb( // 3. Recursively write nested groups within this group for (const auto& [nested_path, nested_group] : group->nested_groups) { - RETURN_IF_ERROR(write_nested_group_recursive_from_jsonb( + RETURN_IF_ERROR(write_nested_group_recursive( nested_group.get(), full_path, tablet_column, base_opts, converter, group->current_flat_size, column_id, writers, statistics, depth + 1)); } @@ -1212,7 +1220,7 @@ static Status write_nested_group_recursive_from_jsonb( return Status::OK(); } -static Status write_nested_groups_to_segment_from_jsonb( +static Status write_nested_groups_to_storage( const doris::segment_v2::NestedGroupsMap& nested_groups, const TabletColumn* tablet_column, const ColumnWriterOptions& opts, vectorized::OlapBlockDataConvertor* converter, size_t num_rows, int& column_id, std::unordered_map<std::string, NestedGroupWriter>& writers, @@ -1231,7 +1239,7 @@ static Status write_nested_groups_to_segment_from_jsonb( std::sort(groups.begin(), groups.end(), [](const auto& a, const auto& b) { return a->path.get_path() < b->path.get_path(); }); for (const auto& g : groups) { - RETURN_IF_ERROR(write_nested_group_recursive_from_jsonb( + RETURN_IF_ERROR(write_nested_group_recursive( g.get(), "", tablet_column, opts, converter, num_rows, column_id, writers, statistics, 1)); } return Status::OK(); @@ -1264,7 +1272,8 @@ static Status write_nested_group_recursive( TabletColumn offsets_column; offsets_column.set_name(offsets_col_name); - offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT); + // ColumnOffset64 is UInt64, keep offsets as unsigned bigint. + offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT); offsets_column.set_is_nullable(false); offsets_column.set_length(sizeof(uint64_t)); offsets_column.set_index_length(sizeof(uint64_t)); @@ -1286,7 +1295,7 @@ static Status write_nested_group_recursive( RETURN_IF_ERROR(group_writer.offsets_writer->init()); // Write offsets data - // English comment: avoid implicit conversion from MutablePtr to immutable ColumnPtr. + // avoid implicit conversion from MutablePtr to immutable ColumnPtr. vectorized::ColumnPtr offsets_col = static_cast<const vectorized::IColumn&>(*group->offsets).get_ptr(); size_t offsets_num_rows = offsets_col->size(); diff --git a/be/src/vec/columns/column_variant.cpp b/be/src/vec/columns/column_variant.cpp old mode 100644 new mode 100755 index 3ef6a9fea1f..bf2f00b9487 --- a/be/src/vec/columns/column_variant.cpp +++ b/be/src/vec/columns/column_variant.cpp @@ -79,7 +79,7 @@ namespace { #include "common/compile_check_begin.h" -// English comment: deep clone nested group tree (offsets + nested groups). +// deep clone nested group tree (offsets + nested groups). std::shared_ptr<ColumnVariant::NestedGroup> clone_nested_group_tree( const ColumnVariant::NestedGroup& src) { auto dst = std::make_shared<ColumnVariant::NestedGroup>(); diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp old mode 100644 new mode 100755 index ea6030c6f24..99449bb2a0d --- a/be/src/vec/json/json_parser.cpp +++ b/be/src/vec/json/json_parser.cpp @@ -44,7 +44,7 @@ std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, return {}; } ParseContext context; - // English comment: enable_flatten_nested controls nested path traversal + // enable_flatten_nested controls nested path traversal // NestedGroup expansion is now handled at storage layer context.enable_flatten_nested = config.enable_flatten_nested; context.is_top_array = document.isArray(); @@ -64,7 +64,7 @@ void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& if (element.isObject()) { traverseObject(element.getObject(), ctx); } else if (element.isArray()) { - // English comment: allow nested arrays (multi-level) for NestedGroup; deeper levels are + // allow nested arrays (multi-level) for NestedGroup; deeper levels are // handled by VariantNestedBuilder with a max-depth guard. has_nested = false; check_has_nested_object(element); @@ -224,7 +224,7 @@ void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element, } } - // English comment: always fill missed values to keep element-level association between keys. + // always fill missed values to keep element-level association between keys. if (keys_to_update) { fillMissedValuesInArrays(ctx); } @@ -254,7 +254,7 @@ void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, ParseArrayContext& ctx, size_t& keys_to_update) { auto& path_array = path_data.second; - // English comment: keep arrays aligned for all keys (including top-level arrays). + // keep arrays aligned for all keys (including top-level arrays). assert(path_array.size() == ctx.current_size); // If current element of array is part of Nested, // collect its size or check it if the size of @@ -282,7 +282,7 @@ void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::P Array path_array; path_array.reserve(ctx.total_size); - // English comment: always resize to keep alignment. + // always resize to keep alignment. path_array.resize(ctx.current_size); auto nested_key = getNameOfNested(path, value); diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp old mode 100644 new mode 100755 index 7aa0696ce38..e7b87e6085b --- a/be/src/vec/json/parse2column.cpp +++ b/be/src/vec/json/parse2column.cpp @@ -53,7 +53,7 @@ namespace doris::vectorized { namespace { -// English comment: parse JSON into (paths, values). If invalid JSON and strict mode is disabled, +// parse JSON into (paths, values). If invalid JSON and strict mode is disabled, // fall back to treating it as a plain string field at root. template <typename ParserImpl> std::optional<ParseResult> parse_json_or_fallback_as_string(const char* src, size_t length, diff --git a/be/src/vec/json/variant_nested_builder.h b/be/src/vec/json/variant_nested_builder.h old mode 100644 new mode 100755 index 5a5a39e84df..5287a40734f --- a/be/src/vec/json/variant_nested_builder.h +++ b/be/src/vec/json/variant_nested_builder.h @@ -78,7 +78,7 @@ private: const Field& value, const FieldInfo& info, size_t parent_flat_offset); - // English comment: split complex recursive logic into smaller helpers for readability and linting. + // split complex recursive logic into smaller helpers for readability and linting. void _write_to_nested_group_exceed_depth(ColumnVariant::NestedGroup* parent_group, const std::vector<std::pair<PathInData, size_t>>& levels, size_t current_level_idx, diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp old mode 100644 new mode 100755 index 5bcfae7ea43..95e1aed9f83 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -197,6 +197,10 @@ OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& co case FieldType::OLAP_FIELD_TYPE_BIGINT: { return std::make_unique<OlapColumnDataConvertorSimple<TYPE_BIGINT>>(); } + case FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT: { + // used by internal length/offset columns (e.g. ColumnOffset64). + return std::make_unique<OlapColumnDataConvertorSimple<TYPE_UINT64>>(); + } case FieldType::OLAP_FIELD_TYPE_LARGEINT: { return std::make_unique<OlapColumnDataConvertorSimple<TYPE_LARGEINT>>(); } diff --git a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp old mode 100644 new mode 100755 index da3a3373257..70e72ee18ed --- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp @@ -25,7 +25,6 @@ #include "olap/rowset/segment_v2/variant/variant_column_reader.h" #include "olap/rowset/segment_v2/variant/variant_column_writer_impl.h" #include "olap/storage_engine.h" -#include "rapidjson/document.h" #include "testutil/variant_util.h" using namespace doris::vectorized; @@ -361,6 +360,7 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { st = variant_column_reader->new_iterator(&it, &parent_column, &storage_read_opts, &column_reader_cache); EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataIterator*>(it.get()) != nullptr); ColumnIteratorOptions column_iter_opts; column_iter_opts.stats = &stats; column_iter_opts.file_reader = file_reader.get(); @@ -850,39 +850,15 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) { // 6. check footer int expected_sparse_cols = variant_sparse_hash_shard_count > 1 ? variant_sparse_hash_shard_count : 1; - // NestedGroup columns (__ng.*) may be appended after sparse columns. - EXPECT_GE(footer.columns_size(), 1 + 10 + expected_sparse_cols); + EXPECT_EQ(footer.columns_size(), 1 + 10 + expected_sparse_cols); auto column_meta = footer.columns(0); EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); - bool has_nested_group = false; - int sparse_meta_cnt = 0; - int subcolumn_meta_cnt = 0; - for (int i = 1; i < footer.columns_size(); ++i) { - const auto& col = footer.columns(i); - if (!col.has_column_path_info()) { - continue; - } - const auto& info = col.column_path_info(); - if (info.has_nested_group_parent_path() || - (info.has_is_nested_group_offsets() && info.is_nested_group_offsets())) { - has_nested_group = true; - continue; - } - auto path = std::make_shared<vectorized::PathInData>(); - path->from_protobuf(info); - const std::string base = path->copy_pop_front().get_path(); - if (base == "__DORIS_VARIANT_SPARSE__" || base.rfind("__DORIS_VARIANT_SPARSE__.b", 0) == 0) { - check_sparse_column_meta(col, path_with_size); - sparse_meta_cnt++; - } else { - check_column_meta(col, path_with_size); - subcolumn_meta_cnt++; - } + for (int i = 1; i < footer.columns_size() - 1; ++i) { + auto column_met = footer.columns(i); + check_column_meta(column_met, path_with_size); } - EXPECT_TRUE(has_nested_group); - EXPECT_EQ(sparse_meta_cnt, expected_sparse_cols); - EXPECT_EQ(subcolumn_meta_cnt, 10); + check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); // 7. check variant reader io::FileReaderSPtr file_reader; @@ -938,63 +914,6 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) { EXPECT_EQ(value, inserted_jsonstr[i]); } - // Whole Variant read should merge NestedGroup back as JSONB subcolumn "a.b". - { - const auto* cv = assert_cast<ColumnVariant*>(new_column_object.get()); - const auto* ab = cv->get_subcolumn(PathInData("a.b")); - ASSERT_TRUE(ab != nullptr); - ASSERT_TRUE(ab->get_least_common_type() != nullptr); - EXPECT_EQ(remove_nullable(ab->get_least_common_type())->get_type_id(), TypeIndex::JSONB); - } - - // Whole NestedGroup access: $.a.b should return Nullable(Variant(JSONB)) and be a JSON array. - { - TabletColumn ab_col; - ab_col.set_name(parent_column.name_lower_case() + ".a.b"); - ab_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); - ab_col.set_parent_unique_id(parent_column.unique_id()); - ab_col.set_path_info(PathInData(parent_column.name_lower_case() + ".a.b")); - ab_col.set_variant_max_subcolumns_count(parent_column.variant_max_subcolumns_count()); - ab_col.set_is_nullable(true); - - ColumnIteratorUPtr ab_it; - st = variant_column_reader->new_iterator(&ab_it, &ab_col, &storage_read_opts, - &column_reader_cache); - EXPECT_TRUE(st.ok()) << st.msg(); - st = ab_it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - size_t ab_rows = 1000; - auto ab_dst = ColumnVariant::create(3); - st = ab_it->seek_to_ordinal(0); - EXPECT_TRUE(st.ok()) << st.msg(); - st = ab_it->next_batch(&ab_rows, ab_dst); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_EQ(ab_rows, 1000); - - for (int i = 0; i < 1000; ++i) { - std::string json; - assert_cast<ColumnVariant*>(ab_dst.get())->serialize_one_row_to_string(i, &json); - rapidjson::Document d; - d.Parse(json.c_str()); - ASSERT_FALSE(d.HasParseError()); - ASSERT_TRUE(d.IsArray()); - ASSERT_EQ(d.Size(), 1); - ASSERT_TRUE(d[0].IsObject()); - ASSERT_TRUE(d[0].HasMember("c")); - ASSERT_TRUE(d[0]["c"].IsObject()); - ASSERT_TRUE(d[0]["c"].HasMember("d")); - ASSERT_TRUE(d[0]["c"].HasMember("e")); - EXPECT_EQ(d[0]["c"]["d"].GetInt(), i); - EXPECT_EQ(std::string(d[0]["c"]["e"].GetString()), std::to_string(i)); - if (i % 17 == 0) { - ASSERT_TRUE(d[0]["c"].HasMember("f")); - EXPECT_EQ(d[0]["c"]["f"].GetInt(), i); - } else { - EXPECT_FALSE(d[0]["c"].HasMember("f")); - } - } - } - auto read_to_column_object = [&](ColumnIteratorUPtr& it) { new_column_object = ColumnVariant::create(10); nrows = 1000; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
