This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 97caf97f62a [fix](variant) materialize NG compaction regular paths 
(#63104)
97caf97f62a is described below

commit 97caf97f62ad08d77ac3a89ee0b998eb9d77b316
Author: lihangyu <[email protected]>
AuthorDate: Tue May 12 10:49:49 2026 +0800

    [fix](variant) materialize NG compaction regular paths (#63104)
    
    - NestedGroup compaction targets normally do not carry extracted
    subcolumns, so preserving existing target subcolumns can miss regular
    paths that sit beside NG arrays.
    
    - Collect Variant extended info for NG roots, keep NG physical children
    owned by the NestedGroup writer, and materialize only regular non-NG
    paths through the existing data-types helper. Typed paths continue to
    use get_compaction_typed_columns(), while doc-mode and normal non-NG
    Variant flows keep their existing ordering.
    
    - Also record materialized regular paths in sub_path_set and update
    schema util coverage for the new target-schema contract.
---
 be/src/exec/common/variant_util.cpp      | 184 ++++++++-----------------------
 be/test/exec/common/schema_util_test.cpp |  42 +++----
 2 files changed, 65 insertions(+), 161 deletions(-)

diff --git a/be/src/exec/common/variant_util.cpp 
b/be/src/exec/common/variant_util.cpp
index ac97563f94d..af10e0d4ff2 100644
--- a/be/src/exec/common/variant_util.cpp
+++ b/be/src/exec/common/variant_util.cpp
@@ -260,65 +260,19 @@ bool glob_match_re2(const std::string& glob_pattern, 
const std::string& candidat
     return RE2::FullMatch(candidate_path, *compiled);
 }
 
-bool is_regular_ng_compaction_subpath(const PathInData& path) {
+// NestedGroup's physical children and offsets are produced by 
NestedGroupWriteProvider, not by
+// appending TabletSchema extracted columns here. This predicate keeps only 
ordinary Variant paths
+// that are outside the NG tree, for example `v.owner` beside `v.items[*]`.
+bool is_regular_path_outside_nested_group(const PathInData& path) {
     const std::string& relative_path = path.get_path();
-    return !relative_path.empty() && !path.has_nested_part() &&
+    return !relative_path.empty() && !path.get_is_typed() && 
!path.has_nested_part() &&
            !segment_v2::contains_nested_group_marker(relative_path) &&
            !segment_v2::is_root_nested_group_path(relative_path) &&
            relative_path != SPARSE_COLUMN_PATH &&
            relative_path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
 }
 
-bool should_keep_existing_ng_compaction_subcolumn(const TabletColumn& column) {
-    if (!column.is_extracted_column()) {
-        return false;
-    }
-    return 
is_regular_ng_compaction_subpath(column.path_info_ptr()->copy_pop_front());
-}
-
-TabletIndexes clone_tablet_indexes(const std::vector<const TabletIndex*>& 
indexes) {
-    TabletIndexes cloned_indexes;
-    cloned_indexes.reserve(indexes.size());
-    for (const auto* index : indexes) {
-        cloned_indexes.emplace_back(std::make_shared<TabletIndex>(*index));
-    }
-    return cloned_indexes;
-}
-
-void keep_existing_ng_compaction_subcolumn(const TabletSchemaSPtr& 
source_schema,
-                                           const TabletColumnPtr& 
extracted_column,
-                                           TabletSchemaSPtr& output_schema,
-                                           TabletSchema::PathsSetInfo& 
paths_set_info) {
-    DCHECK(extracted_column->is_extracted_column());
-    DCHECK(should_keep_existing_ng_compaction_subcolumn(*extracted_column));
-
-    auto relative_path = extracted_column->path_info_ptr()->copy_pop_front();
-    const std::string path = relative_path.get_path();
-    output_schema->append_column(*extracted_column);
-
-    auto indexes = 
clone_tablet_indexes(source_schema->inverted_indexs(*extracted_column));
-    if (extracted_column->path_info_ptr()->get_is_typed()) {
-        TabletSchema::SubColumnInfo sub_column_info {.column = 
*extracted_column,
-                                                     .indexes = 
std::move(indexes)};
-        paths_set_info.typed_path_set.emplace(path, 
std::move(sub_column_info));
-        return;
-    }
-
-    paths_set_info.sub_path_set.emplace(path);
-    if (!indexes.empty()) {
-        paths_set_info.subcolumn_indexes.emplace(path, std::move(indexes));
-    }
-}
-
-using ExistingNgCompactionSubcolumns = std::unordered_map<int32_t, 
std::vector<TabletColumnPtr>>;
-
-struct NestedGroupCompactionMaterializationPlan {
-    std::vector<TabletColumnPtr> preserved_regular_subcolumns;
-    std::unordered_set<PathInData, PathInData::Hash> 
materialized_regular_paths;
-    PathToDataTypes additional_regular_path_to_data_types;
-};
-
-bool should_preserve_existing_ng_compaction_subcolumns(
+bool should_materialize_nested_group_regular_subcolumns(
         const TabletColumnPtr& column,
         const std::unordered_map<int32_t, VariantExtendedInfo>& 
uid_to_variant_extended_info) {
     const auto info_it = 
uid_to_variant_extended_info.find(column->unique_id());
@@ -326,12 +280,12 @@ bool should_preserve_existing_ng_compaction_subcolumns(
            (info_it != uid_to_variant_extended_info.end() && 
info_it->second.has_nested_group);
 }
 
-std::unordered_set<int32_t> collect_ng_compaction_root_uids(
+std::unordered_set<int32_t> collect_nested_group_compaction_root_uids(
         const TabletSchemaSPtr& target,
         const std::unordered_map<int32_t, VariantExtendedInfo>& 
uid_to_variant_extended_info) {
     std::unordered_set<int32_t> root_uids;
     for (const TabletColumnPtr& column : target->columns()) {
-        if (column->is_variant_type() && 
should_preserve_existing_ng_compaction_subcolumns(
+        if (column->is_variant_type() && 
should_materialize_nested_group_regular_subcolumns(
                                                  column, 
uid_to_variant_extended_info)) {
             root_uids.insert(column->unique_id());
         }
@@ -339,50 +293,16 @@ std::unordered_set<int32_t> 
collect_ng_compaction_root_uids(
     return root_uids;
 }
 
-ExistingNgCompactionSubcolumns collect_existing_ng_compaction_subcolumns(
-        const TabletSchemaSPtr& target, const std::unordered_set<int32_t>& 
ng_root_uids) {
-    ExistingNgCompactionSubcolumns uid_to_existing_subcolumns;
-    for (const TabletColumnPtr& column : target->columns()) {
-        if (!column->is_extracted_column() || 
!ng_root_uids.contains(column->parent_unique_id()) ||
-            !should_keep_existing_ng_compaction_subcolumn(*column)) {
-            continue;
-        }
-        
uid_to_existing_subcolumns[column->parent_unique_id()].push_back(column);
-    }
-    return uid_to_existing_subcolumns;
-}
-
-NestedGroupCompactionMaterializationPlan 
build_nested_group_compaction_materialization_plan(
-        const std::vector<TabletColumnPtr>& existing_subcolumns,
+PathToDataTypes collect_regular_types_outside_nested_group(
         const VariantExtendedInfo& extended_info) {
-    NestedGroupCompactionMaterializationPlan plan;
-    plan.preserved_regular_subcolumns = existing_subcolumns;
-    for (const auto& column : existing_subcolumns) {
-        
plan.materialized_regular_paths.emplace(column->path_info_ptr()->copy_pop_front());
-    }
+    PathToDataTypes regular_path_to_data_types;
     for (const auto& [path, data_types] : extended_info.path_to_data_types) {
-        if (!is_regular_ng_compaction_subpath(path) ||
-            plan.materialized_regular_paths.contains(path)) {
+        if (!is_regular_path_outside_nested_group(path)) {
             continue;
         }
-        plan.materialized_regular_paths.emplace(path);
-        plan.additional_regular_path_to_data_types.emplace(path, data_types);
+        regular_path_to_data_types.emplace(path, data_types);
     }
-    return plan;
-}
-
-void append_nested_group_compaction_columns(const TabletSchemaSPtr& target,
-                                            const TabletColumnPtr& column,
-                                            const 
NestedGroupCompactionMaterializationPlan& plan,
-                                            TabletSchemaSPtr& output_schema,
-                                            TabletSchema::PathsSetInfo& 
paths_set_info) {
-    for (const auto& existing_column : plan.preserved_regular_subcolumns) {
-        keep_existing_ng_compaction_subcolumn(target, existing_column, 
output_schema,
-                                              paths_set_info);
-    }
-    VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
-            paths_set_info, column, target, 
plan.additional_regular_path_to_data_types,
-            output_schema);
+    return regular_path_to_data_types;
 }
 
 size_t get_number_of_dimensions(const IDataType& type) {
@@ -1302,13 +1222,15 @@ void 
VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
         TabletSchemaSPtr& output_schema) {
     const auto& parent_indexes = 
target->inverted_indexs(parent_column->unique_id());
     for (const auto& [path, data_types] : path_to_data_types) {
-        if (data_types.empty() || path.empty() || path.has_nested_part()) {
+        // Typed paths are materialized by get_compaction_typed_columns(); 
this helper only
+        // materializes regular subcolumns inferred from rowset data types.
+        if (data_types.empty() || path.empty() || path.get_is_typed() || 
path.has_nested_part()) {
             continue;
         }
         DataTypePtr data_type;
         get_least_supertype_jsonb(data_types, &data_type);
         auto column_name = parent_column->name_lower_case() + "." + 
path.get_path();
-        auto column_path = PathInData(column_name, path.get_is_typed());
+        auto column_path = PathInData(column_name);
         TabletColumn sub_column =
                 get_column_by_type(data_type, column_name,
                                    ExtraInfo {.unique_id = -1,
@@ -1317,35 +1239,28 @@ void 
VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
         inherit_column_attributes(*parent_column, sub_column);
         TabletIndexes sub_column_indexes;
         inherit_index(parent_indexes, sub_column_indexes, sub_column);
-        if (path.get_is_typed()) {
-            TabletSchema::SubColumnInfo sub_column_info {.column = sub_column,
-                                                         .indexes = 
std::move(sub_column_indexes)};
-            paths_set_info.typed_path_set.emplace(path.get_path(), 
std::move(sub_column_info));
-        } else {
-            paths_set_info.subcolumn_indexes.emplace(path.get_path(),
-                                                     
std::move(sub_column_indexes));
-        }
+        paths_set_info.sub_path_set.emplace(path.get_path());
+        paths_set_info.subcolumn_indexes.emplace(path.get_path(), 
std::move(sub_column_indexes));
         output_schema->append_column(sub_column);
         VLOG_DEBUG << "append sub column " << path.get_path() << " data type "
                    << data_type->get_name();
     }
 }
 
-// Build the temporary schema for compaction
-// 1. aggregate path stats and data types from all rowsets
-// 2. append typed columns and nested columns to the output schema
-// 3. sort the subpaths and sparse paths for each unique id
-// 4. append the subpaths and sparse paths to the output schema
-// 5. set the path set info for each unique id
-// 6. return the output schema
+// Build the temporary schema for compaction.
+// NestedGroup roots are special: the root VARIANT column owns the NG tree and 
the streaming NG
+// writer handles NG children, while regular non-NG paths beside the arrays 
are materialized as
+// ordinary extracted subcolumns. NG typed paths still use 
get_compaction_typed_columns(), keeping
+// typed-column rules out of the NG-specific regular-path filtering.
 Status VariantCompactionUtil::get_extended_compaction_schema(
         const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) 
{
     std::unordered_map<int32_t, VariantExtendedInfo> 
uid_to_variant_extended_info;
-    const bool has_extendable_variant =
+    const bool needs_variant_extended_info =
             std::ranges::any_of(target->columns(), [](const TabletColumnPtr& 
column) {
-                return column->is_variant_type() && 
should_check_variant_path_stats(*column);
+                return column->is_variant_type() && 
(should_check_variant_path_stats(*column) ||
+                                                     
column->variant_enable_nested_group());
             });
-    if (has_extendable_variant) {
+    if (needs_variant_extended_info) {
         // collect path stats from all rowsets and segments
         for (const auto& rs : rowsets) {
             RETURN_IF_ERROR(aggregate_variant_extended_info(rs, 
&uid_to_variant_extended_info));
@@ -1356,9 +1271,8 @@ Status 
VariantCompactionUtil::get_extended_compaction_schema(
     TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
     output_schema->shawdow_copy_without_columns(*target);
     std::unordered_map<int32_t, TabletSchema::PathsSetInfo> 
uid_to_paths_set_info;
-    const auto ng_root_uids = collect_ng_compaction_root_uids(target, 
uid_to_variant_extended_info);
-    const auto uid_to_existing_ng_subcolumns =
-            collect_existing_ng_compaction_subcolumns(target, ng_root_uids);
+    const auto ng_root_uids =
+            collect_nested_group_compaction_root_uids(target, 
uid_to_variant_extended_info);
     for (const TabletColumnPtr& column : target->columns()) {
         if (!column->is_extracted_column()) {
             output_schema->append_column(*column);
@@ -1374,27 +1288,24 @@ Status 
VariantCompactionUtil::get_extended_compaction_schema(
                                                            ? 
empty_extended_info
                                                            : info_it->second;
         auto& paths_set_info = uid_to_paths_set_info[column->unique_id()];
-        if (ng_root_uids.contains(column->unique_id())) {
-            const auto plan = 
build_nested_group_compaction_materialization_plan(
-                    uid_to_existing_ng_subcolumns.contains(column->unique_id())
-                            ? 
uid_to_existing_ng_subcolumns.at(column->unique_id())
-                            : std::vector<TabletColumnPtr> {},
-                    extended_info);
-            append_nested_group_compaction_columns(target, column, plan, 
output_schema,
-                                                   paths_set_info);
-            LOG(INFO) << "Variant column uid=" << column->unique_id()
-                      << " keeps nested-group root with regular extracted 
columns in compaction "
-                         "schema";
-            continue;
-        }
-        if (!should_check_variant_path_stats(*column)) {
-            VLOG_DEBUG << "skip extended schema compaction for variant uid=" 
<< column->unique_id()
-                       << " because the column disables variant path stats";
-            continue;
-        }
-        if (extended_info.has_nested_group) {
+        const bool use_nested_group_compaction_schema = 
ng_root_uids.contains(column->unique_id());
+
+        if (use_nested_group_compaction_schema) {
+            // 1. append typed columns. Keep this shared with the non-NG typed 
helper; only the
+            // regular-path selection below is NG-specific.
+            RETURN_IF_ERROR(get_compaction_typed_columns(target, 
extended_info.typed_paths, column,
+                                                         output_schema, 
paths_set_info));
+
+            // NG roots do not record path-count stats for ordinary Variant 
paths, so their regular
+            // non-NG subcolumns use the same data-types materialization 
helper as the
+            // all-materialized non-NG branch below.
+            auto regular_path_to_data_types =
+                    collect_regular_types_outside_nested_group(extended_info);
+            get_compaction_subcolumns_from_data_types(paths_set_info, column, 
target,
+                                                      
regular_path_to_data_types, output_schema);
             LOG(INFO) << "Variant column uid=" << column->unique_id()
-                      << " has nested group, keep original column in 
compaction schema";
+                      << " keeps nested-group root and materializes regular 
non-NG subcolumns in "
+                         "compaction schema";
             continue;
         }
 
@@ -1413,6 +1324,7 @@ Status 
VariantCompactionUtil::get_extended_compaction_schema(
         // 1. append typed columns
         RETURN_IF_ERROR(get_compaction_typed_columns(target, 
extended_info.typed_paths, column,
                                                      output_schema, 
paths_set_info));
+
         // 2. append nested columns
         
RETURN_IF_ERROR(get_compaction_nested_columns(extended_info.nested_paths,
                                                       
extended_info.path_to_data_types, column,
diff --git a/be/test/exec/common/schema_util_test.cpp 
b/be/test/exec/common/schema_util_test.cpp
index 63ba6452722..8d0da1882c2 100644
--- a/be/test/exec/common/schema_util_test.cpp
+++ b/be/test/exec/common/schema_util_test.cpp
@@ -1193,7 +1193,8 @@ TEST_F(SchemaUtilTest, TestGetCompactionSchema) {
     EXPECT_EQ(variant_col.get_sub_columns().size(), 0);
 }
 
-TEST_F(SchemaUtilTest, 
get_extended_compaction_schema_nested_group_preserves_typed_subcolumns) {
+TEST_F(SchemaUtilTest,
+       
get_extended_compaction_schema_nested_group_ignores_existing_extracted_subcolumns)
 {
     TabletColumn variant;
     variant.set_unique_id(1);
     variant.set_name("v1");
@@ -1224,20 +1225,16 @@ TEST_F(SchemaUtilTest, 
get_extended_compaction_schema_nested_group_preserves_typ
             rowsets, target_schema);
     ASSERT_TRUE(status.ok()) << status.to_string();
 
-    EXPECT_EQ(target_schema->num_columns(), 2);
+    // get_extended_compaction_schema rebuilds from the base columns. Real 
compaction targets do
+    // not carry pre-existing extracted columns, so NG compaction must rely on 
rowset metadata for
+    // regular paths and on get_compaction_typed_columns() for typed paths.
+    EXPECT_EQ(target_schema->num_columns(), 1);
     const PathInData typed_path("v1.owner", true);
-    const auto typed_column_index = target_schema->field_index(typed_path);
-    ASSERT_NE(typed_column_index, -1);
-
-    const auto& preserved_typed_column = 
target_schema->column(typed_column_index);
-    EXPECT_TRUE(preserved_typed_column.path_info_ptr()->get_is_typed());
-    EXPECT_EQ(preserved_typed_column.type(), 
FieldType::OLAP_FIELD_TYPE_STRING);
+    EXPECT_EQ(target_schema->field_index(typed_path), -1);
 
     const auto* path_set_info = target_schema->try_path_set_info(1);
     ASSERT_NE(path_set_info, nullptr);
-    ASSERT_TRUE(path_set_info->typed_path_set.contains("owner"));
-    EXPECT_EQ(path_set_info->typed_path_set.at("owner").indexes.size(), 1);
-    
EXPECT_EQ(path_set_info->typed_path_set.at("owner").indexes[0]->index_name(), 
"v1_owner_idx");
+    EXPECT_FALSE(path_set_info->typed_path_set.contains("owner"));
 }
 
 TEST_F(SchemaUtilTest, TestGetSortedSubcolumns) {
@@ -1617,7 +1614,6 @@ TEST_F(SchemaUtilTest, 
get_compaction_subcolumns_from_data_types) {
     path_to_data_types[PathInData("b")] = 
{std::make_shared<DataTypeString>()}; // -> STRING
     path_to_data_types[PathInData("typed", true)] = 
{std::make_shared<DataTypeString>()};
     path_to_data_types[PathInData("shared")] = 
{std::make_shared<DataTypeInt32>()};
-    path_to_data_types[PathInData("shared", true)] = 
{std::make_shared<DataTypeString>()};
 
     TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
     TabletSchema::PathsSetInfo paths_set_info;
@@ -1625,9 +1621,8 @@ TEST_F(SchemaUtilTest, 
get_compaction_subcolumns_from_data_types) {
     
variant_util::VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
             paths_set_info, parent_column, target, path_to_data_types, 
output_schema);
 
-    EXPECT_EQ(output_schema->num_columns(), 5);
-    bool found_a = false, found_b = false, found_typed = false, found_shared = 
false,
-         found_typed_shared = false;
+    EXPECT_EQ(output_schema->num_columns(), 3);
+    bool found_a = false, found_b = false, found_typed = false, found_shared = 
false;
     for (const auto& col : output_schema->columns()) {
         if (col->name() == "v1.a") {
             found_a = true;
@@ -1652,14 +1647,10 @@ TEST_F(SchemaUtilTest, 
get_compaction_subcolumns_from_data_types) {
             EXPECT_EQ(col->type(), FieldType::OLAP_FIELD_TYPE_INT);
             EXPECT_EQ(col->parent_unique_id(), 1);
             EXPECT_EQ(col->path_info_ptr()->get_path(), "v1.shared");
-        } else if (col->name() == "v1.shared" && 
col->path_info_ptr()->get_is_typed()) {
-            found_typed_shared = true;
-            EXPECT_EQ(col->type(), FieldType::OLAP_FIELD_TYPE_STRING);
-            EXPECT_EQ(col->parent_unique_id(), 1);
-            EXPECT_EQ(col->path_info_ptr()->get_path(), "v1.shared");
         }
     }
-    EXPECT_TRUE(found_a && found_b && found_typed && found_shared && 
found_typed_shared);
+    EXPECT_TRUE(found_a && found_b && found_shared);
+    EXPECT_FALSE(found_typed);
 
     ASSERT_TRUE(paths_set_info.subcolumn_indexes.find("a") !=
                 paths_set_info.subcolumn_indexes.end());
@@ -1668,12 +1659,13 @@ TEST_F(SchemaUtilTest, 
get_compaction_subcolumns_from_data_types) {
     EXPECT_EQ(paths_set_info.subcolumn_indexes["a"].size(), 1);
     EXPECT_EQ(paths_set_info.subcolumn_indexes["b"].size(), 1);
     EXPECT_FALSE(paths_set_info.subcolumn_indexes.contains("typed"));
-    ASSERT_TRUE(paths_set_info.typed_path_set.contains("typed"));
-    EXPECT_EQ(paths_set_info.typed_path_set.at("typed").indexes.size(), 1);
     ASSERT_TRUE(paths_set_info.subcolumn_indexes.contains("shared"));
-    ASSERT_TRUE(paths_set_info.typed_path_set.contains("shared"));
     EXPECT_EQ(paths_set_info.subcolumn_indexes.at("shared").size(), 1);
-    EXPECT_EQ(paths_set_info.typed_path_set.at("shared").indexes.size(), 1);
+    EXPECT_FALSE(paths_set_info.typed_path_set.contains("typed"));
+    EXPECT_TRUE(paths_set_info.sub_path_set.contains("a"));
+    EXPECT_TRUE(paths_set_info.sub_path_set.contains("b"));
+    EXPECT_TRUE(paths_set_info.sub_path_set.contains("shared"));
+    EXPECT_FALSE(paths_set_info.sub_path_set.contains("typed"));
 }
 
 // Test has_different_structure_in_same_path function indirectly through 
check_variant_has_no_ambiguous_paths


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to