This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 9952dfefe61 [feat](Variant) support variant sparse feature and schema 
template with multi indexes (part 5) (#54328)
9952dfefe61 is described below

commit 9952dfefe6194fcee95f14ee9d4b1c0de94a5a41
Author: lihangyu <[email protected]>
AuthorDate: Tue Aug 5 22:23:39 2025 +0800

    [feat](Variant) support variant sparse feature and schema template with 
multi indexes (part 5) (#54328)
    
    Add VariantStatsCaculator to caculate variant stats info
---
 be/src/olap/rowset/segment_v2/segment_writer.cpp   |  11 +-
 be/src/olap/rowset/segment_v2/segment_writer.h     |   5 +
 .../rowset/segment_v2/variant_stats_calculator.cpp | 107 +++++
 .../rowset/segment_v2/variant_stats_calculator.h   |  55 +++
 be/src/vec/common/schema_util.cpp                  |  84 ++--
 be/src/vec/common/schema_util.h                    |  11 +-
 .../segment_v2/variant_stats_calculator_test.cpp   | 448 +++++++++++++++++++++
 7 files changed, 671 insertions(+), 50 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index 39562d60ab2..fed429af04c 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -50,7 +50,7 @@
 #include "olap/rowset/segment_v2/inverted_index_writer.h"
 #include "olap/rowset/segment_v2/page_io.h"
 #include "olap/rowset/segment_v2/page_pointer.h"
-// #include "olap/rowset/segment_v2/variant/variant_stats_calculator.h"
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
 #include "olap/segment_loader.h"
 #include "olap/short_key_index.h"
 #include "olap/storage_engine.h"
@@ -324,6 +324,10 @@ Status SegmentWriter::init(const std::vector<uint32_t>& 
col_ids, bool has_key) {
 
     RETURN_IF_ERROR(_create_writers(_tablet_schema, col_ids));
 
+    // Initialize variant statistics calculator
+    _variant_stats_calculator =
+            std::make_unique<VariantStatsCaculator>(&_footer, _tablet_schema, 
col_ids);
+
     // we don't need the short key index for unique key merge on write table.
     if (_has_key) {
         if (_is_mow()) {
@@ -731,7 +735,10 @@ Status SegmentWriter::append_block(const 
vectorized::Block* block, size_t row_po
         
RETURN_IF_ERROR(_column_writers[id]->append(converted_result.second->get_nullmap(),
                                                     
converted_result.second->get_data(), num_rows));
     }
-
+    if (_opts.write_type == DataWriteType::TYPE_COMPACTION) {
+        RETURN_IF_ERROR(
+                _variant_stats_calculator->calculate_variant_stats(block, 
row_pos, num_rows));
+    }
     if (_has_key) {
         if (_is_mow_with_cluster_key()) {
             // for now we don't need to query short key index for CLUSTER BY 
feature,
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h 
b/be/src/olap/rowset/segment_v2/segment_writer.h
index 76ba9b2ab21..c58ee417864 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.h
+++ b/be/src/olap/rowset/segment_v2/segment_writer.h
@@ -65,6 +65,8 @@ namespace segment_v2 {
 extern const char* k_segment_magic;
 extern const uint32_t k_segment_magic_length;
 
+class VariantStatsCaculator;
+
 struct SegmentWriterOptions {
     uint32_t num_rows_per_block = 1024;
     uint32_t max_rows_per_segment = UINT32_MAX;
@@ -106,6 +108,7 @@ public:
     Status partial_update_preconditions_check(size_t row_pos);
     Status append_block_with_partial_content(const vectorized::Block* block, 
size_t row_pos,
                                              size_t num_rows);
+
     int64_t max_row_to_add(size_t row_avg_size_in_bytes);
 
     uint64_t estimate_segment_size();
@@ -261,6 +264,8 @@ private:
     TabletSchemaSPtr _flush_schema = nullptr;
     std::vector<std::string> _primary_keys;
     uint64_t _primary_keys_size = 0;
+    // variant statistics calculator for efficient stats collection
+    std::unique_ptr<VariantStatsCaculator> _variant_stats_calculator;
 };
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp 
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
new file mode 100644
index 00000000000..f1de6260304
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include "common/logging.h"
+#include "util/simd/bits.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/common/schema_util.h"
+
+namespace doris::segment_v2 {
+
+VariantStatsCaculator::VariantStatsCaculator(SegmentFooterPB* footer,
+                                             TabletSchemaSPtr tablet_schema,
+                                             const std::vector<uint32_t>& 
column_ids)
+        : _footer(footer), _tablet_schema(tablet_schema), 
_column_ids(column_ids) {
+    // Build the path to footer index mapping during initialization
+    for (size_t i = 0; i < _footer->columns_size(); ++i) {
+        const auto& column = _footer->columns(i);
+        // path that need to record stats
+        if (column.has_column_path_info() &&
+            column.column_path_info().parrent_column_unique_id() > 0) {
+            
_path_to_footer_index[column.column_path_info().parrent_column_unique_id()]
+                                 [column.column_path_info().path()] = i;
+        }
+    }
+}
+
+Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block* 
block,
+                                                      size_t row_pos, size_t 
num_rows) {
+    for (size_t i = 0; i < block->columns(); ++i) {
+        const TabletColumn& tablet_column = 
_tablet_schema->column(_column_ids[i]);
+        // Only process sub columns and sparse columns during compaction
+        if (tablet_column.has_path_info() && 
tablet_column.path_info_ptr()->need_record_stats() &&
+            tablet_column.parent_unique_id() > 0) {
+            const std::string& column_path = 
tablet_column.path_info_ptr()->get_path();
+            // Find the parent column in footer
+            auto it = 
_path_to_footer_index.find(tablet_column.parent_unique_id());
+            if (it == _path_to_footer_index.end()) {
+                return Status::NotFound("Column path not found in footer: {}",
+                                        
tablet_column.path_info_ptr()->get_path());
+            }
+            size_t footer_index = it->second[column_path];
+            ColumnMetaPB* column_meta = _footer->mutable_columns(footer_index);
+
+            // Get the column from the block
+            const auto& column = block->get_by_position(i).column;
+
+            // Check if this is a sparse column or sub column
+            if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) {
+                // This is a sparse column from variant column
+                _calculate_sparse_column_stats(*column, column_meta, row_pos, 
num_rows);
+            } else {
+                // This is a sub column from variant column
+                _calculate_sub_column_stats(*column, column_meta, row_pos, 
num_rows);
+            }
+        }
+    }
+    return Status::OK();
+}
+
+void VariantStatsCaculator::_calculate_sparse_column_stats(const 
vectorized::IColumn& column,
+                                                           ColumnMetaPB* 
column_meta,
+                                                           size_t row_pos, 
size_t num_rows) {
+    // Get or create variant statistics
+    VariantStatisticsPB* stats = column_meta->mutable_variant_statistics();
+
+    // Use the same logic as the original calculate_variant_stats function
+    vectorized::schema_util::calculate_variant_stats(column, stats, row_pos, 
num_rows);
+
+    VLOG_DEBUG << "Sparse column stats updated, non-null size count: "
+               << stats->sparse_column_non_null_size_size();
+}
+
+void VariantStatsCaculator::_calculate_sub_column_stats(const 
vectorized::IColumn& column,
+                                                        ColumnMetaPB* 
column_meta, size_t row_pos,
+                                                        size_t num_rows) {
+    // For sub columns, we need to calculate the non-null count
+    const auto& nullable_column = assert_cast<const 
vectorized::ColumnNullable&>(column);
+    const auto& null_data = nullable_column.get_null_map_data();
+    const int8_t* start = reinterpret_cast<const int8_t*>(null_data.data()) + 
row_pos;
+
+    // Count non-null values in the current block
+    size_t current_non_null_count = simd::count_zero_num(start, num_rows);
+
+    // Add to existing non-null count
+    column_meta->set_none_null_size(current_non_null_count + 
column_meta->none_null_size());
+
+    VLOG_DEBUG << "Sub column non-null count updated: " << 
column_meta->none_null_size()
+               << " (added " << current_non_null_count << " from current 
block)";
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h 
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
new file mode 100644
index 00000000000..6ffd74036cb
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/core/block.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCaculator {
+public:
+    explicit VariantStatsCaculator(SegmentFooterPB* footer, TabletSchemaSPtr 
tablet_schema,
+                                   const std::vector<uint32_t>& column_ids);
+
+    // Calculate variant statistics for the given column and block
+    Status calculate_variant_stats(const vectorized::Block* block, size_t 
row_pos, size_t num_rows);
+
+private:
+    // Map from column path to footer column index for fast lookup
+    std::unordered_map<int32_t, std::unordered_map<std::string, size_t>> 
_path_to_footer_index;
+
+    // Reference to the footer where we store the statistics
+    SegmentFooterPB* _footer;
+    TabletSchemaSPtr _tablet_schema;
+    std::vector<uint32_t> _column_ids;
+
+    // Helper method to calculate sparse column statistics
+    void _calculate_sparse_column_stats(const vectorized::IColumn& column,
+                                        ColumnMetaPB* column_meta, size_t 
row_pos, size_t num_rows);
+
+    // Helper method to calculate sub column statistics
+    void _calculate_sub_column_stats(const vectorized::IColumn& column, 
ColumnMetaPB* column_meta,
+                                     size_t row_pos, size_t num_rows);
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/vec/common/schema_util.cpp 
b/be/src/vec/common/schema_util.cpp
index 79a21f638a5..edc016d5138 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -1031,48 +1031,48 @@ Status aggregate_variant_extended_info(
 //     return Status::OK();
 // }
 //
-// // Calculate statistics about variant data paths from the encoded sparse 
column
-// void calculate_variant_stats(const IColumn& encoded_sparse_column,
-//                              segment_v2::VariantStatisticsPB* stats, size_t 
row_pos,
-//                              size_t num_rows) {
-//     // Cast input column to ColumnMap type since sparse column is stored as 
a map
-//     const auto& map_column = assert_cast<const 
ColumnMap&>(encoded_sparse_column);
-//
-//     // Get the keys column which contains the paths as strings
-//     const auto& sparse_data_paths =
-//             assert_cast<const 
ColumnString*>(map_column.get_keys_ptr().get());
-//     const auto& serialized_sparse_column_offsets =
-//             assert_cast<const 
ColumnArray::Offsets64&>(map_column.get_offsets());
-//     auto& count_map = *stats->mutable_sparse_column_non_null_size();
-//     // Iterate through all paths in the sparse column
-//     for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
-//         size_t offset = serialized_sparse_column_offsets[i - 1];
-//         size_t end = serialized_sparse_column_offsets[i];
-//         for (size_t j = offset; j != end; ++j) {
-//             auto path = sparse_data_paths->get_data_at(j);
-//
-//             const auto& sparse_path = path.to_string();
-//             // If path already exists in statistics, increment its count
-//             if (auto it = count_map.find(sparse_path); it != 
count_map.end()) {
-//                 ++it->second;
-//             }
-//             // If path doesn't exist and we haven't hit the max statistics 
size limit,
-//             // add it with count 1
-//             else if (count_map.size() < 
config::variant_max_sparse_column_statistics_size) {
-//                 count_map.emplace(sparse_path, 1);
-//             }
-//         }
-//     }
-//
-//     if (stats->sparse_column_non_null_size().size() >
-//         config::variant_max_sparse_column_statistics_size) {
-//         throw doris::Exception(
-//                 ErrorCode::INTERNAL_ERROR,
-//                 "Sparse column non null size: {} is greater than max 
statistics size: {}",
-//                 stats->sparse_column_non_null_size().size(),
-//                 config::variant_max_sparse_column_statistics_size);
-//     }
-// }
+// Calculate statistics about variant data paths from the encoded sparse column
+void calculate_variant_stats(const IColumn& encoded_sparse_column,
+                             segment_v2::VariantStatisticsPB* stats, size_t 
row_pos,
+                             size_t num_rows) {
+    // Cast input column to ColumnMap type since sparse column is stored as a 
map
+    const auto& map_column = assert_cast<const 
ColumnMap&>(encoded_sparse_column);
+
+    // Get the keys column which contains the paths as strings
+    const auto& sparse_data_paths =
+            assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
+    const auto& serialized_sparse_column_offsets =
+            assert_cast<const 
ColumnArray::Offsets64&>(map_column.get_offsets());
+    auto& count_map = *stats->mutable_sparse_column_non_null_size();
+    // Iterate through all paths in the sparse column
+    for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
+        size_t offset = serialized_sparse_column_offsets[i - 1];
+        size_t end = serialized_sparse_column_offsets[i];
+        for (size_t j = offset; j != end; ++j) {
+            auto path = sparse_data_paths->get_data_at(j);
+
+            const auto& sparse_path = path.to_string();
+            // If path already exists in statistics, increment its count
+            if (auto it = count_map.find(sparse_path); it != count_map.end()) {
+                ++it->second;
+            }
+            // If path doesn't exist and we haven't hit the max statistics 
size limit,
+            // add it with count 1
+            else if (count_map.size() < 
config::variant_max_sparse_column_statistics_size) {
+                count_map.emplace(sparse_path, 1);
+            }
+        }
+    }
+
+    if (stats->sparse_column_non_null_size().size() >
+        config::variant_max_sparse_column_statistics_size) {
+        throw doris::Exception(
+                ErrorCode::INTERNAL_ERROR,
+                "Sparse column non null size: {} is greater than max 
statistics size: {}",
+                stats->sparse_column_non_null_size().size(),
+                config::variant_max_sparse_column_statistics_size);
+    }
+}
 
 /// Calculates number of dimensions in array field.
 /// Returns 0 for scalar fields.
diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h
index 863f25be8fd..840b8bc4307 100644
--- a/be/src/vec/common/schema_util.h
+++ b/be/src/vec/common/schema_util.h
@@ -179,12 +179,11 @@ bool inherit_index(const std::vector<const TabletIndex*>& 
parent_indexes,
 // Status check_path_stats(const std::vector<RowsetSharedPtr>& intputs, 
RowsetSharedPtr output,
 //                        BaseTabletSPtr tablet);
 //
-// // Calculate statistics about variant data paths from the encoded sparse 
column
-// void calculate_variant_stats(const IColumn& encoded_sparse_column,
-//                              segment_v2::VariantStatisticsPB* stats, size_t 
row_pos,
-//                              size_t num_rows);
-//
-//
+// Calculate statistics about variant data paths from the encoded sparse column
+void calculate_variant_stats(const IColumn& encoded_sparse_column,
+                             segment_v2::VariantStatisticsPB* stats, size_t 
row_pos,
+                             size_t num_rows);
+
 // bool generate_sub_column_info(const TabletSchema& schema, int32_t 
col_unique_id,
 //                               const std::string& path,
 //                               TabletSchema::SubColumnInfo* sub_column_info);
diff --git a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp 
b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
new file mode 100644
index 00000000000..6591c799945
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
@@ -0,0 +1,448 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include <gtest/gtest.h>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_map.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/core/block.h"
+#include "vec/core/column_with_type_and_name.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCalculatorTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        // Create a mock tablet schema
+        _tablet_schema = std::make_shared<TabletSchema>();
+
+        // Create a segment footer
+        _footer = std::make_unique<SegmentFooterPB>();
+    }
+
+    void TearDown() override {
+        _footer.reset();
+        _tablet_schema.reset();
+    }
+
+    // Helper method to create a mock column with path info
+    TabletColumn create_variant_column(int32_t unique_id, const std::string& 
name,
+                                       int32_t parent_unique_id = -1,
+                                       const std::string& path = "") {
+        TabletColumn column;
+        column.set_unique_id(unique_id);
+        column.set_name(name);
+        column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+        if (parent_unique_id > 0 && !path.empty()) {
+            vectorized::PathInData path_info(path);
+            column.set_path_info(path_info);
+            column.set_parent_unique_id(parent_unique_id);
+        }
+        column.set_variant_max_subcolumns_count(1);
+
+        return column;
+    }
+
+    // Helper method to create a footer column with path info
+    void add_footer_column_with_path(int32_t parent_unique_id, const 
std::string& path) {
+        auto* column_meta = _footer->add_columns();
+        column_meta->set_unique_id(100 + _footer->columns_size());
+
+        auto* path_info = column_meta->mutable_column_path_info();
+        path_info->set_path(path);
+        path_info->set_parrent_column_unique_id(parent_unique_id);
+    }
+
+    // Helper method to create a nullable column for testing
+    vectorized::ColumnPtr create_nullable_column(const std::vector<bool>& 
null_map,
+                                                 const 
std::vector<std::string>& values) {
+        auto string_column = vectorized::ColumnString::create();
+        auto null_column = vectorized::ColumnUInt8::create();
+
+        for (size_t i = 0; i < values.size(); ++i) {
+            if (null_map[i]) {
+                string_column->insert_default();
+                null_column->insert_value(1);
+            } else {
+                string_column->insert_data(values[i].data(), 
values[i].length());
+                null_column->insert_value(0);
+            }
+        }
+
+        return vectorized::ColumnNullable::create(std::move(string_column), 
std::move(null_column));
+    }
+
+    // Helper method to create a map column (sparse column)
+    vectorized::ColumnPtr create_map_column() {
+        auto keys = vectorized::ColumnString::create();
+        auto values = vectorized::ColumnString::create();
+        auto offsets = vectorized::ColumnArray::ColumnOffsets::create();
+
+        // Add some sample data
+        keys->insert_data("key1", 4);
+        values->insert_data("value1", 6);
+        keys->insert_data("key2", 4);
+        values->insert_data("value2", 6);
+
+        offsets->insert_value(0);
+        offsets->insert_value(2);
+
+        return vectorized::ColumnMap::create(std::move(keys), 
std::move(values),
+                                             std::move(offsets));
+    }
+
+    TabletSchemaSPtr _tablet_schema;
+    std::unique_ptr<SegmentFooterPB> _footer;
+};
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithEmptyFooter) {
+    std::vector<uint32_t> column_ids = {0, 1, 2};
+
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Test with empty footer - should not crash
+    vectorized::Block block;
+    auto status = calculator.calculate_variant_stats(&block, 0, 0);
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithValidFooter) {
+    // Add some columns with path info to footer
+    add_footer_column_with_path(1, "sub_column_1");
+    add_footer_column_with_path(1, "sub_column_2.__DORIS_VARIANT_SPARSE__");
+    add_footer_column_with_path(2, "another_sub_column");
+
+    std::vector<uint32_t> column_ids = {0, 1, 2};
+
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Constructor should have built the path mapping
+    vectorized::Block block;
+    auto status = calculator.calculate_variant_stats(&block, 0, 0);
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoVariantColumns) {
+    // Create tablet schema with regular columns (no variant columns)
+    TabletColumn regular_column;
+    regular_column.set_unique_id(1);
+    regular_column.set_name("regular_col");
+    regular_column.set_type(FieldType::OLAP_FIELD_TYPE_INT);
+
+    _tablet_schema->append_column(regular_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create a simple block
+    vectorized::Block block;
+    auto int_column = 
vectorized::ColumnVector<PrimitiveType::TYPE_INT>::create();
+    int_column->insert_value(123);
+    block.insert(
+            {std::move(int_column), 
std::make_shared<vectorized::DataTypeInt32>(), "regular_col"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSubColumn) {
+    // Setup footer with sub column
+    add_footer_column_with_path(1, "sub_column_1");
+
+    // Create variant sub column
+    TabletColumn sub_column =
+            create_variant_column(2, "variant_col.sub_column_1", 1, 
"sub_column_1");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with nullable column
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true, false}, 
{"val1", "", "val3"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column_1"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // Check that non-null size was updated
+    auto& column_meta = _footer->columns(0);
+    EXPECT_EQ(column_meta.none_null_size(), 2); // 2 non-null values
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSparseColumn) {
+    // Setup footer with sparse column
+    add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__");
+
+    // Create variant sparse column
+    TabletColumn sparse_column = create_variant_column(2, 
"variant_col.__DORIS_VARIANT_SPARSE__", 1,
+                                                       
"sparse_col.__DORIS_VARIANT_SPARSE__");
+    _tablet_schema->append_column(sparse_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with map column (sparse column)
+    vectorized::Block block;
+    auto map_column = create_map_column();
+    block.insert({std::move(map_column),
+                  std::make_shared<vectorized::DataTypeMap>(
+                          std::make_shared<vectorized::DataTypeString>(),
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sparse_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_TRUE(status.ok());
+
+    // Check that variant statistics were updated
+    auto& column_meta = _footer->columns(0);
+    EXPECT_TRUE(column_meta.has_variant_statistics());
+}
+
+TEST_F(VariantStatsCalculatorTest, 
CalculateVariantStatsWithMissingFooterEntry) {
+    // Create variant sub column but don't add corresponding footer entry
+    TabletColumn sub_column = create_variant_column(2, 
"variant_col.missing_sub", 1, "missing_sub");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with nullable column
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true}, {"val1", ""});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "missing_sub"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 2);
+    EXPECT_FALSE(status.ok());
+    EXPECT_TRUE(status.is<ErrorCode::NOT_FOUND>());
+}
+
+TEST_F(VariantStatsCalculatorTest, 
CalculateVariantStatsWithMissingPathInFooter) {
+    // Setup footer with different path than what tablet schema has
+    add_footer_column_with_path(1, "different_path");
+
+    // Create variant sub column with non-matching path
+    TabletColumn sub_column =
+            create_variant_column(2, "variant_col.sub_column", 1111, 
"sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with nullable column
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false}, {"val1"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_FALSE(status.ok()) << status.to_string();
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) {
+    // Setup footer with multiple columns
+    add_footer_column_with_path(1, "sub1");
+    add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__");
+    add_footer_column_with_path(2, "another_sub");
+
+    // Create multiple variant columns
+    TabletColumn sub1 = create_variant_column(2, "variant.sub1", 1, "sub1");
+    TabletColumn sparse = create_variant_column(3, 
"variant.__DORIS_VARIANT_SPARSE__", 1,
+                                                
"sub2.__DORIS_VARIANT_SPARSE__");
+    TabletColumn sub2 = create_variant_column(4, "variant2.another_sub", 2, 
"another_sub");
+
+    _tablet_schema->append_column(sub1);
+    _tablet_schema->append_column(sparse);
+    _tablet_schema->append_column(sub2);
+
+    std::vector<uint32_t> column_ids = {0, 1, 2};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with multiple columns
+    vectorized::Block block;
+
+    auto nullable_col1 = create_nullable_column({false, true, false}, {"a", 
"", "c"});
+    block.insert({std::move(nullable_col1),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub1"});
+
+    auto map_col = create_map_column();
+    map_col->assume_mutable()->insert_many_defaults(3);
+    block.insert({std::move(map_col),
+                  std::make_shared<vectorized::DataTypeMap>(
+                          std::make_shared<vectorized::DataTypeString>(),
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sparse"});
+
+    auto nullable_col2 = create_nullable_column({true, false, true}, {"", "x", 
""});
+    block.insert({std::move(nullable_col2),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "another_sub"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // Check that statistics were updated for sub columns
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 2);        // sub1: 2 
non-null
+    EXPECT_TRUE(_footer->columns(1).has_variant_statistics()); // sparse column
+    EXPECT_EQ(_footer->columns(2).none_null_size(), 1);        // another_sub: 
2 non-null
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithEmptyBlock) {
+    add_footer_column_with_path(1, "sub_column");
+
+    TabletColumn sub_column = create_variant_column(2, "variant.sub_column", 
1, "sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create empty block
+    vectorized::Block block;
+    auto empty_column = create_nullable_column({}, {});
+    block.insert({std::move(empty_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 0);
+    EXPECT_TRUE(status.ok());
+
+    // No change in statistics for empty block
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithAllNullValues) {
+    add_footer_column_with_path(1, "sub_column");
+
+    TabletColumn sub_column = create_variant_column(2, "variant.sub_column", 
1, "sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with all null values
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({true, true, true}, {"", "", 
""});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // All null values should result in 0 non-null count
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoPathInfo) {
+    // Create regular column without path info
+    TabletColumn regular_column;
+    regular_column.set_unique_id(1);
+    regular_column.set_name("regular");
+    regular_column.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
+    // No path info set
+
+    _tablet_schema->append_column(regular_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    vectorized::Block block;
+    auto string_column = vectorized::ColumnString::create();
+    string_column->insert_data("test", 4);
+    block.insert(
+            {std::move(string_column), 
std::make_shared<vectorized::DataTypeString>(), "regular"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_TRUE(status.ok()); // Should skip columns without path info
+}
+
+TEST_F(VariantStatsCalculatorTest, 
CalculateVariantStatsAccumulatesNonNullCount) {
+    add_footer_column_with_path(1, "sub_column");
+
+    // Set initial non-null count in footer
+    _footer->mutable_columns(0)->set_none_null_size(5);
+
+    TabletColumn sub_column = create_variant_column(2, "variant.sub_column", 
1, "sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true, false}, {"a", 
"", "c"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // Should accumulate: initial 5 + new 2 = 7
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 7);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithExtendedSchema) {
+    add_footer_column_with_path(1, "sub_column");
+    TabletColumn column;
+    column.set_unique_id(1);
+    column.set_name("variant");
+    column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+    column.set_variant_max_subcolumns_count(0);
+    _tablet_schema->append_column(column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true, false}, {"a", 
"", "c"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to