This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 00f8f74055b [Improvement](variant) improve performance of variant's 
caculate stats (#53077)
00f8f74055b is described below

commit 00f8f74055b259fe5e8777714c8c47114a03c110
Author: lihangyu <[email protected]>
AuthorDate: Fri Jul 11 10:26:50 2025 +0800

    [Improvement](variant) improve performance of variant's caculate stats 
(#53077)
---
 be/src/olap/rowset/segment_v2/segment_writer.cpp   |  64 +--
 be/src/olap/rowset/segment_v2/segment_writer.h     |   6 +-
 .../rowset/segment_v2/variant_stats_calculator.cpp | 111 +++++
 .../rowset/segment_v2/variant_stats_calculator.h   |  55 +++
 .../segment_v2/variant_stats_calculator_test.cpp   | 448 +++++++++++++++++++++
 5 files changed, 627 insertions(+), 57 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index 651c6f3ca92..5bae77a08e8 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -50,6 +50,7 @@
 #include "olap/rowset/segment_v2/inverted_index_writer.h"
 #include "olap/rowset/segment_v2/page_io.h"
 #include "olap/rowset/segment_v2/page_pointer.h"
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
 #include "olap/segment_loader.h"
 #include "olap/short_key_index.h"
 #include "olap/storage_engine.h"
@@ -319,6 +320,10 @@ Status SegmentWriter::init(const std::vector<uint32_t>& 
col_ids, bool has_key) {
 
     RETURN_IF_ERROR(_create_writers(_tablet_schema, col_ids));
 
+    // Initialize variant statistics calculator
+    _variant_stats_calculator =
+            std::make_unique<VariantStatsCaculator>(&_footer, _tablet_schema, 
col_ids);
+
     // we don't need the short key index for unique key merge on write table.
     if (_has_key) {
         if (_is_mow()) {
@@ -818,10 +823,10 @@ Status SegmentWriter::append_block(const 
vectorized::Block* block, size_t row_po
         }
         
RETURN_IF_ERROR(_column_writers[id]->append(converted_result.second->get_nullmap(),
                                                     
converted_result.second->get_data(), num_rows));
-
-        // caculate stats for variant type
-        // TODO it's tricky here, maybe come up with a better idea
-        _maybe_calculate_variant_stats(block, id, cid, row_pos, num_rows);
+    }
+    if (_opts.write_type == DataWriteType::TYPE_COMPACTION) {
+        RETURN_IF_ERROR(
+                _variant_stats_calculator->calculate_variant_stats(block, 
row_pos, num_rows));
     }
     if (_has_key) {
         if (_is_mow_with_cluster_key()) {
@@ -1337,56 +1342,5 @@ inline bool SegmentWriter::_is_mow_with_cluster_key() {
     return _is_mow() && !_tablet_schema->cluster_key_idxes().empty();
 }
 
-// Compaction will extend sparse column and is visible during read and write, 
in order to
-// persit variant stats info, we should do extra caculation during flushing 
segment, otherwise
-// the info is lost
-void SegmentWriter::_maybe_calculate_variant_stats(
-        const vectorized::Block* block,
-        size_t id,  // id is the offset of the column in the block
-        size_t cid, // cid is the column id in TabletSchema
-        size_t row_pos, size_t num_rows) {
-    const auto& tablet_column = _tablet_schema->columns()[cid];
-    // Only process sub columns and sparse columns during compaction
-    if (_tablet_schema->need_record_variant_extended_schema() || 
!tablet_column->has_path_info() ||
-        !tablet_column->path_info_ptr()->need_record_stats() ||
-        _opts.write_type != DataWriteType::TYPE_COMPACTION) {
-        return;
-    }
-
-    // Get parent column's unique ID for matching
-    int64_t parent_unique_id = tablet_column->parent_unique_id();
-
-    // Find matching column in footer
-    for (auto& column : *_footer.mutable_columns()) {
-        // Check if this is the target sparse column
-        if (!column.has_column_path_info() ||
-            column.column_path_info().parrent_column_unique_id() != 
parent_unique_id) {
-            continue;
-        }
-
-        // sprse column from variant column
-        if (column.column_path_info().path().ends_with(SPARSE_COLUMN_PATH)) {
-            // Found matching column, calculate statistics
-            auto* stats = column.mutable_variant_statistics();
-            
vectorized::schema_util::calculate_variant_stats(*block->get_by_position(id).column,
-                                                             stats, row_pos, 
num_rows);
-            VLOG_DEBUG << "sparse stats columns " << 
stats->sparse_column_non_null_size_size();
-            break;
-        }
-        // sub column from variant column
-        else if (column.column_path_info().path() == 
tablet_column->path_info_ptr()->get_path()) {
-            const auto& null_data = assert_cast<const 
vectorized::ColumnNullable&>(
-                                            *block->get_by_position(id).column)
-                                            .get_null_map_data();
-            const int8_t* start = (int8_t*)null_data.data() + row_pos;
-            // none null size in block + current none null size
-            size_t res = simd::count_zero_num(start, num_rows) + 
column.none_null_size();
-            column.set_none_null_size(res);
-            VLOG_DEBUG << "none null size " << res << " path: " << 
column.column_path_info().path();
-            break;
-        }
-    }
-}
-
 } // namespace segment_v2
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h 
b/be/src/olap/rowset/segment_v2/segment_writer.h
index 374af5134a1..b091b48c942 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.h
+++ b/be/src/olap/rowset/segment_v2/segment_writer.h
@@ -66,6 +66,8 @@ namespace segment_v2 {
 extern const char* k_segment_magic;
 extern const uint32_t k_segment_magic_length;
 
+class VariantStatsCaculator;
+
 struct SegmentWriterOptions {
     uint32_t num_rows_per_block = 1024;
     uint32_t max_rows_per_segment = UINT32_MAX;
@@ -173,8 +175,6 @@ private:
     Status _write_footer();
     Status _write_raw_data(const std::vector<Slice>& slices);
     void _maybe_invalid_row_cache(const std::string& key);
-    void _maybe_calculate_variant_stats(const vectorized::Block* block, size_t 
id, size_t cid,
-                                        size_t row_pos, size_t num_rows);
     std::string _encode_keys(const 
std::vector<vectorized::IOlapColumnDataAccessor*>& key_columns,
                              size_t pos);
     // used for unique-key with merge on write and segment min_max key
@@ -263,6 +263,8 @@ private:
     std::map<RowsetId, RowsetSharedPtr> _rsid_to_rowset;
     // contains auto generated columns, should be nullptr if no variants's 
subcolumns
     TabletSchemaSPtr _flush_schema = nullptr;
+    // variant statistics calculator for efficient stats collection
+    std::unique_ptr<VariantStatsCaculator> _variant_stats_calculator;
 };
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp 
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
new file mode 100644
index 00000000000..aef71372666
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include "common/logging.h"
+#include "util/simd/bits.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/common/schema_util.h"
+
+namespace doris::segment_v2 {
+
+VariantStatsCaculator::VariantStatsCaculator(SegmentFooterPB* footer,
+                                             TabletSchemaSPtr tablet_schema,
+                                             const std::vector<uint32_t>& 
column_ids)
+        : _footer(footer), _tablet_schema(tablet_schema), 
_column_ids(column_ids) {
+    // Build the path to footer index mapping during initialization
+    for (size_t i = 0; i < _footer->columns_size(); ++i) {
+        const auto& column = _footer->columns(i);
+        // path that need to record stats
+        if (column.has_column_path_info() &&
+            column.column_path_info().parrent_column_unique_id() > 0) {
+            
_path_to_footer_index[column.column_path_info().parrent_column_unique_id()]
+                                 [column.column_path_info().path()] = i;
+        }
+    }
+}
+
+Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block* 
block,
+                                                      size_t row_pos, size_t 
num_rows) {
+    // only record stats for variant column with none extended schema when 
compaction
+    if (_tablet_schema->need_record_variant_extended_schema()) {
+        return Status::OK();
+    }
+    for (size_t i = 0; i < block->columns(); ++i) {
+        const TabletColumn& tablet_column = 
_tablet_schema->column(_column_ids[i]);
+        // Only process sub columns and sparse columns during compaction
+        if (tablet_column.has_path_info() && 
tablet_column.path_info_ptr()->need_record_stats() &&
+            tablet_column.parent_unique_id() > 0) {
+            const std::string& column_path = 
tablet_column.path_info_ptr()->get_path();
+            // Find the parent column in footer
+            auto it = 
_path_to_footer_index.find(tablet_column.parent_unique_id());
+            if (it == _path_to_footer_index.end()) {
+                return Status::NotFound("Column path not found in footer: {}",
+                                        
tablet_column.path_info_ptr()->get_path());
+            }
+            size_t footer_index = it->second[column_path];
+            ColumnMetaPB* column_meta = _footer->mutable_columns(footer_index);
+
+            // Get the column from the block
+            const auto& column = block->get_by_position(i).column;
+
+            // Check if this is a sparse column or sub column
+            if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) {
+                // This is a sparse column from variant column
+                _calculate_sparse_column_stats(*column, column_meta, row_pos, 
num_rows);
+            } else {
+                // This is a sub column from variant column
+                _calculate_sub_column_stats(*column, column_meta, row_pos, 
num_rows);
+            }
+        }
+    }
+    return Status::OK();
+}
+
+void VariantStatsCaculator::_calculate_sparse_column_stats(const 
vectorized::IColumn& column,
+                                                           ColumnMetaPB* 
column_meta,
+                                                           size_t row_pos, 
size_t num_rows) {
+    // Get or create variant statistics
+    VariantStatisticsPB* stats = column_meta->mutable_variant_statistics();
+
+    // Use the same logic as the original calculate_variant_stats function
+    vectorized::schema_util::calculate_variant_stats(column, stats, row_pos, 
num_rows);
+
+    VLOG_DEBUG << "Sparse column stats updated, non-null size count: "
+               << stats->sparse_column_non_null_size_size();
+}
+
+void VariantStatsCaculator::_calculate_sub_column_stats(const 
vectorized::IColumn& column,
+                                                        ColumnMetaPB* 
column_meta, size_t row_pos,
+                                                        size_t num_rows) {
+    // For sub columns, we need to calculate the non-null count
+    const auto& nullable_column = assert_cast<const 
vectorized::ColumnNullable&>(column);
+    const auto& null_data = nullable_column.get_null_map_data();
+    const int8_t* start = reinterpret_cast<const int8_t*>(null_data.data()) + 
row_pos;
+
+    // Count non-null values in the current block
+    size_t current_non_null_count = simd::count_zero_num(start, num_rows);
+
+    // Add to existing non-null count
+    column_meta->set_none_null_size(current_non_null_count + 
column_meta->none_null_size());
+
+    VLOG_DEBUG << "Sub column non-null count updated: " << 
column_meta->none_null_size()
+               << " (added " << current_non_null_count << " from current 
block)";
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h 
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
new file mode 100644
index 00000000000..6ffd74036cb
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/core/block.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCaculator {
+public:
+    explicit VariantStatsCaculator(SegmentFooterPB* footer, TabletSchemaSPtr 
tablet_schema,
+                                   const std::vector<uint32_t>& column_ids);
+
+    // Calculate variant statistics for the given column and block
+    Status calculate_variant_stats(const vectorized::Block* block, size_t 
row_pos, size_t num_rows);
+
+private:
+    // Map from column path to footer column index for fast lookup
+    std::unordered_map<int32_t, std::unordered_map<std::string, size_t>> 
_path_to_footer_index;
+
+    // Reference to the footer where we store the statistics
+    SegmentFooterPB* _footer;
+    TabletSchemaSPtr _tablet_schema;
+    std::vector<uint32_t> _column_ids;
+
+    // Helper method to calculate sparse column statistics
+    void _calculate_sparse_column_stats(const vectorized::IColumn& column,
+                                        ColumnMetaPB* column_meta, size_t 
row_pos, size_t num_rows);
+
+    // Helper method to calculate sub column statistics
+    void _calculate_sub_column_stats(const vectorized::IColumn& column, 
ColumnMetaPB* column_meta,
+                                     size_t row_pos, size_t num_rows);
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp 
b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
new file mode 100644
index 00000000000..edbda054825
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
@@ -0,0 +1,448 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include <gtest/gtest.h>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_map.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/core/block.h"
+#include "vec/core/column_with_type_and_name.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCalculatorTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        // Create a mock tablet schema
+        _tablet_schema = std::make_shared<TabletSchema>();
+
+        // Create a segment footer
+        _footer = std::make_unique<SegmentFooterPB>();
+    }
+
+    void TearDown() override {
+        _footer.reset();
+        _tablet_schema.reset();
+    }
+
+    // Helper method to create a mock column with path info
+    TabletColumn create_variant_column(int32_t unique_id, const std::string& 
name,
+                                       int32_t parent_unique_id = -1,
+                                       const std::string& path = "") {
+        TabletColumn column;
+        column.set_unique_id(unique_id);
+        column.set_name(name);
+        column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+        if (parent_unique_id > 0 && !path.empty()) {
+            vectorized::PathInData path_info(path);
+            column.set_path_info(path_info);
+            column.set_parent_unique_id(parent_unique_id);
+        }
+        column.set_variant_max_subcolumns_count(1);
+
+        return column;
+    }
+
+    // Helper method to create a footer column with path info
+    void add_footer_column_with_path(int32_t parent_unique_id, const 
std::string& path) {
+        auto* column_meta = _footer->add_columns();
+        column_meta->set_unique_id(100 + _footer->columns_size());
+
+        auto* path_info = column_meta->mutable_column_path_info();
+        path_info->set_path(path);
+        path_info->set_parrent_column_unique_id(parent_unique_id);
+    }
+
+    // Helper method to create a nullable column for testing
+    vectorized::ColumnPtr create_nullable_column(const std::vector<bool>& 
null_map,
+                                                 const 
std::vector<std::string>& values) {
+        auto string_column = vectorized::ColumnString::create();
+        auto null_column = vectorized::ColumnUInt8::create();
+
+        for (size_t i = 0; i < values.size(); ++i) {
+            if (null_map[i]) {
+                string_column->insert_default();
+                null_column->insert_value(1);
+            } else {
+                string_column->insert_data(values[i].data(), 
values[i].length());
+                null_column->insert_value(0);
+            }
+        }
+
+        return vectorized::ColumnNullable::create(std::move(string_column), 
std::move(null_column));
+    }
+
+    // Helper method to create a map column (sparse column)
+    vectorized::ColumnPtr create_map_column() {
+        auto keys = vectorized::ColumnString::create();
+        auto values = vectorized::ColumnString::create();
+        auto offsets = vectorized::ColumnArray::ColumnOffsets::create();
+
+        // Add some sample data
+        keys->insert_data("key1", 4);
+        values->insert_data("value1", 6);
+        keys->insert_data("key2", 4);
+        values->insert_data("value2", 6);
+
+        offsets->insert_value(0);
+        offsets->insert_value(2);
+
+        return vectorized::ColumnMap::create(std::move(keys), 
std::move(values),
+                                             std::move(offsets));
+    }
+
+    TabletSchemaSPtr _tablet_schema;
+    std::unique_ptr<SegmentFooterPB> _footer;
+};
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithEmptyFooter) {
+    std::vector<uint32_t> column_ids = {0, 1, 2};
+
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Test with empty footer - should not crash
+    vectorized::Block block;
+    auto status = calculator.calculate_variant_stats(&block, 0, 0);
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithValidFooter) {
+    // Add some columns with path info to footer
+    add_footer_column_with_path(1, "sub_column_1");
+    add_footer_column_with_path(1, "sub_column_2.__DORIS_VARIANT_SPARSE__");
+    add_footer_column_with_path(2, "another_sub_column");
+
+    std::vector<uint32_t> column_ids = {0, 1, 2};
+
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Constructor should have built the path mapping
+    vectorized::Block block;
+    auto status = calculator.calculate_variant_stats(&block, 0, 0);
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoVariantColumns) {
+    // Create tablet schema with regular columns (no variant columns)
+    TabletColumn regular_column;
+    regular_column.set_unique_id(1);
+    regular_column.set_name("regular_col");
+    regular_column.set_type(FieldType::OLAP_FIELD_TYPE_INT);
+
+    _tablet_schema->append_column(regular_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create a simple block
+    vectorized::Block block;
+    auto int_column = vectorized::ColumnVector<int32_t>::create();
+    int_column->insert_value(123);
+    block.insert(
+            {std::move(int_column), 
std::make_shared<vectorized::DataTypeInt32>(), "regular_col"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSubColumn) {
+    // Setup footer with sub column
+    add_footer_column_with_path(1, "sub_column_1");
+
+    // Create variant sub column
+    TabletColumn sub_column =
+            create_variant_column(2, "variant_col.sub_column_1", 1, 
"sub_column_1");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with nullable column
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true, false}, 
{"val1", "", "val3"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column_1"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // Check that non-null size was updated
+    auto& column_meta = _footer->columns(0);
+    EXPECT_EQ(column_meta.none_null_size(), 2); // 2 non-null values
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSparseColumn) {
+    // Setup footer with sparse column
+    add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__");
+
+    // Create variant sparse column
+    TabletColumn sparse_column = create_variant_column(2, 
"variant_col.__DORIS_VARIANT_SPARSE__", 1,
+                                                       
"sparse_col.__DORIS_VARIANT_SPARSE__");
+    _tablet_schema->append_column(sparse_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with map column (sparse column)
+    vectorized::Block block;
+    auto map_column = create_map_column();
+    block.insert({std::move(map_column),
+                  std::make_shared<vectorized::DataTypeMap>(
+                          std::make_shared<vectorized::DataTypeString>(),
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sparse_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_TRUE(status.ok());
+
+    // Check that variant statistics were updated
+    auto& column_meta = _footer->columns(0);
+    EXPECT_TRUE(column_meta.has_variant_statistics());
+}
+
+TEST_F(VariantStatsCalculatorTest, 
CalculateVariantStatsWithMissingFooterEntry) {
+    // Create variant sub column but don't add corresponding footer entry
+    TabletColumn sub_column = create_variant_column(2, 
"variant_col.missing_sub", 1, "missing_sub");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with nullable column
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true}, {"val1", ""});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "missing_sub"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 2);
+    EXPECT_FALSE(status.ok());
+    EXPECT_TRUE(status.is<ErrorCode::NOT_FOUND>());
+}
+
+TEST_F(VariantStatsCalculatorTest, 
CalculateVariantStatsWithMissingPathInFooter) {
+    // Setup footer with different path than what tablet schema has
+    add_footer_column_with_path(1, "different_path");
+
+    // Create variant sub column with non-matching path
+    TabletColumn sub_column =
+            create_variant_column(2, "variant_col.sub_column", 1111, 
"sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with nullable column
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false}, {"val1"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_FALSE(status.ok()) << status.to_string();
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) {
+    // Setup footer with multiple columns
+    add_footer_column_with_path(1, "sub1");
+    add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__");
+    add_footer_column_with_path(2, "another_sub");
+
+    // Create multiple variant columns
+    TabletColumn sub1 = create_variant_column(2, "variant.sub1", 1, "sub1");
+    TabletColumn sparse = create_variant_column(3, 
"variant.__DORIS_VARIANT_SPARSE__", 1,
+                                                
"sub2.__DORIS_VARIANT_SPARSE__");
+    TabletColumn sub2 = create_variant_column(4, "variant2.another_sub", 2, 
"another_sub");
+
+    _tablet_schema->append_column(sub1);
+    _tablet_schema->append_column(sparse);
+    _tablet_schema->append_column(sub2);
+
+    std::vector<uint32_t> column_ids = {0, 1, 2};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with multiple columns
+    vectorized::Block block;
+
+    auto nullable_col1 = create_nullable_column({false, true, false}, {"a", 
"", "c"});
+    block.insert({std::move(nullable_col1),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub1"});
+
+    auto map_col = create_map_column();
+    map_col->assume_mutable()->insert_many_defaults(3);
+    block.insert({std::move(map_col),
+                  std::make_shared<vectorized::DataTypeMap>(
+                          std::make_shared<vectorized::DataTypeString>(),
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sparse"});
+
+    auto nullable_col2 = create_nullable_column({true, false, true}, {"", "x", 
""});
+    block.insert({std::move(nullable_col2),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "another_sub"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // Check that statistics were updated for sub columns
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 2);        // sub1: 2 
non-null
+    EXPECT_TRUE(_footer->columns(1).has_variant_statistics()); // sparse column
+    EXPECT_EQ(_footer->columns(2).none_null_size(), 1);        // another_sub: 
2 non-null
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithEmptyBlock) {
+    add_footer_column_with_path(1, "sub_column");
+
+    TabletColumn sub_column = create_variant_column(2, "variant.sub_column", 
1, "sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create empty block
+    vectorized::Block block;
+    auto empty_column = create_nullable_column({}, {});
+    block.insert({std::move(empty_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 0);
+    EXPECT_TRUE(status.ok());
+
+    // No change in statistics for empty block
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithAllNullValues) {
+    add_footer_column_with_path(1, "sub_column");
+
+    TabletColumn sub_column = create_variant_column(2, "variant.sub_column", 
1, "sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    // Create block with all null values
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({true, true, true}, {"", "", 
""});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // All null values should result in 0 non-null count
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoPathInfo) {
+    // Create regular column without path info
+    TabletColumn regular_column;
+    regular_column.set_unique_id(1);
+    regular_column.set_name("regular");
+    regular_column.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
+    // No path info set
+
+    _tablet_schema->append_column(regular_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    vectorized::Block block;
+    auto string_column = vectorized::ColumnString::create();
+    string_column->insert_data("test", 4);
+    block.insert(
+            {std::move(string_column), 
std::make_shared<vectorized::DataTypeString>(), "regular"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 1);
+    EXPECT_TRUE(status.ok()); // Should skip columns without path info
+}
+
+TEST_F(VariantStatsCalculatorTest, 
CalculateVariantStatsAccumulatesNonNullCount) {
+    add_footer_column_with_path(1, "sub_column");
+
+    // Set initial non-null count in footer
+    _footer->mutable_columns(0)->set_none_null_size(5);
+
+    TabletColumn sub_column = create_variant_column(2, "variant.sub_column", 
1, "sub_column");
+    _tablet_schema->append_column(sub_column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true, false}, {"a", 
"", "c"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+
+    // Should accumulate: initial 5 + new 2 = 7
+    EXPECT_EQ(_footer->columns(0).none_null_size(), 7);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithExtendedSchema) {
+    add_footer_column_with_path(1, "sub_column");
+    TabletColumn column;
+    column.set_unique_id(1);
+    column.set_name("variant");
+    column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+    column.set_variant_max_subcolumns_count(0);
+    _tablet_schema->append_column(column);
+
+    std::vector<uint32_t> column_ids = {0};
+    VariantStatsCaculator calculator(_footer.get(), _tablet_schema, 
column_ids);
+
+    vectorized::Block block;
+    auto nullable_column = create_nullable_column({false, true, false}, {"a", 
"", "c"});
+    block.insert({std::move(nullable_column),
+                  std::make_shared<vectorized::DataTypeNullable>(
+                          std::make_shared<vectorized::DataTypeString>()),
+                  "sub_column"});
+
+    auto status = calculator.calculate_variant_stats(&block, 0, 3);
+    EXPECT_TRUE(status.ok());
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to