This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 00f8f74055b [Improvement](variant) improve performance of variant's
caculate stats (#53077)
00f8f74055b is described below
commit 00f8f74055b259fe5e8777714c8c47114a03c110
Author: lihangyu <[email protected]>
AuthorDate: Fri Jul 11 10:26:50 2025 +0800
[Improvement](variant) improve performance of variant's caculate stats
(#53077)
---
be/src/olap/rowset/segment_v2/segment_writer.cpp | 64 +--
be/src/olap/rowset/segment_v2/segment_writer.h | 6 +-
.../rowset/segment_v2/variant_stats_calculator.cpp | 111 +++++
.../rowset/segment_v2/variant_stats_calculator.h | 55 +++
.../segment_v2/variant_stats_calculator_test.cpp | 448 +++++++++++++++++++++
5 files changed, 627 insertions(+), 57 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index 651c6f3ca92..5bae77a08e8 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -50,6 +50,7 @@
#include "olap/rowset/segment_v2/inverted_index_writer.h"
#include "olap/rowset/segment_v2/page_io.h"
#include "olap/rowset/segment_v2/page_pointer.h"
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
#include "olap/segment_loader.h"
#include "olap/short_key_index.h"
#include "olap/storage_engine.h"
@@ -319,6 +320,10 @@ Status SegmentWriter::init(const std::vector<uint32_t>&
col_ids, bool has_key) {
RETURN_IF_ERROR(_create_writers(_tablet_schema, col_ids));
+ // Initialize variant statistics calculator
+ _variant_stats_calculator =
+ std::make_unique<VariantStatsCaculator>(&_footer, _tablet_schema,
col_ids);
+
// we don't need the short key index for unique key merge on write table.
if (_has_key) {
if (_is_mow()) {
@@ -818,10 +823,10 @@ Status SegmentWriter::append_block(const
vectorized::Block* block, size_t row_po
}
RETURN_IF_ERROR(_column_writers[id]->append(converted_result.second->get_nullmap(),
converted_result.second->get_data(), num_rows));
-
- // caculate stats for variant type
- // TODO it's tricky here, maybe come up with a better idea
- _maybe_calculate_variant_stats(block, id, cid, row_pos, num_rows);
+ }
+ if (_opts.write_type == DataWriteType::TYPE_COMPACTION) {
+ RETURN_IF_ERROR(
+ _variant_stats_calculator->calculate_variant_stats(block,
row_pos, num_rows));
}
if (_has_key) {
if (_is_mow_with_cluster_key()) {
@@ -1337,56 +1342,5 @@ inline bool SegmentWriter::_is_mow_with_cluster_key() {
return _is_mow() && !_tablet_schema->cluster_key_idxes().empty();
}
-// Compaction will extend sparse column and is visible during read and write,
in order to
-// persit variant stats info, we should do extra caculation during flushing
segment, otherwise
-// the info is lost
-void SegmentWriter::_maybe_calculate_variant_stats(
- const vectorized::Block* block,
- size_t id, // id is the offset of the column in the block
- size_t cid, // cid is the column id in TabletSchema
- size_t row_pos, size_t num_rows) {
- const auto& tablet_column = _tablet_schema->columns()[cid];
- // Only process sub columns and sparse columns during compaction
- if (_tablet_schema->need_record_variant_extended_schema() ||
!tablet_column->has_path_info() ||
- !tablet_column->path_info_ptr()->need_record_stats() ||
- _opts.write_type != DataWriteType::TYPE_COMPACTION) {
- return;
- }
-
- // Get parent column's unique ID for matching
- int64_t parent_unique_id = tablet_column->parent_unique_id();
-
- // Find matching column in footer
- for (auto& column : *_footer.mutable_columns()) {
- // Check if this is the target sparse column
- if (!column.has_column_path_info() ||
- column.column_path_info().parrent_column_unique_id() !=
parent_unique_id) {
- continue;
- }
-
- // sprse column from variant column
- if (column.column_path_info().path().ends_with(SPARSE_COLUMN_PATH)) {
- // Found matching column, calculate statistics
- auto* stats = column.mutable_variant_statistics();
-
vectorized::schema_util::calculate_variant_stats(*block->get_by_position(id).column,
- stats, row_pos,
num_rows);
- VLOG_DEBUG << "sparse stats columns " <<
stats->sparse_column_non_null_size_size();
- break;
- }
- // sub column from variant column
- else if (column.column_path_info().path() ==
tablet_column->path_info_ptr()->get_path()) {
- const auto& null_data = assert_cast<const
vectorized::ColumnNullable&>(
- *block->get_by_position(id).column)
- .get_null_map_data();
- const int8_t* start = (int8_t*)null_data.data() + row_pos;
- // none null size in block + current none null size
- size_t res = simd::count_zero_num(start, num_rows) +
column.none_null_size();
- column.set_none_null_size(res);
- VLOG_DEBUG << "none null size " << res << " path: " <<
column.column_path_info().path();
- break;
- }
- }
-}
-
} // namespace segment_v2
} // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h
b/be/src/olap/rowset/segment_v2/segment_writer.h
index 374af5134a1..b091b48c942 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.h
+++ b/be/src/olap/rowset/segment_v2/segment_writer.h
@@ -66,6 +66,8 @@ namespace segment_v2 {
extern const char* k_segment_magic;
extern const uint32_t k_segment_magic_length;
+class VariantStatsCaculator;
+
struct SegmentWriterOptions {
uint32_t num_rows_per_block = 1024;
uint32_t max_rows_per_segment = UINT32_MAX;
@@ -173,8 +175,6 @@ private:
Status _write_footer();
Status _write_raw_data(const std::vector<Slice>& slices);
void _maybe_invalid_row_cache(const std::string& key);
- void _maybe_calculate_variant_stats(const vectorized::Block* block, size_t
id, size_t cid,
- size_t row_pos, size_t num_rows);
std::string _encode_keys(const
std::vector<vectorized::IOlapColumnDataAccessor*>& key_columns,
size_t pos);
// used for unique-key with merge on write and segment min_max key
@@ -263,6 +263,8 @@ private:
std::map<RowsetId, RowsetSharedPtr> _rsid_to_rowset;
// contains auto generated columns, should be nullptr if no variants's
subcolumns
TabletSchemaSPtr _flush_schema = nullptr;
+ // variant statistics calculator for efficient stats collection
+ std::unique_ptr<VariantStatsCaculator> _variant_stats_calculator;
};
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
new file mode 100644
index 00000000000..aef71372666
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include "common/logging.h"
+#include "util/simd/bits.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/common/schema_util.h"
+
+namespace doris::segment_v2 {
+
+VariantStatsCaculator::VariantStatsCaculator(SegmentFooterPB* footer,
+ TabletSchemaSPtr tablet_schema,
+ const std::vector<uint32_t>&
column_ids)
+ : _footer(footer), _tablet_schema(tablet_schema),
_column_ids(column_ids) {
+ // Build the path to footer index mapping during initialization
+ for (size_t i = 0; i < _footer->columns_size(); ++i) {
+ const auto& column = _footer->columns(i);
+ // path that need to record stats
+ if (column.has_column_path_info() &&
+ column.column_path_info().parrent_column_unique_id() > 0) {
+
_path_to_footer_index[column.column_path_info().parrent_column_unique_id()]
+ [column.column_path_info().path()] = i;
+ }
+ }
+}
+
+Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block*
block,
+ size_t row_pos, size_t
num_rows) {
+ // only record stats for variant column with none extended schema when
compaction
+ if (_tablet_schema->need_record_variant_extended_schema()) {
+ return Status::OK();
+ }
+ for (size_t i = 0; i < block->columns(); ++i) {
+ const TabletColumn& tablet_column =
_tablet_schema->column(_column_ids[i]);
+ // Only process sub columns and sparse columns during compaction
+ if (tablet_column.has_path_info() &&
tablet_column.path_info_ptr()->need_record_stats() &&
+ tablet_column.parent_unique_id() > 0) {
+ const std::string& column_path =
tablet_column.path_info_ptr()->get_path();
+ // Find the parent column in footer
+ auto it =
_path_to_footer_index.find(tablet_column.parent_unique_id());
+ if (it == _path_to_footer_index.end()) {
+ return Status::NotFound("Column path not found in footer: {}",
+
tablet_column.path_info_ptr()->get_path());
+ }
+ size_t footer_index = it->second[column_path];
+ ColumnMetaPB* column_meta = _footer->mutable_columns(footer_index);
+
+ // Get the column from the block
+ const auto& column = block->get_by_position(i).column;
+
+ // Check if this is a sparse column or sub column
+ if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) {
+ // This is a sparse column from variant column
+ _calculate_sparse_column_stats(*column, column_meta, row_pos,
num_rows);
+ } else {
+ // This is a sub column from variant column
+ _calculate_sub_column_stats(*column, column_meta, row_pos,
num_rows);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+void VariantStatsCaculator::_calculate_sparse_column_stats(const
vectorized::IColumn& column,
+ ColumnMetaPB*
column_meta,
+ size_t row_pos,
size_t num_rows) {
+ // Get or create variant statistics
+ VariantStatisticsPB* stats = column_meta->mutable_variant_statistics();
+
+ // Use the same logic as the original calculate_variant_stats function
+ vectorized::schema_util::calculate_variant_stats(column, stats, row_pos,
num_rows);
+
+ VLOG_DEBUG << "Sparse column stats updated, non-null size count: "
+ << stats->sparse_column_non_null_size_size();
+}
+
+void VariantStatsCaculator::_calculate_sub_column_stats(const
vectorized::IColumn& column,
+ ColumnMetaPB*
column_meta, size_t row_pos,
+ size_t num_rows) {
+ // For sub columns, we need to calculate the non-null count
+ const auto& nullable_column = assert_cast<const
vectorized::ColumnNullable&>(column);
+ const auto& null_data = nullable_column.get_null_map_data();
+ const int8_t* start = reinterpret_cast<const int8_t*>(null_data.data()) +
row_pos;
+
+ // Count non-null values in the current block
+ size_t current_non_null_count = simd::count_zero_num(start, num_rows);
+
+ // Add to existing non-null count
+ column_meta->set_none_null_size(current_non_null_count +
column_meta->none_null_size());
+
+ VLOG_DEBUG << "Sub column non-null count updated: " <<
column_meta->none_null_size()
+ << " (added " << current_non_null_count << " from current
block)";
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
new file mode 100644
index 00000000000..6ffd74036cb
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/core/block.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCaculator {
+public:
+ explicit VariantStatsCaculator(SegmentFooterPB* footer, TabletSchemaSPtr
tablet_schema,
+ const std::vector<uint32_t>& column_ids);
+
+ // Calculate variant statistics for the given column and block
+ Status calculate_variant_stats(const vectorized::Block* block, size_t
row_pos, size_t num_rows);
+
+private:
+ // Map from column path to footer column index for fast lookup
+ std::unordered_map<int32_t, std::unordered_map<std::string, size_t>>
_path_to_footer_index;
+
+ // Reference to the footer where we store the statistics
+ SegmentFooterPB* _footer;
+ TabletSchemaSPtr _tablet_schema;
+ std::vector<uint32_t> _column_ids;
+
+ // Helper method to calculate sparse column statistics
+ void _calculate_sparse_column_stats(const vectorized::IColumn& column,
+ ColumnMetaPB* column_meta, size_t
row_pos, size_t num_rows);
+
+ // Helper method to calculate sub column statistics
+ void _calculate_sub_column_stats(const vectorized::IColumn& column,
ColumnMetaPB* column_meta,
+ size_t row_pos, size_t num_rows);
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
new file mode 100644
index 00000000000..edbda054825
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
@@ -0,0 +1,448 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include <gtest/gtest.h>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_map.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/core/block.h"
+#include "vec/core/column_with_type_and_name.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCalculatorTest : public ::testing::Test {
+protected:
+ void SetUp() override {
+ // Create a mock tablet schema
+ _tablet_schema = std::make_shared<TabletSchema>();
+
+ // Create a segment footer
+ _footer = std::make_unique<SegmentFooterPB>();
+ }
+
+ void TearDown() override {
+ _footer.reset();
+ _tablet_schema.reset();
+ }
+
+ // Helper method to create a mock column with path info
+ TabletColumn create_variant_column(int32_t unique_id, const std::string&
name,
+ int32_t parent_unique_id = -1,
+ const std::string& path = "") {
+ TabletColumn column;
+ column.set_unique_id(unique_id);
+ column.set_name(name);
+ column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+ if (parent_unique_id > 0 && !path.empty()) {
+ vectorized::PathInData path_info(path);
+ column.set_path_info(path_info);
+ column.set_parent_unique_id(parent_unique_id);
+ }
+ column.set_variant_max_subcolumns_count(1);
+
+ return column;
+ }
+
+ // Helper method to create a footer column with path info
+ void add_footer_column_with_path(int32_t parent_unique_id, const
std::string& path) {
+ auto* column_meta = _footer->add_columns();
+ column_meta->set_unique_id(100 + _footer->columns_size());
+
+ auto* path_info = column_meta->mutable_column_path_info();
+ path_info->set_path(path);
+ path_info->set_parrent_column_unique_id(parent_unique_id);
+ }
+
+ // Helper method to create a nullable column for testing
+ vectorized::ColumnPtr create_nullable_column(const std::vector<bool>&
null_map,
+ const
std::vector<std::string>& values) {
+ auto string_column = vectorized::ColumnString::create();
+ auto null_column = vectorized::ColumnUInt8::create();
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ if (null_map[i]) {
+ string_column->insert_default();
+ null_column->insert_value(1);
+ } else {
+ string_column->insert_data(values[i].data(),
values[i].length());
+ null_column->insert_value(0);
+ }
+ }
+
+ return vectorized::ColumnNullable::create(std::move(string_column),
std::move(null_column));
+ }
+
+ // Helper method to create a map column (sparse column)
+ vectorized::ColumnPtr create_map_column() {
+ auto keys = vectorized::ColumnString::create();
+ auto values = vectorized::ColumnString::create();
+ auto offsets = vectorized::ColumnArray::ColumnOffsets::create();
+
+ // Add some sample data
+ keys->insert_data("key1", 4);
+ values->insert_data("value1", 6);
+ keys->insert_data("key2", 4);
+ values->insert_data("value2", 6);
+
+ offsets->insert_value(0);
+ offsets->insert_value(2);
+
+ return vectorized::ColumnMap::create(std::move(keys),
std::move(values),
+ std::move(offsets));
+ }
+
+ TabletSchemaSPtr _tablet_schema;
+ std::unique_ptr<SegmentFooterPB> _footer;
+};
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithEmptyFooter) {
+ std::vector<uint32_t> column_ids = {0, 1, 2};
+
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Test with empty footer - should not crash
+ vectorized::Block block;
+ auto status = calculator.calculate_variant_stats(&block, 0, 0);
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithValidFooter) {
+ // Add some columns with path info to footer
+ add_footer_column_with_path(1, "sub_column_1");
+ add_footer_column_with_path(1, "sub_column_2.__DORIS_VARIANT_SPARSE__");
+ add_footer_column_with_path(2, "another_sub_column");
+
+ std::vector<uint32_t> column_ids = {0, 1, 2};
+
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Constructor should have built the path mapping
+ vectorized::Block block;
+ auto status = calculator.calculate_variant_stats(&block, 0, 0);
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoVariantColumns) {
+ // Create tablet schema with regular columns (no variant columns)
+ TabletColumn regular_column;
+ regular_column.set_unique_id(1);
+ regular_column.set_name("regular_col");
+ regular_column.set_type(FieldType::OLAP_FIELD_TYPE_INT);
+
+ _tablet_schema->append_column(regular_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create a simple block
+ vectorized::Block block;
+ auto int_column = vectorized::ColumnVector<int32_t>::create();
+ int_column->insert_value(123);
+ block.insert(
+ {std::move(int_column),
std::make_shared<vectorized::DataTypeInt32>(), "regular_col"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSubColumn) {
+ // Setup footer with sub column
+ add_footer_column_with_path(1, "sub_column_1");
+
+ // Create variant sub column
+ TabletColumn sub_column =
+ create_variant_column(2, "variant_col.sub_column_1", 1,
"sub_column_1");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with nullable column
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true, false},
{"val1", "", "val3"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column_1"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // Check that non-null size was updated
+ auto& column_meta = _footer->columns(0);
+ EXPECT_EQ(column_meta.none_null_size(), 2); // 2 non-null values
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSparseColumn) {
+ // Setup footer with sparse column
+ add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__");
+
+ // Create variant sparse column
+ TabletColumn sparse_column = create_variant_column(2,
"variant_col.__DORIS_VARIANT_SPARSE__", 1,
+
"sparse_col.__DORIS_VARIANT_SPARSE__");
+ _tablet_schema->append_column(sparse_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with map column (sparse column)
+ vectorized::Block block;
+ auto map_column = create_map_column();
+ block.insert({std::move(map_column),
+ std::make_shared<vectorized::DataTypeMap>(
+ std::make_shared<vectorized::DataTypeString>(),
+ std::make_shared<vectorized::DataTypeString>()),
+ "sparse_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_TRUE(status.ok());
+
+ // Check that variant statistics were updated
+ auto& column_meta = _footer->columns(0);
+ EXPECT_TRUE(column_meta.has_variant_statistics());
+}
+
+TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithMissingFooterEntry) {
+ // Create variant sub column but don't add corresponding footer entry
+ TabletColumn sub_column = create_variant_column(2,
"variant_col.missing_sub", 1, "missing_sub");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with nullable column
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true}, {"val1", ""});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "missing_sub"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 2);
+ EXPECT_FALSE(status.ok());
+ EXPECT_TRUE(status.is<ErrorCode::NOT_FOUND>());
+}
+
+TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithMissingPathInFooter) {
+ // Setup footer with different path than what tablet schema has
+ add_footer_column_with_path(1, "different_path");
+
+ // Create variant sub column with non-matching path
+ TabletColumn sub_column =
+ create_variant_column(2, "variant_col.sub_column", 1111,
"sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with nullable column
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false}, {"val1"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_FALSE(status.ok()) << status.to_string();
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) {
+ // Setup footer with multiple columns
+ add_footer_column_with_path(1, "sub1");
+ add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__");
+ add_footer_column_with_path(2, "another_sub");
+
+ // Create multiple variant columns
+ TabletColumn sub1 = create_variant_column(2, "variant.sub1", 1, "sub1");
+ TabletColumn sparse = create_variant_column(3,
"variant.__DORIS_VARIANT_SPARSE__", 1,
+
"sub2.__DORIS_VARIANT_SPARSE__");
+ TabletColumn sub2 = create_variant_column(4, "variant2.another_sub", 2,
"another_sub");
+
+ _tablet_schema->append_column(sub1);
+ _tablet_schema->append_column(sparse);
+ _tablet_schema->append_column(sub2);
+
+ std::vector<uint32_t> column_ids = {0, 1, 2};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with multiple columns
+ vectorized::Block block;
+
+ auto nullable_col1 = create_nullable_column({false, true, false}, {"a",
"", "c"});
+ block.insert({std::move(nullable_col1),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub1"});
+
+ auto map_col = create_map_column();
+ map_col->assume_mutable()->insert_many_defaults(3);
+ block.insert({std::move(map_col),
+ std::make_shared<vectorized::DataTypeMap>(
+ std::make_shared<vectorized::DataTypeString>(),
+ std::make_shared<vectorized::DataTypeString>()),
+ "sparse"});
+
+ auto nullable_col2 = create_nullable_column({true, false, true}, {"", "x",
""});
+ block.insert({std::move(nullable_col2),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "another_sub"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // Check that statistics were updated for sub columns
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 2); // sub1: 2
non-null
+ EXPECT_TRUE(_footer->columns(1).has_variant_statistics()); // sparse column
+ EXPECT_EQ(_footer->columns(2).none_null_size(), 1); // another_sub:
2 non-null
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithEmptyBlock) {
+ add_footer_column_with_path(1, "sub_column");
+
+ TabletColumn sub_column = create_variant_column(2, "variant.sub_column",
1, "sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create empty block
+ vectorized::Block block;
+ auto empty_column = create_nullable_column({}, {});
+ block.insert({std::move(empty_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 0);
+ EXPECT_TRUE(status.ok());
+
+ // No change in statistics for empty block
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithAllNullValues) {
+ add_footer_column_with_path(1, "sub_column");
+
+ TabletColumn sub_column = create_variant_column(2, "variant.sub_column",
1, "sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with all null values
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({true, true, true}, {"", "",
""});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // All null values should result in 0 non-null count
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoPathInfo) {
+ // Create regular column without path info
+ TabletColumn regular_column;
+ regular_column.set_unique_id(1);
+ regular_column.set_name("regular");
+ regular_column.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
+ // No path info set
+
+ _tablet_schema->append_column(regular_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ vectorized::Block block;
+ auto string_column = vectorized::ColumnString::create();
+ string_column->insert_data("test", 4);
+ block.insert(
+ {std::move(string_column),
std::make_shared<vectorized::DataTypeString>(), "regular"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_TRUE(status.ok()); // Should skip columns without path info
+}
+
+TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsAccumulatesNonNullCount) {
+ add_footer_column_with_path(1, "sub_column");
+
+ // Set initial non-null count in footer
+ _footer->mutable_columns(0)->set_none_null_size(5);
+
+ TabletColumn sub_column = create_variant_column(2, "variant.sub_column",
1, "sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true, false}, {"a",
"", "c"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // Should accumulate: initial 5 + new 2 = 7
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 7);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithExtendedSchema) {
+ add_footer_column_with_path(1, "sub_column");
+ TabletColumn column;
+ column.set_unique_id(1);
+ column.set_name("variant");
+ column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+ column.set_variant_max_subcolumns_count(0);
+ _tablet_schema->append_column(column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true, false}, {"a",
"", "c"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]