This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new d159a61a9d9 [branch-4.1](variant) Skip full footer scan when
constructing VariantStatsCaculator (#62819) (#63072)
d159a61a9d9 is described below
commit d159a61a9d9e924006a2afb9fd2146a62ebbe4df
Author: Chenyang Sun <[email protected]>
AuthorDate: Sat May 9 15:30:27 2026 +0800
[branch-4.1](variant) Skip full footer scan when constructing
VariantStatsCaculator (#62819) (#63072)
pick from master #62819
---
be/src/storage/segment/segment_writer.cpp | 7 +++--
.../storage/segment/variant_stats_calculator.cpp | 10 ++++---
be/src/storage/segment/variant_stats_calculator.h | 8 +++--
.../segment/variant_stats_calculator_test.cpp | 35 ++++++++++++++++++++++
4 files changed, 52 insertions(+), 8 deletions(-)
diff --git a/be/src/storage/segment/segment_writer.cpp
b/be/src/storage/segment/segment_writer.cpp
index 976ab0c2e2e..521c3ca2e84 100644
--- a/be/src/storage/segment/segment_writer.cpp
+++ b/be/src/storage/segment/segment_writer.cpp
@@ -330,11 +330,14 @@ Status SegmentWriter::init(const std::vector<uint32_t>&
col_ids, bool has_key) {
_opts.compression_type = _tablet_schema->compression_type();
}
+ // Vertical compaction calls init() multiple times against the same
writer; the footer accumulates entries
+ // across calls, so this init()'s slice of footer columns starts at the
current size.
+ const int variant_stats_footer_offset = _footer.columns_size();
RETURN_IF_ERROR(_create_writers(_tablet_schema, col_ids));
// Initialize variant statistics calculator
- _variant_stats_calculator =
- std::make_unique<VariantStatsCaculator>(&_footer, _tablet_schema,
col_ids);
+ _variant_stats_calculator = std::make_unique<VariantStatsCaculator>(
+ &_footer, _tablet_schema, col_ids, variant_stats_footer_offset);
// we don't need the short key index for unique key merge on write table.
if (_has_key) {
diff --git a/be/src/storage/segment/variant_stats_calculator.cpp
b/be/src/storage/segment/variant_stats_calculator.cpp
index b1bffb7673d..bc160282c18 100644
--- a/be/src/storage/segment/variant_stats_calculator.cpp
+++ b/be/src/storage/segment/variant_stats_calculator.cpp
@@ -30,10 +30,12 @@ namespace doris::segment_v2 {
VariantStatsCaculator::VariantStatsCaculator(SegmentFooterPB* footer,
TabletSchemaSPtr tablet_schema,
- const std::vector<uint32_t>&
column_ids)
+ const std::vector<uint32_t>&
column_ids,
+ int footer_column_offset)
: _footer(footer), _tablet_schema(tablet_schema),
_column_ids(column_ids) {
- // Build the path to footer index mapping during initialization
- for (int i = 0; i < _footer->columns_size(); ++i) {
+ // Only walk this init()'s slice of footer entries; earlier init() calls
(vertical compaction's previous
+ // column groups) are not addressable via `column_ids` and would only
inflate this scan.
+ for (int i = footer_column_offset; i < _footer->columns_size(); ++i) {
const auto& column = _footer->columns(i);
// path that need to record stats
if (column.has_column_path_info() &&
@@ -119,4 +121,4 @@ void
VariantStatsCaculator::_calculate_sub_column_stats(const IColumn& column,
#include "common/compile_check_end.h"
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/be/src/storage/segment/variant_stats_calculator.h
b/be/src/storage/segment/variant_stats_calculator.h
index e096697131a..4caf07540a2 100644
--- a/be/src/storage/segment/variant_stats_calculator.h
+++ b/be/src/storage/segment/variant_stats_calculator.h
@@ -29,8 +29,12 @@ namespace doris::segment_v2 {
class VariantStatsCaculator {
public:
+ // `footer_column_offset` is the index of the first footer entry that
belongs to this init()'s `column_ids`.
+ // Required because SegmentWriter::init() can be invoked multiple times
(vertical compaction) against
+ // an ever-growing footer; without the offset every additional init()
would re-scan the whole footer.
explicit VariantStatsCaculator(SegmentFooterPB* footer, TabletSchemaSPtr
tablet_schema,
- const std::vector<uint32_t>& column_ids);
+ const std::vector<uint32_t>& column_ids,
+ int footer_column_offset = 0);
// Calculate variant statistics for the given column and block
Status calculate_variant_stats(const Block* block, size_t row_pos, size_t
num_rows);
@@ -54,4 +58,4 @@ private:
size_t row_pos, size_t num_rows);
};
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/be/test/storage/segment/variant_stats_calculator_test.cpp
b/be/test/storage/segment/variant_stats_calculator_test.cpp
index ffdfc230901..66c3050370c 100644
--- a/be/test/storage/segment/variant_stats_calculator_test.cpp
+++ b/be/test/storage/segment/variant_stats_calculator_test.cpp
@@ -447,4 +447,39 @@ TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithExtendedSchema) {
EXPECT_TRUE(status.ok());
}
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithFooterOffset) {
+ // Vertical compaction calls SegmentWriter::init() multiple times against
+ // the same writer (key columns first, then each value-column group). The
+ // footer accumulates entries across calls, so the calculator built for the
+ // second init() must only index its own slice — not the leftover entries
+ // from the first init(). The offset tells the constructor where its slice
+ // starts and also where stats results should land.
+ add_footer_column_with_path(1, "stale_from_prev_init"); // pre-existing
+ add_footer_column_with_path(1, "another_stale_entry"); // pre-existing
+ const int footer_offset = _footer->columns_size();
+ add_footer_column_with_path(1, "sub_column"); // belongs to this init()
+
+ TabletColumn sub_column = create_variant_column(2, "variant.sub_column",
1, "sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids, footer_offset);
+
+ Block block;
+ auto nullable_column = create_nullable_column({false, true, false}, {"a",
"", "c"});
+ block.insert({std::move(nullable_column),
+
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // Stats land on this init()'s slice, not the pre-existing entries — proves
+ // we ignored the stale entries even though they share parent_unique_id=1
+ // and the same path keys would otherwise collide in the index map.
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+ EXPECT_EQ(_footer->columns(1).none_null_size(), 0);
+ EXPECT_EQ(_footer->columns(footer_offset).none_null_size(), 2);
+}
+
} // namespace doris::segment_v2
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]