This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new fed4d1bc0f7 [Refactor](scan) Remove the unless config and change some
profile detail in segment (#54041)
fed4d1bc0f7 is described below
commit fed4d1bc0f7f07fff7ca69e9ebd0c7773ac86497
Author: HappenLee <[email protected]>
AuthorDate: Wed Jul 30 11:32:33 2025 +0800
[Refactor](scan) Remove the unless config and change some profile detail in
segment (#54041)
1. remove the unless be config:ignore_always_true_predicate_for_segment
2. change dict filter segment profile logic
---
be/src/common/config.cpp | 2 -
be/src/common/config.h | 3 --
be/src/olap/olap_common.h | 2 +-
be/src/olap/rowset/segment_v2/segment.cpp | 10 ++--
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 59 ++++++++++++----------
be/src/pipeline/exec/olap_scan_operator.cpp | 2 +-
be/src/pipeline/exec/scan_operator.cpp | 1 -
be/src/vec/exec/scan/olap_scanner.cpp | 2 +-
8 files changed, 39 insertions(+), 42 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index f31fe300feb..1f681cecee6 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1326,8 +1326,6 @@ DEFINE_mInt32(be_proc_monitor_interval_ms, "10000");
DEFINE_Int32(workload_group_metrics_interval_ms, "5000");
-DEFINE_Bool(ignore_always_true_predicate_for_segment, "true");
-
// Ingest binlog work pool size, -1 is disable, 0 is hardware concurrency
DEFINE_Int32(ingest_binlog_work_pool_size, "-1");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 271dee612e0..cb52c768c9f 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1388,9 +1388,6 @@ DECLARE_Int32(workload_group_metrics_interval_ms);
// This config controls whether the s3 file writer would flush cache
asynchronously
DECLARE_Bool(enable_flush_file_cache_async);
-// Remove predicate that is always true for a segment.
-DECLARE_Bool(ignore_always_true_predicate_for_segment);
-
// Ingest binlog work pool size
DECLARE_Int32(ingest_binlog_work_pool_size);
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 051b81e0586..1d809ad1c13 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -340,7 +340,7 @@ struct OlapReaderStatistics {
int64_t rows_stats_filtered = 0;
int64_t rows_stats_rp_filtered = 0;
int64_t rows_bf_filtered = 0;
- int64_t rows_dict_filtered = 0;
+ int64_t segment_dict_filtered = 0;
// Including the number of rows filtered out according to the Delete
information in the Tablet,
// and the number of rows filtered for marked deleted rows under the
unique key model.
// This metric is mainly used to record the number of rows filtered by the
delete condition in Segment V1,
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp
b/be/src/olap/rowset/segment_v2/segment.cpp
index 3c0e16ea938..a80a810c28b 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -154,7 +154,7 @@ Segment::Segment(uint32_t segment_id, RowsetId rowset_id,
TabletSchemaSPtr table
_meta_mem_usage(0),
_rowset_id(rowset_id),
_tablet_schema(std::move(tablet_schema)),
- _idx_file_info(idx_file_info) {}
+ _idx_file_info(std::move(idx_file_info)) {}
Segment::~Segment() {
g_segment_estimate_mem_bytes << -_tracked_meta_mem_usage;
@@ -297,17 +297,15 @@ Status Segment::new_iterator(SchemaSPtr schema, const
StorageReadOptions& read_o
*iter = std::make_unique<SegmentIterator>(this->shared_from_this(),
schema);
}
- if (config::ignore_always_true_predicate_for_segment &&
- read_options.io_ctx.reader_type == ReaderType::READER_QUERY &&
+ // TODO: Valid the opt not only in ReaderType::READER_QUERY
+ if (read_options.io_ctx.reader_type == ReaderType::READER_QUERY &&
!read_options.column_predicates.empty()) {
auto pruned_predicates = read_options.column_predicates;
auto pruned = false;
for (auto& it : _column_readers) {
const auto uid = it.first;
const auto column_id =
read_options.tablet_schema->field_index(uid);
- if (it.second->prune_predicates_by_zone_map(pruned_predicates,
column_id)) {
- pruned = true;
- }
+ pruned |=
it.second->prune_predicates_by_zone_map(pruned_predicates, column_id);
}
if (pruned) {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index b0d4e8b3e5e..511fca80bc9 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -605,8 +605,39 @@ Status
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
cids.insert(entry.first);
}
- size_t pre_size = 0;
+ {
+ SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_dict_ns);
+ /// Low cardinality optimization is currently not very stable, so to
prevent data corruption,
+ /// we are temporarily disabling its use in data compaction.
+ // TODO: enable it in not only ReaderTyper::READER_QUERY but also
other reader types.
+ if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) {
+ RowRanges dict_row_ranges = RowRanges::create_single(num_rows());
+ for (auto cid : cids) {
+ if (!_segment->can_apply_predicate_safely(cid,
+
_opts.col_id_to_predicates.at(cid).get(),
+ *_schema,
_opts.io_ctx.reader_type)) {
+ continue;
+ }
+ DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
+ RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict(
+ _opts.col_id_to_predicates.at(cid).get(),
&dict_row_ranges));
+ if (dict_row_ranges.is_empty()) {
+ break;
+ }
+ }
+
+ if (dict_row_ranges.is_empty()) {
+ RowRanges::ranges_intersection(*condition_row_ranges,
dict_row_ranges,
+ condition_row_ranges);
+ _opts.stats->segment_dict_filtered++;
+ _opts.stats->filtered_segment_number++;
+ return Status::OK();
+ }
+ }
+ }
+
+ size_t pre_size = 0;
{
SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_bf_ns);
// first filter data by bloom filter index
@@ -701,32 +732,6 @@ Status
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
_opts.stats->rows_stats_filtered += (pre_size -
condition_row_ranges->count());
}
- {
- SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_dict_ns);
- /// Low cardinality optimization is currently not very stable, so to
prevent data corruption,
- /// we are temporarily disabling its use in data compaction.
- if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) {
- RowRanges dict_row_ranges = RowRanges::create_single(num_rows());
- for (auto cid : cids) {
- if (!_segment->can_apply_predicate_safely(cid,
-
_opts.col_id_to_predicates.at(cid).get(),
- *_schema,
_opts.io_ctx.reader_type)) {
- continue;
- }
- RowRanges tmp_row_ranges =
RowRanges::create_single(num_rows());
- DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
- RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict(
- _opts.col_id_to_predicates.at(cid).get(),
&tmp_row_ranges));
- RowRanges::ranges_intersection(dict_row_ranges,
tmp_row_ranges, &dict_row_ranges);
- }
-
- pre_size = condition_row_ranges->count();
- RowRanges::ranges_intersection(*condition_row_ranges,
dict_row_ranges,
- condition_row_ranges);
- _opts.stats->rows_dict_filtered += (pre_size -
condition_row_ranges->count());
- }
- }
-
return Status::OK();
}
diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp
b/be/src/pipeline/exec/olap_scan_operator.cpp
index 5856ace8c81..a8ac354963d 100644
--- a/be/src/pipeline/exec/olap_scan_operator.cpp
+++ b/be/src/pipeline/exec/olap_scan_operator.cpp
@@ -151,7 +151,7 @@ Status OlapScanLocalState::_init_profile() {
_stats_rp_filtered_counter =
ADD_COUNTER(_segment_profile,
"RowsZoneMapRuntimePredicateFiltered", TUnit::UNIT);
_bf_filtered_counter = ADD_COUNTER(_segment_profile,
"RowsBloomFilterFiltered", TUnit::UNIT);
- _dict_filtered_counter = ADD_COUNTER(_segment_profile, "RowsDictFiltered",
TUnit::UNIT);
+ _dict_filtered_counter = ADD_COUNTER(_segment_profile,
"SegmentDictFiltered", TUnit::UNIT);
_del_filtered_counter = ADD_COUNTER(_scanner_profile, "RowsDelFiltered",
TUnit::UNIT);
_conditions_filtered_counter =
ADD_COUNTER(_segment_profile, "RowsConditionsFiltered",
TUnit::UNIT);
diff --git a/be/src/pipeline/exec/scan_operator.cpp
b/be/src/pipeline/exec/scan_operator.cpp
index 6f4fb6de9f0..3ddc88d6dd7 100644
--- a/be/src/pipeline/exec/scan_operator.cpp
+++ b/be/src/pipeline/exec/scan_operator.cpp
@@ -1221,7 +1221,6 @@ Status ScanOperatorX<LocalStateType>::init(const
TPlanNode& tnode, RuntimeState*
_push_down_agg_type = tnode.push_down_agg_type_opt;
} else if (tnode.olap_scan_node.__isset.push_down_agg_type_opt) {
_push_down_agg_type = tnode.olap_scan_node.push_down_agg_type_opt;
-
} else {
_push_down_agg_type = TPushAggOp::type::NONE;
}
diff --git a/be/src/vec/exec/scan/olap_scanner.cpp
b/be/src/vec/exec/scan/olap_scanner.cpp
index f885e64262b..e77959059f8 100644
--- a/be/src/vec/exec/scan/olap_scanner.cpp
+++ b/be/src/vec/exec/scan/olap_scanner.cpp
@@ -671,7 +671,7 @@ void OlapScanner::_collect_profile_before_close() {
COUNTER_UPDATE(local_state->_rows_expr_cond_input_counter,
stats.expr_cond_input_rows);
COUNTER_UPDATE(local_state->_stats_filtered_counter,
stats.rows_stats_filtered);
COUNTER_UPDATE(local_state->_stats_rp_filtered_counter,
stats.rows_stats_rp_filtered);
- COUNTER_UPDATE(local_state->_dict_filtered_counter,
stats.rows_dict_filtered);
+ COUNTER_UPDATE(local_state->_dict_filtered_counter,
stats.segment_dict_filtered);
COUNTER_UPDATE(local_state->_bf_filtered_counter, stats.rows_bf_filtered);
COUNTER_UPDATE(local_state->_del_filtered_counter,
stats.rows_del_filtered);
COUNTER_UPDATE(local_state->_del_filtered_counter,
stats.rows_del_by_bitmap);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]