This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new fed4d1bc0f7 [Refactor](scan) Remove the unless config and change some 
profile detail in segment (#54041)
fed4d1bc0f7 is described below

commit fed4d1bc0f7f07fff7ca69e9ebd0c7773ac86497
Author: HappenLee <[email protected]>
AuthorDate: Wed Jul 30 11:32:33 2025 +0800

    [Refactor](scan) Remove the unless config and change some profile detail in 
segment (#54041)
    
    1. remove the unless be config:ignore_always_true_predicate_for_segment
    2.  change dict filter segment profile logic
---
 be/src/common/config.cpp                           |  2 -
 be/src/common/config.h                             |  3 --
 be/src/olap/olap_common.h                          |  2 +-
 be/src/olap/rowset/segment_v2/segment.cpp          | 10 ++--
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 59 ++++++++++++----------
 be/src/pipeline/exec/olap_scan_operator.cpp        |  2 +-
 be/src/pipeline/exec/scan_operator.cpp             |  1 -
 be/src/vec/exec/scan/olap_scanner.cpp              |  2 +-
 8 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index f31fe300feb..1f681cecee6 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1326,8 +1326,6 @@ DEFINE_mInt32(be_proc_monitor_interval_ms, "10000");
 
 DEFINE_Int32(workload_group_metrics_interval_ms, "5000");
 
-DEFINE_Bool(ignore_always_true_predicate_for_segment, "true");
-
 // Ingest binlog work pool size, -1 is disable, 0 is hardware concurrency
 DEFINE_Int32(ingest_binlog_work_pool_size, "-1");
 
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 271dee612e0..cb52c768c9f 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1388,9 +1388,6 @@ DECLARE_Int32(workload_group_metrics_interval_ms);
 // This config controls whether the s3 file writer would flush cache 
asynchronously
 DECLARE_Bool(enable_flush_file_cache_async);
 
-// Remove predicate that is always true for a segment.
-DECLARE_Bool(ignore_always_true_predicate_for_segment);
-
 // Ingest binlog work pool size
 DECLARE_Int32(ingest_binlog_work_pool_size);
 
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 051b81e0586..1d809ad1c13 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -340,7 +340,7 @@ struct OlapReaderStatistics {
     int64_t rows_stats_filtered = 0;
     int64_t rows_stats_rp_filtered = 0;
     int64_t rows_bf_filtered = 0;
-    int64_t rows_dict_filtered = 0;
+    int64_t segment_dict_filtered = 0;
     // Including the number of rows filtered out according to the Delete 
information in the Tablet,
     // and the number of rows filtered for marked deleted rows under the 
unique key model.
     // This metric is mainly used to record the number of rows filtered by the 
delete condition in Segment V1,
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp 
b/be/src/olap/rowset/segment_v2/segment.cpp
index 3c0e16ea938..a80a810c28b 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -154,7 +154,7 @@ Segment::Segment(uint32_t segment_id, RowsetId rowset_id, 
TabletSchemaSPtr table
           _meta_mem_usage(0),
           _rowset_id(rowset_id),
           _tablet_schema(std::move(tablet_schema)),
-          _idx_file_info(idx_file_info) {}
+          _idx_file_info(std::move(idx_file_info)) {}
 
 Segment::~Segment() {
     g_segment_estimate_mem_bytes << -_tracked_meta_mem_usage;
@@ -297,17 +297,15 @@ Status Segment::new_iterator(SchemaSPtr schema, const 
StorageReadOptions& read_o
         *iter = std::make_unique<SegmentIterator>(this->shared_from_this(), 
schema);
     }
 
-    if (config::ignore_always_true_predicate_for_segment &&
-        read_options.io_ctx.reader_type == ReaderType::READER_QUERY &&
+    // TODO: Valid the opt not only in ReaderType::READER_QUERY
+    if (read_options.io_ctx.reader_type == ReaderType::READER_QUERY &&
         !read_options.column_predicates.empty()) {
         auto pruned_predicates = read_options.column_predicates;
         auto pruned = false;
         for (auto& it : _column_readers) {
             const auto uid = it.first;
             const auto column_id = 
read_options.tablet_schema->field_index(uid);
-            if (it.second->prune_predicates_by_zone_map(pruned_predicates, 
column_id)) {
-                pruned = true;
-            }
+            pruned |= 
it.second->prune_predicates_by_zone_map(pruned_predicates, column_id);
         }
 
         if (pruned) {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index b0d4e8b3e5e..511fca80bc9 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -605,8 +605,39 @@ Status 
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
         cids.insert(entry.first);
     }
 
-    size_t pre_size = 0;
+    {
+        SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_dict_ns);
+        /// Low cardinality optimization is currently not very stable, so to 
prevent data corruption,
+        /// we are temporarily disabling its use in data compaction.
+        // TODO: enable it in not only ReaderTyper::READER_QUERY but also 
other reader types.
+        if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) {
+            RowRanges dict_row_ranges = RowRanges::create_single(num_rows());
+            for (auto cid : cids) {
+                if (!_segment->can_apply_predicate_safely(cid,
+                                                          
_opts.col_id_to_predicates.at(cid).get(),
+                                                          *_schema, 
_opts.io_ctx.reader_type)) {
+                    continue;
+                }
+                DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
+                RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict(
+                        _opts.col_id_to_predicates.at(cid).get(), 
&dict_row_ranges));
 
+                if (dict_row_ranges.is_empty()) {
+                    break;
+                }
+            }
+
+            if (dict_row_ranges.is_empty()) {
+                RowRanges::ranges_intersection(*condition_row_ranges, 
dict_row_ranges,
+                                               condition_row_ranges);
+                _opts.stats->segment_dict_filtered++;
+                _opts.stats->filtered_segment_number++;
+                return Status::OK();
+            }
+        }
+    }
+
+    size_t pre_size = 0;
     {
         SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_bf_ns);
         // first filter data by bloom filter index
@@ -701,32 +732,6 @@ Status 
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
         _opts.stats->rows_stats_filtered += (pre_size - 
condition_row_ranges->count());
     }
 
-    {
-        SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_dict_ns);
-        /// Low cardinality optimization is currently not very stable, so to 
prevent data corruption,
-        /// we are temporarily disabling its use in data compaction.
-        if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) {
-            RowRanges dict_row_ranges = RowRanges::create_single(num_rows());
-            for (auto cid : cids) {
-                if (!_segment->can_apply_predicate_safely(cid,
-                                                          
_opts.col_id_to_predicates.at(cid).get(),
-                                                          *_schema, 
_opts.io_ctx.reader_type)) {
-                    continue;
-                }
-                RowRanges tmp_row_ranges = 
RowRanges::create_single(num_rows());
-                DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
-                RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict(
-                        _opts.col_id_to_predicates.at(cid).get(), 
&tmp_row_ranges));
-                RowRanges::ranges_intersection(dict_row_ranges, 
tmp_row_ranges, &dict_row_ranges);
-            }
-
-            pre_size = condition_row_ranges->count();
-            RowRanges::ranges_intersection(*condition_row_ranges, 
dict_row_ranges,
-                                           condition_row_ranges);
-            _opts.stats->rows_dict_filtered += (pre_size - 
condition_row_ranges->count());
-        }
-    }
-
     return Status::OK();
 }
 
diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp 
b/be/src/pipeline/exec/olap_scan_operator.cpp
index 5856ace8c81..a8ac354963d 100644
--- a/be/src/pipeline/exec/olap_scan_operator.cpp
+++ b/be/src/pipeline/exec/olap_scan_operator.cpp
@@ -151,7 +151,7 @@ Status OlapScanLocalState::_init_profile() {
     _stats_rp_filtered_counter =
             ADD_COUNTER(_segment_profile, 
"RowsZoneMapRuntimePredicateFiltered", TUnit::UNIT);
     _bf_filtered_counter = ADD_COUNTER(_segment_profile, 
"RowsBloomFilterFiltered", TUnit::UNIT);
-    _dict_filtered_counter = ADD_COUNTER(_segment_profile, "RowsDictFiltered", 
TUnit::UNIT);
+    _dict_filtered_counter = ADD_COUNTER(_segment_profile, 
"SegmentDictFiltered", TUnit::UNIT);
     _del_filtered_counter = ADD_COUNTER(_scanner_profile, "RowsDelFiltered", 
TUnit::UNIT);
     _conditions_filtered_counter =
             ADD_COUNTER(_segment_profile, "RowsConditionsFiltered", 
TUnit::UNIT);
diff --git a/be/src/pipeline/exec/scan_operator.cpp 
b/be/src/pipeline/exec/scan_operator.cpp
index 6f4fb6de9f0..3ddc88d6dd7 100644
--- a/be/src/pipeline/exec/scan_operator.cpp
+++ b/be/src/pipeline/exec/scan_operator.cpp
@@ -1221,7 +1221,6 @@ Status ScanOperatorX<LocalStateType>::init(const 
TPlanNode& tnode, RuntimeState*
         _push_down_agg_type = tnode.push_down_agg_type_opt;
     } else if (tnode.olap_scan_node.__isset.push_down_agg_type_opt) {
         _push_down_agg_type = tnode.olap_scan_node.push_down_agg_type_opt;
-
     } else {
         _push_down_agg_type = TPushAggOp::type::NONE;
     }
diff --git a/be/src/vec/exec/scan/olap_scanner.cpp 
b/be/src/vec/exec/scan/olap_scanner.cpp
index f885e64262b..e77959059f8 100644
--- a/be/src/vec/exec/scan/olap_scanner.cpp
+++ b/be/src/vec/exec/scan/olap_scanner.cpp
@@ -671,7 +671,7 @@ void OlapScanner::_collect_profile_before_close() {
     COUNTER_UPDATE(local_state->_rows_expr_cond_input_counter, 
stats.expr_cond_input_rows);
     COUNTER_UPDATE(local_state->_stats_filtered_counter, 
stats.rows_stats_filtered);
     COUNTER_UPDATE(local_state->_stats_rp_filtered_counter, 
stats.rows_stats_rp_filtered);
-    COUNTER_UPDATE(local_state->_dict_filtered_counter, 
stats.rows_dict_filtered);
+    COUNTER_UPDATE(local_state->_dict_filtered_counter, 
stats.segment_dict_filtered);
     COUNTER_UPDATE(local_state->_bf_filtered_counter, stats.rows_bf_filtered);
     COUNTER_UPDATE(local_state->_del_filtered_counter, 
stats.rows_del_filtered);
     COUNTER_UPDATE(local_state->_del_filtered_counter, 
stats.rows_del_by_bitmap);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to