This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 52ee055f3b5 [improvement](scanner) Remove the predicate that is always
true for the segment (#25366) (#25427)
52ee055f3b5 is described below
commit 52ee055f3b5026053f2d5ecac66a5b10384b143f
Author: Jerry Hu <[email protected]>
AuthorDate: Fri Oct 13 11:44:18 2023 -0500
[improvement](scanner) Remove the predicate that is always true for the
segment (#25366) (#25427)
By utilizing the zonemap index of the segment, we can ascertain if a
predicate is always true. For example, if the segment’s maximum value is 100
and the predicate is col < 101, then this predicate is always true for this
segment.
---
be/src/common/config.cpp | 2 +
be/src/common/config.h | 3 +
be/src/olap/column_predicate.h | 4 ++
be/src/olap/comparison_predicate.h | 25 ++++++++
be/src/olap/rowset/segment_v2/column_reader.cpp | 26 +++++++++
be/src/olap/rowset/segment_v2/column_reader.h | 3 +
be/src/olap/rowset/segment_v2/segment.cpp | 20 ++++++-
.../query_p0/test_select_with_predicate_prune.out | 25 ++++++++
.../test_select_with_predicate_prune.groovy | 67 ++++++++++++++++++++++
9 files changed, 174 insertions(+), 1 deletion(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index f1ecd0e7825..4a81ba56430 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1078,6 +1078,8 @@ DEFINE_mInt32(tablet_schema_cache_recycle_interval,
"86400");
DEFINE_Bool(exit_on_exception, "false")
+DEFINE_Bool(ignore_always_true_predicate_for_segment, "true");
+
// clang-format off
#ifdef BE_TEST
// test s3
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 311aad13643..dd29d21af3a 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1135,6 +1135,9 @@ DECLARE_mInt32(tablet_schema_cache_recycle_interval);
// Use `LOG(FATAL)` to replace `throw` when true
DECLARE_mBool(exit_on_exception);
+// Remove predicate that is always true for a segment.
+DECLARE_Bool(ignore_always_true_predicate_for_segment);
+
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index b98156f5fb8..05e84999a83 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -173,6 +173,10 @@ public:
return true;
}
+ virtual bool is_always_true(const std::pair<WrapperField*, WrapperField*>&
statistic) const {
+ return false;
+ }
+
virtual bool evaluate_del(const std::pair<WrapperField*, WrapperField*>&
statistic) const {
return false;
}
diff --git a/be/src/olap/comparison_predicate.h
b/be/src/olap/comparison_predicate.h
index 04dfd5dc5c3..fddc554f51d 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -158,6 +158,8 @@ public:
return _operator(*reinterpret_cast<const
T*>(statistic.ELE->cell_ptr()), _value); \
}
+ using WarpperFieldType = std::conditional_t<Type == TYPE_DATE, uint24_t,
T>;
+
bool evaluate_and(const std::pair<WrapperField*, WrapperField*>&
statistic) const override {
if (statistic.first->is_null()) {
return true;
@@ -202,6 +204,29 @@ public:
}
}
+ bool is_always_true(const std::pair<WrapperField*, WrapperField*>&
statistic) const override {
+ if (statistic.first->is_null() || statistic.second->is_null()) {
+ return false;
+ }
+
+ T tmp_min_value {};
+ T tmp_max_value {};
+ memcpy((char*)(&tmp_min_value), statistic.first->cell_ptr(),
sizeof(WarpperFieldType));
+ memcpy((char*)(&tmp_max_value), statistic.second->cell_ptr(),
sizeof(WarpperFieldType));
+
+ if constexpr (PT == PredicateType::LT) {
+ return _value > tmp_max_value;
+ } else if constexpr (PT == PredicateType::LE) {
+ return _value >= tmp_max_value;
+ } else if constexpr (PT == PredicateType::GT) {
+ return _value < tmp_min_value;
+ } else if constexpr (PT == PredicateType::GE) {
+ return _value <= tmp_min_value;
+ }
+
+ return false;
+ }
+
bool evaluate_del(const std::pair<WrapperField*, WrapperField*>&
statistic) const override {
if (statistic.first->is_null() || statistic.second->is_null()) {
return false;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index b1b817f545a..d9a074e2904 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -30,6 +30,7 @@
#include "io/fs/file_reader.h"
#include "olap/block_column_predicate.h"
#include "olap/column_predicate.h"
+#include "olap/comparison_predicate.h"
#include "olap/decimal12.h"
#include "olap/inverted_index_parser.h"
#include "olap/iterators.h"
@@ -339,6 +340,31 @@ bool ColumnReader::match_condition(const
AndBlockColumnPredicate* col_predicates
col_predicates);
}
+bool ColumnReader::prune_predicates_by_zone_map(std::vector<ColumnPredicate*>&
predicates,
+ const int column_id) const {
+ if (_zone_map_index == nullptr) {
+ return false;
+ }
+
+ FieldType type = _type_info->type();
+ std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type,
_meta_length));
+ std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type,
_meta_length));
+ _parse_zone_map(*_segment_zone_map, min_value.get(), max_value.get());
+
+ auto pruned = false;
+ for (auto it = predicates.begin(); it != predicates.end();) {
+ auto predicate = *it;
+ if (predicate->column_id() == column_id &&
+ predicate->is_always_true({min_value.get(), max_value.get()})) {
+ pruned = true;
+ it = predicates.erase(it);
+ } else {
+ ++it;
+ }
+ }
+ return pruned;
+}
+
void ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField*
min_value_container,
WrapperField* max_value_container) const {
// min value and max value are valid if has_not_null is true
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index 174aabdefa8..7964555adeb 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -162,6 +162,9 @@ public:
bool is_empty() const { return _num_rows == 0; }
+ bool prune_predicates_by_zone_map(std::vector<ColumnPredicate*>&
predicates,
+ const int column_id) const;
+
CompressionTypePB get_compression() const { return _meta_compression; }
uint64_t num_rows() const { return _num_rows; }
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp
b/be/src/olap/rowset/segment_v2/segment.cpp
index 153ed925176..ddce80bcc33 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -147,7 +147,6 @@ Status Segment::new_iterator(SchemaSPtr schema, const
StorageReadOptions& read_o
return Status::OK();
}
}
-
if (read_options.use_topn_opt) {
auto query_ctx = read_options.runtime_state->get_query_ctx();
auto runtime_predicate =
query_ctx->get_runtime_predicate().get_predictate();
@@ -175,6 +174,25 @@ Status Segment::new_iterator(SchemaSPtr schema, const
StorageReadOptions& read_o
iter->reset(new SegmentIterator(this->shared_from_this(), schema));
}
+ if (config::ignore_always_true_predicate_for_segment &&
+ read_options.io_ctx.reader_type == ReaderType::READER_QUERY &&
+ !read_options.column_predicates.empty()) {
+ auto pruned_predicates = read_options.column_predicates;
+ auto pruned = false;
+ for (auto& it : _column_readers) {
+ if (it.second->prune_predicates_by_zone_map(pruned_predicates,
it.first)) {
+ pruned = true;
+ }
+ }
+
+ if (pruned) {
+ auto options_with_pruned_predicates = read_options;
+ options_with_pruned_predicates.column_predicates =
pruned_predicates;
+ LOG(INFO) << "column_predicates pruned from " <<
read_options.column_predicates.size()
+ << " to " << pruned_predicates.size();
+ return iter->get()->init(options_with_pruned_predicates);
+ }
+ }
return iter->get()->init(read_options);
}
diff --git a/regression-test/data/query_p0/test_select_with_predicate_prune.out
b/regression-test/data/query_p0/test_select_with_predicate_prune.out
new file mode 100644
index 00000000000..2e1fad87499
--- /dev/null
+++ b/regression-test/data/query_p0/test_select_with_predicate_prune.out
@@ -0,0 +1,25 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !select1 --
+1 jerry 2020-10-01
+2 tom 2020-10-02
+3 jack 2020-10-01
+4 tony 2020-10-02
+
+-- !select2 --
+1 jerry 2020-10-01
+3 jack 2020-10-01
+
+-- !select3 --
+
+-- !select4 --
+1 jerry 2020-10-01
+2 tom 2020-10-02
+3 jack 2020-10-01
+4 tony 2020-10-02
+
+-- !select5 --
+2 tom 2020-10-02
+4 tony 2020-10-02
+
+-- !select6 --
+
diff --git
a/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy
b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy
new file mode 100644
index 00000000000..768e04b4c32
--- /dev/null
+++ b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_select_with_predicate_prune") {
+ sql """
+ drop table if exists `test_select_with_predicate_prune`;
+ """
+ sql """
+ CREATE TABLE IF NOT EXISTS `test_select_with_predicate_prune` (
+ id int,
+ name string,
+ birthday date not null
+ )
+ duplicate key(`id`)
+ AUTO PARTITION BY LIST (`birthday`)()
+ DISTRIBUTED BY HASH(`id`) buckets 1
+ PROPERTIES
+ (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """
+ insert into test_select_with_predicate_prune values (1, 'jerry',
'2020-10-01'), (2, 'tom', '2020-10-02');
+ """
+ sql """
+ insert into test_select_with_predicate_prune values (3, 'jack',
'2020-10-01'), (4, 'tony', '2020-10-02');
+ """
+
+ qt_select1 """
+ select * from test_select_with_predicate_prune where birthday <
'2020-10-03' order by id;
+ """
+
+ qt_select2 """
+ select * from test_select_with_predicate_prune where birthday <
'2020-10-02' order by id;
+ """
+
+ qt_select3 """
+ select * from test_select_with_predicate_prune where birthday <
'2020-10-01' order by id;
+ """
+
+
+ qt_select4 """
+ select * from test_select_with_predicate_prune where birthday >
'2020-09-30' order by id;
+ """
+
+ qt_select5 """
+ select * from test_select_with_predicate_prune where birthday >
'2020-10-01' order by id;
+ """
+
+ qt_select6 """
+ select * from test_select_with_predicate_prune where birthday >
'2020-10-02' order by id;
+ """
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]