This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3593a82c6f4 [opt](inverted index) unified optimization judgment to
prevent omissions (#38027)
3593a82c6f4 is described below
commit 3593a82c6f4d42eae703469b867ef22ff93f6827
Author: zzzxl <[email protected]>
AuthorDate: Thu Jul 18 11:47:03 2024 +0800
[opt](inverted index) unified optimization judgment to prevent omissions
(#38027)
1. optimize inverted index, strengthen logical judgment.
---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 62 +++++++++-------------
be/src/olap/rowset/segment_v2/segment_iterator.h | 5 +-
be/src/vec/exprs/vexpr.cpp | 17 +++++-
.../test_all_index_hit_fault_injection.out | 24 +++++++++
.../test_topn_fault_injection.out | 30 +++++++++++
.../test_all_index_hit_fault_injection.groovy | 11 +++-
.../test_topn_fault_injection.groovy | 32 ++++++++---
7 files changed, 132 insertions(+), 49 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index db1d2e9e676..61be47cced7 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1345,19 +1345,15 @@ Status SegmentIterator::_apply_inverted_index() {
return Status::OK();
}
-bool
SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId
cid) {
+bool
SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId
cid,
+
bool default_return) {
auto it = _column_predicate_inverted_index_status.find(cid);
if (it != _column_predicate_inverted_index_status.end()) {
const auto& pred_map = it->second;
-
- bool all_true = std::all_of(pred_map.begin(), pred_map.end(),
- [](const auto& pred_entry) { return
pred_entry.second; });
-
- if (all_true) {
- return true;
- }
+ return std::all_of(pred_map.begin(), pred_map.end(),
+ [](const auto& pred_entry) { return
pred_entry.second; });
}
- return false;
+ return default_return;
}
Status SegmentIterator::_init_return_column_iterators() {
@@ -2404,9 +2400,9 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
nrows_read_limit = std::min(static_cast<uint32_t>(_opts.topn_limit),
nrows_read_limit);
}
- DBUG_EXECUTE_IF("segment_iterator.topn_opt", {
+ DBUG_EXECUTE_IF("segment_iterator.topn_opt_1", {
if (nrows_read_limit != 1) {
- return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt execute
failed: {}",
+ return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt 1
execute failed: {}",
nrows_read_limit);
}
})
@@ -2887,19 +2883,7 @@ bool SegmentIterator::_no_need_read_key_data(ColumnId
cid, vectorized::MutableCo
return false;
}
- std::set<uint32_t> cids;
- for (auto* pred : _col_predicates) {
- cids.insert(pred->column_id());
- }
- for (auto* pred : _col_preds_except_leafnode_of_andnode) {
- cids.insert(pred->column_id());
- }
-
- // If the key is present in expr, data needs to be read.
- if (cids.contains(cid)) {
- return false;
- }
- if
(_column_pred_in_remaining_vconjunct.contains(_opts.tablet_schema->column(cid).name()))
{
+ if (!_check_all_predicates_passed_inverted_index_for_column(cid, true)) {
return false;
}
@@ -2920,7 +2904,7 @@ bool SegmentIterator::_has_delete_predicate(ColumnId cid)
{
return delete_columns_set.contains(cid);
}
-bool SegmentIterator::_can_opt_topn_reads() const {
+bool SegmentIterator::_can_opt_topn_reads() {
if (_opts.topn_limit <= 0) {
return false;
}
@@ -2929,20 +2913,24 @@ bool SegmentIterator::_can_opt_topn_reads() const {
return false;
}
- std::set<uint32_t> cids;
- for (auto* pred : _col_predicates) {
- cids.insert(pred->column_id());
- }
- for (auto* pred : _col_preds_except_leafnode_of_andnode) {
- cids.insert(pred->column_id());
- }
-
- uint32_t delete_sign_idx = _opts.tablet_schema->delete_sign_idx();
- bool result = std::ranges::all_of(cids.begin(), cids.end(),
[delete_sign_idx](auto cid) {
- return cid == delete_sign_idx;
+ bool all_true = std::ranges::all_of(_schema->column_ids(), [this](auto
cid) {
+ if (cid == _opts.tablet_schema->delete_sign_idx() ||
+ _opts.tablet_schema->column(cid).is_key()) {
+ return true;
+ }
+ if (_check_all_predicates_passed_inverted_index_for_column(cid, true))
{
+ return true;
+ }
+ return false;
});
- return result;
+ DBUG_EXECUTE_IF("segment_iterator.topn_opt_2", {
+ if (all_true) {
+ return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt 2
execute failed");
+ }
+ })
+
+ return all_true;
}
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index cb904f21c6a..c9284e592e4 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -387,10 +387,11 @@ private:
bool _has_delete_predicate(ColumnId cid);
- bool _can_opt_topn_reads() const;
+ bool _can_opt_topn_reads();
void _initialize_predicate_results();
- bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid);
+ bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid,
+ bool
default_return = false);
class BitmapRangeIterator;
class BackwardBitmapRangeIterator;
diff --git a/be/src/vec/exprs/vexpr.cpp b/be/src/vec/exprs/vexpr.cpp
index bb6e48f6084..31a8e04cad1 100644
--- a/be/src/vec/exprs/vexpr.cpp
+++ b/be/src/vec/exprs/vexpr.cpp
@@ -23,6 +23,7 @@
#include <thrift/protocol/TDebugProtocol.h>
#include <algorithm>
+#include <boost/algorithm/string/split.hpp>
#include <boost/iterator/iterator_facade.hpp>
#include <memory>
#include <stack>
@@ -604,8 +605,20 @@ bool VExpr::fast_execute(Block& block, const
ColumnNumbers& arguments, size_t re
size_t input_rows_count, const std::string&
function_name) {
std::string result_column_name = gen_predicate_result_sign(block,
arguments, function_name);
if (!block.has(result_column_name)) {
- DBUG_EXECUTE_IF("segment_iterator.fast_execute",
- { return
Status::Error<ErrorCode::INTERNAL_ERROR>("fast_execute failed"); })
+ DBUG_EXECUTE_IF("segment_iterator.fast_execute", {
+ auto debug_col_name =
DebugPoints::instance()->get_debug_param_or_default<std::string>(
+ "segment_iterator._read_columns_by_index", "column_name",
"");
+
+ std::vector<std::string> column_names;
+ boost::split(column_names, debug_col_name,
boost::algorithm::is_any_of(","));
+
+ std::string column_name = block.get_by_position(arguments[0]).name;
+ auto it = std::find(column_names.begin(), column_names.end(),
column_name);
+ if (it == column_names.end()) {
+ return Status::Error<ErrorCode::INTERNAL_ERROR>("fast_execute
failed: {}",
+
result_column_name);
+ }
+ })
return false;
}
diff --git
a/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
b/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
index ea2b79e0c9b..205effb8dbf 100644
---
a/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
+++
b/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
@@ -14,6 +14,18 @@
-- !sql --
14
+-- !sql --
+999
+
+-- !sql --
+209
+
+-- !sql --
+209
+
+-- !sql --
+334
+
-- !sql --
120
@@ -29,3 +41,15 @@
-- !sql --
11
+-- !sql --
+279
+
+-- !sql --
+119
+
+-- !sql --
+119
+
+-- !sql --
+154
+
diff --git
a/regression-test/data/fault_injection_p0/test_topn_fault_injection.out
b/regression-test/data/fault_injection_p0/test_topn_fault_injection.out
index 9cc3f4146b5..fe5034df477 100644
--- a/regression-test/data/fault_injection_p0/test_topn_fault_injection.out
+++ b/regression-test/data/fault_injection_p0/test_topn_fault_injection.out
@@ -17,9 +17,39 @@
-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+-- !sql --
+893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+-- !sql --
+893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
+-- !sql --
+893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
+
-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736
diff --git
a/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
b/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
index 3bd884a5d87..d1a8e7c7642 100644
---
a/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
+++
b/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
@@ -99,19 +99,28 @@ suite("test_all_index_hit_fault_injection",
"nonConcurrent") {
try {
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index",
[column_name: "clientip,request"])
-
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.fast_execute")
+
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.fast_execute",
[column_name: "status,size"])
+
qt_sql """ select count() from ${indexTbName1} where (request
match_phrase 'hm'); """
qt_sql """ select count() from ${indexTbName1} where (request
match_phrase 'hm' and clientip = '126.1.0.0'); """
qt_sql """ select count() from ${indexTbName1} where (request
match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and
clientip = '201.0.0.0'); """
qt_sql """ select count() from ${indexTbName1} where (request
match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or
(request match_phrase 'bg' and clientip = '201.0.0.0' or clientip =
'232.0.0.0'); """
qt_sql """ select count() from ${indexTbName1} where (request
match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request
match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """
+ qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455); """
+ qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm'); """
+ qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or
request match_phrase 'ag'); """
+ qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or
request match_phrase 'ag' or status = 304); """
qt_sql """ select count() from ${indexTbName2} where (request
match_phrase 'hm'); """
qt_sql """ select count() from ${indexTbName2} where (request
match_phrase 'hm' and clientip = '126.1.0.0'); """
qt_sql """ select count() from ${indexTbName2} where (request
match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and
clientip = '201.0.0.0'); """
qt_sql """ select count() from ${indexTbName2} where (request
match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or
(request match_phrase 'bg' and clientip = '201.0.0.0' or clientip =
'232.0.0.0'); """
qt_sql """ select count() from ${indexTbName2} where (request
match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request
match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """
+ qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455); """
+ qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm'); """
+ qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or
request match_phrase 'ag'); """
+ qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or
request match_phrase 'ag' or status = 304); """
} finally {
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
diff --git
a/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy
b/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy
index 08a1ef0164d..37315a49525 100644
--- a/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy
+++ b/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy
@@ -31,12 +31,11 @@ suite("test_topn_fault_injection", "nonConcurrent") {
INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
- UNIQUE KEY(`@timestamp`)
+ DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
- DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
+ DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
- "enable_unique_key_merge_on_write" = "true",
"disable_auto_compaction" = "true"
);
"""
@@ -52,11 +51,12 @@ suite("test_topn_fault_injection", "nonConcurrent") {
INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
- DUPLICATE KEY(`@timestamp`)
+ UNIQUE KEY(`@timestamp`)
COMMENT "OLAP"
- DISTRIBUTED BY RANDOM BUCKETS 1
+ DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
+ "enable_unique_key_merge_on_write" = "true",
"disable_auto_compaction" = "true"
);
"""
@@ -98,19 +98,37 @@ suite("test_topn_fault_injection", "nonConcurrent") {
sql "sync"
try {
- GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt")
+
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt_1")
qt_sql """ select * from ${indexTbName1} where (request match_phrase
'hm') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName1} where (request match_phrase
'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName1} where (request match_phrase
'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip
match_phrase '2') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName1} where (request match_phrase
'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request
match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4')
order by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm') order
by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or
clientip match_phrase '3') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName2} where (request match_phrase
'hm') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName2} where (request match_phrase
'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName2} where (request match_phrase
'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip
match_phrase '2') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName2} where (request match_phrase
'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request
match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4')
order by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm') order
by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or
clientip match_phrase '3') order by `@timestamp` limit 1; """
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt_1")
+ }
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt_2")
+
+ qt_sql """ select * from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and
request like '%ag%') order by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and
clientip like '%1%') order by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName1} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or
clientip match_phrase '3' and request like '%ag%') order by `@timestamp` limit
1; """
+
+ qt_sql """ select * from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and
request like '%ag%') order by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and
clientip like '%1%') order by `@timestamp` limit 1; """
+ qt_sql """ select * from ${indexTbName2} where (`@timestamp` >=
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or
clientip match_phrase '3' and request like '%ag%') order by `@timestamp` limit
1; """
} finally {
- GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt")
+
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt_2")
}
} finally {
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]