This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new abc21f5d77 [bugfix](ngram bf index) process differently for normal
bloom filter index and ngram bf index (#21310)
abc21f5d77 is described below
commit abc21f5d77f82ce7ca4a95baf780cffba4108acf
Author: Kang <[email protected]>
AuthorDate: Thu Jul 13 17:31:45 2023 +0800
[bugfix](ngram bf index) process differently for normal bloom filter index
and ngram bf index (#21310)
* process differently for normal bloom filter index and ngram bf index
* fix review comments for readbility
* add test case
* add testcase for delete condition
---
be/src/olap/accept_null_predicate.h | 4 +-
be/src/olap/block_column_predicate.h | 10 ++--
be/src/olap/column_predicate.h | 2 +-
be/src/olap/comparison_predicate.h | 6 ++-
be/src/olap/in_list_predicate.h | 6 ++-
be/src/olap/like_column_predicate.h | 4 +-
be/src/olap/null_predicate.h | 4 +-
be/src/olap/rowset/segment_v2/column_reader.cpp | 3 +-
be/src/olap/rowset/segment_v2/column_reader.h | 10 +++-
.../data/index_p0/test_ngram_bloomfilter_index.out | 33 ++++++++++++
.../index_p0/test_ngram_bloomfilter_index.groovy | 62 ++++++++++++++++++++++
11 files changed, 132 insertions(+), 12 deletions(-)
diff --git a/be/src/olap/accept_null_predicate.h
b/be/src/olap/accept_null_predicate.h
index bfff2910ca..1a5f586ed5 100644
--- a/be/src/olap/accept_null_predicate.h
+++ b/be/src/olap/accept_null_predicate.h
@@ -148,7 +148,9 @@ public:
bool evaluate_and(const BloomFilter* bf) const override { return
_nested->evaluate_and(bf); }
- bool can_do_bloom_filter() const override { return
_nested->can_do_bloom_filter(); }
+ bool can_do_bloom_filter(bool ngram) const override {
+ return _nested->can_do_bloom_filter(ngram);
+ }
void evaluate_vec(const vectorized::IColumn& column, uint16_t size,
bool* flags) const override {
diff --git a/be/src/olap/block_column_predicate.h
b/be/src/olap/block_column_predicate.h
index 0069a62d29..c91dc0c367 100644
--- a/be/src/olap/block_column_predicate.h
+++ b/be/src/olap/block_column_predicate.h
@@ -87,7 +87,7 @@ public:
return true;
}
- virtual bool can_do_bloom_filter() const { return false; }
+ virtual bool can_do_bloom_filter(bool ngram) const { return false; }
//evaluate predicate on inverted
virtual Status evaluate(const std::string& column_name,
InvertedIndexIterator* iterator,
@@ -121,7 +121,9 @@ public:
void evaluate_vec(vectorized::MutableColumns& block, uint16_t size, bool*
flags) const override;
- bool can_do_bloom_filter() const override { return
_predicate->can_do_bloom_filter(); }
+ bool can_do_bloom_filter(bool ngram) const override {
+ return _predicate->can_do_bloom_filter(ngram);
+ }
private:
const ColumnPredicate* _predicate;
@@ -188,9 +190,9 @@ public:
bool evaluate_and(const StringRef* dict_words, const size_t dict_num)
const override;
- bool can_do_bloom_filter() const override {
+ bool can_do_bloom_filter(bool ngram) const override {
for (auto& pred : _block_column_predicate_vec) {
- if (!pred->can_do_bloom_filter()) {
+ if (!pred->can_do_bloom_filter(ngram)) {
return false;
}
}
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index 88f40c92c1..cad253ac1a 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -183,7 +183,7 @@ public:
return true;
}
- virtual bool can_do_bloom_filter() const { return false; }
+ virtual bool can_do_bloom_filter(bool ngram) const { return false; }
// used to evaluate pre read column in lazy materialization
// now only support integer/float
diff --git a/be/src/olap/comparison_predicate.h
b/be/src/olap/comparison_predicate.h
index 6524fdfc7d..04dfd5dc5c 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -244,6 +244,8 @@ public:
bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
if constexpr (PT == PredicateType::EQ) {
+ // EQ predicate can not use ngram bf, just return true to accept
+ if (bf->is_ngram_bf()) return true;
if constexpr (std::is_same_v<T, StringRef>) {
return bf->test_bytes(_value.data, _value.size);
} else if constexpr (Type == TYPE_DATE) {
@@ -272,7 +274,9 @@ public:
return true;
}
- bool can_do_bloom_filter() const override { return PT ==
PredicateType::EQ; }
+ bool can_do_bloom_filter(bool ngram) const override {
+ return PT == PredicateType::EQ && !ngram;
+ }
void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel,
uint16_t size,
bool* flags) const override {
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 5f0f99f7eb..f4e432cf28 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -381,6 +381,8 @@ public:
bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
if constexpr (PT == PredicateType::IN_LIST) {
+ // IN predicate can not use ngram bf, just return true to accept
+ if (bf->is_ngram_bf()) return true;
HybridSetBase::IteratorBase* iter = _values->begin();
while (iter->has_next()) {
if constexpr (std::is_same_v<T, StringRef>) {
@@ -408,7 +410,9 @@ public:
}
}
- bool can_do_bloom_filter() const override { return PT ==
PredicateType::IN_LIST; }
+ bool can_do_bloom_filter(bool ngram) const override {
+ return PT == PredicateType::IN_LIST && !ngram;
+ }
private:
template <typename LeftT, typename RightT>
diff --git a/be/src/olap/like_column_predicate.h
b/be/src/olap/like_column_predicate.h
index ddbe892303..f97ff46453 100644
--- a/be/src/olap/like_column_predicate.h
+++ b/be/src/olap/like_column_predicate.h
@@ -76,12 +76,14 @@ public:
_page_ng_bf = std::move(src);
}
bool evaluate_and(const BloomFilter* bf) const override {
+ // like predicate can not use normal bf, just return true to accept
+ if (!bf->is_ngram_bf()) return true;
if (_page_ng_bf) {
return bf->contains(*_page_ng_bf);
}
return true;
}
- bool can_do_bloom_filter() const override { return true; }
+ bool can_do_bloom_filter(bool ngram) const override { return ngram; }
private:
template <bool is_and>
diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h
index ed81cc6f1b..4313adea11 100644
--- a/be/src/olap/null_predicate.h
+++ b/be/src/olap/null_predicate.h
@@ -84,6 +84,8 @@ public:
}
bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
+ // null predicate can not use ngram bf, just return true to accept
+ if (bf->is_ngram_bf()) return true;
if (_is_null) {
return bf->test_bytes(nullptr, 0);
} else {
@@ -92,7 +94,7 @@ public:
}
}
- bool can_do_bloom_filter() const override { return _is_null; }
+ bool can_do_bloom_filter(bool ngram) const override { return _is_null &&
!ngram; }
void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool*
flags) const override;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index a27d999169..2e4db26c50 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1213,7 +1213,8 @@ Status FileColumnIterator::get_row_ranges_by_zone_map(
Status FileColumnIterator::get_row_ranges_by_bloom_filter(
const AndBlockColumnPredicate* col_predicates, RowRanges* row_ranges) {
- if (col_predicates->can_do_bloom_filter() &&
_reader->has_bloom_filter_index()) {
+ if ((col_predicates->can_do_bloom_filter(false) &&
_reader->has_bloom_filter_index(false)) ||
+ (col_predicates->can_do_bloom_filter(true) &&
_reader->has_bloom_filter_index(true))) {
RETURN_IF_ERROR(_reader->get_row_ranges_by_bloom_filter(col_predicates,
row_ranges));
}
return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index a6d23ac950..fb212ef33d 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -137,7 +137,15 @@ public:
bool has_zone_map() const { return _zone_map_index_meta != nullptr; }
bool has_bitmap_index() const { return _bitmap_index_meta != nullptr; }
- bool has_bloom_filter_index() const { return _bf_index_meta != nullptr; }
+ bool has_bloom_filter_index(bool ngram) const {
+ if (_bf_index_meta == nullptr) return false;
+
+ if (ngram) {
+ return _bf_index_meta->algorithm() ==
BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER;
+ } else {
+ return _bf_index_meta->algorithm() !=
BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER;
+ }
+ }
// Check if this column could match `cond' using segment zone map.
// Since segment zone map is stored in metadata, this function is fast
without I/O.
diff --git a/regression-test/data/index_p0/test_ngram_bloomfilter_index.out
b/regression-test/data/index_p0/test_ngram_bloomfilter_index.out
new file mode 100644
index 0000000000..7849739f42
--- /dev/null
+++ b/regression-test/data/index_p0/test_ngram_bloomfilter_index.out
@@ -0,0 +1,33 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !select_all_1 --
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699% /test 100 false
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699%xxx /test 100 false
+
+-- !select_eq_1 --
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699% /test 100 false
+
+-- !select_in_1 --
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699% /test 100 false
+
+-- !select_like_1 --
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699% /test 100 false
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699%xxx /test 100 false
+
+-- !select_all_2 --
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699%xxx /test 100 false
+
+-- !select_eq_2 --
+
+-- !select_in_2 --
+
+-- !select_like_2 --
+1 dt_bjn001 p9-webcast-sign.douyinpic.com test
/%/7212503657802320699%xxx /test 100 false
+
+-- !select_all_3 --
+
+-- !select_eq_3 --
+
+-- !select_in_3 --
+
+-- !select_like_3 --
+
diff --git
a/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy
b/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy
new file mode 100644
index 0000000000..7619adedc3
--- /dev/null
+++ b/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_ngram_bloomfilter_index") {
+ // todo: test bitmap index, such as create, drop, alter table index
+ def tableName = 'test_ngram_bloomfilter_index'
+ sql "DROP TABLE IF EXISTS ${tableName}"
+ sql """
+ CREATE TABLE IF NOT EXISTS ${tableName} (
+ `key_id` bigint(20) NULL COMMENT '',
+ `category` varchar(200) NULL COMMENT '',
+ `https_url` varchar(300) NULL COMMENT '',
+ `hostname` varchar(300) NULL,
+ `http_url` text NULL COMMENT '',
+ `url_path` varchar(2000) NULL COMMENT '',
+ `cnt` bigint(20) NULL COMMENT '',
+ `host_flag` boolean NULL COMMENT '',
+ INDEX idx_ngrambf (`http_url`) USING NGRAM_BF PROPERTIES("gram_size" =
"2", "bf_size" = "512")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`key_id`, `category`)
+ COMMENT 'OLAP'
+ DISTRIBUTED BY HASH(`key_id`) BUCKETS 3
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql "INSERT INTO ${tableName} values (1, 'dt_bjn001',
'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%', '/test',
100, false);"
+ sql "INSERT INTO ${tableName} values (1, 'dt_bjn001',
'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%xxx', '/test',
100, false);"
+
+
+ sql "SET enable_function_pushdown = true"
+
+ qt_select_all_1 "SELECT * FROM ${tableName}"
+ qt_select_eq_1 "SELECT * FROM ${tableName} WHERE http_url =
'/%/7212503657802320699%'"
+ qt_select_in_1 "SELECT * FROM ${tableName} WHERE http_url IN
('/%/7212503657802320699%')"
+ qt_select_like_1 "SELECT * FROM ${tableName} WHERE http_url like
'/%/7212503657802320699%'"
+
+ // delete and then select
+ sql "DELETE FROM ${tableName} WHERE http_url IN
('/%/7212503657802320699%')"
+ qt_select_all_2 "SELECT * FROM ${tableName}"
+ qt_select_eq_2 "SELECT * FROM ${tableName} WHERE http_url =
'/%/7212503657802320699%'"
+ qt_select_in_2 "SELECT * FROM ${tableName} WHERE http_url IN
('/%/7212503657802320699%')"
+ qt_select_like_2 "SELECT * FROM ${tableName} WHERE http_url like
'/%/7212503657802320699%'"
+
+ sql "DELETE FROM ${tableName} WHERE http_url =
'/%/7212503657802320699%xxx'"
+ qt_select_all_3 "SELECT * FROM ${tableName}"
+ qt_select_eq_3 "SELECT * FROM ${tableName} WHERE http_url =
'/%/7212503657802320699%'"
+ qt_select_in_3 "SELECT * FROM ${tableName} WHERE http_url IN
('/%/7212503657802320699%')"
+ qt_select_like_3 "SELECT * FROM ${tableName} WHERE http_url like
'/%/7212503657802320699%'"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]