This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 104a822a2f4 [Refacotr](RuntimeFilter) refactor rf code to improve
performance (#28094)
104a822a2f4 is described below
commit 104a822a2f4f77bb3777f813d24974a3936d6c81
Author: HappenLee <[email protected]>
AuthorDate: Thu Dec 7 20:32:30 2023 +0800
[Refacotr](RuntimeFilter) refactor rf code to improve performance (#28094)
---
be/src/exprs/bloom_filter_func.h | 156 +++++++++++++++++----------------------
be/src/exprs/runtime_filter.h | 1 +
2 files changed, 67 insertions(+), 90 deletions(-)
diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h
index 6ea805ee2ee..3c60ccc89c7 100644
--- a/be/src/exprs/bloom_filter_func.h
+++ b/be/src/exprs/bloom_filter_func.h
@@ -55,8 +55,6 @@ public:
return _bloom_filter->find(data);
}
- void add_bytes(const char* data, size_t len) {
_bloom_filter->insert(StringRef(data, len)); }
-
// test_element/find_element only used on vectorized engine
template <typename T>
bool test_element(T element) const {
@@ -217,70 +215,76 @@ protected:
bool _build_bf_exactly = false;
};
-struct BaseOp {
- virtual ~BaseOp() = default;
-
- virtual bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
- const void* data) const = 0;
-
- uint16_t find_batch_olap_engine_with_element_size(const
BloomFilterAdaptor& bloom_filter,
- const char* data, const
uint8* nullmap,
- uint16_t* offsets, int
number,
- const bool
is_parse_column,
- size_t element_size)
const {
- uint16_t new_size = 0;
- if (is_parse_column) {
- if (nullmap == nullptr) {
- for (int i = 0; i < number; i++) {
- uint16_t idx = offsets[i];
- if (!find_olap_engine(bloom_filter, data + element_size *
idx)) {
- continue;
- }
- offsets[new_size++] = idx;
+template <typename T, bool need_trim = false>
+uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char*
data,
+ const uint8* nullmap, uint16_t* offsets, int number,
+ const bool is_parse_column) {
+ auto get_element = [](const char* input_data, int idx) {
+ if constexpr (std::is_same_v<T, StringRef> && need_trim) {
+ const auto value = ((const StringRef*)(input_data))[idx];
+ int64_t size = value.size;
+ const char* data = value.data;
+ // CHAR type may pad the tail with \0, need to trim
+ while (size > 0 && data[size - 1] == '\0') {
+ size--;
+ }
+ return StringRef(value.data, size);
+ } else {
+ return ((const T*)(input_data))[idx];
+ }
+ };
+
+ uint16_t new_size = 0;
+ if (is_parse_column) {
+ if (nullmap == nullptr) {
+ for (int i = 0; i < number; i++) {
+ uint16_t idx = offsets[i];
+ if (!bloom_filter.test_element(get_element(data, idx))) {
+ continue;
}
- } else {
- for (int i = 0; i < number; i++) {
- uint16_t idx = offsets[i];
- if (nullmap[idx]) {
- continue;
- }
- if (!find_olap_engine(bloom_filter, data + element_size *
idx)) {
- continue;
- }
- offsets[new_size++] = idx;
+ offsets[new_size++] = idx;
+ }
+ } else {
+ for (int i = 0; i < number; i++) {
+ uint16_t idx = offsets[i];
+ if (nullmap[idx]) {
+ continue;
+ }
+ if (!bloom_filter.test_element(get_element(data, idx))) {
+ continue;
+ }
+ offsets[new_size++] = idx;
+ }
+ }
+ } else {
+ if (nullmap == nullptr) {
+ for (int i = 0; i < number; i++) {
+ if (!bloom_filter.test_element(get_element(data, i))) {
+ continue;
}
+ offsets[new_size++] = i;
}
} else {
- if (nullmap == nullptr) {
- for (int i = 0; i < number; i++) {
- if (!find_olap_engine(bloom_filter, data + element_size *
i)) {
- continue;
- }
- offsets[new_size++] = i;
+ for (int i = 0; i < number; i++) {
+ if (nullmap[i]) {
+ continue;
}
- } else {
- for (int i = 0; i < number; i++) {
- if (nullmap[i]) {
- continue;
- }
- if (!find_olap_engine(bloom_filter, data + element_size *
i)) {
- continue;
- }
- offsets[new_size++] = i;
+ if (!bloom_filter.test_element(get_element(data, i))) {
+ continue;
}
+ offsets[new_size++] = i;
}
}
- return new_size;
}
-};
+ return new_size;
+}
template <class T>
-struct CommonFindOp : BaseOp {
+struct CommonFindOp {
uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter,
const char* data,
const uint8* nullmap, uint16_t* offsets,
int number,
const bool is_parse_column) {
- return find_batch_olap_engine_with_element_size(bloom_filter, data,
nullmap, offsets,
- number,
is_parse_column, sizeof(T));
+ return find_batch_olap<T>(bloom_filter, data, nullmap, offsets,
number, is_parse_column);
}
void insert_batch(BloomFilterAdaptor& bloom_filter, const
vectorized::ColumnPtr& column,
@@ -333,22 +337,11 @@ struct CommonFindOp : BaseOp {
void insert(BloomFilterAdaptor& bloom_filter, const void* data) const {
bloom_filter.add_element(*(T*)data);
}
-
- bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void*
data) const override {
- return bloom_filter.test_element(*(T*)data);
- }
};
-struct StringFindOp : public BaseOp {
- uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter,
const char* data,
- const uint8* nullmap, uint16_t* offsets,
int number,
- const bool is_parse_column) {
- return find_batch_olap_engine_with_element_size(bloom_filter, data,
nullmap, offsets,
- number,
is_parse_column, sizeof(StringRef));
- }
-
- static void insert_batch(BloomFilterAdaptor& bloom_filter, const
vectorized::ColumnPtr& column,
- size_t start) {
+struct StringFindOp : CommonFindOp<StringRef> {
+ void insert_batch(BloomFilterAdaptor& bloom_filter, const
vectorized::ColumnPtr& column,
+ size_t start) {
if (column->is_nullable()) {
const auto* nullable = assert_cast<const
vectorized::ColumnNullable*>(column.get());
const auto& col =
@@ -370,8 +363,8 @@ struct StringFindOp : public BaseOp {
}
}
- static void find_batch(const BloomFilterAdaptor& bloom_filter,
- const vectorized::ColumnPtr& column, uint8_t*
results) {
+ void find_batch(const BloomFilterAdaptor& bloom_filter, const
vectorized::ColumnPtr& column,
+ uint8_t* results) {
if (column->is_nullable()) {
const auto* nullable = assert_cast<const
vectorized::ColumnNullable*>(column.get());
const auto& col =
@@ -394,33 +387,16 @@ struct StringFindOp : public BaseOp {
}
}
}
-
- static void insert(BloomFilterAdaptor& bloom_filter, const void* data) {
- const auto* value = reinterpret_cast<const StringRef*>(data);
- if (value) {
- bloom_filter.add_bytes(value->data, value->size);
- }
- }
-
- bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void*
data) const override {
- const auto* value = reinterpret_cast<const StringRef*>(data);
- return bloom_filter.test(*value);
- }
};
// We do not need to judge whether data is empty, because null will not appear
// when filer used by the storage engine
struct FixedStringFindOp : public StringFindOp {
- bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
- const void* input_data) const override {
- const auto* value = reinterpret_cast<const StringRef*>(input_data);
- int64_t size = value->size;
- const char* data = value->data;
- // CHAR type may pad the tail with \0, need to trim
- while (size > 0 && data[size - 1] == '\0') {
- size--;
- }
- return bloom_filter.test(StringRef(value->data, size));
+ uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter,
const char* data,
+ const uint8* nullmap, uint16_t* offsets,
int number,
+ const bool is_parse_column) {
+ return find_batch_olap<StringRef, true>(bloom_filter, data, nullmap,
offsets, number,
+ is_parse_column);
}
};
diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h
index 187d0d757e9..97078c11757 100644
--- a/be/src/exprs/runtime_filter.h
+++ b/be/src/exprs/runtime_filter.h
@@ -244,6 +244,7 @@ public:
void copy_from_other(IRuntimeFilter* other);
+ // insert data to build filter
void insert_batch(vectorized::ColumnPtr column, size_t start);
// publish filter
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]