This is an automated email from the ASF dual-hosted git repository.
wangbo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 48222f1fb0 [fix](storage)bloom filter support ColumnDict (#9167)
48222f1fb0 is described below
commit 48222f1fb0381dc3e7182f3ea6a97a4ff80a02a4
Author: wangbo <[email protected]>
AuthorDate: Thu Apr 28 20:03:26 2022 +0800
[fix](storage)bloom filter support ColumnDict (#9167)
bloom filter support ColumnDict(#9167)
---
be/src/exprs/bloomfilter_predicate.h | 28 ++++++++++++++++-------
be/src/olap/bloom_filter_predicate.h | 28 +++++++++++++++++++----
be/src/vec/columns/column_dictionary.h | 42 ++++++++++++++++++++++++++++++++--
3 files changed, 84 insertions(+), 14 deletions(-)
diff --git a/be/src/exprs/bloomfilter_predicate.h
b/be/src/exprs/bloomfilter_predicate.h
index d9ce266c91..ae1a8945e8 100644
--- a/be/src/exprs/bloomfilter_predicate.h
+++ b/be/src/exprs/bloomfilter_predicate.h
@@ -61,8 +61,9 @@ public:
size_t size() { return _bloom_filter->directory().size; }
- bool test_bytes(const char* data, size_t len) const {
- return _bloom_filter->find(Slice(data, len));
+ template <typename T>
+ bool test(T data) const {
+ return _bloom_filter->find(data);
}
void add_bytes(const char* data, size_t len) {
_bloom_filter->insert(Slice(data, len)); }
@@ -83,6 +84,7 @@ public:
virtual void insert(const void* data) = 0;
virtual bool find(const void* data) const = 0;
virtual bool find_olap_engine(const void* data) const = 0;
+ virtual bool find_uint32_t(uint32_t data) const = 0;
virtual Status merge(IBloomFilterFuncBase* bloomfilter_func) = 0;
virtual Status assign(const char* data, int len) = 0;
@@ -171,12 +173,15 @@ struct CommonFindOp {
bloom_filter.add_bytes((char*)data, sizeof(T));
}
ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, const
void* data) const {
- return bloom_filter.test_bytes((char*)data, sizeof(T));
+ return bloom_filter.test(Slice((char*)data, sizeof(T)));
}
ALWAYS_INLINE bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
const void* data) const {
return this->find(bloom_filter, data);
}
+ ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, uint32_t
data) const {
+ return bloom_filter.test(data);
+ }
};
template <class BloomFilterAdaptor>
@@ -192,12 +197,15 @@ struct StringFindOp {
if (value == nullptr) {
return false;
}
- return bloom_filter.test_bytes(value->ptr, value->len);
+ return bloom_filter.test(Slice(value->ptr, value->len));
}
ALWAYS_INLINE bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
const void* data) const {
return StringFindOp::find(bloom_filter, data);
}
+ ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, uint32_t
data) const {
+ return bloom_filter.test(data);
+ }
};
// We do not need to judge whether data is empty, because null will not appear
@@ -210,7 +218,7 @@ struct FixedStringFindOp : public
StringFindOp<BloomFilterAdaptor> {
int64_t size = value->len;
char* data = value->ptr;
while (size > 0 && data[size - 1] == '\0') size--;
- return bloom_filter.test_bytes(value->ptr, size);
+ return bloom_filter.test(Slice(value->ptr, size));
}
};
@@ -219,7 +227,7 @@ struct DateTimeFindOp : public CommonFindOp<DateTimeValue,
BloomFilterAdaptor> {
bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void*
data) const {
DateTimeValue value;
value.from_olap_datetime(*reinterpret_cast<const uint64_t*>(data));
- return bloom_filter.test_bytes((char*)&value, sizeof(DateTimeValue));
+ return bloom_filter.test(Slice((char*)&value, sizeof(DateTimeValue)));
}
};
@@ -238,7 +246,7 @@ struct DateFindOp : public CommonFindOp<DateTimeValue,
BloomFilterAdaptor> {
char data_bytes[sizeof(date_value)];
memcpy(&data_bytes, &date_value, sizeof(date_value));
- return bloom_filter.test_bytes(data_bytes, sizeof(DateTimeValue));
+ return bloom_filter.test(Slice(data_bytes, sizeof(DateTimeValue)));
}
};
@@ -254,7 +262,7 @@ struct DecimalV2FindOp : public
CommonFindOp<DecimalV2Value, BloomFilterAdaptor>
constexpr int decimal_value_sz = sizeof(DecimalV2Value);
char data_bytes[decimal_value_sz];
memcpy(&data_bytes, &value, decimal_value_sz);
- return bloom_filter.test_bytes(data_bytes, decimal_value_sz);
+ return bloom_filter.test(Slice(data_bytes, decimal_value_sz));
}
};
@@ -314,6 +322,10 @@ public:
bool find_olap_engine(const void* data) const override {
return dummy.find_olap_engine(*this->_bloom_filter, data);
}
+
+ bool find_uint32_t(uint32_t data) const override {
+ return dummy.find(*this->_bloom_filter, data);
+ }
private:
typename BloomFilterTypeTraits<type, BloomFilterAdaptor>::FindOp dummy;
diff --git a/be/src/olap/bloom_filter_predicate.h
b/be/src/olap/bloom_filter_predicate.h
index 8094b7b9b0..1605248229 100644
--- a/be/src/olap/bloom_filter_predicate.h
+++ b/be/src/olap/bloom_filter_predicate.h
@@ -30,6 +30,7 @@
#include "vec/columns/column_vector.h"
#include "vec/columns/predicate_column.h"
#include "vec/utils/util.hpp"
+#include "vec/columns/column_dictionary.h"
namespace doris {
@@ -115,14 +116,33 @@ void
BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16
if (column.is_nullable()) {
auto* nullable_col =
vectorized::check_and_get_column<vectorized::ColumnNullable>(column);
auto& null_map_data = nullable_col->get_null_map_column().get_data();
- auto* pred_col =
vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>(
+ // deal ColumnDict
+ if (nullable_col->get_nested_column().is_column_dictionary()) {
+ auto* dict_col =
vectorized::check_and_get_column<vectorized::ColumnDictI32>(nullable_col->get_nested_column());
+
const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values();
+ for (uint16_t i = 0; i < *size; i++) {
+ uint16_t idx = sel[i];
+ sel[new_size] = idx;
+ new_size += (!null_map_data[idx]) &&
_specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
+ }
+ } else {
+ auto* pred_col =
vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>(
nullable_col->get_nested_column());
- auto& pred_col_data = pred_col->get_data();
+ auto& pred_col_data = pred_col->get_data();
+ for (uint16_t i = 0; i < *size; i++) {
+ uint16_t idx = sel[i];
+ sel[new_size] = idx;
+ const auto* cell_value = reinterpret_cast<const
void*>(&(pred_col_data[idx]));
+ new_size += (!null_map_data[idx]) &&
_specific_filter->find_olap_engine(cell_value);
+ }
+ }
+ } else if (column.is_column_dictionary()) {
+ auto* dict_col =
vectorized::check_and_get_column<vectorized::ColumnDictI32>(column);
+
const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values();
for (uint16_t i = 0; i < *size; i++) {
uint16_t idx = sel[i];
sel[new_size] = idx;
- const auto* cell_value = reinterpret_cast<const
void*>(&(pred_col_data[idx]));
- new_size += (!null_map_data[idx]) &&
_specific_filter->find_olap_engine(cell_value);
+ new_size +=
_specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
}
} else {
auto* pred_col =
diff --git a/be/src/vec/columns/column_dictionary.h
b/be/src/vec/columns/column_dictionary.h
index dab8d81ee3..50f764a447 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -67,6 +67,7 @@ public:
using value_type = T;
using Container = PaddedPODArray<value_type>;
using DictContainer = PaddedPODArray<StringValue>;
+ using HashValueContainer = PaddedPODArray<uint32_t>; // used for bloom
filter
bool is_column_dictionary() const override { return true; }
@@ -106,6 +107,7 @@ public:
void clear() override {
_codes.clear();
_dict_code_converted = false;
+ _dict.clear_hash_values();
}
// TODO: Make dict memory usage more precise
@@ -251,6 +253,14 @@ public:
return _dict.find_code_by_bound(value, greater, eq);
}
+ void generate_hash_values() {
+ _dict.generate_hash_values();
+ }
+
+ uint32_t get_hash_value(uint32_t idx) const {
+ return _dict.get_hash_value(_codes[idx]);
+ }
+
phmap::flat_hash_set<int32_t> find_codes(
const phmap::flat_hash_set<StringValue>& values) const {
return _dict.find_codes(values);
@@ -297,6 +307,23 @@ public:
return -1;
}
+ inline StringValue& get_value(T code) { return _dict_data[code]; }
+
+ inline void generate_hash_values() {
+ if (_hash_values.size() == 0) {
+ _hash_values.resize(_dict_data.size());
+ for (size_t i = 0; i < _dict_data.size(); i++) {
+ auto& sv = _dict_data[i];
+ uint32_t hash_val = HashUtil::murmur_hash3_32(sv.ptr,
sv.len, 0);
+ _hash_values[i] = hash_val;
+ }
+ }
+ }
+
+ inline uint32_t get_hash_value(T code) const {
+ return _hash_values[code];
+ }
+
// For > , code takes upper_bound - 1; For >= , code takes upper_bound
// For < , code takes upper_bound; For <=, code takes upper_bound - 1
// For example a sorted dict: <'b',0> <'c',1> <'d',2>
@@ -336,12 +363,15 @@ public:
return code_set;
}
- StringValue& get_value(T code) { return _dict_data[code]; }
-
void clear() {
_dict_data.clear();
_inverted_index.clear();
_code_convert_map.clear();
+ _hash_values.clear();
+ }
+
+ void clear_hash_values() {
+ _hash_values.clear();
}
void sort() {
@@ -365,6 +395,12 @@ public:
phmap::flat_hash_map<StringValue, T, StringValue::HashOfStringValue>
_inverted_index;
// data page code -> sorted dict code, only used for range comparison
predicate
phmap::flat_hash_map<T, T> _code_convert_map;
+ // hash value of origin string , used for bloom filter
+ // It's a trade-off of space for performance
+ // But in TPC-DS 1GB q60,we see no significant improvement.
+ // This may because the magnitude of the data is not large enough(in
q60, only about 80k rows data is filtered for largest table)
+ // So we may need more test here.
+ HashValueContainer _hash_values;
};
private:
@@ -381,4 +417,6 @@ template class ColumnDictionary<uint16_t>;
template class ColumnDictionary<uint32_t>;
template class ColumnDictionary<int32_t>;
+using ColumnDictI32 = vectorized::ColumnDictionary<doris::vectorized::Int32>;
+
} // namespace doris::vectorized
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]