This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 6b6682cd96 [Enhancement](Expr) Opt In Set by small size fixed
container to improve performance. (#17976)
6b6682cd96 is described below
commit 6b6682cd961c0a6fbffb2ec6988c40ee042aa8a4
Author: Qi Chen <[email protected]>
AuthorDate: Tue Mar 28 23:10:39 2023 +0800
[Enhancement](Expr) Opt In Set by small size fixed container to improve
performance. (#17976)
---
be/src/exprs/create_predicate_function.h | 115 ++++-
be/src/exprs/hybrid_set.h | 492 ++++++++++++++++++---
be/src/olap/in_list_predicate.h | 271 +++++++++---
be/src/olap/predicate_creator.h | 13 +-
be/src/runtime/primitive_type.h | 3 +
be/src/vec/columns/column_dictionary.h | 9 +-
.../exec/format/parquet/vparquet_group_reader.cpp | 4 +-
be/src/vec/exprs/vdirect_in_predicate.h | 30 +-
be/src/vec/functions/in.h | 46 +-
9 files changed, 778 insertions(+), 205 deletions(-)
diff --git a/be/src/exprs/create_predicate_function.h
b/be/src/exprs/create_predicate_function.h
index cbb8a45a3b..5a6d66d1f6 100644
--- a/be/src/exprs/create_predicate_function.h
+++ b/be/src/exprs/create_predicate_function.h
@@ -30,7 +30,7 @@ namespace doris {
class MinmaxFunctionTraits {
public:
using BasePtr = MinMaxFuncBase*;
- template <PrimitiveType type>
+ template <PrimitiveType type, size_t N>
static BasePtr get_function() {
return new MinMaxNumFunc<typename
PrimitiveTypeTraits<type>::CppType>();
}
@@ -39,19 +39,29 @@ public:
class HybridSetTraits {
public:
using BasePtr = HybridSetBase*;
- template <PrimitiveType type>
+ template <PrimitiveType type, size_t N>
static BasePtr get_function() {
using CppType = typename PrimitiveTypeTraits<type>::CppType;
- using Set =
- std::conditional_t<std::is_same_v<CppType, StringRef>,
StringSet, HybridSet<type>>;
- return new Set();
+ if constexpr (N >= 1 && N <= 12) {
+ using Set = std::conditional_t<
+ std::is_same_v<CppType, StringRef>, StringSet<>,
+ HybridSet<type,
+ FixedContainer<typename
VecPrimitiveTypeTraits<type>::CppType, N>>>;
+ return new Set();
+ } else {
+ using Set = std::conditional_t<
+ std::is_same_v<CppType, StringRef>, StringSet<>,
+ HybridSet<type,
+ DynamicContainer<typename
VecPrimitiveTypeTraits<type>::CppType>>>;
+ return new Set();
+ }
}
};
class BloomFilterTraits {
public:
using BasePtr = BloomFilterFuncBase*;
- template <PrimitiveType type>
+ template <PrimitiveType type, size_t N>
static BasePtr get_function() {
return new BloomFilterFunc<type>();
}
@@ -60,7 +70,7 @@ public:
class BitmapFilterTraits {
public:
using BasePtr = BitmapFilterFuncBase*;
- template <PrimitiveType type>
+ template <PrimitiveType type, size_t N>
static BasePtr get_function() {
return new BitmapFilterFunc<type>();
}
@@ -69,9 +79,9 @@ public:
template <class Traits>
class PredicateFunctionCreator {
public:
- template <PrimitiveType type>
+ template <PrimitiveType type, size_t N = 0>
static typename Traits::BasePtr create() {
- return Traits::template get_function<type>();
+ return Traits::template get_function<type, N>();
}
};
@@ -94,20 +104,20 @@ public:
M(TYPE_DECIMAL64) \
M(TYPE_DECIMAL128I)
-template <class Traits>
+template <class Traits, size_t N = 0>
typename Traits::BasePtr create_predicate_function(PrimitiveType type) {
using Creator = PredicateFunctionCreator<Traits>;
switch (type) {
case TYPE_BOOLEAN: {
- return Creator::template create<TYPE_BOOLEAN>();
+ return Creator::template create<TYPE_BOOLEAN, N>();
}
case TYPE_DECIMALV2: {
- return Creator::template create<TYPE_DECIMALV2>();
+ return Creator::template create<TYPE_DECIMALV2, N>();
}
-#define M(NAME) \
- case NAME: { \
- return Creator::template create<NAME>(); \
+#define M(NAME) \
+ case NAME: { \
+ return Creator::template create<NAME, N>(); \
}
APPLY_FOR_PRIMTYPE(M)
#undef M
@@ -142,8 +152,78 @@ inline auto create_minmax_filter(PrimitiveType type) {
return create_predicate_function<MinmaxFunctionTraits>(type);
}
+template <size_t N = 0>
inline auto create_set(PrimitiveType type) {
- return create_predicate_function<HybridSetTraits>(type);
+ return create_predicate_function<HybridSetTraits, N>(type);
+}
+
+inline auto create_set(PrimitiveType type, size_t size) {
+ if (size == 1) {
+ return create_set<1>(type);
+ } else if (size == 2) {
+ return create_set<2>(type);
+ } else if (size == 3) {
+ return create_set<3>(type);
+ } else if (size == 4) {
+ return create_set<4>(type);
+ } else if (size == 5) {
+ return create_set<5>(type);
+ } else if (size == 6) {
+ return create_set<6>(type);
+ } else if (size == 7) {
+ return create_set<7>(type);
+ } else if (size == 8) {
+ return create_set<8>(type);
+ } else if (size == 9) {
+ return create_set<9>(type);
+ } else if (size == 10) {
+ return create_set<10>(type);
+ } else if (size == 11) {
+ return create_set<11>(type);
+ } else if (size == 12) {
+ return create_set<12>(type);
+ } else {
+ return create_set(type);
+ }
+}
+
+template <size_t N = 0>
+inline HybridSetBase* create_string_value_set() {
+ if constexpr (N >= 1 && N <= 12) {
+ return new StringValueSet<FixedContainer<StringRef, N>>();
+ } else {
+ return new StringValueSet();
+ }
+}
+
+inline HybridSetBase* create_string_value_set(size_t size) {
+ if (size == 1) {
+ return create_string_value_set<1>();
+ } else if (size == 2) {
+ return create_string_value_set<2>();
+ } else if (size == 3) {
+ return create_string_value_set<3>();
+ } else if (size == 4) {
+ return create_string_value_set<4>();
+ } else if (size == 5) {
+ return create_string_value_set<5>();
+ } else if (size == 6) {
+ return create_string_value_set<6>();
+ } else if (size == 7) {
+ return create_string_value_set<7>();
+ } else if (size == 8) {
+ return create_string_value_set<8>();
+ } else if (size == 9) {
+ return create_string_value_set<9>();
+ } else if (size == 10) {
+ return create_string_value_set<10>();
+ } else if (size == 11) {
+ return create_string_value_set<11>();
+ } else if (size == 12) {
+ return create_string_value_set<12>();
+ } else {
+ return create_string_value_set();
+ }
}
inline auto create_bloom_filter(PrimitiveType type) {
@@ -183,7 +263,8 @@ template <PrimitiveType PT>
ColumnPredicate* create_olap_column_predicate(uint32_t column_id,
const
std::shared_ptr<HybridSetBase>& filter, int,
const TabletColumn* column =
nullptr) {
- return new InListPredicateBase<PT, PredicateType::IN_LIST>(column_id,
filter, column->length());
+ return create_in_list_predicate<PT, PredicateType::IN_LIST>(column_id,
filter,
+
column->length());
}
template <typename T>
diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h
index 25606b226e..7b7f041a81 100644
--- a/be/src/exprs/hybrid_set.h
+++ b/be/src/exprs/hybrid_set.h
@@ -23,10 +23,183 @@
#include "runtime/decimalv2_value.h"
#include "runtime/define_primitive_type.h"
#include "runtime/primitive_type.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
#include "vec/common/string_ref.h"
namespace doris {
+/**
+ * Fix Container can use simd to improve performance. 1 <= N <= 12 can be
improved performance by test.
+ * @tparam T Element Type
+ * @tparam N Fixed Number
+ */
+template <typename T, size_t N>
+class FixedContainer {
+public:
+ using Self = FixedContainer;
+ using ElementType = T;
+
+ class Iterator;
+
+ FixedContainer() : _size(0) { static_assert(N >= 1 && N <= 12); }
+
+ ~FixedContainer() = default;
+
+ void insert(const T& value) {
+ DCHECK(_size < N);
+ _data[_size++] = value;
+ }
+
+ void insert(Iterator begin, Iterator end) {
+ for (auto iter = begin; iter != end; ++iter) {
+ DCHECK(_size < N);
+ _data[_size++] = (*iter);
+ }
+ }
+
+ // Use '|' instead of '||' has better performance by test.
+ bool find(const T& value) const {
+ if constexpr (N == 1) {
+ return (value == _data[0]);
+ }
+ if constexpr (N == 2) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]);
+ }
+ if constexpr (N == 3) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]);
+ }
+ if constexpr (N == 4) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3]);
+ }
+ if constexpr (N == 5) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]);
+ }
+ if constexpr (N == 6) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5]);
+ }
+ if constexpr (N == 7) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5])
|
+ (uint8_t)(value == _data[6]);
+ }
+ if constexpr (N == 8) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5])
|
+ (uint8_t)(value == _data[6]) | (uint8_t)(value == _data[7]);
+ }
+ if constexpr (N == 9) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5])
|
+ (uint8_t)(value == _data[6]) | (uint8_t)(value == _data[7])
|
+ (uint8_t)(value == _data[8]);
+ }
+ if constexpr (N == 10) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5])
|
+ (uint8_t)(value == _data[6]) | (uint8_t)(value == _data[7])
|
+ (uint8_t)(value == _data[8]) | (uint8_t)(value == _data[9]);
+ }
+ if constexpr (N == 11) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5])
|
+ (uint8_t)(value == _data[6]) | (uint8_t)(value == _data[7])
|
+ (uint8_t)(value == _data[8]) | (uint8_t)(value == _data[9])
|
+ (uint8_t)(value == _data[10]);
+ }
+ if constexpr (N == 12) {
+ return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1])
|
+ (uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3])
|
+ (uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5])
|
+ (uint8_t)(value == _data[6]) | (uint8_t)(value == _data[7])
|
+ (uint8_t)(value == _data[8]) | (uint8_t)(value == _data[9])
|
+ (uint8_t)(value == _data[10]) | (uint8_t)(value ==
_data[11]);
+ }
+ CHECK(false) << "unreachable path";
+ return false;
+ }
+
+ size_t size() const { return _size; }
+
+ class Iterator {
+ public:
+ explicit Iterator(std::array<T, N>& data, size_t index) : _data(data),
_index(index) {}
+ Iterator& operator++() {
+ ++_index;
+ return *this;
+ }
+ Iterator operator++(int) {
+ Iterator ret_val = *this;
+ ++(*this);
+ return ret_val;
+ }
+ bool operator==(Iterator other) const { return _index == other._index;
}
+ bool operator!=(Iterator other) const { return !(*this == other); }
+ T& operator*() const { return _data[_index]; }
+
+ T* operator->() const { return &operator*(); }
+
+ // iterator traits
+ using iterator_category = std::forward_iterator_tag;
+ using difference_type = std::ptrdiff_t;
+ using value_type = T;
+ using pointer = T*;
+ using reference = T&;
+
+ private:
+ std::array<T, N>& _data;
+ size_t _index;
+ };
+ Iterator begin() { return Iterator(_data, 0); }
+ Iterator end() { return Iterator(_data, _size); }
+
+private:
+ std::array<T, N> _data;
+ size_t _size;
+};
+
+/**
+ * Dynamic Container uses phmap::flat_hash_set.
+ * @tparam T Element Type
+ */
+template <typename T>
+class DynamicContainer {
+public:
+ using Self = DynamicContainer;
+ using Iterator = typename phmap::flat_hash_set<T>::iterator;
+ using ElementType = T;
+
+ DynamicContainer() = default;
+ ~DynamicContainer() = default;
+
+ void insert(const T& value) { _set.insert(value); }
+
+ void insert(Iterator begin, Iterator end) { _set.insert(begin, end); }
+
+ bool find(const T& value) const { return _set.contains(value); }
+
+ Iterator begin() { return _set.begin(); }
+
+ Iterator end() { return _set.end(); }
+
+ size_t size() const { return _set.size(); }
+
+private:
+ phmap::flat_hash_set<T> _set;
+};
+
+// TODO Maybe change void* parameter to template parameter better.
class HybridSetBase {
public:
HybridSetBase() = default;
@@ -37,18 +210,47 @@ public:
virtual void insert_fixed_len(const char* data, const int* offsets, int
number) = 0;
- virtual void insert(HybridSetBase* set) = 0;
+ virtual void insert(HybridSetBase* set) {
+ HybridSetBase::IteratorBase* iter = set->begin();
+ while (iter->has_next()) {
+ const void* value = iter->get_value();
+ insert(value);
+ iter->next();
+ }
+ }
virtual int size() = 0;
- virtual bool find(const void* data) = 0;
+ virtual bool find(const void* data) const = 0;
// use in vectorize execute engine
- virtual bool find(const void* data, size_t) = 0;
+ virtual bool find(const void* data, size_t) const = 0;
- virtual void find_fixed_len(const char* data, const uint8* nullmap, int
number,
- uint8* results) {
+ virtual void find_fixed_len(const char* __restrict data, const uint8*
__restrict null_map,
+ int number, uint8* __restrict results) {
LOG(FATAL) << "HybridSetBase not support find_fixed_len";
}
+ virtual void find_batch(const doris::vectorized::IColumn& column, size_t
rows,
+ doris::vectorized::ColumnUInt8::Container&
results) {
+ LOG(FATAL) << "HybridSetBase not support find_batch";
+ }
+
+ virtual void find_batch_negative(const doris::vectorized::IColumn& column,
size_t rows,
+
doris::vectorized::ColumnUInt8::Container& results) {
+ LOG(FATAL) << "HybridSetBase not support find_batch_negative";
+ }
+
+ virtual void find_batch_nullable(const doris::vectorized::IColumn& column,
size_t rows,
+ const doris::vectorized::NullMap&
null_map,
+
doris::vectorized::ColumnUInt8::Container& results) {
+ LOG(FATAL) << "HybridSetBase not support find_batch_nullable";
+ }
+
+ virtual void find_batch_nullable_negative(const
doris::vectorized::IColumn& column, size_t rows,
+ const
doris::vectorized::NullMap& null_map,
+
doris::vectorized::ColumnUInt8::Container& results) {
+ LOG(FATAL) << "HybridSetBase not support find_batch_nullable_negative";
+ }
+
class IteratorBase {
public:
IteratorBase() = default;
@@ -61,10 +263,34 @@ public:
virtual IteratorBase* begin() = 0;
};
-template <PrimitiveType T>
+template <typename Type>
+const Type* check_and_get_hybrid_set(const HybridSetBase& column) {
+ return typeid_cast<const Type*>(&column);
+}
+
+template <typename Type>
+const Type* check_and_get_hybrid_set(const HybridSetBase* column) {
+ return typeid_cast<const Type*>(column);
+}
+
+template <typename Type>
+bool check_hybrid_set(const HybridSetBase& column) {
+ return check_and_get_hybrid_set<Type>(&column);
+}
+
+template <typename Type>
+bool check_hybrid_set(const HybridSetBase* column) {
+ return check_and_get_hybrid_set<Type>(column);
+}
+
+template <PrimitiveType T,
+ typename _ContainerType = DynamicContainer<typename
VecPrimitiveTypeTraits<T>::CppType>,
+ typename _ColumnType = typename
VecPrimitiveTypeTraits<T>::ColumnType>
class HybridSet : public HybridSetBase {
public:
- using CppType = typename VecPrimitiveTypeTraits<T>::CppType;
+ using ContainerType = _ContainerType;
+ using ElementType = typename ContainerType::ElementType;
+ using ColumnType = _ColumnType;
HybridSet() = default;
@@ -75,57 +301,98 @@ public:
return;
}
- if constexpr (sizeof(CppType) >= 16) {
+ if constexpr (sizeof(ElementType) >= 16) {
// for large int, it will core dump with no memcpy
- CppType value;
- memcpy(&value, data, sizeof(CppType));
+ ElementType value;
+ memcpy(&value, data, sizeof(ElementType));
_set.insert(value);
} else {
- _set.insert(*reinterpret_cast<const CppType*>(data));
+ _set.insert(*reinterpret_cast<const ElementType*>(data));
}
}
void insert(void* data, size_t) override { insert(data); }
void insert_fixed_len(const char* data, const int* offsets, int number)
override {
for (int i = 0; i < number; i++) {
- insert((void*)((CppType*)data + offsets[i]));
+ insert((void*)((ElementType*)data + offsets[i]));
}
}
- void insert(HybridSetBase* set) override {
- HybridSet<T>* hybrid_set = reinterpret_cast<HybridSet<T>*>(set);
- _set.insert(hybrid_set->_set.begin(), hybrid_set->_set.end());
- }
-
int size() override { return _set.size(); }
- bool find(const void* data) override {
+ bool find(const void* data) const override {
if (data == nullptr) {
return false;
}
- auto it = _set.find(*reinterpret_cast<const CppType*>(data));
- return !(it == _set.end());
+ return _set.find(*reinterpret_cast<const ElementType*>(data));
}
- bool find(const void* data, size_t) override { return find(data); }
+ bool find(const void* data, size_t) const override { return find(data); }
- void find_fixed_len(const char* data, const uint8* nullmap, int number,
- uint8* results) override {
- for (int i = 0; i < number; i++) {
- if (nullmap != nullptr && nullmap[i]) {
- results[i] = false;
- } else {
- results[i] = _set.count(*((CppType*)data + i));
+ void find_fixed_len(const char* __restrict data, const uint8* __restrict
null_map, int number,
+ uint8* __restrict results) override {
+ ElementType* value = (ElementType*)data;
+ if (null_map == nullptr) {
+ for (int i = 0; i < number; i++) {
+ results[i] = _set.find(value[i]);
+ }
+ } else {
+ for (int i = 0; i < number; i++) {
+ results[i] = _set.find(value[i]) & !null_map[i];
+ }
+ }
+ }
+
+ void find_batch(const doris::vectorized::IColumn& column, size_t rows,
+ doris::vectorized::ColumnUInt8::Container& results)
override {
+ _find_batch<false, false>(column, rows, nullptr, results);
+ }
+
+ void find_batch_negative(const doris::vectorized::IColumn& column, size_t
rows,
+ doris::vectorized::ColumnUInt8::Container&
results) override {
+ _find_batch<false, true>(column, rows, nullptr, results);
+ }
+
+ void find_batch_nullable(const doris::vectorized::IColumn& column, size_t
rows,
+ const doris::vectorized::NullMap& null_map,
+ doris::vectorized::ColumnUInt8::Container&
results) override {
+ _find_batch<true, false>(column, rows, &null_map, results);
+ }
+
+ void find_batch_nullable_negative(const doris::vectorized::IColumn&
column, size_t rows,
+ const doris::vectorized::NullMap&
null_map,
+
doris::vectorized::ColumnUInt8::Container& results) override {
+ _find_batch<true, true>(column, rows, &null_map, results);
+ }
+
+ template <bool is_nullable, bool is_negative>
+ void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
+ const doris::vectorized::NullMap* null_map,
+ doris::vectorized::ColumnUInt8::Container& results) {
+ auto& col = assert_cast<const ColumnType&>(column);
+ const auto* __restrict data = (ElementType*)col.get_data().data();
+ const uint8_t* __restrict null_map_data;
+ if constexpr (is_nullable) {
+ null_map_data = null_map->data();
+ }
+ auto* __restrict result_data = results.data();
+ for (size_t i = 0; i < rows; ++i) {
+ if constexpr (!is_nullable && !is_negative) {
+ result_data[i] = _set.find(data[i]);
+ } else if constexpr (!is_nullable && is_negative) {
+ result_data[i] = !_set.find(data[i]);
+ } else if constexpr (is_nullable && !is_negative) {
+ result_data[i] = _set.find(data[i]) & (!null_map_data[i]);
+ } else { // (is_nullable && is_negative)
+ result_data[i] = !(_set.find(data[i]) & (!null_map_data[i]));
}
}
}
- template <class _iT>
class Iterator : public IteratorBase {
public:
- Iterator(typename phmap::flat_hash_set<_iT>::iterator begin,
- typename phmap::flat_hash_set<_iT>::iterator end)
+ Iterator(typename ContainerType::Iterator begin, typename
ContainerType::Iterator end)
: _begin(begin), _end(end) {}
~Iterator() override = default;
bool has_next() const override { return !(_begin == _end); }
@@ -133,23 +400,26 @@ public:
void next() override { ++_begin; }
private:
- typename phmap::flat_hash_set<_iT>::iterator _begin;
- typename phmap::flat_hash_set<_iT>::iterator _end;
+ typename ContainerType::Iterator _begin;
+ typename ContainerType::Iterator _end;
};
IteratorBase* begin() override {
- return _pool.add(new (std::nothrow) Iterator<CppType>(_set.begin(),
_set.end()));
+ return _pool.add(new (std::nothrow) Iterator(_set.begin(),
_set.end()));
}
- phmap::flat_hash_set<CppType>* get_inner_set() { return &_set; }
+ ContainerType* get_inner_set() { return &_set; }
private:
- phmap::flat_hash_set<CppType> _set;
+ ContainerType _set;
ObjectPool _pool;
};
+template <typename _ContainerType = DynamicContainer<std::string>>
class StringSet : public HybridSetBase {
public:
+ using ContainerType = _ContainerType;
+
StringSet() = default;
~StringSet() override = default;
@@ -173,35 +443,72 @@ public:
LOG(FATAL) << "string set not support insert_fixed_len";
}
- void insert(HybridSetBase* set) override {
- StringSet* string_set = reinterpret_cast<StringSet*>(set);
- _set.insert(string_set->_set.begin(), string_set->_set.end());
- }
-
int size() override { return _set.size(); }
- bool find(const void* data) override {
+ bool find(const void* data) const override {
if (data == nullptr) {
return false;
}
auto* value = reinterpret_cast<const StringRef*>(data);
- std::string_view str_value(const_cast<const char*>(value->data),
value->size);
- auto it = _set.find(str_value);
-
- return !(it == _set.end());
+ std::string str_value(const_cast<const char*>(value->data),
value->size);
+ return _set.find(str_value);
}
- bool find(const void* data, size_t size) override {
+ bool find(const void* data, size_t size) const override {
std::string str_value(reinterpret_cast<const char*>(data), size);
- auto it = _set.find(str_value);
- return !(it == _set.end());
+ return _set.find(str_value);
+ }
+
+ void find_batch(const doris::vectorized::IColumn& column, size_t rows,
+ doris::vectorized::ColumnUInt8::Container& results)
override {
+ _find_batch<false, false>(column, rows, nullptr, results);
+ }
+
+ void find_batch_negative(const doris::vectorized::IColumn& column, size_t
rows,
+ doris::vectorized::ColumnUInt8::Container&
results) override {
+ _find_batch<false, true>(column, rows, nullptr, results);
+ }
+
+ void find_batch_nullable(const doris::vectorized::IColumn& column, size_t
rows,
+ const doris::vectorized::NullMap& null_map,
+ doris::vectorized::ColumnUInt8::Container&
results) override {
+ _find_batch<true, false>(column, rows, &null_map, results);
+ }
+
+ void find_batch_nullable_negative(const doris::vectorized::IColumn&
column, size_t rows,
+ const doris::vectorized::NullMap&
null_map,
+
doris::vectorized::ColumnUInt8::Container& results) override {
+ _find_batch<true, true>(column, rows, &null_map, results);
+ }
+
+ template <bool is_nullable, bool is_negative>
+ void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
+ const doris::vectorized::NullMap* null_map,
+ doris::vectorized::ColumnUInt8::Container& results) {
+ auto& col = assert_cast<const
doris::vectorized::ColumnString&>(column);
+ const uint8_t* __restrict null_map_data;
+ if constexpr (is_nullable) {
+ null_map_data = null_map->data();
+ }
+ auto* __restrict result_data = results.data();
+ for (size_t i = 0; i < rows; ++i) {
+ const auto& string_data = col.get_data_at(i).to_string();
+ if constexpr (!is_nullable && !is_negative) {
+ result_data[i] = _set.find(string_data);
+ } else if constexpr (!is_nullable && is_negative) {
+ result_data[i] = !_set.find(string_data);
+ } else if constexpr (is_nullable && !is_negative) {
+ result_data[i] = _set.find(string_data) & (!null_map_data[i]);
+ } else { // (is_nullable && is_negative)
+ result_data[i] = !(_set.find(string_data) &
(!null_map_data[i]));
+ }
+ }
}
class Iterator : public IteratorBase {
public:
- Iterator(phmap::flat_hash_set<std::string>::iterator begin,
- phmap::flat_hash_set<std::string>::iterator end)
+ Iterator(typename ContainerType::Iterator begin, typename
ContainerType::Iterator end)
: _begin(begin), _end(end) {}
~Iterator() override = default;
bool has_next() const override { return !(_begin == _end); }
@@ -213,8 +520,8 @@ public:
void next() override { ++_begin; }
private:
- typename phmap::flat_hash_set<std::string>::iterator _begin;
- typename phmap::flat_hash_set<std::string>::iterator _end;
+ typename ContainerType::Iterator _begin;
+ typename ContainerType::Iterator _end;
StringRef _value;
};
@@ -222,18 +529,21 @@ public:
return _pool.add(new (std::nothrow) Iterator(_set.begin(),
_set.end()));
}
- phmap::flat_hash_set<std::string>* get_inner_set() { return &_set; }
+ ContainerType* get_inner_set() { return &_set; }
private:
- phmap::flat_hash_set<std::string> _set;
+ ContainerType _set;
ObjectPool _pool;
};
// note: Two difference from StringSet
// 1 StringRef has better comparison performance than std::string
// 2 std::string keeps its own memory, bug StringRef just keeps ptr and len,
so you the caller should manage memory of StringRef
+template <typename _ContainerType = DynamicContainer<StringRef>>
class StringValueSet : public HybridSetBase {
public:
+ using ContainerType = _ContainerType;
+
StringValueSet() = default;
~StringValueSet() override = default;
@@ -257,38 +567,74 @@ public:
LOG(FATAL) << "string set not support insert_fixed_len";
}
- void insert(HybridSetBase* set) override {
- StringValueSet* string_set = reinterpret_cast<StringValueSet*>(set);
- _set.insert(string_set->_set.begin(), string_set->_set.end());
- }
-
int size() override { return _set.size(); }
- bool find(const void* data) override {
+ bool find(const void* data) const override {
if (data == nullptr) {
return false;
}
auto* value = reinterpret_cast<const StringRef*>(data);
- auto it = _set.find(*value);
-
- return !(it == _set.end());
+ return _set.find(*value);
}
- bool find(const void* data, size_t size) override {
+ bool find(const void* data, size_t size) const override {
if (data == nullptr) {
return false;
}
StringRef sv(reinterpret_cast<const char*>(data), size);
- auto it = _set.find(sv);
- return !(it == _set.end());
+ return _set.find(sv);
+ }
+
+ void find_batch(const doris::vectorized::IColumn& column, size_t rows,
+ doris::vectorized::ColumnUInt8::Container& results)
override {
+ _find_batch<false, false>(column, rows, nullptr, results);
+ }
+
+ void find_batch_negative(const doris::vectorized::IColumn& column, size_t
rows,
+ doris::vectorized::ColumnUInt8::Container&
results) override {
+ _find_batch<false, true>(column, rows, nullptr, results);
+ }
+
+ void find_batch_nullable(const doris::vectorized::IColumn& column, size_t
rows,
+ const doris::vectorized::NullMap& null_map,
+ doris::vectorized::ColumnUInt8::Container&
results) override {
+ _find_batch<true, false>(column, rows, &null_map, results);
+ }
+
+ void find_batch_nullable_negative(const doris::vectorized::IColumn&
column, size_t rows,
+ const doris::vectorized::NullMap&
null_map,
+
doris::vectorized::ColumnUInt8::Container& results) override {
+ _find_batch<true, true>(column, rows, &null_map, results);
+ }
+
+ template <bool is_nullable, bool is_negative>
+ void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
+ const doris::vectorized::NullMap* null_map,
+ doris::vectorized::ColumnUInt8::Container& results) {
+ auto& col = assert_cast<const
doris::vectorized::ColumnString&>(column);
+ const uint8_t* __restrict null_map_data;
+ if constexpr (is_nullable) {
+ null_map_data = null_map->data();
+ }
+ auto* __restrict result_data = results.data();
+ for (size_t i = 0; i < rows; ++i) {
+ if constexpr (!is_nullable && !is_negative) {
+ result_data[i] = _set.find(col.get_data_at(i));
+ } else if constexpr (!is_nullable && is_negative) {
+ result_data[i] = !_set.find(col.get_data_at(i));
+ } else if constexpr (is_nullable && !is_negative) {
+ result_data[i] = _set.find(col.get_data_at(i)) &
(!null_map_data[i]);
+ } else { // (is_nullable && is_negative)
+ result_data[i] = !(_set.find(col.get_data_at(i)) &
(!null_map_data[i]));
+ }
+ }
}
class Iterator : public IteratorBase {
public:
- Iterator(phmap::flat_hash_set<StringRef>::iterator begin,
- phmap::flat_hash_set<StringRef>::iterator end)
+ Iterator(typename ContainerType::Iterator begin, typename
ContainerType::Iterator end)
: _begin(begin), _end(end) {}
~Iterator() override = default;
bool has_next() const override { return !(_begin == _end); }
@@ -300,8 +646,8 @@ public:
void next() override { ++_begin; }
private:
- typename phmap::flat_hash_set<StringRef>::iterator _begin;
- typename phmap::flat_hash_set<StringRef>::iterator _end;
+ typename ContainerType::Iterator _begin;
+ typename ContainerType::Iterator _end;
StringRef _value;
};
@@ -309,10 +655,10 @@ public:
return _pool.add(new (std::nothrow) Iterator(_set.begin(),
_set.end()));
}
- phmap::flat_hash_set<StringRef>* get_inner_set() { return &_set; }
+ ContainerType* get_inner_set() { return &_set; }
private:
- phmap::flat_hash_set<StringRef> _set;
+ ContainerType _set;
ObjectPool _pool;
};
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 182abc25a6..bdb4553b62 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -75,7 +75,13 @@ struct std::equal_to<doris::uint24_t> {
namespace doris {
-template <PrimitiveType Type, PredicateType PT>
+/**
+ * Use HybridSetType can avoid virtual function call in the loop.
+ * @tparam Type
+ * @tparam PT
+ * @tparam HybridSetType
+ */
+template <PrimitiveType Type, PredicateType PT, typename HybridSetType>
class InListPredicateBase : public ColumnPredicate {
public:
using T = typename PredicatePrimitiveTypeTraits<Type>::PredicateFieldType;
@@ -84,7 +90,7 @@ public:
const ConvertFunc& convert, bool is_opposite = false,
const TabletColumn* col = nullptr, MemPool* pool =
nullptr)
: ColumnPredicate(column_id, is_opposite),
- _values(new phmap::flat_hash_set<T>()),
+ _values(new HybridSetType()),
_min_value(type_limit<T>::max()),
_max_value(type_limit<T>::min()) {
for (const auto& condition : conditions) {
@@ -97,7 +103,7 @@ public:
} else {
tmp = convert(condition);
}
- _values->insert(tmp);
+ _values->insert(&tmp);
_update_min_max(tmp);
}
}
@@ -107,50 +113,69 @@ public:
: ColumnPredicate(column_id, false),
_min_value(type_limit<T>::max()),
_max_value(type_limit<T>::min()) {
- using HybridSetType = std::conditional_t<is_string_type(Type),
StringSet, HybridSet<Type>>;
-
CHECK(hybrid_set != nullptr);
if constexpr (is_string_type(Type) || Type == TYPE_DECIMALV2 ||
is_date_type(Type)) {
- _values = new phmap::flat_hash_set<T>();
- auto values = ((HybridSetType*)hybrid_set.get())->get_inner_set();
+ _values = new HybridSetType();
if constexpr (is_string_type(Type)) {
- // values' type is "phmap::flat_hash_set<std::string>"
- for (const std::string& value : *values) {
- StringRef sv = value;
+ HybridSetBase::IteratorBase* iter = hybrid_set->begin();
+ while (iter->has_next()) {
+ const StringRef* value = (const
StringRef*)(iter->get_value());
if constexpr (Type == TYPE_CHAR) {
_temp_datas.push_back("");
- _temp_datas.back().resize(std::max(char_length,
value.size()));
- memcpy(_temp_datas.back().data(), value.data(),
value.size());
- sv = _temp_datas.back();
+ _temp_datas.back().resize(std::max(char_length,
value->size));
+ memcpy(_temp_datas.back().data(), value->data,
value->size);
+ const string& str = _temp_datas.back();
+ _values->insert((void*)str.data(), str.length());
+ } else {
+ _values->insert((void*)value->data, value->size);
}
- _values->insert(sv);
+ iter->next();
}
} else if constexpr (Type == TYPE_DECIMALV2) {
- for (auto& value : *values) {
- _values->insert({value.int_value(), value.frac_value()});
+ HybridSetBase::IteratorBase* iter = hybrid_set->begin();
+ while (iter->has_next()) {
+ const DecimalV2Value* value = (const
DecimalV2Value*)(iter->get_value());
+ decimal12_t decimal12 = {value->int_value(),
value->frac_value()};
+ _values->insert(&decimal12);
+ iter->next();
}
} else if constexpr (Type == TYPE_DATE) {
- for (auto& value : *values) {
- _values->insert(value.to_olap_date());
+ HybridSetBase::IteratorBase* iter = hybrid_set->begin();
+ while (iter->has_next()) {
+ const vectorized::VecDateTimeValue* value =
+ (const
vectorized::VecDateTimeValue*)(iter->get_value());
+ uint64_t date = value->to_olap_date();
+ _values->insert(&date);
+ iter->next();
}
} else if constexpr (Type == TYPE_DATETIME) {
- for (auto& value : *values) {
- _values->insert(value.to_olap_datetime());
+ HybridSetBase::IteratorBase* iter = hybrid_set->begin();
+ while (iter->has_next()) {
+ const vectorized::VecDateTimeValue* value =
+ (const
vectorized::VecDateTimeValue*)(iter->get_value());
+ uint64_t date_time = value->to_olap_datetime();
+ _values->insert(&date_time);
+ iter->next();
}
} else {
- CHECK(Type == TYPE_DATETIMEV2 || Type == TYPE_DATEV2);
- for (auto& value : *values) {
- _values->insert(T(value));
+ HybridSetBase::IteratorBase* iter = hybrid_set->begin();
+ while (iter->has_next()) {
+ const void* value = iter->get_value();
+ _values->insert(value);
+ iter->next();
}
+ CHECK(Type == TYPE_DATETIMEV2 || Type == TYPE_DATEV2);
}
} else {
- _values = ((HybridSetType*)hybrid_set.get())->get_inner_set();
+ _values = reinterpret_cast<HybridSetType*>(hybrid_set.get());
}
-
- for (auto& value : *_values) {
- _update_min_max(value);
+ HybridSetBase::IteratorBase* iter = _values->begin();
+ while (iter->has_next()) {
+ const T* value = (const T*)(iter->get_value());
+ _update_min_max(*value);
+ iter->next();
}
}
@@ -173,9 +198,11 @@ public:
*result -= null_bitmap;
}
roaring::Roaring indices;
- for (auto value : *_values) {
+ HybridSetBase::IteratorBase* iter = _values->begin();
+ while (iter->has_next()) {
+ const void* value = iter->get_value();
bool exact_match;
- Status s = iterator->seek_dictionary(&value, &exact_match);
+ Status s = iterator->seek_dictionary(value, &exact_match);
rowid_t seeked_ordinal = iterator->current_ordinal();
if (!s.is<ErrorCode::NOT_FOUND>()) {
if (!s.ok()) {
@@ -187,6 +214,7 @@ public:
indices |= index;
}
}
+ iter->next();
}
if constexpr (PT == PredicateType::IN_LIST) {
@@ -206,12 +234,15 @@ public:
auto column_desc = schema.column(_column_id);
std::string column_name = column_desc->name();
roaring::Roaring indices;
- for (auto value : *_values) {
+ HybridSetBase::IteratorBase* iter = _values->begin();
+ while (iter->has_next()) {
+ const void* value = iter->get_value();
InvertedIndexQueryType query_type =
InvertedIndexQueryType::EQUAL_QUERY;
roaring::Roaring index;
- RETURN_IF_ERROR(iterator->read_from_inverted_index(column_name,
&value, query_type,
+ RETURN_IF_ERROR(iterator->read_from_inverted_index(column_name,
value, query_type,
num_rows,
&index));
indices |= index;
+ iter->next();
}
if constexpr (PT == PredicateType::IN_LIST) {
*result &= indices;
@@ -226,15 +257,15 @@ public:
if (column.is_nullable()) {
auto* nullable_col =
vectorized::check_and_get_column<vectorized::ColumnNullable>(column);
- auto& null_bitmap = reinterpret_cast<const
vectorized::ColumnUInt8&>(
- nullable_col->get_null_map_column())
- .get_data();
+ auto& null_map = reinterpret_cast<const vectorized::ColumnUInt8&>(
+ nullable_col->get_null_map_column())
+ .get_data();
auto& nested_col = nullable_col->get_nested_column();
if (_opposite) {
- return _base_evaluate<true, true>(&nested_col, &null_bitmap,
sel, size);
+ return _base_evaluate<true, true>(&nested_col, &null_map, sel,
size);
} else {
- return _base_evaluate<true, false>(&nested_col, &null_bitmap,
sel, size);
+ return _base_evaluate<true, false>(&nested_col, &null_map,
sel, size);
}
} else {
if (_opposite) {
@@ -328,20 +359,25 @@ public:
bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
if constexpr (PT == PredicateType::IN_LIST) {
- for (auto value : *_values) {
+ HybridSetBase::IteratorBase* iter = _values->begin();
+ while (iter->has_next()) {
if constexpr (std::is_same_v<T, StringRef>) {
- if (bf->test_bytes(value.data, value.size)) {
+ const StringRef* value = (const
StringRef*)iter->get_value();
+ if (bf->test_bytes(value->data, value->size)) {
return true;
}
} else if constexpr (Type == TYPE_DATE) {
- if (bf->test_bytes(reinterpret_cast<char*>(&value),
sizeof(uint24_t))) {
+ const void* value = iter->get_value();
+ if (bf->test_bytes(reinterpret_cast<const char*>(value),
sizeof(uint24_t))) {
return true;
}
} else {
- if (bf->test_bytes(reinterpret_cast<char*>(&value),
sizeof(value))) {
+ const T* value = (const T*)(iter->get_value());
+ if (bf->test_bytes(reinterpret_cast<const char*>(value),
sizeof(*value))) {
return true;
}
}
+ iter->next();
}
return false;
} else {
@@ -355,13 +391,11 @@ public:
private:
template <typename LeftT, typename RightT>
bool _operator(const LeftT& lhs, const RightT& rhs) const {
- if constexpr (Type == TYPE_BOOLEAN) {
- DCHECK(_values->size() == 2);
- return PT == PredicateType::IN_LIST;
- } else if constexpr (PT == PredicateType::IN_LIST) {
+ if constexpr (PT == PredicateType::IN_LIST) {
return lhs != rhs;
+ } else {
+ return lhs == rhs;
}
- return lhs == rhs;
}
template <bool is_nullable, bool is_opposite>
@@ -379,7 +413,7 @@ private:
DCHECK((segid.first.hi | segid.first.mi | segid.first.lo) !=
0);
auto& value_in_dict_flags =
_segment_id_to_value_in_dict_flags[segid];
if (value_in_dict_flags.empty()) {
- nested_col_ptr->find_codes(*_values, value_in_dict_flags);
+ nested_col_ptr->find_codes(_values, value_in_dict_flags);
}
CHECK(value_in_dict_flags.size() ==
nested_col_ptr->dict_size())
@@ -429,19 +463,18 @@ private:
}
if constexpr (!is_opposite) {
- if (_operator(_values->find(reinterpret_cast<const
T&>(data_array[idx])),
- _values->end())) {
+ if (_operator(_values->find(reinterpret_cast<const
T*>(&data_array[idx])),
+ false)) {
sel[new_size++] = idx;
}
} else {
- if (!_operator(_values->find(reinterpret_cast<const
T&>(data_array[idx])),
- _values->end())) {
+ if (!_operator(_values->find(reinterpret_cast<const
T*>(&data_array[idx])),
+ false)) {
sel[new_size++] = idx;
}
}
}
}
-
return new_size;
}
@@ -457,7 +490,7 @@ private:
auto& value_in_dict_flags =
_segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()];
if (value_in_dict_flags.empty()) {
- nested_col_ptr->find_codes(*_values, value_in_dict_flags);
+ nested_col_ptr->find_codes(_values, value_in_dict_flags);
}
for (uint16_t i = 0; i < size; i++) {
@@ -509,14 +542,14 @@ private:
if constexpr (!is_opposite) {
if (is_and ^
- _operator(_values->find(reinterpret_cast<const
T&>(data_array[idx])),
- _values->end())) {
+ _operator(_values->find(reinterpret_cast<const
T*>(&data_array[idx])),
+ false)) {
flags[i] = !is_and;
}
} else {
if (is_and ^
- !_operator(_values->find(reinterpret_cast<const
T&>(data_array[idx])),
- _values->end())) {
+ !_operator(_values->find(reinterpret_cast<const
T*>(&data_array[idx])),
+ false)) {
flags[i] = !is_and;
}
}
@@ -539,7 +572,7 @@ private:
}
}
- phmap::flat_hash_set<T>* _values;
+ HybridSetType* _values;
mutable std::map<std::pair<RowsetId, uint32_t>,
std::vector<vectorized::UInt8>>
_segment_id_to_value_in_dict_flags;
T _min_value;
@@ -549,4 +582,128 @@ private:
std::list<std::string> _temp_datas;
};
+template <PrimitiveType Type, PredicateType PT, typename ConditionType,
typename ConvertFunc,
+ size_t N = 0>
+ColumnPredicate* _create_in_list_predicate(uint32_t column_id, const
ConditionType& conditions,
+ const ConvertFunc& convert, bool
is_opposite = false,
+ const TabletColumn* col = nullptr,
+ MemPool* pool = nullptr) {
+ using T = typename PredicatePrimitiveTypeTraits<Type>::PredicateFieldType;
+ if constexpr (N >= 1 && N <= 12) {
+ using Set = std::conditional_t<
+ std::is_same_v<T, StringRef>,
StringSet<FixedContainer<std::string, N>>,
+ HybridSet<Type, FixedContainer<T, N>,
+
vectorized::PredicateColumnType<PredicateEvaluateType<Type>>>>;
+ return new InListPredicateBase<Type, PT, Set>(column_id, conditions,
convert, is_opposite,
+ col, pool);
+ } else {
+ using Set = std::conditional_t<
+ std::is_same_v<T, StringRef>,
StringSet<DynamicContainer<std::string>>,
+ HybridSet<Type, DynamicContainer<T>,
+
vectorized::PredicateColumnType<PredicateEvaluateType<Type>>>>;
+ return new InListPredicateBase<Type, PT, Set>(column_id, conditions,
convert, is_opposite,
+ col, pool);
+ }
+}
+
+template <PrimitiveType Type, PredicateType PT, typename ConditionType,
typename ConvertFunc>
+ColumnPredicate* create_in_list_predicate(uint32_t column_id, const
ConditionType& conditions,
+ const ConvertFunc& convert, bool
is_opposite = false,
+ const TabletColumn* col = nullptr,
+ MemPool* pool = nullptr) {
+ if (conditions.size() == 1) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
1>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 2) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
2>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 3) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
3>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 4) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
4>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 5) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
5>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 6) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
6>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 7) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
7>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 8) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
8>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 9) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
9>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 10) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
10>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 11) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
11>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else if (conditions.size() == 12) {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
12>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ } else {
+ return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc>(
+ column_id, conditions, convert, is_opposite, col, pool);
+ }
+}
+
+template <PrimitiveType Type, PredicateType PT, size_t N = 0>
+ColumnPredicate* _create_in_list_predicate(uint32_t column_id,
+ const
std::shared_ptr<HybridSetBase>& hybrid_set,
+ size_t char_length = 0) {
+ using T = typename PredicatePrimitiveTypeTraits<Type>::PredicateFieldType;
+ if constexpr (N >= 1 && N <= 12) {
+ using Set = std::conditional_t<
+ std::is_same_v<T, StringRef>,
StringSet<FixedContainer<std::string, N>>,
+ HybridSet<Type, FixedContainer<T, N>,
+
vectorized::PredicateColumnType<PredicateEvaluateType<Type>>>>;
+ return new InListPredicateBase<Type, PT, Set>(column_id, hybrid_set,
char_length);
+ } else {
+ using Set = std::conditional_t<
+ std::is_same_v<T, StringRef>,
StringSet<DynamicContainer<std::string>>,
+ HybridSet<Type, DynamicContainer<T>,
+
vectorized::PredicateColumnType<PredicateEvaluateType<Type>>>>;
+ return new InListPredicateBase<Type, PT, Set>(column_id, hybrid_set,
char_length);
+ }
+}
+
+template <PrimitiveType Type, PredicateType PT>
+ColumnPredicate* create_in_list_predicate(uint32_t column_id,
+ const
std::shared_ptr<HybridSetBase>& hybrid_set,
+ size_t char_length = 0) {
+ if (hybrid_set->size() == 1) {
+ return _create_in_list_predicate<Type, PT, 1>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 2) {
+ return _create_in_list_predicate<Type, PT, 2>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 3) {
+ return _create_in_list_predicate<Type, PT, 3>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 4) {
+ return _create_in_list_predicate<Type, PT, 4>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 5) {
+ return _create_in_list_predicate<Type, PT, 5>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 6) {
+ return _create_in_list_predicate<Type, PT, 6>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 7) {
+ return _create_in_list_predicate<Type, PT, 7>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 8) {
+ return _create_in_list_predicate<Type, PT, 8>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 9) {
+ return _create_in_list_predicate<Type, PT, 9>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 10) {
+ return _create_in_list_predicate<Type, PT, 10>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 11) {
+ return _create_in_list_predicate<Type, PT, 11>(column_id, hybrid_set,
char_length);
+ } else if (hybrid_set->size() == 12) {
+ return _create_in_list_predicate<Type, PT, 12>(column_id, hybrid_set,
char_length);
+ } else {
+ return _create_in_list_predicate<Type, PT>(column_id, hybrid_set,
char_length);
+ }
+}
+
} //namespace doris
diff --git a/be/src/olap/predicate_creator.h b/be/src/olap/predicate_creator.h
index 754a1ad503..251dc2680a 100644
--- a/be/src/olap/predicate_creator.h
+++ b/be/src/olap/predicate_creator.h
@@ -53,7 +53,8 @@ public:
ColumnPredicate* create(const TabletColumn& column, int index, const
ConditionType& conditions,
bool opposite, MemPool* pool) override {
if constexpr (PredicateTypeTraits::is_list(PT)) {
- return new InListPredicateBase<Type, PT>(index, conditions,
convert, opposite);
+ return create_in_list_predicate<Type, PT, ConditionType,
decltype(convert)>(
+ index, conditions, convert, opposite);
} else {
static_assert(PredicateTypeTraits::is_comparison(PT));
return new ComparisonPredicateBase<Type, PT>(index,
convert(conditions), opposite);
@@ -82,7 +83,8 @@ public:
ColumnPredicate* create(const TabletColumn& column, int index, const
ConditionType& conditions,
bool opposite, MemPool* pool) override {
if constexpr (PredicateTypeTraits::is_list(PT)) {
- return new InListPredicateBase<Type, PT>(index, conditions,
convert, opposite, &column);
+ return create_in_list_predicate<Type, PT, ConditionType,
decltype(convert)>(
+ index, conditions, convert, opposite, &column);
} else {
static_assert(PredicateTypeTraits::is_comparison(PT));
return new ComparisonPredicateBase<Type, PT>(index,
convert(column, conditions),
@@ -105,8 +107,8 @@ public:
ColumnPredicate* create(const TabletColumn& column, int index, const
ConditionType& conditions,
bool opposite, MemPool* pool) override {
if constexpr (PredicateTypeTraits::is_list(PT)) {
- return new InListPredicateBase<Type, PT>(index, conditions,
convert, opposite, &column,
- pool);
+ return create_in_list_predicate<Type, PT, ConditionType,
decltype(convert)>(
+ index, conditions, convert, opposite, &column, pool);
} else {
static_assert(PredicateTypeTraits::is_comparison(PT));
return new ComparisonPredicateBase<Type, PT>(index,
convert(column, conditions, pool),
@@ -140,7 +142,8 @@ public:
ColumnPredicate* create(const TabletColumn& column, int index, const
ConditionType& conditions,
bool opposite, MemPool* pool) override {
if constexpr (PredicateTypeTraits::is_list(PT)) {
- return new InListPredicateBase<Type, PT>(index, conditions,
_convert, opposite);
+ return create_in_list_predicate<Type, PT, ConditionType,
decltype(_convert)>(
+ index, conditions, _convert, opposite);
} else {
static_assert(PredicateTypeTraits::is_comparison(PT));
return new ComparisonPredicateBase<Type, PT>(index,
_convert(conditions), opposite);
diff --git a/be/src/runtime/primitive_type.h b/be/src/runtime/primitive_type.h
index aacb87a8f9..e125a4b4a2 100644
--- a/be/src/runtime/primitive_type.h
+++ b/be/src/runtime/primitive_type.h
@@ -268,16 +268,19 @@ struct PredicatePrimitiveTypeTraits<TYPE_DATETIMEV2> {
template <PrimitiveType type>
struct VecPrimitiveTypeTraits {
using CppType = typename PrimitiveTypeTraits<type>::CppType;
+ using ColumnType = typename PrimitiveTypeTraits<type>::ColumnType;
};
template <>
struct VecPrimitiveTypeTraits<TYPE_DATE> {
using CppType = vectorized::VecDateTimeValue;
+ using ColumnType = vectorized::ColumnVector<vectorized::DateTime>;
};
template <>
struct VecPrimitiveTypeTraits<TYPE_DATETIME> {
using CppType = vectorized::VecDateTimeValue;
+ using ColumnType = vectorized::ColumnVector<vectorized::DateTime>;
};
} // namespace doris
diff --git a/be/src/vec/columns/column_dictionary.h
b/be/src/vec/columns/column_dictionary.h
index 9e1c25e69f..29fd33f5b1 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -289,8 +289,8 @@ public:
uint32_t get_hash_value(uint32_t idx) const { return
_dict.get_hash_value(_codes[idx], _type); }
- void find_codes(const phmap::flat_hash_set<StringRef>& values,
- std::vector<vectorized::UInt8>& selected) const {
+ template <typename HybridSetType>
+ void find_codes(const HybridSetType* values,
std::vector<vectorized::UInt8>& selected) const {
return _dict.find_codes(values, selected);
}
@@ -423,13 +423,14 @@ public:
return greater ? bound - greater + eq : bound - eq;
}
- void find_codes(const phmap::flat_hash_set<StringRef>& values,
+ template <typename HybridSetType>
+ void find_codes(const HybridSetType* values,
std::vector<vectorized::UInt8>& selected) const {
size_t dict_word_num = _dict_data->size();
selected.resize(dict_word_num);
selected.assign(dict_word_num, false);
for (size_t i = 0; i < _dict_data->size(); i++) {
- if (values.find((*_dict_data)[i]) != values.end()) {
+ if (values->find(&((*_dict_data)[i]))) {
selected[i] = true;
}
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index 1b1140cc5e..a25cce7171 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -157,7 +157,6 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id,
}
}
}
-
return true;
}
// This function is copied from
@@ -844,7 +843,8 @@ Status
RowGroupReader::_rewrite_dict_conjuncts(std::vector<int32_t>& dict_codes,
node.__set_is_nullable(false);
root = _obj_pool->add(new vectorized::VDirectInPredicate(node));
- std::shared_ptr<HybridSetBase>
hybrid_set(create_set(PrimitiveType::TYPE_INT));
+ std::shared_ptr<HybridSetBase> hybrid_set(
+ create_set(PrimitiveType::TYPE_INT, dict_codes.size()));
for (int j = 0; j < dict_codes.size(); ++j) {
hybrid_set->insert(&dict_codes[j]);
}
diff --git a/be/src/vec/exprs/vdirect_in_predicate.h
b/be/src/vec/exprs/vdirect_in_predicate.h
index b717f00d18..b9321a5c2e 100644
--- a/be/src/vec/exprs/vdirect_in_predicate.h
+++ b/be/src/vec/exprs/vdirect_in_predicate.h
@@ -43,30 +43,14 @@ public:
size_t sz = argument_column->size();
res_data_column->resize(sz);
- auto ptr =
((ColumnVector<UInt8>*)res_data_column.get())->get_data().data();
- auto type =
WhichDataType(remove_nullable(block->get_by_position(arguments[0]).type));
- if (type.is_string_or_fixed_string()) {
- for (size_t i = 0; i < sz; i++) {
- auto ele = argument_column->get_data_at(i);
- StringRef v(ele.data, ele.size);
- ptr[i] = _filter->find(reinterpret_cast<const void*>(&v));
- }
- } else if (type.is_int_or_uint() || type.is_float()) {
- if (argument_column->is_nullable()) {
- auto column_nested = reinterpret_cast<const
ColumnNullable*>(argument_column.get())
- ->get_nested_column_ptr();
- auto column_nullmap = reinterpret_cast<const
ColumnNullable*>(argument_column.get())
- ->get_null_map_column_ptr();
- _filter->find_fixed_len(column_nested->get_raw_data().data,
-
(uint8*)column_nullmap->get_raw_data().data, sz, ptr);
- } else {
- _filter->find_fixed_len(argument_column->get_raw_data().data,
nullptr, sz, ptr);
- }
+ if (argument_column->is_nullable()) {
+ auto column_nested = static_cast<const
ColumnNullable*>(argument_column.get())
+ ->get_nested_column_ptr();
+ auto& null_map =
+ static_cast<const
ColumnNullable*>(argument_column.get())->get_null_map_data();
+ _filter->find_batch_nullable(*column_nested, sz, null_map,
res_data_column->get_data());
} else {
- for (size_t i = 0; i < sz; i++) {
- ptr[i] = _filter->find(
- reinterpret_cast<const
void*>(argument_column->get_data_at(i).data));
- }
+ _filter->find_batch(*argument_column, sz,
res_data_column->get_data());
}
DCHECK(!_data_type->is_nullable());
diff --git a/be/src/vec/functions/in.h b/be/src/vec/functions/in.h
index 02b2a7cb62..af7ff2b8a5 100644
--- a/be/src/vec/functions/in.h
+++ b/be/src/vec/functions/in.h
@@ -68,16 +68,18 @@ public:
}
std::shared_ptr<InState> state = std::make_shared<InState>();
context->set_function_state(scope, state);
+ DCHECK(context->get_num_args() >= 1);
if (context->get_arg_type(0)->type == doris::PrimitiveType::TYPE_CHAR
||
context->get_arg_type(0)->type ==
doris::PrimitiveType::TYPE_VARCHAR ||
context->get_arg_type(0)->type ==
doris::PrimitiveType::TYPE_STRING) {
// the StringValue's memory is held by FunctionContext, so we can
use StringValueSet here directly
- state->hybrid_set.reset(new StringValueSet());
+
state->hybrid_set.reset(create_string_value_set((size_t)(context->get_num_args()
- 1)));
+
} else {
-
state->hybrid_set.reset(create_set(context->get_arg_type(0)->type));
+ state->hybrid_set.reset(create_set(context->get_arg_type(0)->type,
+
(size_t)(context->get_num_args() - 1)));
}
- DCHECK(context->get_num_args() >= 1);
for (int i = 1; i < context->get_num_args(); ++i) {
const auto& const_column_ptr = context->get_constant_col(i);
if (const_column_ptr != nullptr) {
@@ -119,19 +121,17 @@ public:
if (materialized_column->is_nullable()) {
auto* null_col_ptr =
vectorized::check_and_get_column<vectorized::ColumnNullable>(
materialized_column);
- auto& null_bitmap = reinterpret_cast<const
vectorized::ColumnUInt8&>(
-
null_col_ptr->get_null_map_column())
- .get_data();
+ auto& null_map = reinterpret_cast<const
vectorized::ColumnUInt8&>(
+ null_col_ptr->get_null_map_column())
+ .get_data();
auto* nested_col_ptr =
null_col_ptr->get_nested_column_ptr().get();
auto search_hash_set = [&](auto* col_ptr) {
- for (size_t i = 0; i < input_rows_count; ++i) {
- const auto& ref_data = col_ptr->get_data_at(i);
- vec_res[i] =
- !null_bitmap[i] &&
-
in_state->hybrid_set->find((void*)ref_data.data, ref_data.size);
- if constexpr (negative) {
- vec_res[i] = !vec_res[i];
- }
+ if constexpr (!negative) {
+ in_state->hybrid_set->find_batch_nullable(*col_ptr,
input_rows_count,
+ null_map,
vec_res);
+ } else {
+ in_state->hybrid_set->find_batch_nullable_negative(
+ *col_ptr, input_rows_count, null_map, vec_res);
}
};
@@ -146,24 +146,22 @@ public:
if (!in_state->null_in_set) {
for (size_t i = 0; i < input_rows_count; ++i) {
- vec_null_map_to[i] = null_bitmap[i];
+ vec_null_map_to[i] = null_map[i];
}
} else {
for (size_t i = 0; i < input_rows_count; ++i) {
- vec_null_map_to[i] = null_bitmap[i] || negative ==
vec_res[i];
+ vec_null_map_to[i] = null_map[i] || negative ==
vec_res[i];
}
}
} else { // non-nullable
auto search_hash_set = [&](auto* col_ptr) {
- for (size_t i = 0; i < input_rows_count; ++i) {
- const auto& ref_data = col_ptr->get_data_at(i);
- vec_res[i] =
-
in_state->hybrid_set->find((void*)ref_data.data, ref_data.size);
- if constexpr (negative) {
- vec_res[i] = !vec_res[i];
- }
+ if constexpr (!negative) {
+ in_state->hybrid_set->find_batch(*col_ptr,
input_rows_count, vec_res);
+ } else {
+ in_state->hybrid_set->find_batch_negative(*col_ptr,
input_rows_count,
+ vec_res);
}
};
@@ -196,7 +194,7 @@ public:
}
std::unique_ptr<HybridSetBase> hybrid_set(
- create_set(context->get_arg_type(0)->type));
+ create_set(context->get_arg_type(0)->type,
set_columns.size()));
bool null_in_set = false;
for (const auto& set_column : set_columns) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]