This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 899acb6564 [improvement][agg]import sub hashmap (#10937)
899acb6564 is described below
commit 899acb6564f58ed0bac4732fbbb3575b5d9c215b
Author: Jerry Hu <[email protected]>
AuthorDate: Mon Jul 18 18:36:45 2022 +0800
[improvement][agg]import sub hashmap (#10937)
---
be/src/vec/common/columns_hashing.h | 1 +
be/src/vec/common/hash_table/hash_table.h | 63 +++
be/src/vec/common/hash_table/string_hash_map.h | 211 ++++++++
be/src/vec/common/hash_table/string_hash_table.h | 612 +++++++++++++++++++++++
be/src/vec/exec/vaggregation_node.cpp | 6 +
be/src/vec/exec/vaggregation_node.h | 54 +-
6 files changed, 946 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/common/columns_hashing.h
b/be/src/vec/common/columns_hashing.h
index 9e4ed78f4e..580588c480 100644
--- a/be/src/vec/common/columns_hashing.h
+++ b/be/src/vec/common/columns_hashing.h
@@ -29,6 +29,7 @@
#include "vec/common/hash_table/hash_table.h"
#include "vec/common/hash_table/hash_table_key_holder.h"
#include "vec/common/hash_table/ph_hash_map.h"
+#include "vec/common/hash_table/string_hash_map.h"
#include "vec/common/unaligned.h"
namespace doris::vectorized {
diff --git a/be/src/vec/common/hash_table/hash_table.h
b/be/src/vec/common/hash_table/hash_table.h
index 05b7af938c..ce6407f072 100644
--- a/be/src/vec/common/hash_table/hash_table.h
+++ b/be/src/vec/common/hash_table/hash_table.h
@@ -300,6 +300,66 @@ struct HashTableGrower {
}
};
+/** Determines the size of the hash table, and when and how much it should be
resized.
+ * This structure is aligned to cache line boundary and also occupies it all.
+ * Precalculates some values to speed up lookups and insertion into the
HashTable (and thus has bigger memory footprint than HashTableGrower).
+ */
+template <size_t initial_size_degree = 8>
+class alignas(64) HashTableGrowerWithPrecalculation {
+ /// The state of this structure is enough to get the buffer size of the
hash table.
+
+ doris::vectorized::UInt8 size_degree_ = initial_size_degree;
+ size_t precalculated_mask = (1ULL << initial_size_degree) - 1;
+ size_t precalculated_max_fill = 1ULL << (initial_size_degree - 1);
+
+public:
+ doris::vectorized::UInt8 size_degree() const { return size_degree_; }
+
+ void increase_size_degree(doris::vectorized::UInt8 delta) {
+ size_degree_ += delta;
+ precalculated_mask = (1ULL << size_degree_) - 1;
+ precalculated_max_fill = 1ULL << (size_degree_ - 1);
+ }
+
+ static constexpr auto initial_count = 1ULL << initial_size_degree;
+
+ /// If collision resolution chains are contiguous, we can implement erase
operation by moving the elements.
+ static constexpr auto performs_linear_probing_with_single_step = true;
+
+ /// The size of the hash table in the cells.
+ size_t buf_size() const { return 1ULL << size_degree_; }
+
+ /// From the hash value, get the cell number in the hash table.
+ size_t place(size_t x) const { return x & precalculated_mask; }
+
+ /// The next cell in the collision resolution chain.
+ size_t next(size_t pos) const { return (pos + 1) & precalculated_mask; }
+
+ /// Whether the hash table is sufficiently full. You need to increase the
size of the hash table, or remove something unnecessary from it.
+ bool overflow(size_t elems) const { return elems > precalculated_max_fill;
}
+
+ /// Increase the size of the hash table.
+ void increase_size() { increase_size_degree(size_degree_ >= 23 ? 1 : 2); }
+
+ /// Set the buffer size by the number of elements in the hash table. Used
when deserializing a hash table.
+ void set(size_t num_elems) {
+ size_degree_ =
+ num_elems <= 1
+ ? initial_size_degree
+ : ((initial_size_degree >
static_cast<size_t>(log2(num_elems - 1)) + 2)
+ ? initial_size_degree
+ : (static_cast<size_t>(log2(num_elems - 1))
+ 2));
+ increase_size_degree(0);
+ }
+
+ void set_buf_size(size_t buf_size_) {
+ size_degree_ = static_cast<size_t>(log2(buf_size_ - 1) + 1);
+ increase_size_degree(0);
+ }
+};
+
+static_assert(sizeof(HashTableGrowerWithPrecalculation<>) == 64);
+
/** When used as a Grower, it turns a hash table into something like a lookup
table.
* It remains non-optimal - the cells store the keys.
* Also, the compiler can not completely remove the code of passing through
the collision resolution chain, although it is not needed.
@@ -375,6 +435,9 @@ protected:
template <typename, typename, typename, typename, typename, typename,
size_t>
friend class TwoLevelHashTable;
+ template <typename SubMaps>
+ friend class StringHashTable;
+
using HashValue = size_t;
using Self = HashTable;
using cell_type = Cell;
diff --git a/be/src/vec/common/hash_table/string_hash_map.h
b/be/src/vec/common/hash_table/string_hash_map.h
new file mode 100644
index 0000000000..4e2ca786bb
--- /dev/null
+++ b/be/src/vec/common/hash_table/string_hash_map.h
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+//
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/StringHashMap.h
+// and modified by Doris
+
+#pragma once
+
+#include "vec/common/hash_table/hash_map.h"
+#include "vec/common/hash_table/string_hash_table.h"
+
+template <typename Key, typename TMapped>
+struct StringHashMapCell : public HashMapCell<Key, TMapped,
StringHashTableHash, HashTableNoState> {
+ using Base = HashMapCell<Key, TMapped, StringHashTableHash,
HashTableNoState>;
+ using value_type = typename Base::value_type;
+ using Base::Base;
+ static constexpr bool need_zero_value_storage = false;
+ // external
+ const StringRef get_key() const { return to_string_ref(this->value.first);
} /// NOLINT
+ // internal
+ static const Key& get_key(const value_type& value_) { return value_.first;
}
+};
+
+template <typename TMapped>
+struct StringHashMapCell<StringKey16, TMapped>
+ : public HashMapCell<StringKey16, TMapped, StringHashTableHash,
HashTableNoState> {
+ using Base = HashMapCell<StringKey16, TMapped, StringHashTableHash,
HashTableNoState>;
+ using value_type = typename Base::value_type;
+ using Base::Base;
+ static constexpr bool need_zero_value_storage = false;
+ bool is_zero(const HashTableNoState& state) const { return
is_zero(this->value.first, state); }
+
+ // Zero means unoccupied cells in hash table. Use key with last word = 0 as
+ // zero keys, because such keys are unrepresentable (no way to encode
length).
+ static bool is_zero(const StringKey16& key, const HashTableNoState&) {
return key.high == 0; }
+ void set_zero() { this->value.first.high = 0; }
+
+ // external
+ const StringRef get_key() const { return to_string_ref(this->value.first);
} /// NOLINT
+ // internal
+ static const StringKey16& get_key(const value_type& value_) { return
value_.first; }
+};
+
+template <typename TMapped>
+struct StringHashMapCell<StringKey24, TMapped>
+ : public HashMapCell<StringKey24, TMapped, StringHashTableHash,
HashTableNoState> {
+ using Base = HashMapCell<StringKey24, TMapped, StringHashTableHash,
HashTableNoState>;
+ using value_type = typename Base::value_type;
+ using Base::Base;
+ static constexpr bool need_zero_value_storage = false;
+ bool is_zero(const HashTableNoState& state) const { return
is_zero(this->value.first, state); }
+
+ // Zero means unoccupied cells in hash table. Use key with last word = 0 as
+ // zero keys, because such keys are unrepresentable (no way to encode
length).
+ static bool is_zero(const StringKey24& key, const HashTableNoState&) {
return key.c == 0; }
+ void set_zero() { this->value.first.c = 0; }
+
+ // external
+ const StringRef get_key() const { return to_string_ref(this->value.first);
} /// NOLINT
+ // internal
+ static const StringKey24& get_key(const value_type& value_) { return
value_.first; }
+};
+
+template <typename TMapped>
+struct StringHashMapCell<StringRef, TMapped>
+ : public HashMapCellWithSavedHash<StringRef, TMapped,
StringHashTableHash,
+ HashTableNoState> {
+ using Base =
+ HashMapCellWithSavedHash<StringRef, TMapped, StringHashTableHash,
HashTableNoState>;
+ using value_type = typename Base::value_type;
+ using Base::Base;
+ static constexpr bool need_zero_value_storage = false;
+ // external
+ using Base::get_key;
+ // internal
+ static const StringRef& get_key(const value_type& value_) { return
value_.first; }
+
+ template <typename Key>
+ StringHashMapCell(const StringHashMapCell<Key, TMapped>& other) {
+ assert(need_zero_value_storage == other.need_zero_value_storage);
+ this->value.first = other.get_key();
+ this->value.second = other.get_second();
+ }
+
+ template <typename Key>
+ StringHashMapCell& operator=(const StringHashMapCell<Key, TMapped>& other)
{
+ assert(need_zero_value_storage == other.need_zero_value_storage);
+ this->value.first = other.get_key();
+ this->value.second = other.get_second();
+ return *this;
+ }
+};
+
+template <typename TMapped, typename Allocator>
+struct StringHashMapSubMaps {
+ using T0 = StringHashTableEmpty<StringHashMapCell<StringRef, TMapped>>;
+ using T1 = HashMapTable<StringKey8, StringHashMapCell<StringKey8,
TMapped>, StringHashTableHash,
+ StringHashTableGrower<>, Allocator>;
+ using T2 = HashMapTable<StringKey16, StringHashMapCell<StringKey16,
TMapped>,
+ StringHashTableHash, StringHashTableGrower<>,
Allocator>;
+ using T3 = HashMapTable<StringKey24, StringHashMapCell<StringKey24,
TMapped>,
+ StringHashTableHash, StringHashTableGrower<>,
Allocator>;
+ using Ts = HashMapTable<StringRef, StringHashMapCell<StringRef, TMapped>,
StringHashTableHash,
+ StringHashTableGrower<>, Allocator>;
+};
+
+template <typename TMapped, typename Allocator = HashTableAllocator>
+class StringHashMap : public StringHashTable<StringHashMapSubMaps<TMapped,
Allocator>> {
+public:
+ using Key = StringRef;
+ using Base = StringHashTable<StringHashMapSubMaps<TMapped, Allocator>>;
+ using Self = StringHashMap;
+ using LookupResult = typename Base::LookupResult;
+
+ using Base::Base;
+
+ /// Merge every cell's value of current map into the destination map.
+ /// Func should have signature void(Mapped & dst, Mapped & src, bool
emplaced).
+ /// Each filled cell in current map will invoke func once. If that map
doesn't
+ /// have a key equals to the given cell, a new cell gets emplaced into
that map,
+ /// and func is invoked with the third argument emplaced set to true.
Otherwise
+ /// emplaced is set to false.
+ template <typename Func>
+ void ALWAYS_INLINE merge_to_via_emplace(Self& that, Func&& func) {
+ if (this->m0.hasZero() && that.m0.hasZero())
+ func(that.m0.zero_value()->get_mapped(),
this->m0.zero_value()->get_mapped(), false);
+ else if (this->m0.hasZero()) {
+ that.m0.setHasZero();
+ func(that.m0.zero_value()->get_mapped(),
this->m0.zero_value()->get_mapped(), true);
+ }
+ this->m1.merge_to_via_emplace(that.m1, func);
+ this->m2.merge_to_via_emplace(that.m2, func);
+ this->m3.merge_to_via_emplace(that.m3, func);
+ this->ms.merge_to_via_emplace(that.ms, func);
+ }
+
+ /// Merge every cell's value of current map into the destination map via
find.
+ /// Func should have signature void(Mapped & dst, Mapped & src, bool
exist).
+ /// Each filled cell in current map will invoke func once. If that map
doesn't
+ /// have a key equals to the given cell, func is invoked with the third
argument
+ /// exist set to false. Otherwise exist is set to true.
+ template <typename Func>
+ void ALWAYS_INLINE merge_to_via_find(Self& that, Func&& func) {
+ if (this->m0.size() && that.m0.size())
+ func(that.m0.zero_value()->get_mapped(),
this->m0.zero_value()->get_mapped(), true);
+ else if (this->m0.size())
+ func(this->m0.zero_value()->get_mapped(),
this->m0.zero_value()->get_mapped(), false);
+ this->m1.merge_to_via_find(that.m1, func);
+ this->m2.merge_to_via_find(that.m2, func);
+ this->m3.merge_to_via_find(that.m3, func);
+ this->ms.merge_to_via_find(that.ms, func);
+ }
+
+ TMapped& ALWAYS_INLINE operator[](const Key& x) {
+ LookupResult it;
+ bool inserted;
+ this->emplace(x, it, inserted);
+ if (inserted) new (&it->get_mapped()) TMapped();
+
+ return it->get_mapped();
+ }
+
+ template <typename Func>
+ void ALWAYS_INLINE for_each_value(Func&& func) {
+ if (this->m0.size()) {
+ func(StringRef {}, this->m0.zero_value()->get_second());
+ }
+
+ for (auto& v : this->m1) {
+ func(v.get_key(), v.get_second());
+ }
+
+ for (auto& v : this->m2) {
+ func(v.get_key(), v.get_second());
+ }
+
+ for (auto& v : this->m3) {
+ func(v.get_key(), v.get_second());
+ }
+
+ for (auto& v : this->ms) {
+ func(v.get_key(), v.get_second());
+ }
+ }
+
+ template <typename Func>
+ void ALWAYS_INLINE for_each_mapped(Func&& func) {
+ if (this->m0.size()) func(this->m0.zero_value()->get_second());
+ for (auto& v : this->m1) func(v.get_second());
+ for (auto& v : this->m2) func(v.get_second());
+ for (auto& v : this->m3) func(v.get_second());
+ for (auto& v : this->ms) func(v.get_second());
+ }
+
+ char* get_null_key_data() { return nullptr; }
+ bool has_null_key_data() const { return false; }
+};
diff --git a/be/src/vec/common/hash_table/string_hash_table.h
b/be/src/vec/common/hash_table/string_hash_table.h
new file mode 100644
index 0000000000..bffd0f1e80
--- /dev/null
+++ b/be/src/vec/common/hash_table/string_hash_table.h
@@ -0,0 +1,612 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+//
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/StringHashTable.h
+// and modified by Doris
+
+#pragma once
+
+#include <new>
+#include <variant>
+
+#include "vec/common/hash_table/hash.h"
+
+using StringKey8 = doris::vectorized::UInt64;
+using StringKey16 = doris::vectorized::UInt128;
+struct StringKey24 {
+ doris::vectorized::UInt64 a;
+ doris::vectorized::UInt64 b;
+ doris::vectorized::UInt64 c;
+
+ bool operator==(const StringKey24 rhs) const { return a == rhs.a && b ==
rhs.b && c == rhs.c; }
+};
+
+inline StringRef ALWAYS_INLINE to_string_ref(const StringKey8& n) {
+ assert(n != 0);
+ return {reinterpret_cast<const char*>(&n), 8ul - (__builtin_clzll(n) >>
3)};
+}
+inline StringRef ALWAYS_INLINE to_string_ref(const StringKey16& n) {
+ assert(n.high != 0);
+ return {reinterpret_cast<const char*>(&n), 16ul - (__builtin_clzll(n.high)
>> 3)};
+}
+inline StringRef ALWAYS_INLINE to_string_ref(const StringKey24& n) {
+ assert(n.c != 0);
+ return {reinterpret_cast<const char*>(&n), 24ul - (__builtin_clzll(n.c) >>
3)};
+}
+
+struct StringHashTableHash {
+#if defined(__SSE4_2__)
+ size_t ALWAYS_INLINE operator()(StringKey8 key) const {
+ size_t res = -1ULL;
+ res = _mm_crc32_u64(res, key);
+ return res;
+ }
+ size_t ALWAYS_INLINE operator()(StringKey16 key) const {
+ size_t res = -1ULL;
+ res = _mm_crc32_u64(res, key.low);
+ res = _mm_crc32_u64(res, key.high);
+ return res;
+ }
+ size_t ALWAYS_INLINE operator()(StringKey24 key) const {
+ size_t res = -1ULL;
+ res = _mm_crc32_u64(res, key.a);
+ res = _mm_crc32_u64(res, key.b);
+ res = _mm_crc32_u64(res, key.c);
+ return res;
+ }
+#else
+ size_t ALWAYS_INLINE operator()(StringKey8 key) const {
+ return util_hash::CityHash64(reinterpret_cast<const char*>(&key), 8);
+ }
+ size_t ALWAYS_INLINE operator()(StringKey16 key) const {
+ return util_hash::CityHash64(reinterpret_cast<const char*>(&key), 16);
+ }
+ size_t ALWAYS_INLINE operator()(StringKey24 key) const {
+ return util_hash::CityHash64(reinterpret_cast<const char*>(&key), 24);
+ }
+#endif
+ size_t ALWAYS_INLINE operator()(StringRef key) const { return
StringRefHash()(key); }
+};
+
+template <typename Cell>
+struct StringHashTableEmpty //-V730
+{
+ using Self = StringHashTableEmpty;
+
+ bool _has_zero = false;
+ std::aligned_storage_t<sizeof(Cell), alignof(Cell)>
+ zero_value_storage; /// Storage of element with zero key.
+
+public:
+ bool has_zero() const { return _has_zero; }
+
+ void set_has_zero() {
+ _has_zero = true;
+ new (zero_value()) Cell();
+ }
+
+ void set_has_zero(const Cell& other) {
+ _has_zero = true;
+ new (zero_value()) Cell(other);
+ }
+
+ void clear_has_zero() {
+ _has_zero = false;
+ if (!std::is_trivially_destructible_v<Cell>) zero_value()->~Cell();
+ }
+
+ Cell* zero_value() { return
std::launder(reinterpret_cast<Cell*>(&zero_value_storage)); }
+ const Cell* zero_value() const {
+ return std::launder(reinterpret_cast<const
Cell*>(&zero_value_storage));
+ }
+
+ using LookupResult = Cell*;
+ using ConstLookupResult = const Cell*;
+
+ template <typename KeyHolder>
+ void ALWAYS_INLINE emplace(KeyHolder&&, LookupResult& it, bool& inserted,
size_t = 0) {
+ if (!has_zero()) {
+ set_has_zero();
+ inserted = true;
+ } else
+ inserted = false;
+ it = zero_value();
+ }
+
+ template <typename Key>
+ LookupResult ALWAYS_INLINE find(const Key&, size_t = 0) {
+ return has_zero() ? zero_value() : nullptr;
+ }
+
+ template <typename Key>
+ ConstLookupResult ALWAYS_INLINE find(const Key&, size_t = 0) const {
+ return has_zero() ? zero_value() : nullptr;
+ }
+
+ size_t size() const { return has_zero() ? 1 : 0; }
+ bool empty() const { return !has_zero(); }
+ size_t get_buffer_size_in_bytes() const { return sizeof(Cell); }
+};
+
+template <size_t initial_size_degree = 8>
+struct StringHashTableGrower : public
HashTableGrowerWithPrecalculation<initial_size_degree> {
+ // Smooth growing for string maps
+ void increase_size() { this->increase_size_degree(1); }
+};
+
+template <typename Mapped>
+struct StringHashTableLookupResult {
+ Mapped* mapped_ptr;
+ StringHashTableLookupResult() : mapped_ptr(nullptr) {}
/// NOLINT
+ StringHashTableLookupResult(Mapped* mapped_ptr_) : mapped_ptr(mapped_ptr_)
{} /// NOLINT
+ StringHashTableLookupResult(std::nullptr_t) {}
/// NOLINT
+ const VoidKey getKey() const { return {}; }
/// NOLINT
+ auto& get_mapped() { return *mapped_ptr; }
+ auto& operator*() { return *this; }
+ auto& operator*() const { return *this; }
+ auto* operator->() { return this; }
+ auto* operator->() const { return this; }
+ explicit operator bool() const { return mapped_ptr; }
+ friend bool operator==(const StringHashTableLookupResult& a, const
std::nullptr_t&) {
+ return !a.mapped_ptr;
+ }
+ friend bool operator==(const std::nullptr_t&, const
StringHashTableLookupResult& b) {
+ return !b.mapped_ptr;
+ }
+ friend bool operator!=(const StringHashTableLookupResult& a, const
std::nullptr_t&) {
+ return a.mapped_ptr;
+ }
+ friend bool operator!=(const std::nullptr_t&, const
StringHashTableLookupResult& b) {
+ return b.mapped_ptr;
+ }
+};
+
+template <typename Mapped>
+ALWAYS_INLINE inline auto
lookup_result_get_mapped(StringHashTableLookupResult<Mapped*> cell) {
+ return &cell.get_mapped();
+}
+
+template <typename SubMaps>
+class StringHashTable : private boost::noncopyable {
+protected:
+ static constexpr size_t NUM_MAPS = 5;
+ // Map for storing empty string
+ using T0 = typename SubMaps::T0;
+
+ // Short strings are stored as numbers
+ using T1 = typename SubMaps::T1;
+ using T2 = typename SubMaps::T2;
+ using T3 = typename SubMaps::T3;
+
+ // Long strings are stored as StringRef along with saved hash
+ using Ts = typename SubMaps::Ts;
+ using Self = StringHashTable;
+
+ template <typename, typename, size_t>
+ friend class TwoLevelStringHashTable;
+
+ T0 m0;
+ T1 m1;
+ T2 m2;
+ T3 m3;
+ Ts ms;
+
+ using Cell = typename Ts::cell_type;
+
+ friend class const_iterator;
+ friend class iterator;
+
+ template <typename Derived, bool is_const>
+ class iterator_base {
+ using Container = std::conditional_t<is_const, const Self, Self>;
+
+ Container* container;
+ int sub_table_index;
+ typename T1::iterator iterator1;
+ typename T2::iterator iterator2;
+ typename T3::iterator iterator3;
+ typename Ts::iterator iterator4;
+
+ typename Ts::cell_type cell;
+
+ friend class StringHashTable;
+
+ public:
+ iterator_base() {}
+ iterator_base(Container* container_, bool end = false) :
container(container_) {
+ if (end) {
+ sub_table_index = 4;
+ iterator4 = container->ms.end();
+ } else {
+ sub_table_index = 0;
+ if (container->m0.size() == 0)
+ sub_table_index++;
+ else
+ return;
+
+ iterator1 = container->m1.begin();
+ if (iterator1 == container->m1.end())
+ sub_table_index++;
+ else
+ return;
+
+ iterator2 = container->m2.begin();
+ if (iterator2 == container->m2.end())
+ sub_table_index++;
+ else
+ return;
+
+ iterator3 = container->m3.begin();
+ if (iterator3 == container->m3.end())
+ sub_table_index++;
+ else
+ return;
+
+ iterator4 = container->ms.begin();
+ }
+ }
+
+ bool operator==(const iterator_base& rhs) const {
+ if (sub_table_index != rhs.sub_table_index) return false;
+ switch (sub_table_index) {
+ case 0: {
+ return true;
+ }
+ case 1: {
+ return iterator1 == rhs.iterator1;
+ }
+ case 2: {
+ return iterator2 == rhs.iterator2;
+ }
+ case 3: {
+ return iterator3 == rhs.iterator3;
+ }
+ case 4: {
+ return iterator4 == rhs.iterator4;
+ }
+ }
+ __builtin_unreachable();
+ }
+
+ bool operator!=(const iterator_base& rhs) const { return !(*this ==
rhs); }
+
+ Derived& operator++() {
+ bool need_switch_to_next = false;
+ switch (sub_table_index) {
+ case 0: {
+ need_switch_to_next = true;
+ break;
+ }
+ case 1: {
+ iterator1.template operator++();
+ if (iterator1 == container->m1.end()) {
+ need_switch_to_next = true;
+ }
+ break;
+ }
+ case 2: {
+ iterator2.template operator++();
+ if (iterator2 == container->m2.end()) {
+ need_switch_to_next = true;
+ }
+ break;
+ }
+ case 3: {
+ iterator3.template operator++();
+ if (iterator3 == container->m3.end()) {
+ need_switch_to_next = true;
+ }
+ break;
+ }
+ case 4: {
+ iterator4.template operator++();
+ break;
+ }
+ }
+
+ while (need_switch_to_next) {
+ sub_table_index++;
+ need_switch_to_next = false;
+ switch (sub_table_index) {
+ case 1: {
+ iterator1 = container->m1.begin();
+ if (iterator1 == container->m1.end()) {
+ need_switch_to_next = true;
+ }
+ break;
+ }
+ case 2: {
+ iterator2 = container->m2.begin();
+ if (iterator2 == container->m2.end()) {
+ need_switch_to_next = true;
+ }
+ break;
+ }
+ case 3: {
+ iterator3 = container->m3.begin();
+ if (iterator3 == container->m3.end()) {
+ need_switch_to_next = true;
+ }
+ break;
+ }
+ case 4: {
+ iterator4 = container->ms.begin();
+ break;
+ }
+ }
+ }
+ while (need_switch_to_next)
+ ;
+
+ return static_cast<Derived&>(*this);
+ }
+
+ auto& operator*() const {
+ switch (sub_table_index) {
+ case 0: {
+ const_cast<iterator_base*>(this)->cell =
*(container->m0.zero_value());
+ break;
+ }
+ case 1: {
+ const_cast<iterator_base*>(this)->cell = *iterator1;
+ break;
+ }
+ case 2: {
+ const_cast<iterator_base*>(this)->cell = *iterator2;
+ break;
+ }
+ case 3: {
+ const_cast<iterator_base*>(this)->cell = *iterator3;
+ break;
+ }
+ case 4: {
+ const_cast<iterator_base*>(this)->cell = *iterator4;
+ break;
+ }
+ }
+ return cell;
+ }
+ auto* operator->() const { return &(this->operator*()); }
+
+ auto get_ptr() const { return &(this->operator*()); }
+
+ size_t get_hash() const {
+ switch (sub_table_index) {
+ case 0: {
+ return container->m0.zero_value()->get_hash(container->m0);
+ }
+ case 1: {
+ return iterator1->get_hash(container->m1);
+ }
+ case 2: {
+ return iterator2->get_hash(container->m2);
+ }
+ case 3: {
+ return iterator3->get_hash(container->m3);
+ }
+ case 4: {
+ return iterator4->get_hash(container->ms);
+ }
+ }
+ }
+
+ size_t get_collision_chain_length() const { return 0; }
+
+ /**
+ * A hack for HashedDictionary.
+ *
+ * The problem: std-like find() returns an iterator, which has to be
+ * compared to end(). On the other hand, HashMap::find() returns
+ * LookupResult, which is compared to nullptr. HashedDictionary has to
+ * support both hash maps with the same code, hence the need for this
+ * hack.
+ *
+ * The proper way would be to remove iterator interface from our
+ * HashMap completely, change all its users to the existing internal
+ * iteration interface, and redefine end() to return LookupResult for
+ * compatibility with std find(). Unfortunately, now is not the time
to
+ * do this.
+ */
+ operator Cell*() const { return nullptr; }
+ };
+
+public:
+ using Key = StringRef;
+ using key_type = Key;
+ using mapped_type = typename Ts::mapped_type;
+ using value_type = typename Ts::value_type;
+ using cell_type = typename Ts::cell_type;
+
+ using LookupResult = StringHashTableLookupResult<typename
cell_type::mapped_type>;
+ using ConstLookupResult = StringHashTableLookupResult<const typename
cell_type::mapped_type>;
+
+ StringHashTable() = default;
+
+ explicit StringHashTable(size_t reserve_for_num_elements)
+ : m1 {reserve_for_num_elements / 4},
+ m2 {reserve_for_num_elements / 4},
+ m3 {reserve_for_num_elements / 4},
+ ms {reserve_for_num_elements / 4} {}
+
+ StringHashTable(StringHashTable&& rhs) noexcept
+ : m1(std::move(rhs.m1)),
+ m2(std::move(rhs.m2)),
+ m3(std::move(rhs.m3)),
+ ms(std::move(rhs.ms)) {}
+
+ ~StringHashTable() = default;
+
+ // Dispatch is written in a way that maximizes the performance:
+ // 1. Always memcpy 8 times bytes
+ // 2. Use switch case extension to generate fast dispatching table
+ // 3. Funcs are named callables that can be force_inlined
+ //
+ // NOTE: It relies on Little Endianness
+ //
+ // NOTE: It requires padded to 8 bytes keys (IOW you cannot pass
+ // std::string here, but you can pass i.e. ColumnString::getDataAt()),
+ // since it copies 8 bytes at a time.
+ template <typename Self, typename KeyHolder, typename Func>
+ static auto ALWAYS_INLINE dispatch(Self& self, KeyHolder&& key_holder,
Func&& func) {
+ StringHashTableHash hash;
+ const StringRef& x = key_holder_get_key(key_holder);
+ const size_t sz = x.size;
+ if (sz == 0) {
+ key_holder_discard_key(key_holder);
+ return func(self.m0, VoidKey {}, 0);
+ }
+
+ if (x.data[sz - 1] == 0) {
+ // Strings with trailing zeros are not representable as fixed-size
+ // string keys. Put them to the generic table.
+ return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
+ }
+
+ const char* p = x.data;
+ // pending bits that needs to be shifted out
+ const char s = (-sz & 7) * 8;
+ union {
+ StringKey8 k8;
+ StringKey16 k16;
+ StringKey24 k24;
+ doris::vectorized::UInt64 n[3];
+ };
+ switch ((sz - 1) >> 3) {
+ case 0: // 1..8 bytes
+ {
+ // first half page
+ if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0) {
+ memcpy(&n[0], p, 8);
+ n[0] &= -1ULL >> s;
+ } else {
+ const char* lp = x.data + x.size - 8;
+ memcpy(&n[0], lp, 8);
+ n[0] >>= s;
+ }
+ key_holder_discard_key(key_holder);
+ return func(self.m1, k8, hash(k8));
+ }
+ case 1: // 9..16 bytes
+ {
+ memcpy(&n[0], p, 8);
+ const char* lp = x.data + x.size - 8;
+ memcpy(&n[1], lp, 8);
+ n[1] >>= s;
+ key_holder_discard_key(key_holder);
+ return func(self.m2, k16, hash(k16));
+ }
+ case 2: // 17..24 bytes
+ {
+ memcpy(&n[0], p, 16);
+ const char* lp = x.data + x.size - 8;
+ memcpy(&n[2], lp, 8);
+ n[2] >>= s;
+ key_holder_discard_key(key_holder);
+ return func(self.m3, k24, hash(k24));
+ }
+ default: // >= 25 bytes
+ {
+ return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
+ }
+ }
+ }
+
+ struct EmplaceCallable {
+ LookupResult& mapped;
+ bool& inserted;
+
+ EmplaceCallable(LookupResult& mapped_, bool& inserted_)
+ : mapped(mapped_), inserted(inserted_) {}
+
+ template <typename Map, typename KeyHolder>
+ void ALWAYS_INLINE operator()(Map& map, KeyHolder&& key_holder, size_t
hash) {
+ typename Map::LookupResult result;
+ map.emplace(key_holder, result, inserted, hash);
+ mapped = &result->get_second();
+ }
+ };
+
+ template <typename KeyHolder>
+ void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, bool&
inserted) {
+ this->dispatch(*this, key_holder, EmplaceCallable(it, inserted));
+ }
+
+ struct FindCallable {
+ // find() doesn't need any key memory management, so we don't work with
+ // any key holders here, only with normal keys. The key type is still
+ // different for every subtable, this is why it is a template
parameter.
+ template <typename Submap, typename SubmapKey>
+ auto ALWAYS_INLINE operator()(Submap& map, const SubmapKey& key,
size_t hash) {
+ auto it = map.find(key, hash);
+ if (!it)
+ return decltype(&it->get_mapped()) {};
+ else
+ return &it->get_mapped();
+ }
+ };
+
+ LookupResult ALWAYS_INLINE find(const Key& x) { return dispatch(*this, x,
FindCallable {}); }
+
+ ConstLookupResult ALWAYS_INLINE find(const Key& x) const {
+ return dispatch(*this, x, FindCallable {});
+ }
+
+ bool ALWAYS_INLINE has(const Key& x, size_t = 0) const {
+ return dispatch(*this, x, FindCallable {}) != nullptr;
+ }
+
+ size_t size() const { return m0.size() + m1.size() + m2.size() + m3.size()
+ ms.size(); }
+
+ bool empty() const {
+ return m0.empty() && m1.empty() && m2.empty() && m3.empty() &&
ms.empty();
+ }
+
+ size_t get_buffer_size_in_bytes() const {
+ return m0.get_buffer_size_in_bytes() + m1.get_buffer_size_in_bytes() +
+ m2.get_buffer_size_in_bytes() + m3.get_buffer_size_in_bytes() +
+ ms.get_buffer_size_in_bytes();
+ }
+
+ class iterator : public iterator_base<iterator, false> {
+ public:
+ using iterator_base<iterator, false>::iterator_base;
+ };
+
+ class const_iterator : public iterator_base<const_iterator, true> {
+ public:
+ using iterator_base<const_iterator, true>::iterator_base;
+ };
+
+ const_iterator begin() const { return const_iterator(this); }
+
+ const_iterator cbegin() const { return begin(); }
+
+ iterator begin() { return iterator(this); }
+
+ const_iterator end() const { return const_iterator(this, true); }
+ const_iterator cend() const { return end(); }
+ iterator end() { return iterator(this, true); }
+
+ bool add_elem_size_overflow(size_t add_size) const {
+ return m1.add_elem_size_overflow(add_size) ||
m2.add_elem_size_overflow(add_size) ||
+ m3.add_elem_size_overflow(add_size) ||
ms.add_elem_size_overflow(add_size);
+ }
+
+#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
+ size_t get_collisions() const { return 0; }
+#endif
+};
diff --git a/be/src/vec/exec/vaggregation_node.cpp
b/be/src/vec/exec/vaggregation_node.cpp
index e152e7d477..16f01902cc 100644
--- a/be/src/vec/exec/vaggregation_node.cpp
+++ b/be/src/vec/exec/vaggregation_node.cpp
@@ -170,6 +170,12 @@ void
AggregationNode::_init_hash_method(std::vector<VExprContext*>& probe_exprs)
}
return;
}
+ case TYPE_CHAR:
+ case TYPE_VARCHAR:
+ case TYPE_STRING: {
+ _agg_data.init(AggregatedDataVariants::Type::string_key,
is_nullable);
+ break;
+ }
default:
_agg_data.init(AggregatedDataVariants::Type::serialized);
}
diff --git a/be/src/vec/exec/vaggregation_node.h
b/be/src/vec/exec/vaggregation_node.h
index c2111905dc..cf49a3f9c7 100644
--- a/be/src/vec/exec/vaggregation_node.h
+++ b/be/src/vec/exec/vaggregation_node.h
@@ -107,6 +107,42 @@ private:
using AggregatedDataWithoutKey = AggregateDataPtr;
using AggregatedDataWithStringKey = PHHashMap<StringRef, AggregateDataPtr,
DefaultHash<StringRef>>;
+using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
+
+template <typename TData>
+struct AggregationMethodStringNoCache {
+ using Data = TData;
+ using Key = typename Data::key_type;
+ using Mapped = typename Data::mapped_type;
+ using Iterator = typename Data::iterator;
+
+ Data data;
+ Iterator iterator;
+ bool inited = false;
+
+ AggregationMethodStringNoCache() = default;
+
+ explicit AggregationMethodStringNoCache(size_t size_hint) :
data(size_hint) {}
+
+ template <typename Other>
+ explicit AggregationMethodStringNoCache(const Other& other) :
data(other.data) {}
+
+ using State = ColumnsHashing::HashMethodString<typename Data::value_type,
Mapped, true, false>;
+
+ static const bool low_cardinality_optimization = false;
+
+ static void insert_key_into_columns(const StringRef& key, MutableColumns&
key_columns,
+ const Sizes&) {
+ key_columns[0]->insert_data(key.data, key.size);
+ }
+
+ void init_once() {
+ if (!inited) {
+ inited = true;
+ iterator = data.begin();
+ }
+ }
+};
/// For the case where there is one numeric key.
/// FieldType is UInt8/16/32/64 for any type with corresponding bit width.
@@ -290,6 +326,8 @@ using AggregatedDataWithNullableUInt8Key =
AggregationDataWithNullKey<Aggregated
using AggregatedDataWithNullableUInt16Key =
AggregationDataWithNullKey<AggregatedDataWithUInt16Key>;
using AggregatedDataWithNullableUInt32Key =
AggregationDataWithNullKey<AggregatedDataWithUInt32Key>;
using AggregatedDataWithNullableUInt64Key =
AggregationDataWithNullKey<AggregatedDataWithUInt64Key>;
+using AggregatedDataWithNullableShortStringKey =
+ AggregationDataWithNullKey<AggregatedDataWithShortStringKey>;
using AggregatedDataWithNullableUInt128Key =
AggregationDataWithNullKey<AggregatedDataWithUInt128Key>;
@@ -299,6 +337,7 @@ using AggregatedMethodVariants = std::variant<
AggregationMethodOneNumber<UInt16, AggregatedDataWithUInt16Key, false>,
AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt32Key>,
AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64Key>,
+ AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>,
AggregationMethodOneNumber<UInt128, AggregatedDataWithUInt128Key>,
AggregationMethodSingleNullableColumn<
AggregationMethodOneNumber<UInt8,
AggregatedDataWithNullableUInt8Key, false>>,
@@ -310,6 +349,8 @@ using AggregatedMethodVariants = std::variant<
AggregationMethodOneNumber<UInt64,
AggregatedDataWithNullableUInt64Key>>,
AggregationMethodSingleNullableColumn<
AggregationMethodOneNumber<UInt128,
AggregatedDataWithNullableUInt128Key>>,
+ AggregationMethodSingleNullableColumn<
+
AggregationMethodStringNoCache<AggregatedDataWithNullableShortStringKey>>,
AggregationMethodKeysFixed<AggregatedDataWithUInt64Key, false>,
AggregationMethodKeysFixed<AggregatedDataWithUInt64Key, true>,
AggregationMethodKeysFixed<AggregatedDataWithUInt128Key, false>,
@@ -336,7 +377,8 @@ struct AggregatedDataVariants {
int128_key,
int64_keys,
int128_keys,
- int256_keys
+ int256_keys,
+ string_key,
};
Type _type = Type::EMPTY;
@@ -425,6 +467,16 @@ struct AggregatedDataVariants {
.emplace<AggregationMethodKeysFixed<AggregatedDataWithUInt256Key, false>>();
}
break;
+ case Type::string_key:
+ if (is_nullable) {
+ _aggregated_method_variant.emplace<
+
AggregationMethodSingleNullableColumn<AggregationMethodStringNoCache<
+ AggregatedDataWithNullableShortStringKey>>>();
+ } else {
+ _aggregated_method_variant.emplace<
+
AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>>();
+ }
+ break;
default:
DCHECK(false) << "Do not have a rigth agg data type";
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]