This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 245f4a8ff76 [Improvement](hash) add int96 int104 to hash method
(#58770)
245f4a8ff76 is described below
commit 245f4a8ff763ffd9911e4bb1f5903a336dedc9f1
Author: Pxl <[email protected]>
AuthorDate: Wed Dec 10 11:51:26 2025 +0800
[Improvement](hash) add int96 int104 to hash method (#58770)
```sql
select
count(*)
from
(
select
brand_id,
class_id,
category_id
from
(
SELECT
iss.i_brand_id brand_id,
iss.i_class_id class_id,
iss.i_category_id category_id
FROM
store_sales,
item iss,
date_dim d1
WHERE
(ss_item_sk = iss.i_item_sk)
AND (ss_sold_date_sk = d1.d_date_sk)
AND (
d1.d_year BETWEEN 1999
AND (1999 + 2)
)
)tmp
group by
brand_id,
class_id,
category_id
)tmp2;
```
before:
<img width="904" height="262" alt="QQ_1764934227381"
src="https://github.com/user-attachments/assets/771a51d7-049d-49a0-a4af-eab318047c2d"
/>
after:
<img width="808" height="250" alt="QQ_1764934235361"
src="https://github.com/user-attachments/assets/56ea2e41-04d4-4cd7-a3a9-3c1f8eab596c"
/>
This pull request adds support for new fixed-width hash key types,
specifically `UInt96` and `UInt104`, across the aggregation, join, set,
partition, and dictionary hash map utilities in the codebase. The
changes ensure that these new types are fully integrated into the
relevant data structures, hash functions, and test coverage, improving
flexibility and performance for scenarios that require these key sizes.
**Support for new hash key types (`UInt96` and `UInt104`):**
* Added new struct definitions for `UInt96` and `UInt104` in
`uint128.h`, including equality operators.
* Updated the `HashKeyType` enum and `get_hash_key_type_with_fixed`
function to include `fixed96` and `fixed104` options.
[[1]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R41-R42)
[[2]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R67-R70)
* Implemented `HashCRC32` specializations for `UInt96` and `UInt104` to
enable CRC32 hashing for these types.
**Integration into aggregation, set, join, partition, and dictionary
utilities:**
* Extended the variant types and initialization logic in aggregation
(`agg_utils.h`), distinct aggregation (`distinct_agg_utils.h`), set
(`set_utils.h`), join (`join_utils.h`), partition sort
(`partition_sort_utils.h`), and dictionary hash map
(`complex_dict_hash_map.h`) utilities to support the new key types.
[[1]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R85-R86)
[[2]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R147-R154)
[[3]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R109-R110)
[[4]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R166-R173)
[[5]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR71-R72)
[[6]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR115-R122)
[[7]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963R70)
[[8]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963R112-R119)
[[9]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R144-R145)
[[10]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R209-R216)
[[11]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650R51)
[[12]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650R91-R97)
* Updated template instantiations and type extraction logic to handle
the new key types in join probe implementation.
**Test coverage:**
* Added test cases to verify initialization and type handling for the
new key types in set and distinct aggregation utilities.
[[1]](diffhunk://#diff-9e0e850ab93037077da8e96f7d72b1d45c40835221ccca205cc20ef571115603R167-R176)
[[2]](diffhunk://#diff-96eb9173d84e4c838fcef6dcef716e5e4519ea678f842b4a512c66bbd2f275b1R100-R107)
---
be/src/pipeline/common/agg_utils.h | 10 ++++++++++
be/src/pipeline/common/distinct_agg_utils.h | 10 ++++++++++
be/src/pipeline/common/join_utils.h | 9 +++++++++
be/src/pipeline/common/partition_sort_utils.h | 10 ++++++++++
be/src/pipeline/common/set_utils.h | 10 ++++++++++
.../exec/join/process_hash_table_probe_impl.h | 2 ++
be/src/vec/common/hash_table/hash.h | 21 +++++++++++++++++++++
be/src/vec/common/hash_table/hash_key_type.h | 6 ++++++
be/src/vec/common/uint128.h | 19 +++++++++++++++++++
be/src/vec/functions/complex_dict_hash_map.h | 8 ++++++++
be/test/pipeline/common/distinct_agg_utils_test.cpp | 10 ++++++++++
be/test/pipeline/common/set_utils_test.cpp | 8 ++++++++
12 files changed, 123 insertions(+)
diff --git a/be/src/pipeline/common/agg_utils.h
b/be/src/pipeline/common/agg_utils.h
index f0cf0a17f2a..f676013c27f 100644
--- a/be/src/pipeline/common/agg_utils.h
+++ b/be/src/pipeline/common/agg_utils.h
@@ -82,6 +82,8 @@ using AggregatedMethodVariants = std::variant<
vectorized::MethodStringNoCache<AggregatedDataWithNullableShortStringKey>>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>,
+ vectorized::MethodKeysFixed<AggData<vectorized::UInt96>>,
+ vectorized::MethodKeysFixed<AggData<vectorized::UInt104>>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>>;
@@ -142,6 +144,14 @@ struct AggregatedDataVariants
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed96:
+
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt96>>>(
+ get_key_sizes(data_types));
+ break;
+ case HashKeyType::fixed104:
+
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt104>>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/distinct_agg_utils.h
b/be/src/pipeline/common/distinct_agg_utils.h
index 3c95a2793fc..17ec246be16 100644
--- a/be/src/pipeline/common/distinct_agg_utils.h
+++ b/be/src/pipeline/common/distinct_agg_utils.h
@@ -106,6 +106,8 @@ using DistinctMethodVariants = std::variant<
vectorized::DataWithNullKey<DistinctDataWithShortStringKey>>>,
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>,
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>,
+ vectorized::MethodKeysFixed<DistinctData<vectorized::UInt96>>,
+ vectorized::MethodKeysFixed<DistinctData<vectorized::UInt104>>,
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>,
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>,
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>>;
@@ -161,6 +163,14 @@ struct DistinctDataVariants
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed96:
+
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt96>>>(
+ get_key_sizes(data_types));
+ break;
+ case HashKeyType::fixed104:
+
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt104>>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/join_utils.h
b/be/src/pipeline/common/join_utils.h
index 08708f037ba..8d46c317fb5 100644
--- a/be/src/pipeline/common/join_utils.h
+++ b/be/src/pipeline/common/join_utils.h
@@ -67,6 +67,7 @@ using HashTableVariants = std::variant<
DirectPrimaryTypeHashTableContext<vectorized::UInt64>,
DirectPrimaryTypeHashTableContext<vectorized::UInt128>,
FixedKeyHashTableContext<vectorized::UInt64>,
FixedKeyHashTableContext<vectorized::UInt72>,
+ FixedKeyHashTableContext<vectorized::UInt96>,
FixedKeyHashTableContext<vectorized::UInt104>,
FixedKeyHashTableContext<vectorized::UInt128>,
FixedKeyHashTableContext<vectorized::UInt136>,
FixedKeyHashTableContext<vectorized::UInt256>, MethodOneString>;
@@ -108,6 +109,14 @@ struct JoinDataVariants {
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt72>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed96:
+
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt96>>(
+ get_key_sizes(data_types));
+ break;
+ case HashKeyType::fixed104:
+
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt104>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt128>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/partition_sort_utils.h
b/be/src/pipeline/common/partition_sort_utils.h
index ccd1b6a144d..e1eebddf1c9 100644
--- a/be/src/pipeline/common/partition_sort_utils.h
+++ b/be/src/pipeline/common/partition_sort_utils.h
@@ -141,6 +141,8 @@ using PartitionedMethodVariants = std::variant<
PartitionDataSingleNullable<vectorized::UInt256>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>,
+ vectorized::MethodKeysFixed<PartitionData<vectorized::UInt96>>,
+ vectorized::MethodKeysFixed<PartitionData<vectorized::UInt104>>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt136>>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
@@ -204,6 +206,14 @@ struct PartitionedHashMapVariants
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed96:
+
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt96>>>(
+ get_key_sizes(data_types));
+ break;
+ case HashKeyType::fixed104:
+
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt104>>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/set_utils.h
b/be/src/pipeline/common/set_utils.h
index 665a7710fa8..d08ad883b83 100644
--- a/be/src/pipeline/common/set_utils.h
+++ b/be/src/pipeline/common/set_utils.h
@@ -68,6 +68,8 @@ using SetHashTableVariants =
SetPrimaryTypeHashTableContext<vectorized::UInt256>,
SetFixedKeyHashTableContext<vectorized::UInt64>,
SetFixedKeyHashTableContext<vectorized::UInt72>,
+ SetFixedKeyHashTableContext<vectorized::UInt96>,
+ SetFixedKeyHashTableContext<vectorized::UInt104>,
SetFixedKeyHashTableContext<vectorized::UInt128>,
SetFixedKeyHashTableContext<vectorized::UInt256>,
SetFixedKeyHashTableContext<vectorized::UInt136>>;
@@ -110,6 +112,14 @@ struct SetDataVariants
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt72>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed96:
+
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt96>>(
+ get_key_sizes(data_types));
+ break;
+ case HashKeyType::fixed104:
+
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt104>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt128>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
index 1253afabdbd..60261223cdf 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
@@ -813,6 +813,8 @@ struct ExtractType<T(U)> {
INSTANTIATION(JoinOpType,
(PrimaryTypeHashTableContext<vectorized::UInt256>)); \
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt64>));
\
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt72>));
\
+ INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt96>));
\
+ INSTANTIATION(JoinOpType,
(FixedKeyHashTableContext<vectorized::UInt104>)); \
INSTANTIATION(JoinOpType,
(FixedKeyHashTableContext<vectorized::UInt128>)); \
INSTANTIATION(JoinOpType,
(FixedKeyHashTableContext<vectorized::UInt136>)); \
INSTANTIATION(JoinOpType,
(FixedKeyHashTableContext<vectorized::UInt256>)); \
diff --git a/be/src/vec/common/hash_table/hash.h
b/be/src/vec/common/hash_table/hash.h
index 6817d7e091d..348cb5d4555 100644
--- a/be/src/vec/common/hash_table/hash.h
+++ b/be/src/vec/common/hash_table/hash.h
@@ -201,6 +201,27 @@ struct HashCRC32<doris::vectorized::UInt72> {
}
};
+template <>
+struct HashCRC32<doris::vectorized::UInt96> {
+ size_t operator()(const doris::vectorized::UInt96& x) const {
+ doris::vectorized::UInt64 crc = -1ULL;
+ crc = _mm_crc32_u32(crc, x.a);
+ crc = _mm_crc32_u64(crc, x.b);
+ return crc;
+ }
+};
+
+template <>
+struct HashCRC32<doris::vectorized::UInt104> {
+ size_t operator()(const doris::vectorized::UInt104& x) const {
+ doris::vectorized::UInt64 crc = -1ULL;
+ crc = _mm_crc32_u8(crc, x.a);
+ crc = _mm_crc32_u32(crc, x.b);
+ crc = _mm_crc32_u64(crc, x.c);
+ return crc;
+ }
+};
+
template <>
struct HashCRC32<doris::vectorized::UInt136> {
size_t operator()(const doris::vectorized::UInt136& x) const {
diff --git a/be/src/vec/common/hash_table/hash_key_type.h
b/be/src/vec/common/hash_table/hash_key_type.h
index 7a04137324e..025af1bdc40 100644
--- a/be/src/vec/common/hash_table/hash_key_type.h
+++ b/be/src/vec/common/hash_table/hash_key_type.h
@@ -38,6 +38,8 @@ enum class HashKeyType {
string_key,
fixed64,
fixed72,
+ fixed96,
+ fixed104,
fixed128,
fixed136,
fixed256
@@ -62,6 +64,10 @@ inline HashKeyType get_hash_key_type_with_fixed(size_t size)
{
return HashKeyType::fixed64;
} else if (size <= sizeof(UInt72)) {
return HashKeyType::fixed72;
+ } else if (size <= sizeof(UInt96)) {
+ return HashKeyType::fixed96;
+ } else if (size <= sizeof(UInt104)) {
+ return HashKeyType::fixed104;
} else if (size <= sizeof(UInt128)) {
return HashKeyType::fixed128;
} else if (size <= sizeof(UInt136)) {
diff --git a/be/src/vec/common/uint128.h b/be/src/vec/common/uint128.h
index 2a6bb70177d..58db42868a9 100644
--- a/be/src/vec/common/uint128.h
+++ b/be/src/vec/common/uint128.h
@@ -70,6 +70,25 @@ struct UInt72 {
};
#pragma pack()
+#pragma pack(1)
+struct UInt96 {
+ UInt32 a;
+ UInt64 b;
+
+ bool operator==(const UInt96& rhs) const { return a == rhs.a && b ==
rhs.b; }
+};
+#pragma pack()
+
+#pragma pack(1)
+struct UInt104 {
+ UInt8 a;
+ UInt32 b;
+ UInt64 c;
+
+ bool operator==(const UInt104& rhs) const { return a == rhs.a && b ==
rhs.b && c == rhs.c; }
+};
+#pragma pack()
+
#pragma pack(1)
struct UInt136 {
UInt8 a;
diff --git a/be/src/vec/functions/complex_dict_hash_map.h
b/be/src/vec/functions/complex_dict_hash_map.h
index d815cbb0904..15db68240cf 100644
--- a/be/src/vec/functions/complex_dict_hash_map.h
+++ b/be/src/vec/functions/complex_dict_hash_map.h
@@ -48,6 +48,7 @@ using DictHashMapVariants = std::variant<
MethodOneNumber<UInt256, DictHashMap<UInt256>>,
MethodKeysFixed<DictHashMap<UInt64>>,
MethodKeysFixed<DictHashMap<UInt72>>,
+ MethodKeysFixed<DictHashMap<UInt96>>,
MethodKeysFixed<DictHashMap<UInt104>>,
MethodKeysFixed<DictHashMap<UInt128>>,
MethodKeysFixed<DictHashMap<UInt136>>,
MethodKeysFixed<DictHashMap<UInt256>>>;
@@ -87,6 +88,13 @@ struct DictionaryHashMapMethod
case HashKeyType::fixed72:
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt72>>>(get_key_sizes(data_types));
break;
+ case HashKeyType::fixed96:
+
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt96>>>(get_key_sizes(data_types));
+ break;
+ case HashKeyType::fixed104:
+ method_variant.emplace<MethodKeysFixed<DictHashMap<UInt104>>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/test/pipeline/common/distinct_agg_utils_test.cpp
b/be/test/pipeline/common/distinct_agg_utils_test.cpp
index 33a572455e2..788c69e3e53 100644
--- a/be/test/pipeline/common/distinct_agg_utils_test.cpp
+++ b/be/test/pipeline/common/distinct_agg_utils_test.cpp
@@ -164,6 +164,16 @@ TEST_F(DistinctAggUtilsTest,
TestDistinctDataVariantsInitFixedKeys) {
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
variants.method_variant));
break;
+ case HashKeyType::fixed96:
+ ASSERT_TRUE(std::holds_alternative<
+
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt96>>>(
+ variants.method_variant));
+ break;
+ case HashKeyType::fixed104:
+ ASSERT_TRUE(std::holds_alternative<
+
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt104>>>(
+ variants.method_variant));
+ break;
case HashKeyType::fixed128:
ASSERT_TRUE(std::holds_alternative<
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
diff --git a/be/test/pipeline/common/set_utils_test.cpp
b/be/test/pipeline/common/set_utils_test.cpp
index bb12a8edb6f..8766c1dd5b1 100644
--- a/be/test/pipeline/common/set_utils_test.cpp
+++ b/be/test/pipeline/common/set_utils_test.cpp
@@ -97,6 +97,14 @@ TEST_F(SetUtilsTest, TestSetDataVariantsInitFixedKeys) {
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt72>>(
variants.method_variant));
break;
+ case HashKeyType::fixed96:
+
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt96>>(
+ variants.method_variant));
+ break;
+ case HashKeyType::fixed104:
+
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt104>>(
+ variants.method_variant));
+ break;
case HashKeyType::fixed128:
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt128>>(
variants.method_variant));
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]