This is an automated email from the ASF dual-hosted git repository.

zclll pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new f4fdf54c871 [Improvement](hash) remove nullable when 
_serialize_null_into_key is false and add int72 (#58316)
f4fdf54c871 is described below

commit f4fdf54c8716e10c034cc4f7d9bf8ac19f31905e
Author: Pxl <[email protected]>
AuthorDate: Thu Nov 27 18:44:27 2025 +0800

    [Improvement](hash) remove nullable when _serialize_null_into_key is false 
and add int72 (#58316)
    
    tpcds q97 8.8s -> 8.3s, q2 9s -> 8.7s
    
    This pull request adds support for a new fixed-width hash key type,
    `UInt72`, across the codebase. This enables more efficient handling of
    hash keys that are 72 bits wide in various data processing components,
    including aggregation, joins, sets, partitioning, and dictionary hash
    maps. The changes involve updating type variants, hash key type
    detection, and hash functions to accommodate the new type.
    
    ### Hash Key Type Support
    
    * Introduced the new `UInt72` struct and added it to the hash key type
    enumeration (`HashKeyType::fixed72`) and type detection logic in
    `hash_key_type.h`, allowing the system to recognize and use 72-bit hash
    keys.
    
[[1]](diffhunk://#diff-0dea38f1f0f0f99ad74d97d77e100557d743ad599b3f5f75c825baf9c13ecdbfR64-R72)
    
[[2]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R40)
    
[[3]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R63-R64)
    
    ### Variant and Method Updates
    
    * Added `UInt72`-based variants to all major hash table, aggregation,
    distinct, set, partition, and dictionary hash map method variant
    definitions and their corresponding initialization logic, ensuring that
    all relevant components can utilize the new key type.
    
[[1]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R84-R87)
    
[[2]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R141-R144)
    
[[3]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R108-R111)
    
[[4]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R160-R163)
    
[[5]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963L69-R70)
    
[[6]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963R107-R110)
    
[[7]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R143-R146)
    
[[8]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R203-R206)
    
[[9]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR70)
    
[[10]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR109-R112)
    
[[11]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650L50-R52)
    
[[12]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650R87-R89)
    
    ### Hash Function Implementation
    
    * Implemented a specialized CRC32 hash function for `UInt72` in
    `hash.h`, ensuring proper hashing behavior for the new type.
    
    ### Code Generation and Instantiation
    
    * Updated template instantiations and code generation macros to include
    `FixedKeyHashTableContext<vectorized::UInt72>`, ensuring that join and
    hash table probing logic supports the new key type.
    
    ### Minor Logic Adjustment
    
    * Refactored build key column handling in hash join to correctly manage
    nullable and non-nullable types in `hashjoin_build_sink.cpp`.
---
 be/src/pipeline/common/agg_utils.h                  |  9 +++++++--
 be/src/pipeline/common/distinct_agg_utils.h         |  9 +++++++--
 be/src/pipeline/common/join_utils.h                 |  7 ++++++-
 be/src/pipeline/common/partition_sort_utils.h       |  7 ++++++-
 be/src/pipeline/common/set_utils.h                  |  5 +++++
 be/src/pipeline/exec/hashjoin_build_sink.cpp        |  6 ++++--
 .../exec/join/process_hash_table_probe_impl.h       |  1 +
 be/src/vec/common/hash_table/hash.h                 | 21 ++++++++++++++-------
 be/src/vec/common/hash_table/hash_key_type.h        |  3 +++
 be/src/vec/common/uint128.h                         |  9 +++++++++
 be/src/vec/functions/complex_dict_hash_map.h        |  8 ++++++--
 be/test/pipeline/common/distinct_agg_utils_test.cpp |  7 ++++++-
 be/test/pipeline/common/set_utils_test.cpp          |  6 +++++-
 13 files changed, 79 insertions(+), 19 deletions(-)

diff --git a/be/src/pipeline/common/agg_utils.h 
b/be/src/pipeline/common/agg_utils.h
index 146649f96b1..f0cf0a17f2a 100644
--- a/be/src/pipeline/common/agg_utils.h
+++ b/be/src/pipeline/common/agg_utils.h
@@ -81,9 +81,10 @@ using AggregatedMethodVariants = std::variant<
         vectorized::MethodSingleNullableColumn<
                 
vectorized::MethodStringNoCache<AggregatedDataWithNullableShortStringKey>>,
         vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>,
+        vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>,
         vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>,
-        vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>,
-        vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>>;
+        vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>,
+        vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>>;
 
 struct AggregatedDataVariants
         : public DataVariants<AggregatedMethodVariants, 
vectorized::MethodSingleNullableColumn,
@@ -137,6 +138,10 @@ struct AggregatedDataVariants
             
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/distinct_agg_utils.h 
b/be/src/pipeline/common/distinct_agg_utils.h
index 592132eba6b..3c95a2793fc 100644
--- a/be/src/pipeline/common/distinct_agg_utils.h
+++ b/be/src/pipeline/common/distinct_agg_utils.h
@@ -105,9 +105,10 @@ using DistinctMethodVariants = std::variant<
         vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
                 vectorized::DataWithNullKey<DistinctDataWithShortStringKey>>>,
         vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>,
+        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>,
         vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>,
-        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>,
-        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>>;
+        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>,
+        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>>;
 
 struct DistinctDataVariants
         : public DataVariants<DistinctMethodVariants, 
vectorized::MethodSingleNullableColumn,
@@ -156,6 +157,10 @@ struct DistinctDataVariants
             
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/join_utils.h 
b/be/src/pipeline/common/join_utils.h
index c10b748f82f..08708f037ba 100644
--- a/be/src/pipeline/common/join_utils.h
+++ b/be/src/pipeline/common/join_utils.h
@@ -66,7 +66,8 @@ using HashTableVariants = std::variant<
         DirectPrimaryTypeHashTableContext<vectorized::UInt32>,
         DirectPrimaryTypeHashTableContext<vectorized::UInt64>,
         DirectPrimaryTypeHashTableContext<vectorized::UInt128>,
-        FixedKeyHashTableContext<vectorized::UInt64>, 
FixedKeyHashTableContext<vectorized::UInt128>,
+        FixedKeyHashTableContext<vectorized::UInt64>, 
FixedKeyHashTableContext<vectorized::UInt72>,
+        FixedKeyHashTableContext<vectorized::UInt128>,
         FixedKeyHashTableContext<vectorized::UInt136>,
         FixedKeyHashTableContext<vectorized::UInt256>, MethodOneString>;
 
@@ -103,6 +104,10 @@ struct JoinDataVariants {
             
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt64>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt72>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt128>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/partition_sort_utils.h 
b/be/src/pipeline/common/partition_sort_utils.h
index 381dd3ec42b..ccd1b6a144d 100644
--- a/be/src/pipeline/common/partition_sort_utils.h
+++ b/be/src/pipeline/common/partition_sort_utils.h
@@ -140,9 +140,10 @@ using PartitionedMethodVariants = std::variant<
         PartitionDataSingleNullable<vectorized::UInt128>,
         PartitionDataSingleNullable<vectorized::UInt256>,
         vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>,
+        vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>,
         vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>,
-        vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
         vectorized::MethodKeysFixed<PartitionData<vectorized::UInt136>>,
+        vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
         vectorized::MethodStringNoCache<PartitionDataWithShortStringKey>,
         vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
                 
vectorized::DataWithNullKey<PartitionDataWithShortStringKey>>>>;
@@ -199,6 +200,10 @@ struct PartitionedHashMapVariants
             
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/set_utils.h 
b/be/src/pipeline/common/set_utils.h
index d9f70b1e457..665a7710fa8 100644
--- a/be/src/pipeline/common/set_utils.h
+++ b/be/src/pipeline/common/set_utils.h
@@ -67,6 +67,7 @@ using SetHashTableVariants =
                      SetPrimaryTypeHashTableContext<vectorized::UInt128>,
                      SetPrimaryTypeHashTableContext<vectorized::UInt256>,
                      SetFixedKeyHashTableContext<vectorized::UInt64>,
+                     SetFixedKeyHashTableContext<vectorized::UInt72>,
                      SetFixedKeyHashTableContext<vectorized::UInt128>,
                      SetFixedKeyHashTableContext<vectorized::UInt256>,
                      SetFixedKeyHashTableContext<vectorized::UInt136>>;
@@ -105,6 +106,10 @@ struct SetDataVariants
             
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt64>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt72>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt128>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp 
b/be/src/pipeline/exec/hashjoin_build_sink.cpp
index cbc22f7168d..778409087ee 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.cpp
+++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp
@@ -450,9 +450,11 @@ Status 
HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state,
         /// For 'null safe equal' join,
         /// the build key column maybe be converted to nullable from 
non-nullable.
         if (p._serialize_null_into_key[i]) {
-            data_type = vectorized::make_nullable(data_type);
+            data_types.emplace_back(vectorized::make_nullable(data_type));
+        } else {
+            // in this case, we use nullmap to represent null value
+            data_types.emplace_back(vectorized::remove_nullable(data_type));
         }
-        data_types.emplace_back(std::move(data_type));
     }
     if (_build_expr_ctxs.size() == 1) {
         p._should_keep_hash_key_column = true;
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h 
b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
index 1f1edec4335..6753052f61c 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
@@ -803,6 +803,7 @@ struct ExtractType<T(U)> {
     INSTANTIATION(JoinOpType, 
(PrimaryTypeHashTableContext<vectorized::UInt128>));       \
     INSTANTIATION(JoinOpType, 
(PrimaryTypeHashTableContext<vectorized::UInt256>));       \
     INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt64>)); 
          \
+    INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt72>)); 
          \
     INSTANTIATION(JoinOpType, 
(FixedKeyHashTableContext<vectorized::UInt128>));          \
     INSTANTIATION(JoinOpType, 
(FixedKeyHashTableContext<vectorized::UInt136>));          \
     INSTANTIATION(JoinOpType, 
(FixedKeyHashTableContext<vectorized::UInt256>));          \
diff --git a/be/src/vec/common/hash_table/hash.h 
b/be/src/vec/common/hash_table/hash.h
index 4b0e20a01a0..6817d7e091d 100644
--- a/be/src/vec/common/hash_table/hash.h
+++ b/be/src/vec/common/hash_table/hash.h
@@ -189,20 +189,27 @@ struct HashCRC32<wide::Int256> {
     }
 };
 
+#include "common/compile_check_avoid_begin.h"
+
+template <>
+struct HashCRC32<doris::vectorized::UInt72> {
+    size_t operator()(const doris::vectorized::UInt72& x) const {
+        doris::vectorized::UInt64 crc = -1ULL;
+        crc = _mm_crc32_u8(crc, x.a);
+        crc = _mm_crc32_u64(crc, x.b);
+        return crc;
+    }
+};
+
 template <>
 struct HashCRC32<doris::vectorized::UInt136> {
     size_t operator()(const doris::vectorized::UInt136& x) const {
-#if defined(__SSE4_2__) || defined(__aarch64__)
         doris::vectorized::UInt64 crc = -1ULL;
-#include "common/compile_check_avoid_begin.h"
-        //_mm_crc32_u8 does not provide a u64 interface, so there is an 
unavoidable conversion from u64 to u32 here.
         crc = _mm_crc32_u8(crc, x.a);
-#include "common/compile_check_avoid_end.h"
         crc = _mm_crc32_u64(crc, x.b);
         crc = _mm_crc32_u64(crc, x.c);
         return crc;
-#else
-        return Hash128to64({Hash128to64({x.a, x.b}), x.c});
-#endif
     }
 };
+
+#include "common/compile_check_avoid_end.h"
diff --git a/be/src/vec/common/hash_table/hash_key_type.h 
b/be/src/vec/common/hash_table/hash_key_type.h
index 52d264371cb..7a04137324e 100644
--- a/be/src/vec/common/hash_table/hash_key_type.h
+++ b/be/src/vec/common/hash_table/hash_key_type.h
@@ -37,6 +37,7 @@ enum class HashKeyType {
     int256_key,
     string_key,
     fixed64,
+    fixed72,
     fixed128,
     fixed136,
     fixed256
@@ -59,6 +60,8 @@ inline HashKeyType get_hash_key_type_with_fixed(size_t size) {
     using namespace vectorized;
     if (size <= sizeof(UInt64)) {
         return HashKeyType::fixed64;
+    } else if (size <= sizeof(UInt72)) {
+        return HashKeyType::fixed72;
     } else if (size <= sizeof(UInt128)) {
         return HashKeyType::fixed128;
     } else if (size <= sizeof(UInt136)) {
diff --git a/be/src/vec/common/uint128.h b/be/src/vec/common/uint128.h
index 961a4958955..2a6bb70177d 100644
--- a/be/src/vec/common/uint128.h
+++ b/be/src/vec/common/uint128.h
@@ -61,6 +61,15 @@ struct UInt128TrivialHash {
 
 using UInt256 = wide::UInt256;
 
+#pragma pack(1)
+struct UInt72 {
+    UInt8 a;
+    UInt64 b;
+
+    bool operator==(const UInt72& rhs) const { return a == rhs.a && b == 
rhs.b; }
+};
+#pragma pack()
+
 #pragma pack(1)
 struct UInt136 {
     UInt8 a;
diff --git a/be/src/vec/functions/complex_dict_hash_map.h 
b/be/src/vec/functions/complex_dict_hash_map.h
index de06ce3568e..d815cbb0904 100644
--- a/be/src/vec/functions/complex_dict_hash_map.h
+++ b/be/src/vec/functions/complex_dict_hash_map.h
@@ -47,8 +47,9 @@ using DictHashMapVariants = std::variant<
         MethodOneNumber<UInt128, DictHashMap<UInt128>>,
         MethodOneNumber<UInt256, DictHashMap<UInt256>>,
 
-        MethodKeysFixed<DictHashMap<UInt64>>, 
MethodKeysFixed<DictHashMap<UInt128>>,
-        MethodKeysFixed<DictHashMap<UInt256>>, 
MethodKeysFixed<DictHashMap<UInt136>>>;
+        MethodKeysFixed<DictHashMap<UInt64>>, 
MethodKeysFixed<DictHashMap<UInt72>>,
+        MethodKeysFixed<DictHashMap<UInt128>>, 
MethodKeysFixed<DictHashMap<UInt136>>,
+        MethodKeysFixed<DictHashMap<UInt256>>>;
 
 struct DictionaryHashMapMethod
         : public DataVariants<DictHashMapVariants, 
vectorized::MethodSingleNullableColumn,
@@ -83,6 +84,9 @@ struct DictionaryHashMapMethod
         case HashKeyType::fixed64:
             
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt64>>>(get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt72>>>(get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             method_variant.emplace<MethodKeysFixed<DictHashMap<UInt128>>>(
                     get_key_sizes(data_types));
diff --git a/be/test/pipeline/common/distinct_agg_utils_test.cpp 
b/be/test/pipeline/common/distinct_agg_utils_test.cpp
index 8d0c5bcb98f..33a572455e2 100644
--- a/be/test/pipeline/common/distinct_agg_utils_test.cpp
+++ b/be/test/pipeline/common/distinct_agg_utils_test.cpp
@@ -159,6 +159,11 @@ TEST_F(DistinctAggUtilsTest, 
TestDistinctDataVariantsInitFixedKeys) {
                         
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>>(
                     variants.method_variant));
             break;
+        case HashKeyType::fixed72:
+            ASSERT_TRUE(std::holds_alternative<
+                        
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
+                    variants.method_variant));
+            break;
         case HashKeyType::fixed128:
             ASSERT_TRUE(std::holds_alternative<
                         
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
@@ -190,7 +195,7 @@ TEST_F(DistinctAggUtilsTest, 
TestDistinctDataVariantsInitFixedKeys) {
 
     test_block({std::make_shared<vectorized::DataTypeInt64>(),
                 std::make_shared<vectorized::DataTypeUInt8>()},
-               HashKeyType::fixed128);
+               HashKeyType::fixed72);
 
     test_block({std::make_shared<vectorized::DataTypeInt64>(),
                 std::make_shared<vectorized::DataTypeInt64>()},
diff --git a/be/test/pipeline/common/set_utils_test.cpp 
b/be/test/pipeline/common/set_utils_test.cpp
index 89bd2e175c5..bb12a8edb6f 100644
--- a/be/test/pipeline/common/set_utils_test.cpp
+++ b/be/test/pipeline/common/set_utils_test.cpp
@@ -93,6 +93,10 @@ TEST_F(SetUtilsTest, TestSetDataVariantsInitFixedKeys) {
             
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt64>>(
                     variants.method_variant));
             break;
+        case HashKeyType::fixed72:
+            
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt72>>(
+                    variants.method_variant));
+            break;
         case HashKeyType::fixed128:
             
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt128>>(
                     variants.method_variant));
@@ -121,7 +125,7 @@ TEST_F(SetUtilsTest, TestSetDataVariantsInitFixedKeys) {
 
     test_block({std::make_shared<vectorized::DataTypeInt64>(),
                 std::make_shared<vectorized::DataTypeUInt8>()},
-               HashKeyType::fixed128);
+               HashKeyType::fixed72);
 
     test_block({std::make_shared<vectorized::DataTypeInt64>(),
                 std::make_shared<vectorized::DataTypeInt64>()},


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to