This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch tpc_preview2
in repository https://gitbox.apache.org/repos/asf/doris.git

commit c83e49a4977b68b357e68d8cad2ebd6375d76cb3
Author: BiteTheDDDDt <[email protected]>
AuthorDate: Mon Nov 24 20:59:28 2025 +0800

    remove nullable when _serialize_null_into_key is false and add int72
---
 be/src/pipeline/common/agg_utils.h                  |  9 +++++++--
 be/src/pipeline/common/distinct_agg_utils.h         |  9 +++++++--
 be/src/pipeline/common/join_utils.h                 |  7 ++++++-
 be/src/pipeline/common/partition_sort_utils.h       |  7 ++++++-
 be/src/pipeline/common/set_utils.h                  |  5 +++++
 be/src/pipeline/exec/hashjoin_build_sink.cpp        |  6 ++++--
 .../exec/join/process_hash_table_probe_impl.h       |  1 +
 be/src/vec/common/hash_table/hash.h                 | 21 ++++++++++++++-------
 be/src/vec/common/hash_table/hash_key_type.h        |  3 +++
 be/src/vec/common/hash_table/hash_map_context.h     |  1 -
 be/src/vec/common/hash_table/join_hash_table.h      |  6 +++++-
 be/src/vec/common/uint128.h                         | 10 ++++++++++
 be/src/vec/functions/complex_dict_hash_map.h        |  8 ++++++--
 13 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/be/src/pipeline/common/agg_utils.h 
b/be/src/pipeline/common/agg_utils.h
index 146649f96b1..f0cf0a17f2a 100644
--- a/be/src/pipeline/common/agg_utils.h
+++ b/be/src/pipeline/common/agg_utils.h
@@ -81,9 +81,10 @@ using AggregatedMethodVariants = std::variant<
         vectorized::MethodSingleNullableColumn<
                 
vectorized::MethodStringNoCache<AggregatedDataWithNullableShortStringKey>>,
         vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>,
+        vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>,
         vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>,
-        vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>,
-        vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>>;
+        vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>,
+        vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>>;
 
 struct AggregatedDataVariants
         : public DataVariants<AggregatedMethodVariants, 
vectorized::MethodSingleNullableColumn,
@@ -137,6 +138,10 @@ struct AggregatedDataVariants
             
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/distinct_agg_utils.h 
b/be/src/pipeline/common/distinct_agg_utils.h
index 592132eba6b..3c95a2793fc 100644
--- a/be/src/pipeline/common/distinct_agg_utils.h
+++ b/be/src/pipeline/common/distinct_agg_utils.h
@@ -105,9 +105,10 @@ using DistinctMethodVariants = std::variant<
         vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
                 vectorized::DataWithNullKey<DistinctDataWithShortStringKey>>>,
         vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>,
+        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>,
         vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>,
-        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>,
-        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>>;
+        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>,
+        vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>>;
 
 struct DistinctDataVariants
         : public DataVariants<DistinctMethodVariants, 
vectorized::MethodSingleNullableColumn,
@@ -156,6 +157,10 @@ struct DistinctDataVariants
             
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/join_utils.h 
b/be/src/pipeline/common/join_utils.h
index c10b748f82f..08708f037ba 100644
--- a/be/src/pipeline/common/join_utils.h
+++ b/be/src/pipeline/common/join_utils.h
@@ -66,7 +66,8 @@ using HashTableVariants = std::variant<
         DirectPrimaryTypeHashTableContext<vectorized::UInt32>,
         DirectPrimaryTypeHashTableContext<vectorized::UInt64>,
         DirectPrimaryTypeHashTableContext<vectorized::UInt128>,
-        FixedKeyHashTableContext<vectorized::UInt64>, 
FixedKeyHashTableContext<vectorized::UInt128>,
+        FixedKeyHashTableContext<vectorized::UInt64>, 
FixedKeyHashTableContext<vectorized::UInt72>,
+        FixedKeyHashTableContext<vectorized::UInt128>,
         FixedKeyHashTableContext<vectorized::UInt136>,
         FixedKeyHashTableContext<vectorized::UInt256>, MethodOneString>;
 
@@ -103,6 +104,10 @@ struct JoinDataVariants {
             
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt64>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt72>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt128>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/partition_sort_utils.h 
b/be/src/pipeline/common/partition_sort_utils.h
index 381dd3ec42b..ccd1b6a144d 100644
--- a/be/src/pipeline/common/partition_sort_utils.h
+++ b/be/src/pipeline/common/partition_sort_utils.h
@@ -140,9 +140,10 @@ using PartitionedMethodVariants = std::variant<
         PartitionDataSingleNullable<vectorized::UInt128>,
         PartitionDataSingleNullable<vectorized::UInt256>,
         vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>,
+        vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>,
         vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>,
-        vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
         vectorized::MethodKeysFixed<PartitionData<vectorized::UInt136>>,
+        vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
         vectorized::MethodStringNoCache<PartitionDataWithShortStringKey>,
         vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
                 
vectorized::DataWithNullKey<PartitionDataWithShortStringKey>>>>;
@@ -199,6 +200,10 @@ struct PartitionedHashMapVariants
             
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/set_utils.h 
b/be/src/pipeline/common/set_utils.h
index d9f70b1e457..665a7710fa8 100644
--- a/be/src/pipeline/common/set_utils.h
+++ b/be/src/pipeline/common/set_utils.h
@@ -67,6 +67,7 @@ using SetHashTableVariants =
                      SetPrimaryTypeHashTableContext<vectorized::UInt128>,
                      SetPrimaryTypeHashTableContext<vectorized::UInt256>,
                      SetFixedKeyHashTableContext<vectorized::UInt64>,
+                     SetFixedKeyHashTableContext<vectorized::UInt72>,
                      SetFixedKeyHashTableContext<vectorized::UInt128>,
                      SetFixedKeyHashTableContext<vectorized::UInt256>,
                      SetFixedKeyHashTableContext<vectorized::UInt136>>;
@@ -105,6 +106,10 @@ struct SetDataVariants
             
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt64>>(
                     get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt72>>(
+                    get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt128>>(
                     get_key_sizes(data_types));
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp 
b/be/src/pipeline/exec/hashjoin_build_sink.cpp
index cbc22f7168d..778409087ee 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.cpp
+++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp
@@ -450,9 +450,11 @@ Status 
HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state,
         /// For 'null safe equal' join,
         /// the build key column maybe be converted to nullable from 
non-nullable.
         if (p._serialize_null_into_key[i]) {
-            data_type = vectorized::make_nullable(data_type);
+            data_types.emplace_back(vectorized::make_nullable(data_type));
+        } else {
+            // in this case, we use nullmap to represent null value
+            data_types.emplace_back(vectorized::remove_nullable(data_type));
         }
-        data_types.emplace_back(std::move(data_type));
     }
     if (_build_expr_ctxs.size() == 1) {
         p._should_keep_hash_key_column = true;
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h 
b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
index 1f1edec4335..6753052f61c 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
@@ -803,6 +803,7 @@ struct ExtractType<T(U)> {
     INSTANTIATION(JoinOpType, 
(PrimaryTypeHashTableContext<vectorized::UInt128>));       \
     INSTANTIATION(JoinOpType, 
(PrimaryTypeHashTableContext<vectorized::UInt256>));       \
     INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt64>)); 
          \
+    INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt72>)); 
          \
     INSTANTIATION(JoinOpType, 
(FixedKeyHashTableContext<vectorized::UInt128>));          \
     INSTANTIATION(JoinOpType, 
(FixedKeyHashTableContext<vectorized::UInt136>));          \
     INSTANTIATION(JoinOpType, 
(FixedKeyHashTableContext<vectorized::UInt256>));          \
diff --git a/be/src/vec/common/hash_table/hash.h 
b/be/src/vec/common/hash_table/hash.h
index 4b0e20a01a0..6817d7e091d 100644
--- a/be/src/vec/common/hash_table/hash.h
+++ b/be/src/vec/common/hash_table/hash.h
@@ -189,20 +189,27 @@ struct HashCRC32<wide::Int256> {
     }
 };
 
+#include "common/compile_check_avoid_begin.h"
+
+template <>
+struct HashCRC32<doris::vectorized::UInt72> {
+    size_t operator()(const doris::vectorized::UInt72& x) const {
+        doris::vectorized::UInt64 crc = -1ULL;
+        crc = _mm_crc32_u8(crc, x.a);
+        crc = _mm_crc32_u64(crc, x.b);
+        return crc;
+    }
+};
+
 template <>
 struct HashCRC32<doris::vectorized::UInt136> {
     size_t operator()(const doris::vectorized::UInt136& x) const {
-#if defined(__SSE4_2__) || defined(__aarch64__)
         doris::vectorized::UInt64 crc = -1ULL;
-#include "common/compile_check_avoid_begin.h"
-        //_mm_crc32_u8 does not provide a u64 interface, so there is an 
unavoidable conversion from u64 to u32 here.
         crc = _mm_crc32_u8(crc, x.a);
-#include "common/compile_check_avoid_end.h"
         crc = _mm_crc32_u64(crc, x.b);
         crc = _mm_crc32_u64(crc, x.c);
         return crc;
-#else
-        return Hash128to64({Hash128to64({x.a, x.b}), x.c});
-#endif
     }
 };
+
+#include "common/compile_check_avoid_end.h"
diff --git a/be/src/vec/common/hash_table/hash_key_type.h 
b/be/src/vec/common/hash_table/hash_key_type.h
index 52d264371cb..7a04137324e 100644
--- a/be/src/vec/common/hash_table/hash_key_type.h
+++ b/be/src/vec/common/hash_table/hash_key_type.h
@@ -37,6 +37,7 @@ enum class HashKeyType {
     int256_key,
     string_key,
     fixed64,
+    fixed72,
     fixed128,
     fixed136,
     fixed256
@@ -59,6 +60,8 @@ inline HashKeyType get_hash_key_type_with_fixed(size_t size) {
     using namespace vectorized;
     if (size <= sizeof(UInt64)) {
         return HashKeyType::fixed64;
+    } else if (size <= sizeof(UInt72)) {
+        return HashKeyType::fixed72;
     } else if (size <= sizeof(UInt128)) {
         return HashKeyType::fixed128;
     } else if (size <= sizeof(UInt136)) {
diff --git a/be/src/vec/common/hash_table/hash_map_context.h 
b/be/src/vec/common/hash_table/hash_map_context.h
index 0140a30653f..704e2a44833 100644
--- a/be/src/vec/common/hash_table/hash_map_context.h
+++ b/be/src/vec/common/hash_table/hash_map_context.h
@@ -251,7 +251,6 @@ struct MethodSerialized : public MethodBase<TData> {
             for (size_t i = 0; i < num_rows; ++i) {
                 input_keys[i].data =
                         reinterpret_cast<char*>(serialized_key_buffer + i * 
max_one_row_byte_size);
-                input_keys[i].size = 0;
             }
 
             for (const auto& column : key_columns) {
diff --git a/be/src/vec/common/hash_table/join_hash_table.h 
b/be/src/vec/common/hash_table/join_hash_table.h
index 9426829e056..0c3d3951431 100644
--- a/be/src/vec/common/hash_table/join_hash_table.h
+++ b/be/src/vec/common/hash_table/join_hash_table.h
@@ -367,7 +367,7 @@ private:
         const auto batch_size = max_batch_size;
 
         auto do_the_probe = [&]() {
-            while (build_idx && matched_cnt < batch_size) {
+            while (build_idx) {
                 if (_eq(keys[probe_idx], build_keys[build_idx])) {
                     probe_idxs[matched_cnt] = probe_idx;
                     build_idxs[matched_cnt] = build_idx;
@@ -378,6 +378,10 @@ private:
                             visited[build_idx] = 1;
                         }
                     }
+                    if (matched_cnt > batch_size) {
+                        build_idx = next[build_idx];
+                        break;
+                    }
                 }
                 build_idx = next[build_idx];
             }
diff --git a/be/src/vec/common/uint128.h b/be/src/vec/common/uint128.h
index 961a4958955..a8eca75fb87 100644
--- a/be/src/vec/common/uint128.h
+++ b/be/src/vec/common/uint128.h
@@ -61,6 +61,16 @@ struct UInt128TrivialHash {
 
 using UInt256 = wide::UInt256;
 
+#pragma pack(1)
+struct UInt72 {
+    UInt8 a;
+    UInt64 b;
+
+    bool operator==(const UInt72& rhs) const { return a == rhs.a && b == 
rhs.b; }
+};
+#pragma pack()
+
+
 #pragma pack(1)
 struct UInt136 {
     UInt8 a;
diff --git a/be/src/vec/functions/complex_dict_hash_map.h 
b/be/src/vec/functions/complex_dict_hash_map.h
index de06ce3568e..d815cbb0904 100644
--- a/be/src/vec/functions/complex_dict_hash_map.h
+++ b/be/src/vec/functions/complex_dict_hash_map.h
@@ -47,8 +47,9 @@ using DictHashMapVariants = std::variant<
         MethodOneNumber<UInt128, DictHashMap<UInt128>>,
         MethodOneNumber<UInt256, DictHashMap<UInt256>>,
 
-        MethodKeysFixed<DictHashMap<UInt64>>, 
MethodKeysFixed<DictHashMap<UInt128>>,
-        MethodKeysFixed<DictHashMap<UInt256>>, 
MethodKeysFixed<DictHashMap<UInt136>>>;
+        MethodKeysFixed<DictHashMap<UInt64>>, 
MethodKeysFixed<DictHashMap<UInt72>>,
+        MethodKeysFixed<DictHashMap<UInt128>>, 
MethodKeysFixed<DictHashMap<UInt136>>,
+        MethodKeysFixed<DictHashMap<UInt256>>>;
 
 struct DictionaryHashMapMethod
         : public DataVariants<DictHashMapVariants, 
vectorized::MethodSingleNullableColumn,
@@ -83,6 +84,9 @@ struct DictionaryHashMapMethod
         case HashKeyType::fixed64:
             
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt64>>>(get_key_sizes(data_types));
             break;
+        case HashKeyType::fixed72:
+            
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt72>>>(get_key_sizes(data_types));
+            break;
         case HashKeyType::fixed128:
             method_variant.emplace<MethodKeysFixed<DictHashMap<UInt128>>>(
                     get_key_sizes(data_types));


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to