This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch tpc_preview2 in repository https://gitbox.apache.org/repos/asf/doris.git
commit c83e49a4977b68b357e68d8cad2ebd6375d76cb3 Author: BiteTheDDDDt <[email protected]> AuthorDate: Mon Nov 24 20:59:28 2025 +0800 remove nullable when _serialize_null_into_key is false and add int72 --- be/src/pipeline/common/agg_utils.h | 9 +++++++-- be/src/pipeline/common/distinct_agg_utils.h | 9 +++++++-- be/src/pipeline/common/join_utils.h | 7 ++++++- be/src/pipeline/common/partition_sort_utils.h | 7 ++++++- be/src/pipeline/common/set_utils.h | 5 +++++ be/src/pipeline/exec/hashjoin_build_sink.cpp | 6 ++++-- .../exec/join/process_hash_table_probe_impl.h | 1 + be/src/vec/common/hash_table/hash.h | 21 ++++++++++++++------- be/src/vec/common/hash_table/hash_key_type.h | 3 +++ be/src/vec/common/hash_table/hash_map_context.h | 1 - be/src/vec/common/hash_table/join_hash_table.h | 6 +++++- be/src/vec/common/uint128.h | 10 ++++++++++ be/src/vec/functions/complex_dict_hash_map.h | 8 ++++++-- 13 files changed, 74 insertions(+), 19 deletions(-) diff --git a/be/src/pipeline/common/agg_utils.h b/be/src/pipeline/common/agg_utils.h index 146649f96b1..f0cf0a17f2a 100644 --- a/be/src/pipeline/common/agg_utils.h +++ b/be/src/pipeline/common/agg_utils.h @@ -81,9 +81,10 @@ using AggregatedMethodVariants = std::variant< vectorized::MethodSingleNullableColumn< vectorized::MethodStringNoCache<AggregatedDataWithNullableShortStringKey>>, vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>, + vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>, vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>, - vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>, - vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>>; + vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>, + vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>>; struct AggregatedDataVariants : public DataVariants<AggregatedMethodVariants, vectorized::MethodSingleNullableColumn, @@ -137,6 +138,10 @@ struct AggregatedDataVariants method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>>( get_key_sizes(data_types)); break; + case HashKeyType::fixed72: + method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>>( + get_key_sizes(data_types)); + break; case HashKeyType::fixed128: method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>>( get_key_sizes(data_types)); diff --git a/be/src/pipeline/common/distinct_agg_utils.h b/be/src/pipeline/common/distinct_agg_utils.h index 592132eba6b..3c95a2793fc 100644 --- a/be/src/pipeline/common/distinct_agg_utils.h +++ b/be/src/pipeline/common/distinct_agg_utils.h @@ -105,9 +105,10 @@ using DistinctMethodVariants = std::variant< vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache< vectorized::DataWithNullKey<DistinctDataWithShortStringKey>>>, vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>, + vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>, vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>, - vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>, - vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>>; + vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>, + vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>>; struct DistinctDataVariants : public DataVariants<DistinctMethodVariants, vectorized::MethodSingleNullableColumn, @@ -156,6 +157,10 @@ struct DistinctDataVariants method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>>( get_key_sizes(data_types)); break; + case HashKeyType::fixed72: + method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>( + get_key_sizes(data_types)); + break; case HashKeyType::fixed128: method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>( get_key_sizes(data_types)); diff --git a/be/src/pipeline/common/join_utils.h b/be/src/pipeline/common/join_utils.h index c10b748f82f..08708f037ba 100644 --- a/be/src/pipeline/common/join_utils.h +++ b/be/src/pipeline/common/join_utils.h @@ -66,7 +66,8 @@ using HashTableVariants = std::variant< DirectPrimaryTypeHashTableContext<vectorized::UInt32>, DirectPrimaryTypeHashTableContext<vectorized::UInt64>, DirectPrimaryTypeHashTableContext<vectorized::UInt128>, - FixedKeyHashTableContext<vectorized::UInt64>, FixedKeyHashTableContext<vectorized::UInt128>, + FixedKeyHashTableContext<vectorized::UInt64>, FixedKeyHashTableContext<vectorized::UInt72>, + FixedKeyHashTableContext<vectorized::UInt128>, FixedKeyHashTableContext<vectorized::UInt136>, FixedKeyHashTableContext<vectorized::UInt256>, MethodOneString>; @@ -103,6 +104,10 @@ struct JoinDataVariants { method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt64>>( get_key_sizes(data_types)); break; + case HashKeyType::fixed72: + method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt72>>( + get_key_sizes(data_types)); + break; case HashKeyType::fixed128: method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt128>>( get_key_sizes(data_types)); diff --git a/be/src/pipeline/common/partition_sort_utils.h b/be/src/pipeline/common/partition_sort_utils.h index 381dd3ec42b..ccd1b6a144d 100644 --- a/be/src/pipeline/common/partition_sort_utils.h +++ b/be/src/pipeline/common/partition_sort_utils.h @@ -140,9 +140,10 @@ using PartitionedMethodVariants = std::variant< PartitionDataSingleNullable<vectorized::UInt128>, PartitionDataSingleNullable<vectorized::UInt256>, vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>, + vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>, vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>, - vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>, vectorized::MethodKeysFixed<PartitionData<vectorized::UInt136>>, + vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>, vectorized::MethodStringNoCache<PartitionDataWithShortStringKey>, vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache< vectorized::DataWithNullKey<PartitionDataWithShortStringKey>>>>; @@ -199,6 +200,10 @@ struct PartitionedHashMapVariants method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>>( get_key_sizes(data_types)); break; + case HashKeyType::fixed72: + method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>>( + get_key_sizes(data_types)); + break; case HashKeyType::fixed128: method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>>( get_key_sizes(data_types)); diff --git a/be/src/pipeline/common/set_utils.h b/be/src/pipeline/common/set_utils.h index d9f70b1e457..665a7710fa8 100644 --- a/be/src/pipeline/common/set_utils.h +++ b/be/src/pipeline/common/set_utils.h @@ -67,6 +67,7 @@ using SetHashTableVariants = SetPrimaryTypeHashTableContext<vectorized::UInt128>, SetPrimaryTypeHashTableContext<vectorized::UInt256>, SetFixedKeyHashTableContext<vectorized::UInt64>, + SetFixedKeyHashTableContext<vectorized::UInt72>, SetFixedKeyHashTableContext<vectorized::UInt128>, SetFixedKeyHashTableContext<vectorized::UInt256>, SetFixedKeyHashTableContext<vectorized::UInt136>>; @@ -105,6 +106,10 @@ struct SetDataVariants method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt64>>( get_key_sizes(data_types)); break; + case HashKeyType::fixed72: + method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt72>>( + get_key_sizes(data_types)); + break; case HashKeyType::fixed128: method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt128>>( get_key_sizes(data_types)); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index cbc22f7168d..778409087ee 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -450,9 +450,11 @@ Status HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state, /// For 'null safe equal' join, /// the build key column maybe be converted to nullable from non-nullable. if (p._serialize_null_into_key[i]) { - data_type = vectorized::make_nullable(data_type); + data_types.emplace_back(vectorized::make_nullable(data_type)); + } else { + // in this case, we use nullmap to represent null value + data_types.emplace_back(vectorized::remove_nullable(data_type)); } - data_types.emplace_back(std::move(data_type)); } if (_build_expr_ctxs.size() == 1) { p._should_keep_hash_key_column = true; diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 1f1edec4335..6753052f61c 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -803,6 +803,7 @@ struct ExtractType<T(U)> { INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext<vectorized::UInt128>)); \ INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext<vectorized::UInt256>)); \ INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt64>)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt72>)); \ INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt128>)); \ INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt136>)); \ INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt256>)); \ diff --git a/be/src/vec/common/hash_table/hash.h b/be/src/vec/common/hash_table/hash.h index 4b0e20a01a0..6817d7e091d 100644 --- a/be/src/vec/common/hash_table/hash.h +++ b/be/src/vec/common/hash_table/hash.h @@ -189,20 +189,27 @@ struct HashCRC32<wide::Int256> { } }; +#include "common/compile_check_avoid_begin.h" + +template <> +struct HashCRC32<doris::vectorized::UInt72> { + size_t operator()(const doris::vectorized::UInt72& x) const { + doris::vectorized::UInt64 crc = -1ULL; + crc = _mm_crc32_u8(crc, x.a); + crc = _mm_crc32_u64(crc, x.b); + return crc; + } +}; + template <> struct HashCRC32<doris::vectorized::UInt136> { size_t operator()(const doris::vectorized::UInt136& x) const { -#if defined(__SSE4_2__) || defined(__aarch64__) doris::vectorized::UInt64 crc = -1ULL; -#include "common/compile_check_avoid_begin.h" - //_mm_crc32_u8 does not provide a u64 interface, so there is an unavoidable conversion from u64 to u32 here. crc = _mm_crc32_u8(crc, x.a); -#include "common/compile_check_avoid_end.h" crc = _mm_crc32_u64(crc, x.b); crc = _mm_crc32_u64(crc, x.c); return crc; -#else - return Hash128to64({Hash128to64({x.a, x.b}), x.c}); -#endif } }; + +#include "common/compile_check_avoid_end.h" diff --git a/be/src/vec/common/hash_table/hash_key_type.h b/be/src/vec/common/hash_table/hash_key_type.h index 52d264371cb..7a04137324e 100644 --- a/be/src/vec/common/hash_table/hash_key_type.h +++ b/be/src/vec/common/hash_table/hash_key_type.h @@ -37,6 +37,7 @@ enum class HashKeyType { int256_key, string_key, fixed64, + fixed72, fixed128, fixed136, fixed256 @@ -59,6 +60,8 @@ inline HashKeyType get_hash_key_type_with_fixed(size_t size) { using namespace vectorized; if (size <= sizeof(UInt64)) { return HashKeyType::fixed64; + } else if (size <= sizeof(UInt72)) { + return HashKeyType::fixed72; } else if (size <= sizeof(UInt128)) { return HashKeyType::fixed128; } else if (size <= sizeof(UInt136)) { diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h index 0140a30653f..704e2a44833 100644 --- a/be/src/vec/common/hash_table/hash_map_context.h +++ b/be/src/vec/common/hash_table/hash_map_context.h @@ -251,7 +251,6 @@ struct MethodSerialized : public MethodBase<TData> { for (size_t i = 0; i < num_rows; ++i) { input_keys[i].data = reinterpret_cast<char*>(serialized_key_buffer + i * max_one_row_byte_size); - input_keys[i].size = 0; } for (const auto& column : key_columns) { diff --git a/be/src/vec/common/hash_table/join_hash_table.h b/be/src/vec/common/hash_table/join_hash_table.h index 9426829e056..0c3d3951431 100644 --- a/be/src/vec/common/hash_table/join_hash_table.h +++ b/be/src/vec/common/hash_table/join_hash_table.h @@ -367,7 +367,7 @@ private: const auto batch_size = max_batch_size; auto do_the_probe = [&]() { - while (build_idx && matched_cnt < batch_size) { + while (build_idx) { if (_eq(keys[probe_idx], build_keys[build_idx])) { probe_idxs[matched_cnt] = probe_idx; build_idxs[matched_cnt] = build_idx; @@ -378,6 +378,10 @@ private: visited[build_idx] = 1; } } + if (matched_cnt > batch_size) { + build_idx = next[build_idx]; + break; + } } build_idx = next[build_idx]; } diff --git a/be/src/vec/common/uint128.h b/be/src/vec/common/uint128.h index 961a4958955..a8eca75fb87 100644 --- a/be/src/vec/common/uint128.h +++ b/be/src/vec/common/uint128.h @@ -61,6 +61,16 @@ struct UInt128TrivialHash { using UInt256 = wide::UInt256; +#pragma pack(1) +struct UInt72 { + UInt8 a; + UInt64 b; + + bool operator==(const UInt72& rhs) const { return a == rhs.a && b == rhs.b; } +}; +#pragma pack() + + #pragma pack(1) struct UInt136 { UInt8 a; diff --git a/be/src/vec/functions/complex_dict_hash_map.h b/be/src/vec/functions/complex_dict_hash_map.h index de06ce3568e..d815cbb0904 100644 --- a/be/src/vec/functions/complex_dict_hash_map.h +++ b/be/src/vec/functions/complex_dict_hash_map.h @@ -47,8 +47,9 @@ using DictHashMapVariants = std::variant< MethodOneNumber<UInt128, DictHashMap<UInt128>>, MethodOneNumber<UInt256, DictHashMap<UInt256>>, - MethodKeysFixed<DictHashMap<UInt64>>, MethodKeysFixed<DictHashMap<UInt128>>, - MethodKeysFixed<DictHashMap<UInt256>>, MethodKeysFixed<DictHashMap<UInt136>>>; + MethodKeysFixed<DictHashMap<UInt64>>, MethodKeysFixed<DictHashMap<UInt72>>, + MethodKeysFixed<DictHashMap<UInt128>>, MethodKeysFixed<DictHashMap<UInt136>>, + MethodKeysFixed<DictHashMap<UInt256>>>; struct DictionaryHashMapMethod : public DataVariants<DictHashMapVariants, vectorized::MethodSingleNullableColumn, @@ -83,6 +84,9 @@ struct DictionaryHashMapMethod case HashKeyType::fixed64: method_variant.emplace<MethodKeysFixed<DictHashMap<UInt64>>>(get_key_sizes(data_types)); break; + case HashKeyType::fixed72: + method_variant.emplace<MethodKeysFixed<DictHashMap<UInt72>>>(get_key_sizes(data_types)); + break; case HashKeyType::fixed128: method_variant.emplace<MethodKeysFixed<DictHashMap<UInt128>>>( get_key_sizes(data_types)); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
