linrrzqqq commented on code in PR #63196:
URL: https://github.com/apache/doris/pull/63196#discussion_r3257951288


##########
be/src/exprs/function/function_hash.cpp:
##########
@@ -133,6 +133,107 @@ using FunctionMurmurHash3_64_V2 =
 using FunctionMurmurHash3U64V2 =
         FunctionVariadicArgumentsBase<DataTypeInt128, 
MurmurHash3Impl<TYPE_LARGEINT, true>>;
 
+struct MurmurHash3128Impl {
+    static constexpr auto name = "murmur_hash3_128";
+
+    static Status empty_apply(IColumn& icolumn, size_t input_rows_count) {
+        ColumnVector<TYPE_LARGEINT>& vec_to = 
assert_cast<ColumnVector<TYPE_LARGEINT>&>(icolumn);
+        vec_to.get_data().assign(input_rows_count, pack_hash(emtpy_value, 
emtpy_value));
+        return Status::OK();
+    }
+
+    static Status first_apply(const IDataType* type, const IColumn* column, 
size_t input_rows_count,
+                              IColumn& icolumn) {
+        return execute<true>(type, column, input_rows_count, icolumn);
+    }
+
+    static Status combine_apply(const IDataType* type, const IColumn* column,
+                                size_t input_rows_count, IColumn& icolumn) {
+        return execute<false>(type, column, input_rows_count, icolumn);
+    }
+
+    template <bool first>
+    static Status execute(const IDataType* type, const IColumn* column, size_t 
input_rows_count,
+                          IColumn& col_to) {
+        auto& to_column = assert_cast<ColumnVector<TYPE_LARGEINT>&>(col_to);
+        if constexpr (first) {
+            // The first argument initializes one 128-bit hash state per row. 
Later arguments reuse
+            // the same result column and update the saved state in place.
+            to_column.insert_many_defaults(input_rows_count);
+        }
+        auto& col_to_data = to_column.get_data();
+        if (const auto* col_from = check_and_get_column<ColumnString>(column)) 
{
+            const typename ColumnString::Chars& data = col_from->get_chars();
+            const typename ColumnString::Offsets& offsets = 
col_from->get_offsets();
+            size_t size = offsets.size();
+            ColumnString::Offset current_offset = 0;
+            for (size_t i = 0; i < size; ++i) {
+                if constexpr (first) {
+                    init_hash(col_to_data[i], reinterpret_cast<const 
char*>(&data[current_offset]),
+                              offsets[i] - current_offset);
+                } else {
+                    update_hash(col_to_data[i],
+                                reinterpret_cast<const 
char*>(&data[current_offset]),
+                                offsets[i] - current_offset);
+                }
+                current_offset = offsets[i];
+            }
+        } else if (const ColumnConst* col_from_const =
+                           
check_and_get_column_const_string_or_fixedstring(column)) {
+            auto value = col_from_const->get_value<TYPE_STRING>();
+            for (size_t i = 0; i < input_rows_count; ++i) {
+                if constexpr (first) {
+                    init_hash(col_to_data[i], value.data(), value.size());
+                } else {
+                    update_hash(col_to_data[i], value.data(), value.size());
+                }
+            }
+        } else {
+            DCHECK(false);
+            return Status::NotSupported("Illegal column {} of argument of 
function {}",
+                                        column->get_name(), name);
+        }
+        return Status::OK();
+    }
+
+private:
+    static __int128_t pack_hash(uint64_t h1, uint64_t h2) {
+        static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
+        // Store the two MurmurHash3 x64 128-bit lanes in a single LARGEINT 
value. Keep h1 in the
+        // low 64 bits and h2 in the high 64 bits to match 
murmur_hash3_x64_128's out[0]/out[1].
+        const auto value =
+                (static_cast<unsigned __int128>(h2) << 64) | 
static_cast<unsigned __int128>(h1);
+        return static_cast<__int128_t>(value);

Review Comment:
   [python ](https://mmh3.readthedocs.io/en/stable/api.html#mmh3.hash128) 和 
[go](https://github.com/twmb/murmur3/blob/v1.1.8/murmur128.go)的默认实现都是无符号的,这里可能得跟
 mmh3_64一样,加一个`murmur_hash3_u128`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to