This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 9457dc6b48b9cd780921df0f896664531d19c255
Author: amory <[email protected]>
AuthorDate: Tue Jul 11 14:40:40 2023 +0800

    [Improve](hash-fun)improve nested hash with range #21699
    
    Issue Number: close #xxx
    
    when cal array hash, elem size is not need to seed hash
    hash = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&elem_size),
                                                       sizeof(elem_size), hash);
    but we need to be care [[], [1]] vs [[1], []], when array nested array , 
and nested array is empty, we should make hash seed to
    make difference
    2. use range for one hash value to avoid virtual function call in loop.
    which double the performance. I make it in ut
    
    column: array[int64]
    50 rows , and single array has 10w elements
---
 be/src/vec/columns/column.h                   |  8 ++-
 be/src/vec/columns/column_array.cpp           | 77 +++++++++++++++++++-------
 be/src/vec/columns/column_array.h             |  6 +-
 be/src/vec/columns/column_const.h             |  8 ++-
 be/src/vec/columns/column_decimal.cpp         | 50 ++++++++++++-----
 be/src/vec/columns/column_decimal.h           | 14 ++++-
 be/src/vec/columns/column_map.cpp             | 80 ++++++++++++++++++++-------
 be/src/vec/columns/column_map.h               |  6 +-
 be/src/vec/columns/column_nullable.cpp        | 34 ++++++++----
 be/src/vec/columns/column_nullable.h          |  6 +-
 be/src/vec/columns/column_string.h            | 42 +++++++++++---
 be/src/vec/columns/column_struct.cpp          | 10 ++--
 be/src/vec/columns/column_struct.h            |  6 +-
 be/src/vec/columns/column_vector.h            | 42 +++++++++++---
 be/test/vec/columns/column_hash_func_test.cpp | 79 ++++++++++++++++++++++++++
 15 files changed, 371 insertions(+), 97 deletions(-)

diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index b291ba2443..0ff14a1653 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -386,7 +386,9 @@ public:
         LOG(FATAL) << get_name() << " update_hashes_with_value xxhash not 
supported";
     }
 
-    virtual void update_xxHash_with_value(size_t n, uint64_t& hash) const {
+    // use range for one hash value to avoid virtual function call in loop
+    virtual void update_xxHash_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                          const uint8_t* __restrict null_data) 
const {
         LOG(FATAL) << get_name() << " update_hash_with_value xxhash not 
supported";
     }
 
@@ -398,7 +400,9 @@ public:
         LOG(FATAL) << get_name() << "update_crcs_with_value not supported";
     }
 
-    virtual void update_crc_with_value(size_t n, uint64_t& hash) const {
+    // use range for one hash value to avoid virtual function call in loop
+    virtual void update_crc_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                       const uint8_t* __restrict null_data) 
const {
         LOG(FATAL) << get_name() << " update_crc_with_value not supported";
     }
 
diff --git a/be/src/vec/columns/column_array.cpp 
b/be/src/vec/columns/column_array.cpp
index c5e35e5bb9..b575f7bf15 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -277,25 +277,64 @@ void 
ColumnArray::update_hashes_with_value(std::vector<SipHash>& hashes,
 }
 
 // for every array row calculate xxHash
-void ColumnArray::update_xxHash_with_value(size_t n, uint64_t& hash) const {
-    size_t elem_size = size_at(n);
-    size_t offset = offset_at(n);
-    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&elem_size), sizeof(elem_size),
-                                      hash);
-    for (auto i = 0; i < elem_size; ++i) {
-        get_data().update_xxHash_with_value(offset + i, hash);
+void ColumnArray::update_xxHash_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                           const uint8_t* __restrict 
null_data) const {
+    auto& offsets_column = get_offsets();
+    if (null_data) {
+        for (size_t i = start; i < end; ++i) {
+            if (null_data[i] == 0) {
+                size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+                if (elem_size == 0) {
+                    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&elem_size),
+                                                      sizeof(elem_size), hash);
+                } else {
+                    get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                     nullptr);
+                }
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; ++i) {
+            size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+            if (elem_size == 0) {
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&elem_size),
+                                                  sizeof(elem_size), hash);
+            } else {
+                get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                 nullptr);
+            }
+        }
     }
 }
 
 // for every array row calculate crcHash
-void ColumnArray::update_crc_with_value(size_t n, uint64_t& crc) const {
-    size_t elem_size = size_at(n);
-    size_t offset = offset_at(n);
-
-    crc = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&elem_size), 
sizeof(elem_size),
-                                  crc);
-    for (auto i = 0; i < elem_size; ++i) {
-        get_data().update_crc_with_value(offset + i, crc);
+void ColumnArray::update_crc_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                        const uint8_t* __restrict null_data) 
const {
+    auto& offsets_column = get_offsets();
+    if (null_data) {
+        for (size_t i = start; i < end; ++i) {
+            if (null_data[i] == 0) {
+                size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+                if (elem_size == 0) {
+                    hash = HashUtil::zlib_crc_hash(reinterpret_cast<const 
char*>(&elem_size),
+                                                   sizeof(elem_size), hash);
+                } else {
+                    get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                     nullptr);
+                }
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; ++i) {
+            size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+            if (elem_size == 0) {
+                hash = HashUtil::zlib_crc_hash(reinterpret_cast<const 
char*>(&elem_size),
+                                               sizeof(elem_size), hash);
+            } else {
+                get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                 nullptr);
+            }
+        }
     }
 }
 
@@ -305,12 +344,12 @@ void ColumnArray::update_hashes_with_value(uint64_t* 
__restrict hashes,
     if (null_data) {
         for (size_t i = 0; i < s; ++i) {
             if (null_data[i] == 0) {
-                update_xxHash_with_value(i, hashes[i]);
+                update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
             }
         }
     } else {
         for (size_t i = 0; i < s; ++i) {
-            update_xxHash_with_value(i, hashes[i]);
+            update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
         }
     }
 }
@@ -324,12 +363,12 @@ void 
ColumnArray::update_crcs_with_value(std::vector<uint64_t>& hash, PrimitiveT
         for (size_t i = 0; i < s; ++i) {
             // every row
             if (null_data[i] == 0) {
-                update_crc_with_value(i, hash[i]);
+                update_crc_with_value(i, i + 1, hash[i], nullptr);
             }
         }
     } else {
         for (size_t i = 0; i < s; ++i) {
-            update_crc_with_value(i, hash[i]);
+            update_crc_with_value(i, i + 1, hash[i], nullptr);
         }
     }
 }
diff --git a/be/src/vec/columns/column_array.h 
b/be/src/vec/columns/column_array.h
index 2e1c96a2c5..4fe1827e17 100644
--- a/be/src/vec/columns/column_array.h
+++ b/be/src/vec/columns/column_array.h
@@ -139,8 +139,10 @@ public:
     StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& 
begin) const override;
     const char* deserialize_and_insert_from_arena(const char* pos) override;
     void update_hash_with_value(size_t n, SipHash& hash) const override;
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
 
     void update_hashes_with_value(std::vector<SipHash>& hashes,
                                   const uint8_t* __restrict null_data) const 
override;
diff --git a/be/src/vec/columns/column_const.h 
b/be/src/vec/columns/column_const.h
index feeb0608a2..7554e773b9 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -152,7 +152,8 @@ public:
         data->serialize_vec(keys, num_rows, max_row_byte_size);
     }
 
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override {
         auto real_data = data->get_data_at(0);
         if (real_data.data == nullptr) {
             hash = HashUtil::xxHash64NullWithSeed(hash);
@@ -161,8 +162,9 @@ public:
         }
     }
 
-    void update_crc_with_value(size_t n, uint64_t& crc) const override {
-        get_data_column_ptr()->update_crc_with_value(n, crc);
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override {
+        get_data_column_ptr()->update_crc_with_value(start, end, hash, 
nullptr);
     }
 
     void serialize_vec_with_null_map(std::vector<StringRef>& keys, size_t 
num_rows,
diff --git a/be/src/vec/columns/column_decimal.cpp 
b/be/src/vec/columns/column_decimal.cpp
index e0b8fef056..a73be249eb 100644
--- a/be/src/vec/columns/column_decimal.cpp
+++ b/be/src/vec/columns/column_decimal.cpp
@@ -138,16 +138,27 @@ void 
ColumnDecimal<T>::update_hashes_with_value(std::vector<SipHash>& hashes,
 }
 
 template <typename T>
-void ColumnDecimal<T>::update_crc_with_value(size_t n, uint64_t& crc) const {
-    if constexpr (!IsDecimalV2<T>) {
-        crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+void ColumnDecimal<T>::update_crc_with_value(size_t start, size_t end, 
uint64_t& hash,
+                                             const uint8_t* __restrict 
null_data) const {
+    if (null_data == nullptr) {
+        for (size_t i = start; i < end; i++) {
+            if constexpr (!IsDecimalV2<T>) {
+                hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash);
+            } else {
+                decimalv2_do_crc(i, hash);
+            }
+        }
     } else {
-        const DecimalV2Value& dec_val = (const DecimalV2Value&)data[n];
-        int64_t int_val = dec_val.int_value();
-        int32_t frac_val = dec_val.frac_value();
-        crc = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), crc);
-        crc = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), crc);
-    };
+        for (size_t i = start; i < end; i++) {
+            if (null_data[i] == 0) {
+                if constexpr (!IsDecimalV2<T>) {
+                    hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash);
+                } else {
+                    decimalv2_do_crc(i, hash);
+                }
+            }
+        }
+    }
 }
 
 template <typename T>
@@ -161,19 +172,32 @@ void 
ColumnDecimal<T>::update_crcs_with_value(std::vector<uint64_t>& hashes, Pri
     } else {
         if (null_data == nullptr) {
             for (size_t i = 0; i < s; i++) {
-                update_crc_with_value(i, hashes[i]);
+                decimalv2_do_crc(i, hashes[i]);
             }
         } else {
             for (size_t i = 0; i < s; i++) {
-                if (null_data[i] == 0) update_crc_with_value(i, hashes[i]);
+                if (null_data[i] == 0) decimalv2_do_crc(i, hashes[i]);
             }
         }
     }
 }
 
 template <typename T>
-void ColumnDecimal<T>::update_xxHash_with_value(size_t n, uint64_t& hash) 
const {
-    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[n]), 
sizeof(T), hash);
+void ColumnDecimal<T>::update_xxHash_with_value(size_t start, size_t end, 
uint64_t& hash,
+                                                const uint8_t* __restrict 
null_data) const {
+    if (null_data) {
+        for (size_t i = start; i < end; i++) {
+            if (null_data[i] == 0) {
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]),
+                                                  sizeof(T), hash);
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; i++) {
+            hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]), sizeof(T),
+                                              hash);
+        }
+    }
 }
 
 template <typename T>
diff --git a/be/src/vec/columns/column_decimal.h 
b/be/src/vec/columns/column_decimal.h
index 973f0bea68..efc260ba90 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -184,8 +184,10 @@ public:
     void update_crcs_with_value(std::vector<uint64_t>& hashes, PrimitiveType 
type,
                                 const uint8_t* __restrict null_data) const 
override;
 
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
 
     int compare_at(size_t n, size_t m, const IColumn& rhs_, int 
nan_direction_hint) const override;
     void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
@@ -302,6 +304,14 @@ protected:
             std::partial_sort(res.begin(), sort_end, res.end(),
                               [this](size_t a, size_t b) { return data[a] < 
data[b]; });
     }
+
+    void ALWAYS_INLINE decimalv2_do_crc(size_t i, uint64_t& hash) const {
+        const DecimalV2Value& dec_val = (const DecimalV2Value&)data[i];
+        int64_t int_val = dec_val.int_value();
+        int32_t frac_val = dec_val.frac_value();
+        hash = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), hash);
+        hash = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), hash);
+    };
 };
 
 template <typename>
diff --git a/be/src/vec/columns/column_map.cpp 
b/be/src/vec/columns/column_map.cpp
index 1924e2ba46..ac7c5da1a9 100644
--- a/be/src/vec/columns/column_map.cpp
+++ b/be/src/vec/columns/column_map.cpp
@@ -253,26 +253,64 @@ void 
ColumnMap::update_hashes_with_value(std::vector<SipHash>& hashes,
     SIP_HASHES_FUNCTION_COLUMN_IMPL();
 }
 
-void ColumnMap::update_xxHash_with_value(size_t n, uint64_t& hash) const {
-    size_t kv_size = size_at(n);
-    size_t offset = offset_at(n);
-
-    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&kv_size), 
sizeof(kv_size),
-                                      hash);
-    for (auto i = 0; i < kv_size; ++i) {
-        get_keys().update_xxHash_with_value(offset + i, hash);
-        get_values().update_xxHash_with_value(offset + i, hash);
+void ColumnMap::update_xxHash_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                         const uint8_t* __restrict null_data) 
const {
+    auto& offsets = get_offsets();
+    if (null_data) {
+        for (size_t i = start; i < end; ++i) {
+            if (null_data[i] == 0) {
+                size_t kv_size = offsets[i] - offsets[i - 1];
+                if (kv_size == 0) {
+                    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&kv_size),
+                                                      sizeof(kv_size), hash);
+                } else {
+                    get_keys().update_xxHash_with_value(offsets[i - 1], 
offsets[i], hash, nullptr);
+                    get_values().update_xxHash_with_value(offsets[i - 1], 
offsets[i], hash,
+                                                          nullptr);
+                }
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; ++i) {
+            size_t kv_size = offsets[i] - offsets[i - 1];
+            if (kv_size == 0) {
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&kv_size),
+                                                  sizeof(kv_size), hash);
+            } else {
+                get_keys().update_xxHash_with_value(offsets[i - 1], 
offsets[i], hash, nullptr);
+                get_values().update_xxHash_with_value(offsets[i - 1], 
offsets[i], hash, nullptr);
+            }
+        }
     }
 }
 
-void ColumnMap::update_crc_with_value(size_t n, uint64_t& crc) const {
-    size_t kv_size = size_at(n);
-    size_t offset = offset_at(n);
-
-    crc = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&kv_size), 
sizeof(kv_size), crc);
-    for (size_t i = 0; i < kv_size; ++i) {
-        get_keys().update_crc_with_value(offset + i, crc);
-        get_values().update_crc_with_value(offset + i, crc);
+void ColumnMap::update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                                      const uint8_t* __restrict null_data) 
const {
+    auto& offsets = get_offsets();
+    if (null_data) {
+        for (size_t i = start; i < end; ++i) {
+            if (null_data[i] == 0) {
+                size_t kv_size = offsets[i] - offsets[i - 1];
+                if (kv_size == 0) {
+                    hash = HashUtil::zlib_crc_hash(reinterpret_cast<const 
char*>(&kv_size),
+                                                   sizeof(kv_size), hash);
+                } else {
+                    get_keys().update_crc_with_value(offsets[i - 1], 
offsets[i], hash, nullptr);
+                    get_values().update_crc_with_value(offsets[i - 1], 
offsets[i], hash, nullptr);
+                }
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; ++i) {
+            size_t kv_size = offsets[i] - offsets[i - 1];
+            if (kv_size == 0) {
+                hash = HashUtil::zlib_crc_hash(reinterpret_cast<const 
char*>(&kv_size),
+                                               sizeof(kv_size), hash);
+            } else {
+                get_keys().update_crc_with_value(offsets[i - 1], offsets[i], 
hash, nullptr);
+                get_values().update_crc_with_value(offsets[i - 1], offsets[i], 
hash, nullptr);
+            }
+        }
     }
 }
 
@@ -282,12 +320,12 @@ void ColumnMap::update_hashes_with_value(uint64_t* 
hashes, const uint8_t* null_d
         for (size_t i = 0; i < s; ++i) {
             // every row
             if (null_data[i] == 0) {
-                update_xxHash_with_value(i, hashes[i]);
+                update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
             }
         }
     } else {
         for (size_t i = 0; i < s; ++i) {
-            update_xxHash_with_value(i, hashes[i]);
+            update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
         }
     }
 }
@@ -301,12 +339,12 @@ void 
ColumnMap::update_crcs_with_value(std::vector<uint64_t>& hash, PrimitiveTyp
         for (size_t i = 0; i < s; ++i) {
             // every row
             if (null_data[i] == 0) {
-                update_crc_with_value(i, hash[i]);
+                update_crc_with_value(i, i + 1, hash[i], nullptr);
             }
         }
     } else {
         for (size_t i = 0; i < s; ++i) {
-            update_crc_with_value(i, hash[i]);
+            update_crc_with_value(i, i + 1, hash[i], nullptr);
         }
     }
 }
diff --git a/be/src/vec/columns/column_map.h b/be/src/vec/columns/column_map.h
index 0d7bb2d0a7..91c9eb0177 100644
--- a/be/src/vec/columns/column_map.h
+++ b/be/src/vec/columns/column_map.h
@@ -167,8 +167,10 @@ public:
     size_t allocated_bytes() const override;
     void protect() override;
 
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
 
     void update_hashes_with_value(std::vector<SipHash>& hashes,
                                   const uint8_t* __restrict null_data) const 
override;
diff --git a/be/src/vec/columns/column_nullable.cpp 
b/be/src/vec/columns/column_nullable.cpp
index ce5b68f3fb..538bcd27a6 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -65,21 +65,35 @@ MutableColumnPtr ColumnNullable::get_shrinked_column() {
                                   get_null_map_column_ptr());
 }
 
-void ColumnNullable::update_xxHash_with_value(size_t n, uint64_t& hash) const {
-    auto* __restrict real_null_data = assert_cast<const 
ColumnUInt8&>(*null_map).get_data().data();
-    if (real_null_data[n] != 0) {
-        hash = HashUtil::xxHash64NullWithSeed(hash);
+void ColumnNullable::update_xxHash_with_value(size_t start, size_t end, 
uint64_t& hash,
+                                              const uint8_t* __restrict 
null_data) const {
+    if (!has_null()) {
+        nested_column->update_xxHash_with_value(start, end, hash, nullptr);
     } else {
-        nested_column->update_xxHash_with_value(n, hash);
+        auto* __restrict real_null_data =
+                assert_cast<const ColumnUInt8&>(*null_map).get_data().data();
+        for (int i = start; i < end; ++i) {
+            if (real_null_data[i] != 0) {
+                hash = HashUtil::xxHash64NullWithSeed(hash);
+            }
+        }
+        nested_column->update_xxHash_with_value(start, end, hash, 
real_null_data);
     }
 }
 
-void ColumnNullable::update_crc_with_value(size_t n, uint64_t& crc) const {
-    auto* __restrict real_null_data = assert_cast<const 
ColumnUInt8&>(*null_map).get_data().data();
-    if (real_null_data[n] != 0) {
-        crc = HashUtil::zlib_crc_hash_null(crc);
+void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                           const uint8_t* __restrict 
null_data) const {
+    if (!has_null()) {
+        nested_column->update_crc_with_value(start, end, hash, nullptr);
     } else {
-        nested_column->update_xxHash_with_value(n, crc);
+        auto* __restrict real_null_data =
+                assert_cast<const ColumnUInt8&>(*null_map).get_data().data();
+        for (int i = start; i < end; ++i) {
+            if (real_null_data[i] != 0) {
+                hash = HashUtil::zlib_crc_hash_null(hash);
+            }
+        }
+        nested_column->update_crc_with_value(start, end, hash, real_null_data);
     }
 }
 
diff --git a/be/src/vec/columns/column_nullable.h 
b/be/src/vec/columns/column_nullable.h
index be9ba72399..11c24be294 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -215,8 +215,10 @@ public:
     ColumnPtr replicate(const Offsets& replicate_offsets) const override;
     void replicate(const uint32_t* counts, size_t target_size, IColumn& 
column, size_t begin = 0,
                    int count_sz = -1) const override;
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
 
     void update_hash_with_value(size_t n, SipHash& hash) const override;
     void update_hashes_with_value(std::vector<SipHash>& hashes,
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 703826cd24..f79d378cf6 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -398,16 +398,42 @@ public:
     void deserialize_vec_with_null_map(std::vector<StringRef>& keys, const 
size_t num_rows,
                                        const uint8_t* null_map) override;
 
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
-        size_t string_size = size_at(n);
-        size_t offset = offset_at(n);
-        hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&chars[offset]),
-                                          string_size, hash);
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; ++i) {
+                if (null_data[i] == 0) {
+                    size_t string_size = size_at(i);
+                    size_t offset = offset_at(i);
+                    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&chars[offset]),
+                                                      string_size, hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; ++i) {
+                size_t string_size = size_at(i);
+                size_t offset = offset_at(i);
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&chars[offset]),
+                                                  string_size, hash);
+            }
+        }
     }
 
-    void update_crc_with_value(size_t n, uint64_t& crc) const override {
-        auto data_ref = get_data_at(n);
-        crc = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, crc);
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; ++i) {
+                if (null_data[i] == 0) {
+                    auto data_ref = get_data_at(i);
+                    hash = HashUtil::zlib_crc_hash(data_ref.data, 
data_ref.size, hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; ++i) {
+                auto data_ref = get_data_at(i);
+                hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, 
hash);
+            }
+        }
     }
 
     void update_hash_with_value(size_t n, SipHash& hash) const override {
diff --git a/be/src/vec/columns/column_struct.cpp 
b/be/src/vec/columns/column_struct.cpp
index 58f5a4abaf..0b3bcb24e8 100644
--- a/be/src/vec/columns/column_struct.cpp
+++ b/be/src/vec/columns/column_struct.cpp
@@ -196,15 +196,17 @@ void 
ColumnStruct::update_hashes_with_value(std::vector<SipHash>& hashes,
     SIP_HASHES_FUNCTION_COLUMN_IMPL();
 }
 
-void ColumnStruct::update_xxHash_with_value(size_t n, uint64_t& hash) const {
+void ColumnStruct::update_xxHash_with_value(size_t start, size_t end, 
uint64_t& hash,
+                                            const uint8_t* __restrict 
null_data) const {
     for (const auto& column : columns) {
-        column->update_xxHash_with_value(n, hash);
+        column->update_xxHash_with_value(start, end, hash, nullptr);
     }
 }
 
-void ColumnStruct::update_crc_with_value(size_t n, uint64_t& crc) const {
+void ColumnStruct::update_crc_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                         const uint8_t* __restrict null_data) 
const {
     for (const auto& column : columns) {
-        column->update_crc_with_value(n, crc);
+        column->update_crc_with_value(start, end, hash, nullptr);
     }
 }
 
diff --git a/be/src/vec/columns/column_struct.h 
b/be/src/vec/columns/column_struct.h
index 3771d29e48..9073725e81 100644
--- a/be/src/vec/columns/column_struct.h
+++ b/be/src/vec/columns/column_struct.h
@@ -106,8 +106,10 @@ public:
     const char* deserialize_and_insert_from_arena(const char* pos) override;
 
     void update_hash_with_value(size_t n, SipHash& hash) const override;
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
 
     void update_hashes_with_value(std::vector<SipHash>& hashes,
                                   const uint8_t* __restrict null_data) const 
override;
diff --git a/be/src/vec/columns/column_vector.h 
b/be/src/vec/columns/column_vector.h
index 67f2827c92..48228822f3 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -274,21 +274,49 @@ public:
                                      const uint8_t* null_map,
                                      size_t max_row_byte_size) const override;
 
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
-        hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[n]), sizeof(T), hash);
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; i++) {
+                if (null_data[i] == 0) {
+                    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]),
+                                                      sizeof(T), hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; i++) {
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]),
+                                                  sizeof(T), hash);
+            }
+        }
     }
 
-    void update_crc_with_value(size_t n, uint64_t& crc) const override {
+    void ALWAYS_INLINE update_crc_with_value_without_null(size_t idx, 
uint64_t& hash) const {
         if constexpr (!std::is_same_v<T, Int64>) {
-            crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+            hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash);
         } else {
             if (this->is_date_type() || this->is_datetime_type()) {
                 char buf[64];
-                const VecDateTimeValue& date_val = (const 
VecDateTimeValue&)data[n];
+                const VecDateTimeValue& date_val = (const 
VecDateTimeValue&)data[idx];
                 auto len = date_val.to_buffer(buf);
-                crc = HashUtil::zlib_crc_hash(buf, len, crc);
+                hash = HashUtil::zlib_crc_hash(buf, len, hash);
             } else {
-                crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+                hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash);
+            }
+        }
+    }
+
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; i++) {
+                if (null_data[i] == 0) {
+                    update_crc_with_value_without_null(i, hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; i++) {
+                update_crc_with_value_without_null(i, hash);
             }
         }
     }
diff --git a/be/test/vec/columns/column_hash_func_test.cpp 
b/be/test/vec/columns/column_hash_func_test.cpp
index 0e409b4640..f80edb035f 100644
--- a/be/test/vec/columns/column_hash_func_test.cpp
+++ b/be/test/vec/columns/column_hash_func_test.cpp
@@ -20,6 +20,8 @@
 #include <gtest/gtest-test-part.h>
 
 #include "gtest/gtest_pred_impl.h"
+#include "util/runtime_profile.h"
+#include "vec/columns/column_array.h"
 #include "vec/columns/column_const.h"
 #include "vec/core/field.h"
 #include "vec/data_types/data_type.h"
@@ -86,6 +88,83 @@ TEST(HashFuncTest, ArrayTypeTest) {
     }
 }
 
+TEST(HashFuncTest, ArraySimpleBenchmarkTest) {
+    DataTypes dataTypes = create_scala_data_types();
+
+    DataTypePtr d = std::make_shared<DataTypeInt64>();
+    DataTypePtr array_ptr = std::make_shared<DataTypeArray>(d);
+    MutableColumnPtr array_mutable_col = array_ptr->create_column();
+
+    int r_num = 50;
+    for (int r = 0; r < r_num; ++r) {
+        Array a;
+        for (int i = 0; i < 10000; ++i) {
+            a.push_back(Int64(i));
+        }
+        array_mutable_col->insert(a);
+    }
+    std::vector<uint64_t> crc_hash_vals(r_num);
+    int64_t time_t = 0;
+    {
+        SCOPED_RAW_TIMER(&time_t);
+        EXPECT_NO_FATAL_FAILURE(array_mutable_col->update_crcs_with_value(
+                crc_hash_vals, PrimitiveType::TYPE_ARRAY));
+    }
+    std::cout << time_t << "ns" << std::endl;
+}
+
+TEST(HashFuncTest, ArrayNestedArrayTest) {
+    DataTypes dataTypes = create_scala_data_types();
+
+    DataTypePtr d = std::make_shared<DataTypeInt64>();
+    MutableColumnPtr scala_mutable_col = d->create_column();
+    DataTypePtr nested_array_ptr = std::make_shared<DataTypeArray>(d);
+    DataTypePtr array_ptr = std::make_shared<DataTypeArray>(nested_array_ptr);
+    MutableColumnPtr array_mutable_col = array_ptr->create_column();
+
+    Array a, a1, a2, a3, nested, nested1;
+    nested.push_back(Int64(1));
+    nested1.push_back(Int64(2));
+
+    // a: [[1], [2]]
+    a.push_back(nested);
+    a.push_back(nested1);
+    // a1: [[2], [1]]
+    a1.push_back(nested1);
+    a1.push_back(nested);
+
+    // a2: [[], [1]]
+    a2.push_back(Array());
+    a2.push_back(nested);
+    // a3: [[1], []]
+    a3.push_back(nested);
+    a3.push_back(Array());
+
+    array_mutable_col->insert(a);
+    array_mutable_col->insert(a1);
+    array_mutable_col->insert(a2);
+    array_mutable_col->insert(a3);
+
+    auto nested_col =
+            
reinterpret_cast<vectorized::ColumnArray*>(array_mutable_col.get())->get_data_ptr();
+    EXPECT_EQ(nested_col->size(), 8);
+
+    std::vector<uint64_t> xx_hash_vals(4);
+    std::vector<uint64_t> crc_hash_vals(4);
+    auto* __restrict xx_hashes = xx_hash_vals.data();
+    auto* __restrict crc_hashes = crc_hash_vals.data();
+
+    // xxHash
+    
EXPECT_NO_FATAL_FAILURE(array_mutable_col->update_hashes_with_value(xx_hashes));
+    EXPECT_TRUE(xx_hashes[0] != xx_hashes[1]);
+    EXPECT_TRUE(xx_hashes[2] != xx_hashes[3]);
+    // crcHash
+    EXPECT_NO_FATAL_FAILURE(
+            array_mutable_col->update_crcs_with_value(crc_hash_vals, 
PrimitiveType::TYPE_ARRAY));
+    EXPECT_TRUE(crc_hashes[0] != crc_hashes[1]);
+    EXPECT_TRUE(crc_hashes[2] != crc_hashes[3]);
+}
+
 TEST(HashFuncTest, ArrayCornerCaseTest) {
     DataTypes dataTypes = create_scala_data_types();
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to