This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-15.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit faeab94e486f9c3cfacec2ab27ea3b6c8c0c9e27
Author: Rossi Sun <[email protected]>
AuthorDate: Fri Jan 26 22:43:08 2024 +0800

    GH-39778: [C++] Fix tail-byte access cross buffer boundary in key hash avx2 
(#39800)
    
    
    
    ### Rationale for this change
    
    Issue #39778 seems caused by a careless (but hard to spot) bug in key hash 
avx2.
    
    ### What changes are included in this PR?
    
    Fix the careless bug.
    
    ### Are these changes tested?
    
    UT included.
    
    ### Are there any user-facing changes?
    
    No.
    
    * Closes: #39778
    
    Authored-by: Ruoxi Sun <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/compute/key_hash.cc      | 142 +++++++++++++++++----------------
 cpp/src/arrow/compute/key_hash.h       |  22 ++---
 cpp/src/arrow/compute/key_hash_avx2.cc |   2 +-
 cpp/src/arrow/compute/key_hash_test.cc |  59 ++++++++++++++
 4 files changed, 145 insertions(+), 80 deletions(-)

diff --git a/cpp/src/arrow/compute/key_hash.cc 
b/cpp/src/arrow/compute/key_hash.cc
index f5867b405e..1902b9ce9a 100644
--- a/cpp/src/arrow/compute/key_hash.cc
+++ b/cpp/src/arrow/compute/key_hash.cc
@@ -105,23 +105,23 @@ inline void Hashing32::StripeMask(int i, uint32_t* mask1, 
uint32_t* mask2,
 }
 
 template <bool T_COMBINE_HASHES>
-void Hashing32::HashFixedLenImp(uint32_t num_rows, uint64_t length, const 
uint8_t* keys,
-                                uint32_t* hashes) {
+void Hashing32::HashFixedLenImp(uint32_t num_rows, uint64_t key_length,
+                                const uint8_t* keys, uint32_t* hashes) {
   // Calculate the number of rows that skip the last 16 bytes
   //
   uint32_t num_rows_safe = num_rows;
-  while (num_rows_safe > 0 && (num_rows - num_rows_safe) * length < 
kStripeSize) {
+  while (num_rows_safe > 0 && (num_rows - num_rows_safe) * key_length < 
kStripeSize) {
     --num_rows_safe;
   }
 
   // Compute masks for the last 16 byte stripe
   //
-  uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize);
+  uint64_t num_stripes = bit_util::CeilDiv(key_length, kStripeSize);
   uint32_t mask1, mask2, mask3, mask4;
-  StripeMask(((length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3, 
&mask4);
+  StripeMask(((key_length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, 
&mask3, &mask4);
 
   for (uint32_t i = 0; i < num_rows_safe; ++i) {
-    const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+    const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
     uint32_t acc1, acc2, acc3, acc4;
     ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
     ProcessLastStripe(mask1, mask2, mask3, mask4, key + (num_stripes - 1) * 
kStripeSize,
@@ -138,11 +138,11 @@ void Hashing32::HashFixedLenImp(uint32_t num_rows, 
uint64_t length, const uint8_
 
   uint32_t last_stripe_copy[4];
   for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
-    const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+    const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
     uint32_t acc1, acc2, acc3, acc4;
     ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
     memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
-           length - (num_stripes - 1) * kStripeSize);
+           key_length - (num_stripes - 1) * kStripeSize);
     ProcessLastStripe(mask1, mask2, mask3, mask4,
                       reinterpret_cast<const uint8_t*>(last_stripe_copy), 
&acc1, &acc2,
                       &acc3, &acc4);
@@ -168,15 +168,16 @@ void Hashing32::HashVarLenImp(uint32_t num_rows, const T* 
offsets,
   }
 
   for (uint32_t i = 0; i < num_rows_safe; ++i) {
-    uint64_t length = offsets[i + 1] - offsets[i];
+    uint64_t key_length = offsets[i + 1] - offsets[i];
 
     // Compute masks for the last 16 byte stripe.
     // For an empty string set number of stripes to 1 but mask to all zeroes.
     //
-    int is_non_empty = length == 0 ? 0 : 1;
-    uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - 
is_non_empty);
+    int is_non_empty = key_length == 0 ? 0 : 1;
+    uint64_t num_stripes =
+        bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
     uint32_t mask1, mask2, mask3, mask4;
-    StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, 
&mask1,
+    StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + 
is_non_empty, &mask1,
                &mask2, &mask3, &mask4);
 
     const uint8_t* key = concatenated_keys + offsets[i];
@@ -198,23 +199,24 @@ void Hashing32::HashVarLenImp(uint32_t num_rows, const T* 
offsets,
 
   uint32_t last_stripe_copy[4];
   for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
-    uint64_t length = offsets[i + 1] - offsets[i];
+    uint64_t key_length = offsets[i + 1] - offsets[i];
 
     // Compute masks for the last 16 byte stripe.
     // For an empty string set number of stripes to 1 but mask to all zeroes.
     //
-    int is_non_empty = length == 0 ? 0 : 1;
-    uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - 
is_non_empty);
+    int is_non_empty = key_length == 0 ? 0 : 1;
+    uint64_t num_stripes =
+        bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
     uint32_t mask1, mask2, mask3, mask4;
-    StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, 
&mask1,
+    StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + 
is_non_empty, &mask1,
                &mask2, &mask3, &mask4);
 
     const uint8_t* key = concatenated_keys + offsets[i];
     uint32_t acc1, acc2, acc3, acc4;
     ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
-    if (length > 0) {
+    if (key_length > 0) {
       memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
-             length - (num_stripes - 1) * kStripeSize);
+             key_length - (num_stripes - 1) * kStripeSize);
     }
     if (num_stripes > 0) {
       ProcessLastStripe(mask1, mask2, mask3, mask4,
@@ -309,9 +311,9 @@ void Hashing32::HashIntImp(uint32_t num_keys, const T* 
keys, uint32_t* hashes) {
   }
 }
 
-void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
length_key,
+void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
key_length,
                         const uint8_t* keys, uint32_t* hashes) {
-  switch (length_key) {
+  switch (key_length) {
     case sizeof(uint8_t):
       if (combine_hashes) {
         HashIntImp<true, uint8_t>(num_keys, keys, hashes);
@@ -352,27 +354,27 @@ void Hashing32::HashInt(bool combine_hashes, uint32_t 
num_keys, uint64_t length_
   }
 }
 
-void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, 
uint32_t num_rows,
-                          uint64_t length, const uint8_t* keys, uint32_t* 
hashes,
-                          uint32_t* hashes_temp_for_combine) {
-  if (ARROW_POPCOUNT64(length) == 1 && length <= sizeof(uint64_t)) {
-    HashInt(combine_hashes, num_rows, length, keys, hashes);
+void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, 
uint32_t num_keys,
+                          uint64_t key_length, const uint8_t* keys, uint32_t* 
hashes,
+                          uint32_t* temp_hashes_for_combine) {
+  if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) {
+    HashInt(combine_hashes, num_keys, key_length, keys, hashes);
     return;
   }
 
   uint32_t num_processed = 0;
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-    num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys, 
hashes,
-                                      hashes_temp_for_combine);
+    num_processed = HashFixedLen_avx2(combine_hashes, num_keys, key_length, 
keys, hashes,
+                                      temp_hashes_for_combine);
   }
 #endif
   if (combine_hashes) {
-    HashFixedLenImp<true>(num_rows - num_processed, length, keys + length * 
num_processed,
-                          hashes + num_processed);
+    HashFixedLenImp<true>(num_keys - num_processed, key_length,
+                          keys + key_length * num_processed, hashes + 
num_processed);
   } else {
-    HashFixedLenImp<false>(num_rows - num_processed, length,
-                           keys + length * num_processed, hashes + 
num_processed);
+    HashFixedLenImp<false>(num_keys - num_processed, key_length,
+                           keys + key_length * num_processed, hashes + 
num_processed);
   }
 }
 
@@ -423,13 +425,13 @@ void Hashing32::HashMultiColumn(const 
std::vector<KeyColumnArray>& cols,
       }
 
       if (cols[icol].metadata().is_fixed_length) {
-        uint32_t col_width = cols[icol].metadata().fixed_length;
-        if (col_width == 0) {
+        uint32_t key_length = cols[icol].metadata().fixed_length;
+        if (key_length == 0) {
           HashBit(icol > 0, cols[icol].bit_offset(1), batch_size_next,
                   cols[icol].data(1) + first_row / 8, hashes + first_row);
         } else {
-          HashFixed(ctx->hardware_flags, icol > 0, batch_size_next, col_width,
-                    cols[icol].data(1) + first_row * col_width, hashes + 
first_row,
+          HashFixed(ctx->hardware_flags, icol > 0, batch_size_next, key_length,
+                    cols[icol].data(1) + first_row * key_length, hashes + 
first_row,
                     hash_temp);
         }
       } else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) {
@@ -463,8 +465,9 @@ void Hashing32::HashMultiColumn(const 
std::vector<KeyColumnArray>& cols,
 Status Hashing32::HashBatch(const ExecBatch& key_batch, uint32_t* hashes,
                             std::vector<KeyColumnArray>& column_arrays,
                             int64_t hardware_flags, util::TempVectorStack* 
temp_stack,
-                            int64_t offset, int64_t length) {
-  RETURN_NOT_OK(ColumnArraysFromExecBatch(key_batch, offset, length, 
&column_arrays));
+                            int64_t start_rows, int64_t num_rows) {
+  RETURN_NOT_OK(
+      ColumnArraysFromExecBatch(key_batch, start_rows, num_rows, 
&column_arrays));
 
   LightContext ctx;
   ctx.hardware_flags = hardware_flags;
@@ -574,23 +577,23 @@ inline void Hashing64::StripeMask(int i, uint64_t* mask1, 
uint64_t* mask2,
 }
 
 template <bool T_COMBINE_HASHES>
-void Hashing64::HashFixedLenImp(uint32_t num_rows, uint64_t length, const 
uint8_t* keys,
-                                uint64_t* hashes) {
+void Hashing64::HashFixedLenImp(uint32_t num_rows, uint64_t key_length,
+                                const uint8_t* keys, uint64_t* hashes) {
   // Calculate the number of rows that skip the last 32 bytes
   //
   uint32_t num_rows_safe = num_rows;
-  while (num_rows_safe > 0 && (num_rows - num_rows_safe) * length < 
kStripeSize) {
+  while (num_rows_safe > 0 && (num_rows - num_rows_safe) * key_length < 
kStripeSize) {
     --num_rows_safe;
   }
 
   // Compute masks for the last 32 byte stripe
   //
-  uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize);
+  uint64_t num_stripes = bit_util::CeilDiv(key_length, kStripeSize);
   uint64_t mask1, mask2, mask3, mask4;
-  StripeMask(((length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3, 
&mask4);
+  StripeMask(((key_length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, 
&mask3, &mask4);
 
   for (uint32_t i = 0; i < num_rows_safe; ++i) {
-    const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+    const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
     uint64_t acc1, acc2, acc3, acc4;
     ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
     ProcessLastStripe(mask1, mask2, mask3, mask4, key + (num_stripes - 1) * 
kStripeSize,
@@ -607,11 +610,11 @@ void Hashing64::HashFixedLenImp(uint32_t num_rows, 
uint64_t length, const uint8_
 
   uint64_t last_stripe_copy[4];
   for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
-    const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+    const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
     uint64_t acc1, acc2, acc3, acc4;
     ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
     memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
-           length - (num_stripes - 1) * kStripeSize);
+           key_length - (num_stripes - 1) * kStripeSize);
     ProcessLastStripe(mask1, mask2, mask3, mask4,
                       reinterpret_cast<const uint8_t*>(last_stripe_copy), 
&acc1, &acc2,
                       &acc3, &acc4);
@@ -637,15 +640,16 @@ void Hashing64::HashVarLenImp(uint32_t num_rows, const T* 
offsets,
   }
 
   for (uint32_t i = 0; i < num_rows_safe; ++i) {
-    uint64_t length = offsets[i + 1] - offsets[i];
+    uint64_t key_length = offsets[i + 1] - offsets[i];
 
     // Compute masks for the last 32 byte stripe.
     // For an empty string set number of stripes to 1 but mask to all zeroes.
     //
-    int is_non_empty = length == 0 ? 0 : 1;
-    uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - 
is_non_empty);
+    int is_non_empty = key_length == 0 ? 0 : 1;
+    uint64_t num_stripes =
+        bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
     uint64_t mask1, mask2, mask3, mask4;
-    StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, 
&mask1,
+    StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + 
is_non_empty, &mask1,
                &mask2, &mask3, &mask4);
 
     const uint8_t* key = concatenated_keys + offsets[i];
@@ -667,22 +671,23 @@ void Hashing64::HashVarLenImp(uint32_t num_rows, const T* 
offsets,
 
   uint64_t last_stripe_copy[4];
   for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
-    uint64_t length = offsets[i + 1] - offsets[i];
+    uint64_t key_length = offsets[i + 1] - offsets[i];
 
     // Compute masks for the last 32 byte stripe
     //
-    int is_non_empty = length == 0 ? 0 : 1;
-    uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 - 
is_non_empty);
+    int is_non_empty = key_length == 0 ? 0 : 1;
+    uint64_t num_stripes =
+        bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
     uint64_t mask1, mask2, mask3, mask4;
-    StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty, 
&mask1,
+    StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) + 
is_non_empty, &mask1,
                &mask2, &mask3, &mask4);
 
     const uint8_t* key = concatenated_keys + offsets[i];
     uint64_t acc1, acc2, acc3, acc4;
     ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
-    if (length > 0) {
+    if (key_length > 0) {
       memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
-             length - (num_stripes - 1) * kStripeSize);
+             key_length - (num_stripes - 1) * kStripeSize);
     }
     if (num_stripes > 0) {
       ProcessLastStripe(mask1, mask2, mask3, mask4,
@@ -759,9 +764,9 @@ void Hashing64::HashIntImp(uint32_t num_keys, const T* 
keys, uint64_t* hashes) {
   }
 }
 
-void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
length_key,
+void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
key_length,
                         const uint8_t* keys, uint64_t* hashes) {
-  switch (length_key) {
+  switch (key_length) {
     case sizeof(uint8_t):
       if (combine_hashes) {
         HashIntImp<true, uint8_t>(num_keys, keys, hashes);
@@ -802,17 +807,17 @@ void Hashing64::HashInt(bool combine_hashes, uint32_t 
num_keys, uint64_t length_
   }
 }
 
-void Hashing64::HashFixed(bool combine_hashes, uint32_t num_rows, uint64_t 
length,
+void Hashing64::HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t 
key_length,
                           const uint8_t* keys, uint64_t* hashes) {
-  if (ARROW_POPCOUNT64(length) == 1 && length <= sizeof(uint64_t)) {
-    HashInt(combine_hashes, num_rows, length, keys, hashes);
+  if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) {
+    HashInt(combine_hashes, num_keys, key_length, keys, hashes);
     return;
   }
 
   if (combine_hashes) {
-    HashFixedLenImp<true>(num_rows, length, keys, hashes);
+    HashFixedLenImp<true>(num_keys, key_length, keys, hashes);
   } else {
-    HashFixedLenImp<false>(num_rows, length, keys, hashes);
+    HashFixedLenImp<false>(num_keys, key_length, keys, hashes);
   }
 }
 
@@ -860,13 +865,13 @@ void Hashing64::HashMultiColumn(const 
std::vector<KeyColumnArray>& cols,
       }
 
       if (cols[icol].metadata().is_fixed_length) {
-        uint64_t col_width = cols[icol].metadata().fixed_length;
-        if (col_width == 0) {
+        uint64_t key_length = cols[icol].metadata().fixed_length;
+        if (key_length == 0) {
           HashBit(icol > 0, cols[icol].bit_offset(1), batch_size_next,
                   cols[icol].data(1) + first_row / 8, hashes + first_row);
         } else {
-          HashFixed(icol > 0, batch_size_next, col_width,
-                    cols[icol].data(1) + first_row * col_width, hashes + 
first_row);
+          HashFixed(icol > 0, batch_size_next, key_length,
+                    cols[icol].data(1) + first_row * key_length, hashes + 
first_row);
         }
       } else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) {
         HashVarLen(icol > 0, batch_size_next, cols[icol].offsets() + first_row,
@@ -897,8 +902,9 @@ void Hashing64::HashMultiColumn(const 
std::vector<KeyColumnArray>& cols,
 Status Hashing64::HashBatch(const ExecBatch& key_batch, uint64_t* hashes,
                             std::vector<KeyColumnArray>& column_arrays,
                             int64_t hardware_flags, util::TempVectorStack* 
temp_stack,
-                            int64_t offset, int64_t length) {
-  RETURN_NOT_OK(ColumnArraysFromExecBatch(key_batch, offset, length, 
&column_arrays));
+                            int64_t start_row, int64_t num_rows) {
+  RETURN_NOT_OK(
+      ColumnArraysFromExecBatch(key_batch, start_row, num_rows, 
&column_arrays));
 
   LightContext ctx;
   ctx.hardware_flags = hardware_flags;
diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h
index b193716c9b..1173df5ed1 100644
--- a/cpp/src/arrow/compute/key_hash.h
+++ b/cpp/src/arrow/compute/key_hash.h
@@ -51,10 +51,10 @@ class ARROW_EXPORT Hashing32 {
   static Status HashBatch(const ExecBatch& key_batch, uint32_t* hashes,
                           std::vector<KeyColumnArray>& column_arrays,
                           int64_t hardware_flags, util::TempVectorStack* 
temp_stack,
-                          int64_t offset, int64_t length);
+                          int64_t start_row, int64_t num_rows);
 
   static void HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t 
num_keys,
-                        uint64_t length_key, const uint8_t* keys, uint32_t* 
hashes,
+                        uint64_t key_length, const uint8_t* keys, uint32_t* 
hashes,
                         uint32_t* temp_hashes_for_combine);
 
  private:
@@ -100,7 +100,7 @@ class ARROW_EXPORT Hashing32 {
   static inline void StripeMask(int i, uint32_t* mask1, uint32_t* mask2, 
uint32_t* mask3,
                                 uint32_t* mask4);
   template <bool T_COMBINE_HASHES>
-  static void HashFixedLenImp(uint32_t num_rows, uint64_t length, const 
uint8_t* keys,
+  static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const 
uint8_t* keys,
                               uint32_t* hashes);
   template <typename T, bool T_COMBINE_HASHES>
   static void HashVarLenImp(uint32_t num_rows, const T* offsets,
@@ -112,7 +112,7 @@ class ARROW_EXPORT Hashing32 {
                       const uint8_t* keys, uint32_t* hashes);
   template <bool T_COMBINE_HASHES, typename T>
   static void HashIntImp(uint32_t num_keys, const T* keys, uint32_t* hashes);
-  static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
length_key,
+  static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
key_length,
                       const uint8_t* keys, uint32_t* hashes);
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
@@ -129,11 +129,11 @@ class ARROW_EXPORT Hashing32 {
                                             __m256i mask_last_stripe, const 
uint8_t* keys,
                                             int64_t offset_A, int64_t 
offset_B);
   template <bool T_COMBINE_HASHES>
-  static uint32_t HashFixedLenImp_avx2(uint32_t num_rows, uint64_t length,
+  static uint32_t HashFixedLenImp_avx2(uint32_t num_rows, uint64_t key_length,
                                        const uint8_t* keys, uint32_t* hashes,
                                        uint32_t* hashes_temp_for_combine);
   static uint32_t HashFixedLen_avx2(bool combine_hashes, uint32_t num_rows,
-                                    uint64_t length, const uint8_t* keys,
+                                    uint64_t key_length, const uint8_t* keys,
                                     uint32_t* hashes, uint32_t* 
hashes_temp_for_combine);
   template <typename T, bool T_COMBINE_HASHES>
   static uint32_t HashVarLenImp_avx2(uint32_t num_rows, const T* offsets,
@@ -164,9 +164,9 @@ class ARROW_EXPORT Hashing64 {
   static Status HashBatch(const ExecBatch& key_batch, uint64_t* hashes,
                           std::vector<KeyColumnArray>& column_arrays,
                           int64_t hardware_flags, util::TempVectorStack* 
temp_stack,
-                          int64_t offset, int64_t length);
+                          int64_t start_row, int64_t num_rows);
 
-  static void HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t 
length_key,
+  static void HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t 
key_length,
                         const uint8_t* keys, uint64_t* hashes);
 
  private:
@@ -203,7 +203,7 @@ class ARROW_EXPORT Hashing64 {
   static inline void StripeMask(int i, uint64_t* mask1, uint64_t* mask2, 
uint64_t* mask3,
                                 uint64_t* mask4);
   template <bool T_COMBINE_HASHES>
-  static void HashFixedLenImp(uint32_t num_rows, uint64_t length, const 
uint8_t* keys,
+  static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const 
uint8_t* keys,
                               uint64_t* hashes);
   template <typename T, bool T_COMBINE_HASHES>
   static void HashVarLenImp(uint32_t num_rows, const T* offsets,
@@ -211,11 +211,11 @@ class ARROW_EXPORT Hashing64 {
   template <bool T_COMBINE_HASHES>
   static void HashBitImp(int64_t bit_offset, uint32_t num_keys, const uint8_t* 
keys,
                          uint64_t* hashes);
-  static void HashBit(bool T_COMBINE_HASHES, int64_t bit_offset, uint32_t 
num_keys,
+  static void HashBit(bool combine_hashes, int64_t bit_offset, uint32_t 
num_keys,
                       const uint8_t* keys, uint64_t* hashes);
   template <bool T_COMBINE_HASHES, typename T>
   static void HashIntImp(uint32_t num_keys, const T* keys, uint64_t* hashes);
-  static void HashInt(bool T_COMBINE_HASHES, uint32_t num_keys, uint64_t 
length_key,
+  static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t 
key_length,
                       const uint8_t* keys, uint64_t* hashes);
 };
 
diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc 
b/cpp/src/arrow/compute/key_hash_avx2.cc
index 1b444b5767..aec2800c64 100644
--- a/cpp/src/arrow/compute/key_hash_avx2.cc
+++ b/cpp/src/arrow/compute/key_hash_avx2.cc
@@ -190,7 +190,7 @@ uint32_t Hashing32::HashFixedLenImp_avx2(uint32_t num_rows, 
uint64_t length,
   // Do not process rows that could read past the end of the buffer using 16
   // byte loads. Round down number of rows to process to multiple of 2.
   //
-  uint64_t num_rows_to_skip = bit_util::CeilDiv(length, kStripeSize);
+  uint64_t num_rows_to_skip = bit_util::CeilDiv(kStripeSize, length);
   uint32_t num_rows_to_process =
       (num_rows_to_skip > num_rows)
           ? 0
diff --git a/cpp/src/arrow/compute/key_hash_test.cc 
b/cpp/src/arrow/compute/key_hash_test.cc
index 3e6d41525c..c998df7169 100644
--- a/cpp/src/arrow/compute/key_hash_test.cc
+++ b/cpp/src/arrow/compute/key_hash_test.cc
@@ -252,5 +252,64 @@ TEST(VectorHash, BasicString) { 
RunTestVectorHash<StringType>(); }
 
 TEST(VectorHash, BasicLargeString) { RunTestVectorHash<LargeStringType>(); }
 
+void HashFixedLengthFrom(int key_length, int num_rows, int start_row) {
+  int num_rows_to_hash = num_rows - start_row;
+  auto num_bytes_aligned = arrow::bit_util::RoundUpToMultipleOf64(key_length * 
num_rows);
+
+  const auto hardware_flags_for_testing = HardwareFlagsForTesting();
+  ASSERT_GT(hardware_flags_for_testing.size(), 0);
+
+  std::vector<std::vector<uint32_t>> 
hashes32(hardware_flags_for_testing.size());
+  std::vector<std::vector<uint64_t>> 
hashes64(hardware_flags_for_testing.size());
+  for (auto& h : hashes32) {
+    h.resize(num_rows_to_hash);
+  }
+  for (auto& h : hashes64) {
+    h.resize(num_rows_to_hash);
+  }
+
+  FixedSizeBinaryBuilder keys_builder(fixed_size_binary(key_length));
+  for (int j = 0; j < num_rows; ++j) {
+    ASSERT_OK(keys_builder.Append(std::string(key_length, 42)));
+  }
+  ASSERT_OK_AND_ASSIGN(auto keys, keys_builder.Finish());
+  // Make sure the buffer is aligned as expected.
+  ASSERT_EQ(keys->data()->buffers[1]->capacity(), num_bytes_aligned);
+
+  constexpr int mini_batch_size = 1024;
+  std::vector<uint32_t> temp_buffer;
+  temp_buffer.resize(mini_batch_size * 4);
+
+  for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size()); 
++i) {
+    const auto hardware_flags = hardware_flags_for_testing[i];
+    Hashing32::HashFixed(hardware_flags,
+                         /*combine_hashes=*/false, num_rows_to_hash, 
key_length,
+                         keys->data()->GetValues<uint8_t>(1) + start_row * 
key_length,
+                         hashes32[i].data(), temp_buffer.data());
+    Hashing64::HashFixed(
+        /*combine_hashes=*/false, num_rows_to_hash, key_length,
+        keys->data()->GetValues<uint8_t>(1) + start_row * key_length, 
hashes64[i].data());
+  }
+
+  // Verify that all implementations (scalar, SIMD) give the same hashes.
+  for (int i = 1; i < static_cast<int>(hardware_flags_for_testing.size()); 
++i) {
+    for (int j = 0; j < num_rows_to_hash; ++j) {
+      ASSERT_EQ(hashes32[i][j], hashes32[0][j])
+          << "scalar and simd approaches yielded different 32-bit hashes";
+      ASSERT_EQ(hashes64[i][j], hashes64[0][j])
+          << "scalar and simd approaches yielded different 64-bit hashes";
+    }
+  }
+}
+
+// Some carefully chosen cases that may cause troubles like GH-39778.
+TEST(VectorHash, FixedLengthTailByteSafety) {
+  // Tow cases of key_length < stripe (16-byte).
+  HashFixedLengthFrom(/*key_length=*/3, /*num_rows=*/1450, /*start_row=*/1447);
+  HashFixedLengthFrom(/*key_length=*/5, /*num_rows=*/883, /*start_row=*/858);
+  // Case of key_length > stripe (16-byte).
+  HashFixedLengthFrom(/*key_length=*/19, /*num_rows=*/64, /*start_row=*/63);
+}
+
 }  // namespace compute
 }  // namespace arrow

Reply via email to