This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 13b22346d3 GH-39778: [C++] Fix tail-byte access cross buffer boundary
in key hash avx2 (#39800)
13b22346d3 is described below
commit 13b22346d36b9952df5c988c9425b9e5bc4f09c4
Author: Rossi Sun <[email protected]>
AuthorDate: Fri Jan 26 22:43:08 2024 +0800
GH-39778: [C++] Fix tail-byte access cross buffer boundary in key hash avx2
(#39800)
### Rationale for this change
Issue #39778 seems caused by a careless (but hard to spot) bug in key hash
avx2.
### What changes are included in this PR?
Fix the careless bug.
### Are these changes tested?
UT included.
### Are there any user-facing changes?
No.
* Closes: #39778
Authored-by: Ruoxi Sun <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/compute/key_hash.cc | 142 +++++++++++++++++----------------
cpp/src/arrow/compute/key_hash.h | 22 ++---
cpp/src/arrow/compute/key_hash_avx2.cc | 2 +-
cpp/src/arrow/compute/key_hash_test.cc | 59 ++++++++++++++
4 files changed, 145 insertions(+), 80 deletions(-)
diff --git a/cpp/src/arrow/compute/key_hash.cc
b/cpp/src/arrow/compute/key_hash.cc
index f5867b405e..1902b9ce9a 100644
--- a/cpp/src/arrow/compute/key_hash.cc
+++ b/cpp/src/arrow/compute/key_hash.cc
@@ -105,23 +105,23 @@ inline void Hashing32::StripeMask(int i, uint32_t* mask1,
uint32_t* mask2,
}
template <bool T_COMBINE_HASHES>
-void Hashing32::HashFixedLenImp(uint32_t num_rows, uint64_t length, const
uint8_t* keys,
- uint32_t* hashes) {
+void Hashing32::HashFixedLenImp(uint32_t num_rows, uint64_t key_length,
+ const uint8_t* keys, uint32_t* hashes) {
// Calculate the number of rows that skip the last 16 bytes
//
uint32_t num_rows_safe = num_rows;
- while (num_rows_safe > 0 && (num_rows - num_rows_safe) * length <
kStripeSize) {
+ while (num_rows_safe > 0 && (num_rows - num_rows_safe) * key_length <
kStripeSize) {
--num_rows_safe;
}
// Compute masks for the last 16 byte stripe
//
- uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize);
+ uint64_t num_stripes = bit_util::CeilDiv(key_length, kStripeSize);
uint32_t mask1, mask2, mask3, mask4;
- StripeMask(((length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3,
&mask4);
+ StripeMask(((key_length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2,
&mask3, &mask4);
for (uint32_t i = 0; i < num_rows_safe; ++i) {
- const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+ const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
uint32_t acc1, acc2, acc3, acc4;
ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
ProcessLastStripe(mask1, mask2, mask3, mask4, key + (num_stripes - 1) *
kStripeSize,
@@ -138,11 +138,11 @@ void Hashing32::HashFixedLenImp(uint32_t num_rows,
uint64_t length, const uint8_
uint32_t last_stripe_copy[4];
for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
- const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+ const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
uint32_t acc1, acc2, acc3, acc4;
ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
- length - (num_stripes - 1) * kStripeSize);
+ key_length - (num_stripes - 1) * kStripeSize);
ProcessLastStripe(mask1, mask2, mask3, mask4,
reinterpret_cast<const uint8_t*>(last_stripe_copy),
&acc1, &acc2,
&acc3, &acc4);
@@ -168,15 +168,16 @@ void Hashing32::HashVarLenImp(uint32_t num_rows, const T*
offsets,
}
for (uint32_t i = 0; i < num_rows_safe; ++i) {
- uint64_t length = offsets[i + 1] - offsets[i];
+ uint64_t key_length = offsets[i + 1] - offsets[i];
// Compute masks for the last 16 byte stripe.
// For an empty string set number of stripes to 1 but mask to all zeroes.
//
- int is_non_empty = length == 0 ? 0 : 1;
- uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 -
is_non_empty);
+ int is_non_empty = key_length == 0 ? 0 : 1;
+ uint64_t num_stripes =
+ bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
uint32_t mask1, mask2, mask3, mask4;
- StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty,
&mask1,
+ StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) +
is_non_empty, &mask1,
&mask2, &mask3, &mask4);
const uint8_t* key = concatenated_keys + offsets[i];
@@ -198,23 +199,24 @@ void Hashing32::HashVarLenImp(uint32_t num_rows, const T*
offsets,
uint32_t last_stripe_copy[4];
for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
- uint64_t length = offsets[i + 1] - offsets[i];
+ uint64_t key_length = offsets[i + 1] - offsets[i];
// Compute masks for the last 16 byte stripe.
// For an empty string set number of stripes to 1 but mask to all zeroes.
//
- int is_non_empty = length == 0 ? 0 : 1;
- uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 -
is_non_empty);
+ int is_non_empty = key_length == 0 ? 0 : 1;
+ uint64_t num_stripes =
+ bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
uint32_t mask1, mask2, mask3, mask4;
- StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty,
&mask1,
+ StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) +
is_non_empty, &mask1,
&mask2, &mask3, &mask4);
const uint8_t* key = concatenated_keys + offsets[i];
uint32_t acc1, acc2, acc3, acc4;
ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
- if (length > 0) {
+ if (key_length > 0) {
memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
- length - (num_stripes - 1) * kStripeSize);
+ key_length - (num_stripes - 1) * kStripeSize);
}
if (num_stripes > 0) {
ProcessLastStripe(mask1, mask2, mask3, mask4,
@@ -309,9 +311,9 @@ void Hashing32::HashIntImp(uint32_t num_keys, const T*
keys, uint32_t* hashes) {
}
}
-void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
length_key,
+void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
key_length,
const uint8_t* keys, uint32_t* hashes) {
- switch (length_key) {
+ switch (key_length) {
case sizeof(uint8_t):
if (combine_hashes) {
HashIntImp<true, uint8_t>(num_keys, keys, hashes);
@@ -352,27 +354,27 @@ void Hashing32::HashInt(bool combine_hashes, uint32_t
num_keys, uint64_t length_
}
}
-void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes,
uint32_t num_rows,
- uint64_t length, const uint8_t* keys, uint32_t*
hashes,
- uint32_t* hashes_temp_for_combine) {
- if (ARROW_POPCOUNT64(length) == 1 && length <= sizeof(uint64_t)) {
- HashInt(combine_hashes, num_rows, length, keys, hashes);
+void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes,
uint32_t num_keys,
+ uint64_t key_length, const uint8_t* keys, uint32_t*
hashes,
+ uint32_t* temp_hashes_for_combine) {
+ if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) {
+ HashInt(combine_hashes, num_keys, key_length, keys, hashes);
return;
}
uint32_t num_processed = 0;
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys,
hashes,
- hashes_temp_for_combine);
+ num_processed = HashFixedLen_avx2(combine_hashes, num_keys, key_length,
keys, hashes,
+ temp_hashes_for_combine);
}
#endif
if (combine_hashes) {
- HashFixedLenImp<true>(num_rows - num_processed, length, keys + length *
num_processed,
- hashes + num_processed);
+ HashFixedLenImp<true>(num_keys - num_processed, key_length,
+ keys + key_length * num_processed, hashes +
num_processed);
} else {
- HashFixedLenImp<false>(num_rows - num_processed, length,
- keys + length * num_processed, hashes +
num_processed);
+ HashFixedLenImp<false>(num_keys - num_processed, key_length,
+ keys + key_length * num_processed, hashes +
num_processed);
}
}
@@ -423,13 +425,13 @@ void Hashing32::HashMultiColumn(const
std::vector<KeyColumnArray>& cols,
}
if (cols[icol].metadata().is_fixed_length) {
- uint32_t col_width = cols[icol].metadata().fixed_length;
- if (col_width == 0) {
+ uint32_t key_length = cols[icol].metadata().fixed_length;
+ if (key_length == 0) {
HashBit(icol > 0, cols[icol].bit_offset(1), batch_size_next,
cols[icol].data(1) + first_row / 8, hashes + first_row);
} else {
- HashFixed(ctx->hardware_flags, icol > 0, batch_size_next, col_width,
- cols[icol].data(1) + first_row * col_width, hashes +
first_row,
+ HashFixed(ctx->hardware_flags, icol > 0, batch_size_next, key_length,
+ cols[icol].data(1) + first_row * key_length, hashes +
first_row,
hash_temp);
}
} else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) {
@@ -463,8 +465,9 @@ void Hashing32::HashMultiColumn(const
std::vector<KeyColumnArray>& cols,
Status Hashing32::HashBatch(const ExecBatch& key_batch, uint32_t* hashes,
std::vector<KeyColumnArray>& column_arrays,
int64_t hardware_flags, util::TempVectorStack*
temp_stack,
- int64_t offset, int64_t length) {
- RETURN_NOT_OK(ColumnArraysFromExecBatch(key_batch, offset, length,
&column_arrays));
+ int64_t start_rows, int64_t num_rows) {
+ RETURN_NOT_OK(
+ ColumnArraysFromExecBatch(key_batch, start_rows, num_rows,
&column_arrays));
LightContext ctx;
ctx.hardware_flags = hardware_flags;
@@ -574,23 +577,23 @@ inline void Hashing64::StripeMask(int i, uint64_t* mask1,
uint64_t* mask2,
}
template <bool T_COMBINE_HASHES>
-void Hashing64::HashFixedLenImp(uint32_t num_rows, uint64_t length, const
uint8_t* keys,
- uint64_t* hashes) {
+void Hashing64::HashFixedLenImp(uint32_t num_rows, uint64_t key_length,
+ const uint8_t* keys, uint64_t* hashes) {
// Calculate the number of rows that skip the last 32 bytes
//
uint32_t num_rows_safe = num_rows;
- while (num_rows_safe > 0 && (num_rows - num_rows_safe) * length <
kStripeSize) {
+ while (num_rows_safe > 0 && (num_rows - num_rows_safe) * key_length <
kStripeSize) {
--num_rows_safe;
}
// Compute masks for the last 32 byte stripe
//
- uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize);
+ uint64_t num_stripes = bit_util::CeilDiv(key_length, kStripeSize);
uint64_t mask1, mask2, mask3, mask4;
- StripeMask(((length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2, &mask3,
&mask4);
+ StripeMask(((key_length - 1) & (kStripeSize - 1)) + 1, &mask1, &mask2,
&mask3, &mask4);
for (uint32_t i = 0; i < num_rows_safe; ++i) {
- const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+ const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
uint64_t acc1, acc2, acc3, acc4;
ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
ProcessLastStripe(mask1, mask2, mask3, mask4, key + (num_stripes - 1) *
kStripeSize,
@@ -607,11 +610,11 @@ void Hashing64::HashFixedLenImp(uint32_t num_rows,
uint64_t length, const uint8_
uint64_t last_stripe_copy[4];
for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
- const uint8_t* key = keys + static_cast<uint64_t>(i) * length;
+ const uint8_t* key = keys + static_cast<uint64_t>(i) * key_length;
uint64_t acc1, acc2, acc3, acc4;
ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
- length - (num_stripes - 1) * kStripeSize);
+ key_length - (num_stripes - 1) * kStripeSize);
ProcessLastStripe(mask1, mask2, mask3, mask4,
reinterpret_cast<const uint8_t*>(last_stripe_copy),
&acc1, &acc2,
&acc3, &acc4);
@@ -637,15 +640,16 @@ void Hashing64::HashVarLenImp(uint32_t num_rows, const T*
offsets,
}
for (uint32_t i = 0; i < num_rows_safe; ++i) {
- uint64_t length = offsets[i + 1] - offsets[i];
+ uint64_t key_length = offsets[i + 1] - offsets[i];
// Compute masks for the last 32 byte stripe.
// For an empty string set number of stripes to 1 but mask to all zeroes.
//
- int is_non_empty = length == 0 ? 0 : 1;
- uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 -
is_non_empty);
+ int is_non_empty = key_length == 0 ? 0 : 1;
+ uint64_t num_stripes =
+ bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
uint64_t mask1, mask2, mask3, mask4;
- StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty,
&mask1,
+ StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) +
is_non_empty, &mask1,
&mask2, &mask3, &mask4);
const uint8_t* key = concatenated_keys + offsets[i];
@@ -667,22 +671,23 @@ void Hashing64::HashVarLenImp(uint32_t num_rows, const T*
offsets,
uint64_t last_stripe_copy[4];
for (uint32_t i = num_rows_safe; i < num_rows; ++i) {
- uint64_t length = offsets[i + 1] - offsets[i];
+ uint64_t key_length = offsets[i + 1] - offsets[i];
// Compute masks for the last 32 byte stripe
//
- int is_non_empty = length == 0 ? 0 : 1;
- uint64_t num_stripes = bit_util::CeilDiv(length, kStripeSize) + (1 -
is_non_empty);
+ int is_non_empty = key_length == 0 ? 0 : 1;
+ uint64_t num_stripes =
+ bit_util::CeilDiv(key_length, kStripeSize) + (1 - is_non_empty);
uint64_t mask1, mask2, mask3, mask4;
- StripeMask(((length - is_non_empty) & (kStripeSize - 1)) + is_non_empty,
&mask1,
+ StripeMask(((key_length - is_non_empty) & (kStripeSize - 1)) +
is_non_empty, &mask1,
&mask2, &mask3, &mask4);
const uint8_t* key = concatenated_keys + offsets[i];
uint64_t acc1, acc2, acc3, acc4;
ProcessFullStripes(num_stripes, key, &acc1, &acc2, &acc3, &acc4);
- if (length > 0) {
+ if (key_length > 0) {
memcpy(last_stripe_copy, key + (num_stripes - 1) * kStripeSize,
- length - (num_stripes - 1) * kStripeSize);
+ key_length - (num_stripes - 1) * kStripeSize);
}
if (num_stripes > 0) {
ProcessLastStripe(mask1, mask2, mask3, mask4,
@@ -759,9 +764,9 @@ void Hashing64::HashIntImp(uint32_t num_keys, const T*
keys, uint64_t* hashes) {
}
}
-void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
length_key,
+void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
key_length,
const uint8_t* keys, uint64_t* hashes) {
- switch (length_key) {
+ switch (key_length) {
case sizeof(uint8_t):
if (combine_hashes) {
HashIntImp<true, uint8_t>(num_keys, keys, hashes);
@@ -802,17 +807,17 @@ void Hashing64::HashInt(bool combine_hashes, uint32_t
num_keys, uint64_t length_
}
}
-void Hashing64::HashFixed(bool combine_hashes, uint32_t num_rows, uint64_t
length,
+void Hashing64::HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t
key_length,
const uint8_t* keys, uint64_t* hashes) {
- if (ARROW_POPCOUNT64(length) == 1 && length <= sizeof(uint64_t)) {
- HashInt(combine_hashes, num_rows, length, keys, hashes);
+ if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) {
+ HashInt(combine_hashes, num_keys, key_length, keys, hashes);
return;
}
if (combine_hashes) {
- HashFixedLenImp<true>(num_rows, length, keys, hashes);
+ HashFixedLenImp<true>(num_keys, key_length, keys, hashes);
} else {
- HashFixedLenImp<false>(num_rows, length, keys, hashes);
+ HashFixedLenImp<false>(num_keys, key_length, keys, hashes);
}
}
@@ -860,13 +865,13 @@ void Hashing64::HashMultiColumn(const
std::vector<KeyColumnArray>& cols,
}
if (cols[icol].metadata().is_fixed_length) {
- uint64_t col_width = cols[icol].metadata().fixed_length;
- if (col_width == 0) {
+ uint64_t key_length = cols[icol].metadata().fixed_length;
+ if (key_length == 0) {
HashBit(icol > 0, cols[icol].bit_offset(1), batch_size_next,
cols[icol].data(1) + first_row / 8, hashes + first_row);
} else {
- HashFixed(icol > 0, batch_size_next, col_width,
- cols[icol].data(1) + first_row * col_width, hashes +
first_row);
+ HashFixed(icol > 0, batch_size_next, key_length,
+ cols[icol].data(1) + first_row * key_length, hashes +
first_row);
}
} else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) {
HashVarLen(icol > 0, batch_size_next, cols[icol].offsets() + first_row,
@@ -897,8 +902,9 @@ void Hashing64::HashMultiColumn(const
std::vector<KeyColumnArray>& cols,
Status Hashing64::HashBatch(const ExecBatch& key_batch, uint64_t* hashes,
std::vector<KeyColumnArray>& column_arrays,
int64_t hardware_flags, util::TempVectorStack*
temp_stack,
- int64_t offset, int64_t length) {
- RETURN_NOT_OK(ColumnArraysFromExecBatch(key_batch, offset, length,
&column_arrays));
+ int64_t start_row, int64_t num_rows) {
+ RETURN_NOT_OK(
+ ColumnArraysFromExecBatch(key_batch, start_row, num_rows,
&column_arrays));
LightContext ctx;
ctx.hardware_flags = hardware_flags;
diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h
index b193716c9b..1173df5ed1 100644
--- a/cpp/src/arrow/compute/key_hash.h
+++ b/cpp/src/arrow/compute/key_hash.h
@@ -51,10 +51,10 @@ class ARROW_EXPORT Hashing32 {
static Status HashBatch(const ExecBatch& key_batch, uint32_t* hashes,
std::vector<KeyColumnArray>& column_arrays,
int64_t hardware_flags, util::TempVectorStack*
temp_stack,
- int64_t offset, int64_t length);
+ int64_t start_row, int64_t num_rows);
static void HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t
num_keys,
- uint64_t length_key, const uint8_t* keys, uint32_t*
hashes,
+ uint64_t key_length, const uint8_t* keys, uint32_t*
hashes,
uint32_t* temp_hashes_for_combine);
private:
@@ -100,7 +100,7 @@ class ARROW_EXPORT Hashing32 {
static inline void StripeMask(int i, uint32_t* mask1, uint32_t* mask2,
uint32_t* mask3,
uint32_t* mask4);
template <bool T_COMBINE_HASHES>
- static void HashFixedLenImp(uint32_t num_rows, uint64_t length, const
uint8_t* keys,
+ static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const
uint8_t* keys,
uint32_t* hashes);
template <typename T, bool T_COMBINE_HASHES>
static void HashVarLenImp(uint32_t num_rows, const T* offsets,
@@ -112,7 +112,7 @@ class ARROW_EXPORT Hashing32 {
const uint8_t* keys, uint32_t* hashes);
template <bool T_COMBINE_HASHES, typename T>
static void HashIntImp(uint32_t num_keys, const T* keys, uint32_t* hashes);
- static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
length_key,
+ static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
key_length,
const uint8_t* keys, uint32_t* hashes);
#if defined(ARROW_HAVE_RUNTIME_AVX2)
@@ -129,11 +129,11 @@ class ARROW_EXPORT Hashing32 {
__m256i mask_last_stripe, const
uint8_t* keys,
int64_t offset_A, int64_t
offset_B);
template <bool T_COMBINE_HASHES>
- static uint32_t HashFixedLenImp_avx2(uint32_t num_rows, uint64_t length,
+ static uint32_t HashFixedLenImp_avx2(uint32_t num_rows, uint64_t key_length,
const uint8_t* keys, uint32_t* hashes,
uint32_t* hashes_temp_for_combine);
static uint32_t HashFixedLen_avx2(bool combine_hashes, uint32_t num_rows,
- uint64_t length, const uint8_t* keys,
+ uint64_t key_length, const uint8_t* keys,
uint32_t* hashes, uint32_t*
hashes_temp_for_combine);
template <typename T, bool T_COMBINE_HASHES>
static uint32_t HashVarLenImp_avx2(uint32_t num_rows, const T* offsets,
@@ -164,9 +164,9 @@ class ARROW_EXPORT Hashing64 {
static Status HashBatch(const ExecBatch& key_batch, uint64_t* hashes,
std::vector<KeyColumnArray>& column_arrays,
int64_t hardware_flags, util::TempVectorStack*
temp_stack,
- int64_t offset, int64_t length);
+ int64_t start_row, int64_t num_rows);
- static void HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t
length_key,
+ static void HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t
key_length,
const uint8_t* keys, uint64_t* hashes);
private:
@@ -203,7 +203,7 @@ class ARROW_EXPORT Hashing64 {
static inline void StripeMask(int i, uint64_t* mask1, uint64_t* mask2,
uint64_t* mask3,
uint64_t* mask4);
template <bool T_COMBINE_HASHES>
- static void HashFixedLenImp(uint32_t num_rows, uint64_t length, const
uint8_t* keys,
+ static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const
uint8_t* keys,
uint64_t* hashes);
template <typename T, bool T_COMBINE_HASHES>
static void HashVarLenImp(uint32_t num_rows, const T* offsets,
@@ -211,11 +211,11 @@ class ARROW_EXPORT Hashing64 {
template <bool T_COMBINE_HASHES>
static void HashBitImp(int64_t bit_offset, uint32_t num_keys, const uint8_t*
keys,
uint64_t* hashes);
- static void HashBit(bool T_COMBINE_HASHES, int64_t bit_offset, uint32_t
num_keys,
+ static void HashBit(bool combine_hashes, int64_t bit_offset, uint32_t
num_keys,
const uint8_t* keys, uint64_t* hashes);
template <bool T_COMBINE_HASHES, typename T>
static void HashIntImp(uint32_t num_keys, const T* keys, uint64_t* hashes);
- static void HashInt(bool T_COMBINE_HASHES, uint32_t num_keys, uint64_t
length_key,
+ static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t
key_length,
const uint8_t* keys, uint64_t* hashes);
};
diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc
b/cpp/src/arrow/compute/key_hash_avx2.cc
index 1b444b5767..aec2800c64 100644
--- a/cpp/src/arrow/compute/key_hash_avx2.cc
+++ b/cpp/src/arrow/compute/key_hash_avx2.cc
@@ -190,7 +190,7 @@ uint32_t Hashing32::HashFixedLenImp_avx2(uint32_t num_rows,
uint64_t length,
// Do not process rows that could read past the end of the buffer using 16
// byte loads. Round down number of rows to process to multiple of 2.
//
- uint64_t num_rows_to_skip = bit_util::CeilDiv(length, kStripeSize);
+ uint64_t num_rows_to_skip = bit_util::CeilDiv(kStripeSize, length);
uint32_t num_rows_to_process =
(num_rows_to_skip > num_rows)
? 0
diff --git a/cpp/src/arrow/compute/key_hash_test.cc
b/cpp/src/arrow/compute/key_hash_test.cc
index 3e6d41525c..c998df7169 100644
--- a/cpp/src/arrow/compute/key_hash_test.cc
+++ b/cpp/src/arrow/compute/key_hash_test.cc
@@ -252,5 +252,64 @@ TEST(VectorHash, BasicString) {
RunTestVectorHash<StringType>(); }
TEST(VectorHash, BasicLargeString) { RunTestVectorHash<LargeStringType>(); }
+void HashFixedLengthFrom(int key_length, int num_rows, int start_row) {
+ int num_rows_to_hash = num_rows - start_row;
+ auto num_bytes_aligned = arrow::bit_util::RoundUpToMultipleOf64(key_length *
num_rows);
+
+ const auto hardware_flags_for_testing = HardwareFlagsForTesting();
+ ASSERT_GT(hardware_flags_for_testing.size(), 0);
+
+ std::vector<std::vector<uint32_t>>
hashes32(hardware_flags_for_testing.size());
+ std::vector<std::vector<uint64_t>>
hashes64(hardware_flags_for_testing.size());
+ for (auto& h : hashes32) {
+ h.resize(num_rows_to_hash);
+ }
+ for (auto& h : hashes64) {
+ h.resize(num_rows_to_hash);
+ }
+
+ FixedSizeBinaryBuilder keys_builder(fixed_size_binary(key_length));
+ for (int j = 0; j < num_rows; ++j) {
+ ASSERT_OK(keys_builder.Append(std::string(key_length, 42)));
+ }
+ ASSERT_OK_AND_ASSIGN(auto keys, keys_builder.Finish());
+ // Make sure the buffer is aligned as expected.
+ ASSERT_EQ(keys->data()->buffers[1]->capacity(), num_bytes_aligned);
+
+ constexpr int mini_batch_size = 1024;
+ std::vector<uint32_t> temp_buffer;
+ temp_buffer.resize(mini_batch_size * 4);
+
+ for (int i = 0; i < static_cast<int>(hardware_flags_for_testing.size());
++i) {
+ const auto hardware_flags = hardware_flags_for_testing[i];
+ Hashing32::HashFixed(hardware_flags,
+ /*combine_hashes=*/false, num_rows_to_hash,
key_length,
+ keys->data()->GetValues<uint8_t>(1) + start_row *
key_length,
+ hashes32[i].data(), temp_buffer.data());
+ Hashing64::HashFixed(
+ /*combine_hashes=*/false, num_rows_to_hash, key_length,
+ keys->data()->GetValues<uint8_t>(1) + start_row * key_length,
hashes64[i].data());
+ }
+
+ // Verify that all implementations (scalar, SIMD) give the same hashes.
+ for (int i = 1; i < static_cast<int>(hardware_flags_for_testing.size());
++i) {
+ for (int j = 0; j < num_rows_to_hash; ++j) {
+ ASSERT_EQ(hashes32[i][j], hashes32[0][j])
+ << "scalar and simd approaches yielded different 32-bit hashes";
+ ASSERT_EQ(hashes64[i][j], hashes64[0][j])
+ << "scalar and simd approaches yielded different 64-bit hashes";
+ }
+ }
+}
+
+// Some carefully chosen cases that may cause troubles like GH-39778.
+TEST(VectorHash, FixedLengthTailByteSafety) {
+ // Tow cases of key_length < stripe (16-byte).
+ HashFixedLengthFrom(/*key_length=*/3, /*num_rows=*/1450, /*start_row=*/1447);
+ HashFixedLengthFrom(/*key_length=*/5, /*num_rows=*/883, /*start_row=*/858);
+ // Case of key_length > stripe (16-byte).
+ HashFixedLengthFrom(/*key_length=*/19, /*num_rows=*/64, /*start_row=*/63);
+}
+
} // namespace compute
} // namespace arrow