alamb commented on code in PR #6816:
URL: https://github.com/apache/arrow-datafusion/pull/6816#discussion_r1249453560
##########
datafusion/physical-expr/src/hash_utils.rs:
##########
@@ -84,35 +84,75 @@ macro_rules! hash_float_value {
}
hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
+fn hash_array_primitive<T>(
+ array: &PrimitiveArray<T>,
+ random_state: &RandomState,
+ hashes_buffer: &mut [u64],
+ rehash: bool,
+) where
+ T: ArrowPrimitiveType,
+ <T as arrow_array::ArrowPrimitiveType>::Native: HashValue,
+{
+ if array.null_count() == 0 {
+ if rehash {
+ for (hash, &value) in
hashes_buffer.iter_mut().zip(array.values().iter()) {
+ *hash = combine_hashes(value.hash_one(random_state), *hash);
+ }
+ } else {
+ for (hash, &value) in
hashes_buffer.iter_mut().zip(array.values().iter()) {
+ *hash = value.hash_one(random_state);
+ }
+ }
+ } else if rehash {
+ for (i, hash) in hashes_buffer.iter_mut().enumerate() {
+ if !array.is_null(i) {
+ let value = unsafe { array.value_unchecked(i) };
+ *hash = combine_hashes(value.hash_one(random_state), *hash);
+ }
+ }
+ } else {
+ for (i, hash) in hashes_buffer.iter_mut().enumerate() {
+ if !array.is_null(i) {
+ let value = unsafe { array.value_unchecked(i) };
+ *hash = value.hash_one(random_state);
+ }
+ }
+ }
+}
+
fn hash_array<T>(
Review Comment:
Could you please add some documentation comments here expalining what
`rehash` means ? I think it would make the code easier to understand on a
subsequent read
##########
datafusion/physical-expr/src/hash_utils.rs:
##########
@@ -84,35 +84,75 @@ macro_rules! hash_float_value {
}
hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
+fn hash_array_primitive<T>(
+ array: &PrimitiveArray<T>,
+ random_state: &RandomState,
+ hashes_buffer: &mut [u64],
+ rehash: bool,
+) where
+ T: ArrowPrimitiveType,
+ <T as arrow_array::ArrowPrimitiveType>::Native: HashValue,
+{
+ if array.null_count() == 0 {
+ if rehash {
+ for (hash, &value) in
hashes_buffer.iter_mut().zip(array.values().iter()) {
+ *hash = combine_hashes(value.hash_one(random_state), *hash);
+ }
+ } else {
+ for (hash, &value) in
hashes_buffer.iter_mut().zip(array.values().iter()) {
+ *hash = value.hash_one(random_state);
+ }
+ }
+ } else if rehash {
+ for (i, hash) in hashes_buffer.iter_mut().enumerate() {
+ if !array.is_null(i) {
+ let value = unsafe { array.value_unchecked(i) };
+ *hash = combine_hashes(value.hash_one(random_state), *hash);
+ }
+ }
+ } else {
+ for (i, hash) in hashes_buffer.iter_mut().enumerate() {
+ if !array.is_null(i) {
+ let value = unsafe { array.value_unchecked(i) };
+ *hash = value.hash_one(random_state);
+ }
+ }
+ }
+}
+
fn hash_array<T>(
array: T,
random_state: &RandomState,
hashes_buffer: &mut [u64],
- multi_col: bool,
+ rehash: bool,
) where
T: ArrayAccessor,
T::Item: HashValue,
{
if array.null_count() == 0 {
- if multi_col {
+ if rehash {
Review Comment:
Perhaps we can add some comments with a safety justification here for using
`unsafe`
It would make me feel better too to check the lengths at least onece
like
```rust
assert_eq!(hashes_buffer.len(), array.len())
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
// SAFETY: Validated length above
let value = unsafe { array.value_unchecked(i) };
*hash = combine_hashes(value.hash_one(random_state),
*hash);```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]