This is an automated email from the ASF dual-hosted git repository. dheres pushed a commit to branch create_hashes_primitive in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
commit b6ccfa450b86ed31814aaaa1df9a2c52a7513fcb Author: Daniƫl Heres <[email protected]> AuthorDate: Sat Jul 1 09:47:08 2023 +0200 Only rehash col >=1 --- datafusion/physical-expr/src/hash_utils.rs | 68 ++++++++++++++++-------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/datafusion/physical-expr/src/hash_utils.rs b/datafusion/physical-expr/src/hash_utils.rs index de3526992f..e3cb891902 100644 --- a/datafusion/physical-expr/src/hash_utils.rs +++ b/datafusion/physical-expr/src/hash_utils.rs @@ -88,31 +88,33 @@ fn hash_array_primitve<T>( array: &PrimitiveArray<T>, random_state: &RandomState, hashes_buffer: &mut [u64], - multi_col: bool, + rehash: bool, ) where T: ArrowPrimitiveType, <T as arrow_array::ArrowPrimitiveType>::Native: HashValue, { if array.null_count() == 0 { - if multi_col { - for (hash, &val) in hashes_buffer.iter_mut().zip(array.values().iter()) { - *hash = combine_hashes(val.hash_one(&random_state), *hash); + if rehash { + for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) { + *hash = combine_hashes(value.hash_one(&random_state), *hash); } } else { - for (hash, &val) in hashes_buffer.iter_mut().zip(array.values().iter()) { - *hash = val.hash_one(&random_state); + for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) { + *hash = value.hash_one(&random_state); } } - } else if multi_col { + } else if rehash { for (i, hash) in hashes_buffer.iter_mut().enumerate() { if !array.is_null(i) { - *hash = combine_hashes(array.value(i).hash_one(random_state), *hash); + let value = unsafe {array.value_unchecked(i)}; + *hash = combine_hashes(value.hash_one(random_state), *hash); } } } else { for (i, hash) in hashes_buffer.iter_mut().enumerate() { if !array.is_null(i) { - *hash = array.value(i).hash_one(random_state); + let value = unsafe {array.value_unchecked(i)}; + *hash = value.hash_one(random_state); } } } @@ -122,31 +124,35 @@ fn hash_array<T>( array: T, random_state: &RandomState, hashes_buffer: &mut [u64], - multi_col: bool, + rehash: bool, ) where T: ArrayAccessor, T::Item: HashValue, { if array.null_count() == 0 { - if multi_col { + if rehash { for (i, hash) in hashes_buffer.iter_mut().enumerate() { - *hash = combine_hashes(array.value(i).hash_one(random_state), *hash); + let value = unsafe {array.value_unchecked(i)}; + *hash = combine_hashes(value.hash_one(random_state), *hash); } } else { for (i, hash) in hashes_buffer.iter_mut().enumerate() { - *hash = array.value(i).hash_one(random_state); + let value= unsafe {array.value_unchecked(i)}; + *hash = value.hash_one(random_state); } } - } else if multi_col { + } else if rehash { for (i, hash) in hashes_buffer.iter_mut().enumerate() { if !array.is_null(i) { - *hash = combine_hashes(array.value(i).hash_one(random_state), *hash); + let value= unsafe {array.value_unchecked(i)}; + *hash = combine_hashes(value.hash_one(random_state), *hash); } } } else { for (i, hash) in hashes_buffer.iter_mut().enumerate() { if !array.is_null(i) { - *hash = array.value(i).hash_one(random_state); + let value= unsafe {array.value_unchecked(i)}; + *hash = value.hash_one(random_state); } } } @@ -242,34 +248,32 @@ pub fn create_hashes<'a>( random_state: &RandomState, hashes_buffer: &'a mut Vec<u64>, ) -> Result<&'a mut Vec<u64>> { - // combine hashes with `combine_hashes` if we have more than 1 column - - let multi_col = arrays.len() > 1; - - for col in arrays { + for (i, col) in arrays.iter().enumerate() { let array = col.as_ref(); + // combine hashes with `combine_hashes` for all columns besides the first + let rehash = i >= 1; downcast_primitive_array! { - array => hash_array_primitve(array, random_state, hashes_buffer, multi_col), - DataType::Null => hash_null(random_state, hashes_buffer, multi_col), - DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, multi_col), - DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, multi_col), - DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, multi_col), - DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, multi_col), - DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, multi_col), + array => hash_array_primitve(array, random_state, hashes_buffer, rehash), + DataType::Null => hash_null(random_state, hashes_buffer, rehash), + DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash), + DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash), + DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash), + DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash), DataType::FixedSizeBinary(_) => { let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap(); - hash_array(array, random_state, hashes_buffer, multi_col) + hash_array(array, random_state, hashes_buffer, rehash) } DataType::Decimal128(_, _) => { let array = as_primitive_array::<Decimal128Type>(array)?; - hash_array_primitve(array, random_state, hashes_buffer, multi_col) + hash_array_primitve(array, random_state, hashes_buffer, rehash) } DataType::Decimal256(_, _) => { let array = as_primitive_array::<Decimal256Type>(array)?; - hash_array_primitve(array, random_state, hashes_buffer, multi_col) + hash_array_primitve(array, random_state, hashes_buffer, rehash) } DataType::Dictionary(_, _) => downcast_dictionary_array! { - array => hash_dictionary(array, random_state, hashes_buffer, multi_col)?, + array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, _ => unreachable!() } _ => {
