martin-g commented on code in PR #18981:
URL: https://github.com/apache/datafusion/pull/18981#discussion_r2571734759
##########
datafusion/common/src/hash_utils.rs:
##########
@@ -484,6 +485,40 @@ fn hash_fixed_list_array(
Ok(())
}
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_run_array<R: RunEndIndexType>(
+ array: &RunArray<R>,
+ random_state: &RandomState,
+ hashes_buffer: &mut [u64],
+ rehash: bool,
+) -> Result<()> {
+ let values = array.values();
+ let values_len = values.len();
+ let mut values_hashes = vec![0u64; values_len];
+ create_hashes(&[Arc::clone(values)], random_state, &mut values_hashes)?;
+
+ let run_ends = array.run_ends();
+ let mut prev_run_end = 0;
+
+ for (i, value_hash) in values_hashes.iter().enumerate().take(values_len) {
+ let run_end = run_ends.values()[i].as_usize();
+
+ if rehash {
+ for hash in
hashes_buffer.iter_mut().take(run_end).skip(prev_run_end) {
+ *hash = combine_hashes(*value_hash, *hash);
+ }
+ } else {
+ for hash in
hashes_buffer.iter_mut().take(run_end).skip(prev_run_end) {
Review Comment:
This could be optimized to
`hashes_buffer[prev_run_end..run_end].fill(value_hash)` (SIMD friendly!).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]