brancz commented on code in PR #18981:
URL: https://github.com/apache/datafusion/pull/18981#discussion_r2581549902


##########
datafusion/common/src/hash_utils.rs:
##########
@@ -484,6 +484,90 @@ fn hash_fixed_list_array(
     Ok(())
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_run_array<R: RunEndIndexType>(
+    array: &RunArray<R>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    // We find the relevant runs that cover potentially sliced arrays, so we 
can only hash those
+    // values. Then we find the runs that refer to the original runs and 
ensure that we apply
+    // hashes correctly to the sliced, whether sliced at the start, end, or 
both.
+    let array_offset = array.offset();
+    let array_len = array.len();
+
+    if array_len == 0 {
+        return Ok(());
+    }
+
+    let run_ends = array.run_ends();
+    let run_ends_values = run_ends.values();
+    let values = array.values();
+
+    let start_physical_index = array.get_start_physical_index();
+    // get_end_physical_index returns the inclusive last index, but we need 
the exclusive range end
+    // for the operations we use below.
+    let end_physical_index = array.get_end_physical_index() + 1;
+
+    let sliced_values = values.slice(
+        start_physical_index,
+        end_physical_index - start_physical_index,
+    );
+    let mut values_hashes = vec![0u64; sliced_values.len()];
+    create_hashes(
+        std::slice::from_ref(&sliced_values),
+        random_state,
+        &mut values_hashes,
+    )?;
+
+    let mut logical_position = 0;
+    for (adjusted_physical_index, &absolute_run_end) in run_ends_values
+        [start_physical_index..end_physical_index]
+        .iter()
+        .enumerate()
+    {
+        let is_null_value = sliced_values.is_null(adjusted_physical_index);
+        let absolute_run_end = absolute_run_end.as_usize();
+
+        let start_in_slice = if absolute_run_end > array_offset {
+            logical_position
+        } else {
+            continue;
+        };

Review Comment:
   looking at it with fresh eyes, I think this must have been a defensive 
leftover from when I was calculating/handling slicing/offset manually, but I 
think we need to rely on arrow being correct and I think with the tests we have 
we're well covered



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to