Jefffrey commented on code in PR #19500:
URL: https://github.com/apache/datafusion/pull/19500#discussion_r2647783500
##########
datafusion/common/src/hash_utils.rs:
##########
@@ -513,24 +514,41 @@ fn hash_list_array<OffsetSize>(
where
OffsetSize: OffsetSizeTrait,
{
- let values = array.values();
- let offsets = array.value_offsets();
- let nulls = array.nulls();
- let mut values_hashes = vec![0u64; values.len()];
- create_hashes([values], random_state, &mut values_hashes)?;
- if let Some(nulls) = nulls {
- for (i, (start, stop)) in
offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
- if nulls.is_valid(i) {
+ // In case values is sliced, hash only the bytes used by the offsets of
this ListArray
+ let first_offset =
array.value_offsets().first().cloned().unwrap_or_default();
+ let last_offset =
array.value_offsets().last().cloned().unwrap_or_default();
+ let value_bytes_len = (last_offset - first_offset).as_usize();
+ let mut values_hashes = vec![0u64; value_bytes_len];
+ create_hashes(
+ [array
+ .values()
+ .slice(first_offset.as_usize(), value_bytes_len)],
+ random_state,
+ &mut values_hashes,
+ )?;
+
+ if array.null_count() > 0 {
+ for (i, (start, stop)) in
array.value_offsets().iter().tuple_windows().enumerate()
Review Comment:
Using tuple_windows from itertools makes this more ergonomic
##########
datafusion/common/src/hash_utils.rs:
##########
@@ -513,24 +514,41 @@ fn hash_list_array<OffsetSize>(
where
OffsetSize: OffsetSizeTrait,
{
- let values = array.values();
- let offsets = array.value_offsets();
- let nulls = array.nulls();
- let mut values_hashes = vec![0u64; values.len()];
- create_hashes([values], random_state, &mut values_hashes)?;
- if let Some(nulls) = nulls {
- for (i, (start, stop)) in
offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
- if nulls.is_valid(i) {
+ // In case values is sliced, hash only the bytes used by the offsets of
this ListArray
+ let first_offset =
array.value_offsets().first().cloned().unwrap_or_default();
+ let last_offset =
array.value_offsets().last().cloned().unwrap_or_default();
+ let value_bytes_len = (last_offset - first_offset).as_usize();
+ let mut values_hashes = vec![0u64; value_bytes_len];
+ create_hashes(
+ [array
+ .values()
+ .slice(first_offset.as_usize(), value_bytes_len)],
+ random_state,
+ &mut values_hashes,
+ )?;
+
+ if array.null_count() > 0 {
Review Comment:
Switching to this count probably doesn't affect much (how often do we see a
nullbuffer thats present but has all bits valid?) but it's consistent with how
we check for nulls in the other functions
##########
datafusion/common/src/hash_utils.rs:
##########
@@ -513,24 +514,41 @@ fn hash_list_array<OffsetSize>(
where
OffsetSize: OffsetSizeTrait,
{
- let values = array.values();
- let offsets = array.value_offsets();
- let nulls = array.nulls();
- let mut values_hashes = vec![0u64; values.len()];
- create_hashes([values], random_state, &mut values_hashes)?;
- if let Some(nulls) = nulls {
- for (i, (start, stop)) in
offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
- if nulls.is_valid(i) {
+ // In case values is sliced, hash only the bytes used by the offsets of
this ListArray
+ let first_offset =
array.value_offsets().first().cloned().unwrap_or_default();
+ let last_offset =
array.value_offsets().last().cloned().unwrap_or_default();
+ let value_bytes_len = (last_offset - first_offset).as_usize();
+ let mut values_hashes = vec![0u64; value_bytes_len];
+ create_hashes(
+ [array
+ .values()
+ .slice(first_offset.as_usize(), value_bytes_len)],
+ random_state,
+ &mut values_hashes,
+ )?;
Review Comment:
Main change here
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]