This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 7316274443 Minor: Fix incorrect indices for hashing struct (#8775)
7316274443 is described below
commit 73162744431317f19abd562484372b113ffc2846
Author: Jay Zhan <[email protected]>
AuthorDate: Sat Jan 6 22:34:57 2024 +0800
Minor: Fix incorrect indices for hashing struct (#8775)
* fix bug
Signed-off-by: jayzhan211 <[email protected]>
* fmt
Signed-off-by: jayzhan211 <[email protected]>
* add rowsort
Signed-off-by: jayzhan211 <[email protected]>
---------
Signed-off-by: jayzhan211 <[email protected]>
---
datafusion/common/src/hash_utils.rs | 46 +++++++++++++++++++----
datafusion/sqllogictest/test_files/dictionary.slt | 2 +-
2 files changed, 39 insertions(+), 9 deletions(-)
diff --git a/datafusion/common/src/hash_utils.rs
b/datafusion/common/src/hash_utils.rs
index 5c36f41a6e..8dcc00ca1c 100644
--- a/datafusion/common/src/hash_utils.rs
+++ b/datafusion/common/src/hash_utils.rs
@@ -214,22 +214,19 @@ fn hash_struct_array(
hashes_buffer: &mut [u64],
) -> Result<()> {
let nulls = array.nulls();
- let num_columns = array.num_columns();
+ let row_len = array.len();
- // Skip null columns
- let valid_indices: Vec<usize> = if let Some(nulls) = nulls {
+ let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
nulls.valid_indices().collect()
} else {
- (0..num_columns).collect()
+ (0..row_len).collect()
};
// Create hashes for each row that combines the hashes over all the column
at that row.
- // array.len() is the number of rows.
- let mut values_hashes = vec![0u64; array.len()];
+ let mut values_hashes = vec![0u64; row_len];
create_hashes(array.columns(), random_state, &mut values_hashes)?;
- // Skip the null columns, nulls should get hash value 0.
- for i in valid_indices {
+ for i in valid_row_indices {
let hash = &mut hashes_buffer[i];
*hash = combine_hashes(*hash, values_hashes[i]);
}
@@ -601,6 +598,39 @@ mod tests {
assert_eq!(hashes[4], hashes[5]);
}
+ #[test]
+ // Tests actual values of hashes, which are different if forcing collisions
+ #[cfg(not(feature = "force_hash_collisions"))]
+ fn create_hashes_for_struct_arrays_more_column_than_row() {
+ let struct_array = StructArray::from(vec![
+ (
+ Arc::new(Field::new("bool", DataType::Boolean, false)),
+ Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("i32-1", DataType::Int32, false)),
+ Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("i32-2", DataType::Int32, false)),
+ Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("i32-3", DataType::Int32, false)),
+ Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
+ ),
+ ]);
+
+ assert!(struct_array.is_valid(0));
+ assert!(struct_array.is_valid(1));
+
+ let array = Arc::new(struct_array) as ArrayRef;
+ let random_state = RandomState::with_seeds(0, 0, 0, 0);
+ let mut hashes = vec![0; array.len()];
+ create_hashes(&[array], &random_state, &mut hashes).unwrap();
+ assert_eq!(hashes[0], hashes[1]);
+ }
+
#[test]
// Tests actual values of hashes, which are different if forcing collisions
#[cfg(not(feature = "force_hash_collisions"))]
diff --git a/datafusion/sqllogictest/test_files/dictionary.slt
b/datafusion/sqllogictest/test_files/dictionary.slt
index d4ad46711b..b7f375dd6c 100644
--- a/datafusion/sqllogictest/test_files/dictionary.slt
+++ b/datafusion/sqllogictest/test_files/dictionary.slt
@@ -148,7 +148,7 @@ select count(*) from m1 where tag_id = '1000' and time <
'2024-01-03T14:46:35+01
----
10
-query RRR
+query RRR rowsort
select min(f5), max(f5), avg(f5) from m2 where tag_id = '1000' and time <
'2024-01-03T14:46:35+01:00' group by type;
----
100 600 350