This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 7316274443 Minor: Fix incorrect indices for hashing struct (#8775)
7316274443 is described below

commit 73162744431317f19abd562484372b113ffc2846
Author: Jay Zhan <[email protected]>
AuthorDate: Sat Jan 6 22:34:57 2024 +0800

    Minor: Fix incorrect indices for hashing struct (#8775)
    
    * fix bug
    
    Signed-off-by: jayzhan211 <[email protected]>
    
    * fmt
    
    Signed-off-by: jayzhan211 <[email protected]>
    
    * add rowsort
    
    Signed-off-by: jayzhan211 <[email protected]>
    
    ---------
    
    Signed-off-by: jayzhan211 <[email protected]>
---
 datafusion/common/src/hash_utils.rs               | 46 +++++++++++++++++++----
 datafusion/sqllogictest/test_files/dictionary.slt |  2 +-
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/datafusion/common/src/hash_utils.rs 
b/datafusion/common/src/hash_utils.rs
index 5c36f41a6e..8dcc00ca1c 100644
--- a/datafusion/common/src/hash_utils.rs
+++ b/datafusion/common/src/hash_utils.rs
@@ -214,22 +214,19 @@ fn hash_struct_array(
     hashes_buffer: &mut [u64],
 ) -> Result<()> {
     let nulls = array.nulls();
-    let num_columns = array.num_columns();
+    let row_len = array.len();
 
-    // Skip null columns
-    let valid_indices: Vec<usize> = if let Some(nulls) = nulls {
+    let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
         nulls.valid_indices().collect()
     } else {
-        (0..num_columns).collect()
+        (0..row_len).collect()
     };
 
     // Create hashes for each row that combines the hashes over all the column 
at that row.
-    // array.len() is the number of rows.
-    let mut values_hashes = vec![0u64; array.len()];
+    let mut values_hashes = vec![0u64; row_len];
     create_hashes(array.columns(), random_state, &mut values_hashes)?;
 
-    // Skip the null columns, nulls should get hash value 0.
-    for i in valid_indices {
+    for i in valid_row_indices {
         let hash = &mut hashes_buffer[i];
         *hash = combine_hashes(*hash, values_hashes[i]);
     }
@@ -601,6 +598,39 @@ mod tests {
         assert_eq!(hashes[4], hashes[5]);
     }
 
+    #[test]
+    // Tests actual values of hashes, which are different if forcing collisions
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_struct_arrays_more_column_than_row() {
+        let struct_array = StructArray::from(vec![
+            (
+                Arc::new(Field::new("bool", DataType::Boolean, false)),
+                Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
+            ),
+            (
+                Arc::new(Field::new("i32-1", DataType::Int32, false)),
+                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
+            ),
+            (
+                Arc::new(Field::new("i32-2", DataType::Int32, false)),
+                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
+            ),
+            (
+                Arc::new(Field::new("i32-3", DataType::Int32, false)),
+                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
+            ),
+        ]);
+
+        assert!(struct_array.is_valid(0));
+        assert!(struct_array.is_valid(1));
+
+        let array = Arc::new(struct_array) as ArrayRef;
+        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let mut hashes = vec![0; array.len()];
+        create_hashes(&[array], &random_state, &mut hashes).unwrap();
+        assert_eq!(hashes[0], hashes[1]);
+    }
+
     #[test]
     // Tests actual values of hashes, which are different if forcing collisions
     #[cfg(not(feature = "force_hash_collisions"))]
diff --git a/datafusion/sqllogictest/test_files/dictionary.slt 
b/datafusion/sqllogictest/test_files/dictionary.slt
index d4ad46711b..b7f375dd6c 100644
--- a/datafusion/sqllogictest/test_files/dictionary.slt
+++ b/datafusion/sqllogictest/test_files/dictionary.slt
@@ -148,7 +148,7 @@ select count(*) from m1 where tag_id = '1000' and time < 
'2024-01-03T14:46:35+01
 ----
 10
 
-query RRR
+query RRR rowsort
 select min(f5), max(f5), avg(f5) from m2 where tag_id = '1000' and time < 
'2024-01-03T14:46:35+01:00' group by type;
 ----
 100 600 350

Reply via email to