This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new cfa7154a69 fix `null_count` on `compute_record_batch_statistics` to 
report null counts across partitions (#10468)
cfa7154a69 is described below

commit cfa7154a695cb1a6b3feb348d7e7958c5c94ae8a
Author: Samuel Colvin <[email protected]>
AuthorDate: Thu May 16 20:33:50 2024 -0400

    fix `null_count` on `compute_record_batch_statistics` to report null counts 
across partitions (#10468)
    
    * fix null_count on compute_record_batch_statistics
    
    * fmt
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/physical-plan/src/common.rs | 44 +++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/datafusion/physical-plan/src/common.rs 
b/datafusion/physical-plan/src/common.rs
index 9e2216ae0a..c61e9a05bf 100644
--- a/datafusion/physical-plan/src/common.rs
+++ b/datafusion/physical-plan/src/common.rs
@@ -153,16 +153,23 @@ pub fn compute_record_batch_statistics(
         })
         .sum();
 
-    let mut column_statistics = vec![ColumnStatistics::new_unknown(); 
projection.len()];
+    let mut null_counts = vec![0; projection.len()];
 
     for partition in batches.iter() {
         for batch in partition {
             for (stat_index, col_index) in projection.iter().enumerate() {
-                column_statistics[stat_index].null_count =
-                    Precision::Exact(batch.column(*col_index).null_count());
+                null_counts[stat_index] += 
batch.column(*col_index).null_count();
             }
         }
     }
+    let column_statistics = null_counts
+        .into_iter()
+        .map(|null_count| {
+            let mut s = ColumnStatistics::new_unknown();
+            s.null_count = Precision::Exact(null_count);
+            s
+        })
+        .collect();
 
     Statistics {
         num_rows: Precision::Exact(nb_rows),
@@ -687,4 +694,35 @@ mod tests {
         assert_eq!(actual, expected);
         Ok(())
     }
+
+    #[test]
+    fn test_compute_record_batch_statistics_null() -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("u64", DataType::UInt64, 
true)]));
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(UInt64Array::from(vec![Some(1), None, None]))],
+        )?;
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(UInt64Array::from(vec![Some(1), Some(2), None]))],
+        )?;
+        let byte_size = batch1.get_array_memory_size() + 
batch2.get_array_memory_size();
+        let actual =
+            compute_record_batch_statistics(&[vec![batch1], vec![batch2]], 
&schema, None);
+
+        let expected = Statistics {
+            num_rows: Precision::Exact(6),
+            total_byte_size: Precision::Exact(byte_size),
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Absent,
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                null_count: Precision::Exact(3),
+            }],
+        };
+
+        assert_eq!(actual, expected);
+        Ok(())
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to