This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new cfa7154a69 fix `null_count` on `compute_record_batch_statistics` to
report null counts across partitions (#10468)
cfa7154a69 is described below
commit cfa7154a695cb1a6b3feb348d7e7958c5c94ae8a
Author: Samuel Colvin <[email protected]>
AuthorDate: Thu May 16 20:33:50 2024 -0400
fix `null_count` on `compute_record_batch_statistics` to report null counts
across partitions (#10468)
* fix null_count on compute_record_batch_statistics
* fmt
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/physical-plan/src/common.rs | 44 +++++++++++++++++++++++++++++++---
1 file changed, 41 insertions(+), 3 deletions(-)
diff --git a/datafusion/physical-plan/src/common.rs
b/datafusion/physical-plan/src/common.rs
index 9e2216ae0a..c61e9a05bf 100644
--- a/datafusion/physical-plan/src/common.rs
+++ b/datafusion/physical-plan/src/common.rs
@@ -153,16 +153,23 @@ pub fn compute_record_batch_statistics(
})
.sum();
- let mut column_statistics = vec![ColumnStatistics::new_unknown();
projection.len()];
+ let mut null_counts = vec![0; projection.len()];
for partition in batches.iter() {
for batch in partition {
for (stat_index, col_index) in projection.iter().enumerate() {
- column_statistics[stat_index].null_count =
- Precision::Exact(batch.column(*col_index).null_count());
+ null_counts[stat_index] +=
batch.column(*col_index).null_count();
}
}
}
+ let column_statistics = null_counts
+ .into_iter()
+ .map(|null_count| {
+ let mut s = ColumnStatistics::new_unknown();
+ s.null_count = Precision::Exact(null_count);
+ s
+ })
+ .collect();
Statistics {
num_rows: Precision::Exact(nb_rows),
@@ -687,4 +694,35 @@ mod tests {
assert_eq!(actual, expected);
Ok(())
}
+
+ #[test]
+ fn test_compute_record_batch_statistics_null() -> Result<()> {
+ let schema =
+ Arc::new(Schema::new(vec![Field::new("u64", DataType::UInt64,
true)]));
+ let batch1 = RecordBatch::try_new(
+ Arc::clone(&schema),
+ vec![Arc::new(UInt64Array::from(vec![Some(1), None, None]))],
+ )?;
+ let batch2 = RecordBatch::try_new(
+ Arc::clone(&schema),
+ vec![Arc::new(UInt64Array::from(vec![Some(1), Some(2), None]))],
+ )?;
+ let byte_size = batch1.get_array_memory_size() +
batch2.get_array_memory_size();
+ let actual =
+ compute_record_batch_statistics(&[vec![batch1], vec![batch2]],
&schema, None);
+
+ let expected = Statistics {
+ num_rows: Precision::Exact(6),
+ total_byte_size: Precision::Exact(byte_size),
+ column_statistics: vec![ColumnStatistics {
+ distinct_count: Precision::Absent,
+ max_value: Precision::Absent,
+ min_value: Precision::Absent,
+ null_count: Precision::Exact(3),
+ }],
+ };
+
+ assert_eq!(actual, expected);
+ Ok(())
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]