This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 003813ae30 Optimize Statistics::projection (#13225)
003813ae30 is described below
commit 003813ae307aa24a0d8369dba9977355431b799d
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Nov 5 10:54:33 2024 -0500
Optimize Statistics::projection (#13225)
---
datafusion/common/src/stats.rs | 78 +++++++++++++++++++++++++++++++++++++++---
1 file changed, 73 insertions(+), 5 deletions(-)
diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index 1aa42705e7..d2ce965c5c 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -268,13 +268,35 @@ impl Statistics {
return self;
};
- // todo: it would be nice to avoid cloning column statistics if
- // possible (e.g. if the projection did not contain duplicates)
- self.column_statistics = projection
- .iter()
- .map(|&i| self.column_statistics[i].clone())
+ enum Slot {
+ /// The column is taken and put into the specified statistics
location
+ Taken(usize),
+ /// The original columns is present
+ Present(ColumnStatistics),
+ }
+
+ // Convert to Vec<Slot> so we can avoid copying the statistics
+ let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
+ .into_iter()
+ .map(Slot::Present)
.collect();
+ for idx in projection {
+ let next_idx = self.column_statistics.len();
+ let slot = std::mem::replace(
+ columns.get_mut(*idx).expect("projection out of bounds"),
+ Slot::Taken(next_idx),
+ );
+ match slot {
+ // The column was there, so just move it
+ Slot::Present(col) => self.column_statistics.push(col),
+ // The column was taken, so copy from the previous location
+ Slot::Taken(prev_idx) => self
+ .column_statistics
+ .push(self.column_statistics[prev_idx].clone()),
+ }
+ }
+
self
}
@@ -581,4 +603,50 @@ mod tests {
let p2 = precision.clone();
assert_eq!(precision, p2);
}
+
+ #[test]
+ fn test_project_none() {
+ let projection = None;
+ let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+ assert_eq!(stats, make_stats(vec![10, 20, 30]));
+ }
+
+ #[test]
+ fn test_project_empty() {
+ let projection = Some(vec![]);
+ let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+ assert_eq!(stats, make_stats(vec![]));
+ }
+
+ #[test]
+ fn test_project_swap() {
+ let projection = Some(vec![2, 1]);
+ let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+ assert_eq!(stats, make_stats(vec![30, 20]));
+ }
+
+ #[test]
+ fn test_project_repeated() {
+ let projection = Some(vec![1, 2, 1, 1, 0, 2]);
+ let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+ assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
+ }
+
+ // Make a Statistics structure with the specified null counts for each
column
+ fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
+ Statistics {
+ num_rows: Precision::Exact(42),
+ total_byte_size: Precision::Exact(500),
+ column_statistics: counts.into_iter().map(col_stats_i64).collect(),
+ }
+ }
+
+ fn col_stats_i64(null_count: usize) -> ColumnStatistics {
+ ColumnStatistics {
+ null_count: Precision::Exact(null_count),
+ max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
+ min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
+ distinct_count: Precision::Exact(100),
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]