This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 003813ae30 Optimize Statistics::projection (#13225)
003813ae30 is described below

commit 003813ae307aa24a0d8369dba9977355431b799d
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Nov 5 10:54:33 2024 -0500

    Optimize Statistics::projection (#13225)
---
 datafusion/common/src/stats.rs | 78 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 5 deletions(-)

diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index 1aa42705e7..d2ce965c5c 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -268,13 +268,35 @@ impl Statistics {
             return self;
         };
 
-        // todo: it would be nice to avoid cloning column statistics if
-        // possible (e.g. if the projection did not contain duplicates)
-        self.column_statistics = projection
-            .iter()
-            .map(|&i| self.column_statistics[i].clone())
+        enum Slot {
+            /// The column is taken and put into the specified statistics 
location
+            Taken(usize),
+            /// The original columns is present
+            Present(ColumnStatistics),
+        }
+
+        // Convert to Vec<Slot> so we can avoid copying the statistics
+        let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
+            .into_iter()
+            .map(Slot::Present)
             .collect();
 
+        for idx in projection {
+            let next_idx = self.column_statistics.len();
+            let slot = std::mem::replace(
+                columns.get_mut(*idx).expect("projection out of bounds"),
+                Slot::Taken(next_idx),
+            );
+            match slot {
+                // The column was there, so just move it
+                Slot::Present(col) => self.column_statistics.push(col),
+                // The column was taken, so copy from the previous location
+                Slot::Taken(prev_idx) => self
+                    .column_statistics
+                    .push(self.column_statistics[prev_idx].clone()),
+            }
+        }
+
         self
     }
 
@@ -581,4 +603,50 @@ mod tests {
         let p2 = precision.clone();
         assert_eq!(precision, p2);
     }
+
+    #[test]
+    fn test_project_none() {
+        let projection = None;
+        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+        assert_eq!(stats, make_stats(vec![10, 20, 30]));
+    }
+
+    #[test]
+    fn test_project_empty() {
+        let projection = Some(vec![]);
+        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+        assert_eq!(stats, make_stats(vec![]));
+    }
+
+    #[test]
+    fn test_project_swap() {
+        let projection = Some(vec![2, 1]);
+        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+        assert_eq!(stats, make_stats(vec![30, 20]));
+    }
+
+    #[test]
+    fn test_project_repeated() {
+        let projection = Some(vec![1, 2, 1, 1, 0, 2]);
+        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
+        assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
+    }
+
+    // Make a Statistics structure with the specified null counts for each 
column
+    fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
+        Statistics {
+            num_rows: Precision::Exact(42),
+            total_byte_size: Precision::Exact(500),
+            column_statistics: counts.into_iter().map(col_stats_i64).collect(),
+        }
+    }
+
+    fn col_stats_i64(null_count: usize) -> ColumnStatistics {
+        ColumnStatistics {
+            null_count: Precision::Exact(null_count),
+            max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
+            min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
+            distinct_count: Precision::Exact(100),
+        }
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to