This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 190634bee1 Remove redundant statistics from FileScanConfig (#14955)
190634bee1 is described below

commit 190634bee1093d9d71786aa9c98ec207be05ea72
Author: Alan Tang <jmtan...@gmail.com>
AuthorDate: Thu Apr 3 19:34:00 2025 +0800

    Remove redundant statistics from FileScanConfig (#14955)
    
    * enhance: Remove redundant statistics from FileScanConfig
    
    Signed-off-by: Alan Tang <jmtan...@gmail.com>
    
    * chore: fix some ci error
    
    Signed-off-by: Alan Tang <jmtan...@gmail.com>
    
    chore: fix fmt and clippy errors
    
    ---------
    
    Signed-off-by: Alan Tang <jmtan...@gmail.com>
---
 datafusion/datasource/src/file_scan_config.rs  | 45 +++++++++++++++-----------
 datafusion/proto/src/physical_plan/to_proto.rs |  2 +-
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/datafusion/datasource/src/file_scan_config.rs 
b/datafusion/datasource/src/file_scan_config.rs
index 729283289c..6720d77dea 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -151,9 +151,6 @@ pub struct FileScanConfig {
     pub file_groups: Vec<FileGroup>,
     /// Table constraints
     pub constraints: Constraints,
-    /// Estimated overall statistics of the files, taking `filters` into 
account.
-    /// Defaults to [`Statistics::new_unknown`].
-    pub statistics: Statistics,
     /// Columns on which to project the data. Indexes that are higher than the
     /// number of columns of `file_schema` refer to `table_partition_cols`.
     pub projection: Option<Vec<usize>>,
@@ -412,7 +409,6 @@ impl FileScanConfigBuilder {
             table_partition_cols,
             constraints,
             file_groups,
-            statistics,
             output_ordering,
             file_compression_type,
             new_lines_in_values,
@@ -426,9 +422,9 @@ impl From<FileScanConfig> for FileScanConfigBuilder {
         Self {
             object_store_url: config.object_store_url,
             file_schema: config.file_schema,
-            file_source: config.file_source,
+            file_source: Arc::<dyn FileSource>::clone(&config.file_source),
             file_groups: config.file_groups,
-            statistics: Some(config.statistics),
+            statistics: config.file_source.statistics().ok(),
             output_ordering: config.output_ordering,
             file_compression_type: Some(config.file_compression_type),
             new_lines_in_values: Some(config.new_lines_in_values),
@@ -610,7 +606,6 @@ impl FileScanConfig {
             file_schema,
             file_groups: vec![],
             constraints: Constraints::empty(),
-            statistics,
             projection: None,
             limit: None,
             table_partition_cols: vec![],
@@ -625,7 +620,8 @@ impl FileScanConfig {
     /// Set the file source
     #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
     pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
-        self.file_source = 
file_source.with_statistics(self.statistics.clone());
+        self.file_source =
+            
file_source.with_statistics(Statistics::new_unknown(&self.file_schema));
         self
     }
 
@@ -639,7 +635,6 @@ impl FileScanConfig {
     /// Set the statistics of the files
     #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
     pub fn with_statistics(mut self, statistics: Statistics) -> Self {
-        self.statistics = statistics.clone();
         self.file_source = self.file_source.with_statistics(statistics);
         self
     }
@@ -654,10 +649,7 @@ impl FileScanConfig {
     }
 
     fn projected_stats(&self) -> Statistics {
-        let statistics = self
-            .file_source
-            .statistics()
-            .unwrap_or(self.statistics.clone());
+        let statistics = self.file_source.statistics().unwrap();
 
         let table_cols_stats = self
             .projection_indices()
@@ -804,7 +796,7 @@ impl FileScanConfig {
             return (
                 Arc::clone(&self.file_schema),
                 self.constraints.clone(),
-                self.statistics.clone(),
+                self.file_source.statistics().unwrap().clone(),
                 self.output_ordering.clone(),
             );
         }
@@ -949,7 +941,11 @@ impl Debug for FileScanConfig {
         write!(f, "FileScanConfig {{")?;
         write!(f, "object_store_url={:?}, ", self.object_store_url)?;
 
-        write!(f, "statistics={:?}, ", self.statistics)?;
+        write!(
+            f,
+            "statistics={:?}, ",
+            self.file_source.statistics().unwrap()
+        )?;
 
         DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?;
         write!(f, "}}")
@@ -2161,13 +2157,24 @@ mod tests {
         assert!(config.constraints.is_empty());
 
         // Verify statistics are set to unknown
-        assert_eq!(config.statistics.num_rows, Precision::Absent);
-        assert_eq!(config.statistics.total_byte_size, Precision::Absent);
         assert_eq!(
-            config.statistics.column_statistics.len(),
+            config.file_source.statistics().unwrap().num_rows,
+            Precision::Absent
+        );
+        assert_eq!(
+            config.file_source.statistics().unwrap().total_byte_size,
+            Precision::Absent
+        );
+        assert_eq!(
+            config
+                .file_source
+                .statistics()
+                .unwrap()
+                .column_statistics
+                .len(),
             file_schema.fields().len()
         );
-        for stat in config.statistics.column_statistics {
+        for stat in config.file_source.statistics().unwrap().column_statistics 
{
             assert_eq!(stat.distinct_count, Precision::Absent);
             assert_eq!(stat.min_value, Precision::Absent);
             assert_eq!(stat.max_value, Precision::Absent);
diff --git a/datafusion/proto/src/physical_plan/to_proto.rs 
b/datafusion/proto/src/physical_plan/to_proto.rs
index 1384e6c0c3..f6546ff3f2 100644
--- a/datafusion/proto/src/physical_plan/to_proto.rs
+++ b/datafusion/proto/src/physical_plan/to_proto.rs
@@ -507,7 +507,7 @@ pub fn serialize_file_scan_config(
 
     Ok(protobuf::FileScanExecConf {
         file_groups,
-        statistics: Some((&conf.statistics).into()),
+        statistics: Some((&conf.file_source.statistics().unwrap()).into()),
         limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }),
         projection: conf
             .projection


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

Reply via email to