This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 6e568972e Add setting for statistics collection (#3846)
6e568972e is described below

commit 6e568972ed2def586892aaaee12934ea7dba1db6
Author: DaniĆ«l Heres <[email protected]>
AuthorDate: Tue Oct 18 15:36:00 2022 +0200

    Add setting for statistics collection (#3846)
    
    * Add setting for statistics collection
    
    * fmt
---
 datafusion/core/src/execution/context.rs | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/src/execution/context.rs 
b/datafusion/core/src/execution/context.rs
index 8468483eb..f7fb0eb90 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -484,7 +484,7 @@ impl SessionContext {
                 };
                 let options = ListingOptions {
                     format: file_format,
-                    collect_stat: false,
+                    collect_stat: self.copied_config().collect_statistics,
                     file_extension: file_extension.to_owned(),
                     target_partitions: self.copied_config().target_partitions,
                     table_partition_cols: cmd.table_partition_cols.clone(),
@@ -1085,6 +1085,8 @@ pub const REPARTITION_AGGREGATIONS: &str = 
"repartition_aggregations";
 pub const REPARTITION_WINDOWS: &str = "repartition_windows";
 /// Session Configuration entry name for 'PARQUET_PRUNING'
 pub const PARQUET_PRUNING: &str = "parquet_pruning";
+/// Session Configuration entry name for 'COLLECT_STATISTICS'
+pub const COLLECT_STATISTICS: &str = "collect_statistics";
 
 /// Map that holds opaque objects indexed by their type.
 ///
@@ -1142,6 +1144,8 @@ pub struct SessionConfig {
     pub repartition_windows: bool,
     /// Should DataFusion parquet reader using the predicate to prune data
     pub parquet_pruning: bool,
+    /// Should DataFusion collect statistics after listing files
+    pub collect_statistics: bool,
     /// Configuration options
     pub config_options: Arc<RwLock<ConfigOptions>>,
     /// Opaque extensions.
@@ -1160,6 +1164,7 @@ impl Default for SessionConfig {
             repartition_aggregations: true,
             repartition_windows: true,
             parquet_pruning: true,
+            collect_statistics: false,
             config_options: Arc::new(RwLock::new(ConfigOptions::new())),
             // Assume no extensions by default.
             extensions: HashMap::with_capacity_and_hasher(
@@ -1262,6 +1267,12 @@ impl SessionConfig {
         self
     }
 
+    /// Enables or disables the collection of statistics after listing files
+    pub fn with_collect_statistics(mut self, enabled: bool) -> Self {
+        self.collect_statistics = enabled;
+        self
+    }
+
     /// Get the currently configured batch size
     pub fn batch_size(&self) -> usize {
         self.config_options
@@ -1305,6 +1316,11 @@ impl SessionConfig {
             PARQUET_PRUNING.to_owned(),
             format!("{}", self.parquet_pruning),
         );
+        map.insert(
+            COLLECT_STATISTICS.to_owned(),
+            format!("{}", self.collect_statistics),
+        );
+
         map
     }
 
@@ -1764,6 +1780,9 @@ impl TaskContext {
                         .with_parquet_pruning(
                             
props.get(PARQUET_PRUNING).unwrap().parse().unwrap(),
                         )
+                        .with_collect_statistics(
+                            
props.get(COLLECT_STATISTICS).unwrap().parse().unwrap(),
+                        )
                 }
             }
             TaskProperties::SessionConfig(session_config) => 
session_config.clone(),

Reply via email to