(arrow-datafusion) branch main updated: Dont call multiunzip when no stats (#9220)

alamb Thu, 15 Feb 2024 08:32:55 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 8aaea5d6eb Dont call multiunzip when no stats (#9220)
8aaea5d6eb is described below

commit 8aaea5d6eb8e918bb876b3b2374e9f847087ce6d
Author: Matthew Turner <[email protected]>
AuthorDate: Thu Feb 15 11:30:59 2024 -0500

    Dont call multiunzip when no stats (#9220)
    
    * Dont call multiunzip when no stats
    
    * Update docstring
---
 datafusion/core/benches/sql_planner.rs          |  2 +-
 datafusion/core/src/datasource/listing/table.rs |  9 +++++++--
 datafusion/core/src/datasource/statistics.rs    | 10 ++++++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/datafusion/core/benches/sql_planner.rs 
b/datafusion/core/benches/sql_planner.rs
index 4615d0a0f5..6f54054530 100644
--- a/datafusion/core/benches/sql_planner.rs
+++ b/datafusion/core/benches/sql_planner.rs
@@ -234,7 +234,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         let sql = 
std::fs::read_to_string(format!("../../benchmarks/queries/{}.sql", q))
             .unwrap();
         c.bench_function(&format!("physical_plan_tpch_{}", q), |b| {
-            b.iter(|| logical_plan(&ctx, &sql))
+            b.iter(|| physical_plan(&ctx, &sql))
         });
     }
 
diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index 094b26bfbd..56e64f556c 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -880,8 +880,13 @@ impl ListingTable {
             .boxed()
             .buffered(ctx.config_options().execution.meta_fetch_concurrency);
 
-        let (files, statistics) =
-            get_statistics_with_limit(files, self.schema(), limit).await?;
+        let (files, statistics) = get_statistics_with_limit(
+            files,
+            self.schema(),
+            limit,
+            self.options.collect_stat,
+        )
+        .await?;
 
         Ok((
             split_files(files, self.options.target_partitions),
diff --git a/datafusion/core/src/datasource/statistics.rs 
b/datafusion/core/src/datasource/statistics.rs
index 73896f8eb7..c67227f966 100644
--- a/datafusion/core/src/datasource/statistics.rs
+++ b/datafusion/core/src/datasource/statistics.rs
@@ -29,12 +29,15 @@ use itertools::izip;
 use itertools::multiunzip;
 
 /// Get all files as well as the file level summary statistics (no statistic 
for partition columns).
-/// If the optional `limit` is provided, includes only sufficient files.
-/// Needed to read up to `limit` number of rows.
+/// If the optional `limit` is provided, includes only sufficient files. 
Needed to read up to
+/// `limit` number of rows. `collect_stats` is passed down from the 
configuration parameter on
+/// `ListingTable`. If it is false we only construct bare statistics and skip 
a potentially expensive
+///  call to `multiunzip` for constructing file level summary statistics.
 pub async fn get_statistics_with_limit(
     all_files: impl Stream<Item = Result<(PartitionedFile, Statistics)>>,
     file_schema: SchemaRef,
     limit: Option<usize>,
+    collect_stats: bool,
 ) -> Result<(Vec<PartitionedFile>, Statistics)> {
     let mut result_files = vec![];
     // These statistics can be calculated as long as at least one file provides
@@ -78,6 +81,9 @@ pub async fn get_statistics_with_limit(
             while let Some(current) = all_files.next().await {
                 let (file, file_stats) = current?;
                 result_files.push(file);
+                if !collect_stats {
+                    continue;
+                }
 
                 // We accumulate the number of rows, total byte size and null
                 // counts across all the files in question. If any file does 
not

(arrow-datafusion) branch main updated: Dont call multiunzip when no stats (#9220)

Reply via email to