This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8aaea5d6eb Dont call multiunzip when no stats (#9220)
8aaea5d6eb is described below
commit 8aaea5d6eb8e918bb876b3b2374e9f847087ce6d
Author: Matthew Turner <[email protected]>
AuthorDate: Thu Feb 15 11:30:59 2024 -0500
Dont call multiunzip when no stats (#9220)
* Dont call multiunzip when no stats
* Update docstring
---
datafusion/core/benches/sql_planner.rs | 2 +-
datafusion/core/src/datasource/listing/table.rs | 9 +++++++--
datafusion/core/src/datasource/statistics.rs | 10 ++++++++--
3 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/datafusion/core/benches/sql_planner.rs
b/datafusion/core/benches/sql_planner.rs
index 4615d0a0f5..6f54054530 100644
--- a/datafusion/core/benches/sql_planner.rs
+++ b/datafusion/core/benches/sql_planner.rs
@@ -234,7 +234,7 @@ fn criterion_benchmark(c: &mut Criterion) {
let sql =
std::fs::read_to_string(format!("../../benchmarks/queries/{}.sql", q))
.unwrap();
c.bench_function(&format!("physical_plan_tpch_{}", q), |b| {
- b.iter(|| logical_plan(&ctx, &sql))
+ b.iter(|| physical_plan(&ctx, &sql))
});
}
diff --git a/datafusion/core/src/datasource/listing/table.rs
b/datafusion/core/src/datasource/listing/table.rs
index 094b26bfbd..56e64f556c 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -880,8 +880,13 @@ impl ListingTable {
.boxed()
.buffered(ctx.config_options().execution.meta_fetch_concurrency);
- let (files, statistics) =
- get_statistics_with_limit(files, self.schema(), limit).await?;
+ let (files, statistics) = get_statistics_with_limit(
+ files,
+ self.schema(),
+ limit,
+ self.options.collect_stat,
+ )
+ .await?;
Ok((
split_files(files, self.options.target_partitions),
diff --git a/datafusion/core/src/datasource/statistics.rs
b/datafusion/core/src/datasource/statistics.rs
index 73896f8eb7..c67227f966 100644
--- a/datafusion/core/src/datasource/statistics.rs
+++ b/datafusion/core/src/datasource/statistics.rs
@@ -29,12 +29,15 @@ use itertools::izip;
use itertools::multiunzip;
/// Get all files as well as the file level summary statistics (no statistic
for partition columns).
-/// If the optional `limit` is provided, includes only sufficient files.
-/// Needed to read up to `limit` number of rows.
+/// If the optional `limit` is provided, includes only sufficient files.
Needed to read up to
+/// `limit` number of rows. `collect_stats` is passed down from the
configuration parameter on
+/// `ListingTable`. If it is false we only construct bare statistics and skip
a potentially expensive
+/// call to `multiunzip` for constructing file level summary statistics.
pub async fn get_statistics_with_limit(
all_files: impl Stream<Item = Result<(PartitionedFile, Statistics)>>,
file_schema: SchemaRef,
limit: Option<usize>,
+ collect_stats: bool,
) -> Result<(Vec<PartitionedFile>, Statistics)> {
let mut result_files = vec![];
// These statistics can be calculated as long as at least one file provides
@@ -78,6 +81,9 @@ pub async fn get_statistics_with_limit(
while let Some(current) = all_files.next().await {
let (file, file_stats) = current?;
result_files.push(file);
+ if !collect_stats {
+ continue;
+ }
// We accumulate the number of rows, total byte size and null
// counts across all the files in question. If any file does
not