(datafusion) branch main updated: Set the default value of `datafusion.execution.collect_statistics` to `true` (#16447)

blaginin Thu, 19 Jun 2025 11:09:58 -0700

This is an automated email from the ASF dual-hosted git repository.

blaginin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 2d7ae09262 Set the default value of 
`datafusion.execution.collect_statistics` to `true` (#16447)
2d7ae09262 is described below

commit 2d7ae09262f7a1338c30192b33efbe1b2d1d9829
Author: Adam Gutglick <a...@spiraldb.com>
AuthorDate: Thu Jun 19 19:09:42 2025 +0100

    Set the default value of `datafusion.execution.collect_statistics` to 
`true` (#16447)
    
    * fix sqllogicaltests
    * Add upgrade note
---
 datafusion/common/src/config.rs                    |   4 +-
 datafusion/core/src/execution/context/parquet.rs   |  27 +-
 datafusion/core/tests/parquet/row_group_pruning.rs |   4 +-
 .../sqllogictest/test_files/explain_tree.slt       | 383 +++++++++------------
 .../sqllogictest/test_files/information_schema.slt |   4 +-
 datafusion/sqllogictest/test_files/limit.slt       |   2 +-
 .../test_files/parquet_filter_pushdown.slt         |  11 +-
 .../sqllogictest/test_files/parquet_statistics.slt |  12 +-
 datafusion/sqllogictest/test_files/repartition.slt |   4 +-
 docs/source/library-user-guide/upgrading.md        |  20 +-
 docs/source/user-guide/configs.md                  |   2 +-
 docs/source/user-guide/sql/ddl.md                  |   8 +-
 12 files changed, 211 insertions(+), 270 deletions(-)

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index b7aca9e270..c6ac2f0b50 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -294,8 +294,8 @@ config_namespace! {
 
         /// Should DataFusion collect statistics when first creating a table.
         /// Has no effect after the table is created. Applies to the default
-        /// `ListingTableProvider` in DataFusion. Defaults to false.
-        pub collect_statistics: bool, default = false
+        /// `ListingTableProvider` in DataFusion. Defaults to true.
+        pub collect_statistics: bool, default = true
 
         /// Number of partitions for query execution. Increasing partitions 
can increase
         /// concurrency.
diff --git a/datafusion/core/src/execution/context/parquet.rs 
b/datafusion/core/src/execution/context/parquet.rs
index eea2b80477..2fb763bee4 100644
--- a/datafusion/core/src/execution/context/parquet.rs
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -34,13 +34,12 @@ impl SessionContext {
     ///
     /// # Note: Statistics
     ///
-    /// NOTE: by default, statistics are not collected when reading the Parquet
-    /// files as this can slow down the initial DataFrame creation. However,
-    /// collecting statistics can greatly accelerate queries with certain
-    /// filters.
+    /// NOTE: by default, statistics are collected when reading the Parquet
+    /// files This can slow down the initial DataFrame creation while
+    /// greatly accelerating queries with certain filters.
     ///
-    /// To enable collect statistics, set the [config option]
-    /// `datafusion.execution.collect_statistics` to `true`. See
+    /// To disable statistics collection, set the [config option]
+    /// `datafusion.execution.collect_statistics` to `false`. See
     /// [`ConfigOptions`] and [`ExecutionOptions::collect_statistics`] for more
     /// details.
     ///
@@ -171,28 +170,28 @@ mod tests {
 
     #[tokio::test]
     async fn register_parquet_respects_collect_statistics_config() -> 
Result<()> {
-        // The default is false
+        // The default is true
         let mut config = SessionConfig::new();
         config.options_mut().explain.physical_plan_only = true;
         config.options_mut().explain.show_statistics = true;
         let content = explain_query_all_with_config(config).await?;
-        assert_contains!(content, "statistics=[Rows=Absent,");
+        assert_contains!(content, "statistics=[Rows=Exact(");
 
-        // Explicitly set to false
+        // Explicitly set to true
         let mut config = SessionConfig::new();
         config.options_mut().explain.physical_plan_only = true;
         config.options_mut().explain.show_statistics = true;
-        config.options_mut().execution.collect_statistics = false;
+        config.options_mut().execution.collect_statistics = true;
         let content = explain_query_all_with_config(config).await?;
-        assert_contains!(content, "statistics=[Rows=Absent,");
+        assert_contains!(content, "statistics=[Rows=Exact(");
 
-        // Explicitly set to true
+        // Explicitly set to false
         let mut config = SessionConfig::new();
         config.options_mut().explain.physical_plan_only = true;
         config.options_mut().explain.show_statistics = true;
-        config.options_mut().execution.collect_statistics = true;
+        config.options_mut().execution.collect_statistics = false;
         let content = explain_query_all_with_config(config).await?;
-        assert_contains!(content, "statistics=[Rows=Exact(10),");
+        assert_contains!(content, "statistics=[Rows=Absent,");
 
         Ok(())
     }
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs 
b/datafusion/core/tests/parquet/row_group_pruning.rs
index 5a85f47c01..a88c0773e0 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -421,7 +421,7 @@ macro_rules! int_tests {
                     .with_query(&format!("SELECT * FROM t where i{} in (100)", 
$bits))
                     .with_expected_errors(Some(0))
                     .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(4))
+                    .with_pruned_by_stats(Some(0))
                     .with_matched_by_bloom_filter(Some(0))
                     .with_pruned_by_bloom_filter(Some(0))
                     .with_expected_rows(0)
@@ -1316,7 +1316,7 @@ async fn test_row_group_with_null_values() {
         .with_query("SELECT * FROM t WHERE \"i32\" > 7")
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(0))
-        .with_pruned_by_stats(Some(3))
+        .with_pruned_by_stats(Some(0))
         .with_expected_rows(0)
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(0))
diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt 
b/datafusion/sqllogictest/test_files/explain_tree.slt
index 8096c8cacf..7d4171a3e8 100644
--- a/datafusion/sqllogictest/test_files/explain_tree.slt
+++ b/datafusion/sqllogictest/test_files/explain_tree.slt
@@ -291,47 +291,40 @@ explain SELECT table1.string_col, table2.date_col FROM 
table1 JOIN table2 ON tab
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│       ProjectionExec      │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│        HashJoinExec       │
-09)│    --------------------   │
-10)│            on:            ├──────────────┐
-11)│    (int_col = int_col)    │              │
-12)└─────────────┬─────────────┘              │
-13)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-14)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
-15)│    --------------------   ││    --------------------   │
-16)│     target_batch_size:    ││     target_batch_size:    │
-17)│            8192           ││            8192           │
-18)└─────────────┬─────────────┘└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-20)│      RepartitionExec      ││      RepartitionExec      │
-21)│    --------------------   ││    --------------------   │
-22)│ partition_count(in->out): ││ partition_count(in->out): │
-23)│           4 -> 4          ││           4 -> 4          │
-24)│                           ││                           │
-25)│    partitioning_scheme:   ││    partitioning_scheme:   │
-26)│    Hash([int_col@0], 4)   ││    Hash([int_col@0], 4)   │
-27)└─────────────┬─────────────┘└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-29)│      RepartitionExec      ││      RepartitionExec      │
-30)│    --------------------   ││    --------------------   │
-31)│ partition_count(in->out): ││ partition_count(in->out): │
-32)│           1 -> 4          ││           1 -> 4          │
-33)│                           ││                           │
-34)│    partitioning_scheme:   ││    partitioning_scheme:   │
-35)│     RoundRobinBatch(4)    ││     RoundRobinBatch(4)    │
-36)└─────────────┬─────────────┘└─────────────┬─────────────┘
-37)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-38)│       DataSourceExec      ││       DataSourceExec      │
-39)│    --------------------   ││    --------------------   │
-40)│          files: 1         ││          files: 1         │
-41)│        format: csv        ││      format: parquet      │
-42)└───────────────────────────┘└───────────────────────────┘
+04)│     date_col: date_col    │
+05)│                           │
+06)│        string_col:        │
+07)│         string_col        │
+08)└─────────────┬─────────────┘
+09)┌─────────────┴─────────────┐
+10)│    CoalesceBatchesExec    │
+11)│    --------------------   │
+12)│     target_batch_size:    │
+13)│            8192           │
+14)└─────────────┬─────────────┘
+15)┌─────────────┴─────────────┐
+16)│        HashJoinExec       │
+17)│    --------------------   │
+18)│            on:            ├──────────────┐
+19)│    (int_col = int_col)    │              │
+20)└─────────────┬─────────────┘              │
+21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+22)│       DataSourceExec      ││      RepartitionExec      │
+23)│    --------------------   ││    --------------------   │
+24)│          files: 1         ││ partition_count(in->out): │
+25)│      format: parquet      ││           1 -> 4          │
+26)│                           ││                           │
+27)│                           ││    partitioning_scheme:   │
+28)│                           ││     RoundRobinBatch(4)    │
+29)└───────────────────────────┘└─────────────┬─────────────┘
+30)-----------------------------┌─────────────┴─────────────┐
+31)-----------------------------│       DataSourceExec      │
+32)-----------------------------│    --------------------   │
+33)-----------------------------│          files: 1         │
+34)-----------------------------│        format: csv        │
+35)-----------------------------└───────────────────────────┘
 
 # 3 Joins
 query TT
@@ -365,48 +358,41 @@ physical_plan
 19)│    (int_col = int_col)    │              │
 20)└─────────────┬─────────────┘              │
 21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-22)│       DataSourceExec      ││    CoalesceBatchesExec    │
+22)│       DataSourceExec      ││       ProjectionExec      │
 23)│    --------------------   ││    --------------------   │
-24)│         bytes: 536        ││     target_batch_size:    │
-25)│       format: memory      ││            8192           │
+24)│         bytes: 536        ││     date_col: date_col    │
+25)│       format: memory      ││      int_col: int_col     │
 26)│          rows: 1          ││                           │
-27)└───────────────────────────┘└─────────────┬─────────────┘
-28)-----------------------------┌─────────────┴─────────────┐
-29)-----------------------------│        HashJoinExec       │
-30)-----------------------------│    --------------------   │
-31)-----------------------------│            on:            ├──────────────┐
-32)-----------------------------│    (int_col = int_col)    │              │
-33)-----------------------------└─────────────┬─────────────┘              │
-34)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-35)-----------------------------│    CoalesceBatchesExec    ││    
CoalesceBatchesExec    │
-36)-----------------------------│    --------------------   ││    
--------------------   │
-37)-----------------------------│     target_batch_size:    ││     
target_batch_size:    │
-38)-----------------------------│            8192           ││            8192 
          │
-39)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-40)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-41)-----------------------------│      RepartitionExec      ││      
RepartitionExec      │
-42)-----------------------------│    --------------------   ││    
--------------------   │
-43)-----------------------------│ partition_count(in->out): ││ 
partition_count(in->out): │
-44)-----------------------------│           4 -> 4          ││           4 -> 
4          │
-45)-----------------------------│                           ││                 
          │
-46)-----------------------------│    partitioning_scheme:   ││    
partitioning_scheme:   │
-47)-----------------------------│    Hash([int_col@0], 4)   ││    
Hash([int_col@0], 4)   │
-48)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-49)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-50)-----------------------------│      RepartitionExec      ││      
RepartitionExec      │
-51)-----------------------------│    --------------------   ││    
--------------------   │
-52)-----------------------------│ partition_count(in->out): ││ 
partition_count(in->out): │
-53)-----------------------------│           1 -> 4          ││           1 -> 
4          │
-54)-----------------------------│                           ││                 
          │
-55)-----------------------------│    partitioning_scheme:   ││    
partitioning_scheme:   │
-56)-----------------------------│     RoundRobinBatch(4)    ││     
RoundRobinBatch(4)    │
-57)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-58)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-59)-----------------------------│       DataSourceExec      ││       
DataSourceExec      │
-60)-----------------------------│    --------------------   ││    
--------------------   │
-61)-----------------------------│          files: 1         ││          files: 
1         │
-62)-----------------------------│        format: csv        ││      format: 
parquet      │
-63)-----------------------------└───────────────────────────┘└───────────────────────────┘
+27)│                           ││        string_col:        │
+28)│                           ││         string_col        │
+29)└───────────────────────────┘└─────────────┬─────────────┘
+30)-----------------------------┌─────────────┴─────────────┐
+31)-----------------------------│    CoalesceBatchesExec    │
+32)-----------------------------│    --------------------   │
+33)-----------------------------│     target_batch_size:    │
+34)-----------------------------│            8192           │
+35)-----------------------------└─────────────┬─────────────┘
+36)-----------------------------┌─────────────┴─────────────┐
+37)-----------------------------│        HashJoinExec       │
+38)-----------------------------│    --------------------   │
+39)-----------------------------│            on:            ├──────────────┐
+40)-----------------------------│    (int_col = int_col)    │              │
+41)-----------------------------└─────────────┬─────────────┘              │
+42)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+43)-----------------------------│       DataSourceExec      ││      
RepartitionExec      │
+44)-----------------------------│    --------------------   ││    
--------------------   │
+45)-----------------------------│          files: 1         ││ 
partition_count(in->out): │
+46)-----------------------------│      format: parquet      ││           1 -> 
4          │
+47)-----------------------------│                           ││                 
          │
+48)-----------------------------│                           ││    
partitioning_scheme:   │
+49)-----------------------------│                           ││     
RoundRobinBatch(4)    │
+50)-----------------------------└───────────────────────────┘└─────────────┬─────────────┘
+51)----------------------------------------------------------┌─────────────┴─────────────┐
+52)----------------------------------------------------------│       
DataSourceExec      │
+53)----------------------------------------------------------│    
--------------------   │
+54)----------------------------------------------------------│          files: 
1         │
+55)----------------------------------------------------------│        format: 
csv        │
+56)----------------------------------------------------------└───────────────────────────┘
 
 # Long Filter (demonstrate what happens with wrapping)
 query TT
@@ -1029,21 +1015,11 @@ physical_plan
 11)│         bigint_col        │
 12)└─────────────┬─────────────┘
 13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
+14)│       DataSourceExec      │
 15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│      format: parquet      │
-27)└───────────────────────────┘
-
+16)│          files: 1         │
+17)│      format: parquet      │
+18)└───────────────────────────┘
 
 # Query with projection on memory
 query TT
@@ -1186,51 +1162,46 @@ explain select * from table1 inner join table2 on 
table1.int_col = table2.int_co
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│       ProjectionExec      │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│        HashJoinExec       │
-09)│    --------------------   │
-10)│            on:            │
-11)│   (int_col = int_col),    ├──────────────┐
-12)│       (string_col =       │              │
-13)│         string_col)       │              │
-14)└─────────────┬─────────────┘              │
-15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-16)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
-17)│    --------------------   ││    --------------------   │
-18)│     target_batch_size:    ││     target_batch_size:    │
-19)│            8192           ││            8192           │
-20)└─────────────┬─────────────┘└─────────────┬─────────────┘
-21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-22)│      RepartitionExec      ││      RepartitionExec      │
-23)│    --------------------   ││    --------------------   │
-24)│ partition_count(in->out): ││ partition_count(in->out): │
-25)│           4 -> 4          ││           4 -> 4          │
-26)│                           ││                           │
-27)│    partitioning_scheme:   ││    partitioning_scheme:   │
-28)│      Hash([int_col@0,     ││      Hash([int_col@0,     │
-29)│       string_col@1],      ││       string_col@1],      │
-30)│             4)            ││             4)            │
-31)└─────────────┬─────────────┘└─────────────┬─────────────┘
-32)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-33)│      RepartitionExec      ││      RepartitionExec      │
-34)│    --------------------   ││    --------------------   │
-35)│ partition_count(in->out): ││ partition_count(in->out): │
-36)│           1 -> 4          ││           1 -> 4          │
-37)│                           ││                           │
-38)│    partitioning_scheme:   ││    partitioning_scheme:   │
-39)│     RoundRobinBatch(4)    ││     RoundRobinBatch(4)    │
-40)└─────────────┬─────────────┘└─────────────┬─────────────┘
-41)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-42)│       DataSourceExec      ││       DataSourceExec      │
-43)│    --------------------   ││    --------------------   │
-44)│          files: 1         ││          files: 1         │
-45)│        format: csv        ││      format: parquet      │
-46)└───────────────────────────┘└───────────────────────────┘
+04)│        bigint_col:        │
+05)│         bigint_col        │
+06)│                           │
+07)│     date_col: date_col    │
+08)│      int_col: int_col     │
+09)│                           │
+10)│        string_col:        │
+11)│         string_col        │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│    CoalesceBatchesExec    │
+15)│    --------------------   │
+16)│     target_batch_size:    │
+17)│            8192           │
+18)└─────────────┬─────────────┘
+19)┌─────────────┴─────────────┐
+20)│        HashJoinExec       │
+21)│    --------------------   │
+22)│            on:            │
+23)│   (int_col = int_col),    ├──────────────┐
+24)│       (string_col =       │              │
+25)│         string_col)       │              │
+26)└─────────────┬─────────────┘              │
+27)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+28)│       DataSourceExec      ││      RepartitionExec      │
+29)│    --------------------   ││    --------------------   │
+30)│          files: 1         ││ partition_count(in->out): │
+31)│      format: parquet      ││           1 -> 4          │
+32)│                           ││                           │
+33)│                           ││    partitioning_scheme:   │
+34)│                           ││     RoundRobinBatch(4)    │
+35)└───────────────────────────┘└─────────────┬─────────────┘
+36)-----------------------------┌─────────────┴─────────────┐
+37)-----------------------------│       DataSourceExec      │
+38)-----------------------------│    --------------------   │
+39)-----------------------------│          files: 1         │
+40)-----------------------------│        format: csv        │
+41)-----------------------------└───────────────────────────┘
 
 # Query with outer hash join.
 query TT
@@ -1238,53 +1209,48 @@ explain select * from table1 left outer join table2 on 
table1.int_col = table2.i
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│       ProjectionExec      │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│        HashJoinExec       │
-09)│    --------------------   │
-10)│      join_type: Left      │
-11)│                           │
-12)│            on:            ├──────────────┐
-13)│   (int_col = int_col),    │              │
-14)│       (string_col =       │              │
-15)│         string_col)       │              │
-16)└─────────────┬─────────────┘              │
-17)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-18)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
-19)│    --------------------   ││    --------------------   │
-20)│     target_batch_size:    ││     target_batch_size:    │
-21)│            8192           ││            8192           │
-22)└─────────────┬─────────────┘└─────────────┬─────────────┘
-23)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-24)│      RepartitionExec      ││      RepartitionExec      │
-25)│    --------------------   ││    --------------------   │
-26)│ partition_count(in->out): ││ partition_count(in->out): │
-27)│           4 -> 4          ││           4 -> 4          │
-28)│                           ││                           │
-29)│    partitioning_scheme:   ││    partitioning_scheme:   │
-30)│      Hash([int_col@0,     ││      Hash([int_col@0,     │
-31)│       string_col@1],      ││       string_col@1],      │
-32)│             4)            ││             4)            │
-33)└─────────────┬─────────────┘└─────────────┬─────────────┘
-34)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-35)│      RepartitionExec      ││      RepartitionExec      │
-36)│    --------------------   ││    --------------------   │
-37)│ partition_count(in->out): ││ partition_count(in->out): │
-38)│           1 -> 4          ││           1 -> 4          │
-39)│                           ││                           │
-40)│    partitioning_scheme:   ││    partitioning_scheme:   │
-41)│     RoundRobinBatch(4)    ││     RoundRobinBatch(4)    │
-42)└─────────────┬─────────────┘└─────────────┬─────────────┘
-43)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-44)│       DataSourceExec      ││       DataSourceExec      │
-45)│    --------------------   ││    --------------------   │
-46)│          files: 1         ││          files: 1         │
-47)│        format: csv        ││      format: parquet      │
-48)└───────────────────────────┘└───────────────────────────┘
+04)│        bigint_col:        │
+05)│         bigint_col        │
+06)│                           │
+07)│     date_col: date_col    │
+08)│      int_col: int_col     │
+09)│                           │
+10)│        string_col:        │
+11)│         string_col        │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│    CoalesceBatchesExec    │
+15)│    --------------------   │
+16)│     target_batch_size:    │
+17)│            8192           │
+18)└─────────────┬─────────────┘
+19)┌─────────────┴─────────────┐
+20)│        HashJoinExec       │
+21)│    --------------------   │
+22)│      join_type: Right     │
+23)│                           │
+24)│            on:            ├──────────────┐
+25)│   (int_col = int_col),    │              │
+26)│       (string_col =       │              │
+27)│         string_col)       │              │
+28)└─────────────┬─────────────┘              │
+29)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+30)│       DataSourceExec      ││      RepartitionExec      │
+31)│    --------------------   ││    --------------------   │
+32)│          files: 1         ││ partition_count(in->out): │
+33)│      format: parquet      ││           1 -> 4          │
+34)│                           ││                           │
+35)│                           ││    partitioning_scheme:   │
+36)│                           ││     RoundRobinBatch(4)    │
+37)└───────────────────────────┘└─────────────┬─────────────┘
+38)-----------------------------┌─────────────┴─────────────┐
+39)-----------------------------│       DataSourceExec      │
+40)-----------------------------│    --------------------   │
+41)-----------------------------│          files: 1         │
+42)-----------------------------│        format: csv        │
+43)-----------------------------└───────────────────────────┘
 
 # Query with nested loop join.
 query TT
@@ -1303,35 +1269,8 @@ physical_plan
 10)│        format: csv        ││                           │
 11)└───────────────────────────┘└─────────────┬─────────────┘
 12)-----------------------------┌─────────────┴─────────────┐
-13)-----------------------------│       AggregateExec       │
-14)-----------------------------│    --------------------   │
-15)-----------------------------│       aggr: count(1)      │
-16)-----------------------------│        mode: Final        │
-17)-----------------------------└─────────────┬─────────────┘
-18)-----------------------------┌─────────────┴─────────────┐
-19)-----------------------------│   CoalescePartitionsExec  │
-20)-----------------------------└─────────────┬─────────────┘
-21)-----------------------------┌─────────────┴─────────────┐
-22)-----------------------------│       AggregateExec       │
-23)-----------------------------│    --------------------   │
-24)-----------------------------│       aggr: count(1)      │
-25)-----------------------------│       mode: Partial       │
-26)-----------------------------└─────────────┬─────────────┘
-27)-----------------------------┌─────────────┴─────────────┐
-28)-----------------------------│      RepartitionExec      │
-29)-----------------------------│    --------------------   │
-30)-----------------------------│ partition_count(in->out): │
-31)-----------------------------│           1 -> 4          │
-32)-----------------------------│                           │
-33)-----------------------------│    partitioning_scheme:   │
-34)-----------------------------│     RoundRobinBatch(4)    │
-35)-----------------------------└─────────────┬─────────────┘
-36)-----------------------------┌─────────────┴─────────────┐
-37)-----------------------------│       DataSourceExec      │
-38)-----------------------------│    --------------------   │
-39)-----------------------------│          files: 1         │
-40)-----------------------------│      format: parquet      │
-41)-----------------------------└───────────────────────────┘
+13)-----------------------------│     PlaceholderRowExec    │
+14)-----------------------------└───────────────────────────┘
 
 # Query with cross join.
 query TT
@@ -1342,21 +1281,11 @@ physical_plan
 02)│       CrossJoinExec       ├──────────────┐
 03)└─────────────┬─────────────┘              │
 04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-05)│       DataSourceExec      ││      RepartitionExec      │
+05)│       DataSourceExec      ││       DataSourceExec      │
 06)│    --------------------   ││    --------------------   │
-07)│          files: 1         ││ partition_count(in->out): │
-08)│        format: csv        ││           1 -> 4          │
-09)│                           ││                           │
-10)│                           ││    partitioning_scheme:   │
-11)│                           ││     RoundRobinBatch(4)    │
-12)└───────────────────────────┘└─────────────┬─────────────┘
-13)-----------------------------┌─────────────┴─────────────┐
-14)-----------------------------│       DataSourceExec      │
-15)-----------------------------│    --------------------   │
-16)-----------------------------│          files: 1         │
-17)-----------------------------│      format: parquet      │
-18)-----------------------------└───────────────────────────┘
-
+07)│          files: 1         ││          files: 1         │
+08)│        format: csv        ││      format: parquet      │
+09)└───────────────────────────┘└───────────────────────────┘
 
 # Query with sort merge join.
 statement ok
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt 
b/datafusion/sqllogictest/test_files/information_schema.slt
index e2c166bc7d..bf9ae20e7b 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -216,7 +216,7 @@ datafusion.catalog.location NULL
 datafusion.catalog.newlines_in_values false
 datafusion.execution.batch_size 8192
 datafusion.execution.coalesce_batches true
-datafusion.execution.collect_statistics false
+datafusion.execution.collect_statistics true
 datafusion.execution.enable_recursive_ctes true
 datafusion.execution.enforce_batch_size_in_joins false
 datafusion.execution.keep_partition_by_columns false
@@ -328,7 +328,7 @@ datafusion.catalog.location NULL Location scanned to load 
tables for `default` s
 datafusion.catalog.newlines_in_values false Specifies whether newlines in 
(quoted) CSV values are supported. This is the default value for 
`format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified 
explicitly in the statement. Parsing newlines in quoted values may be affected 
by execution behaviour such as parallel file scanning. Setting this to `true` 
ensures that newlines in values are parsed successfully, which may reduce 
performance.
 datafusion.execution.batch_size 8192 Default batch size while creating new 
batches, it's especially useful for buffer-in-memory batches since creating 
tiny batches would result in too much metadata memory consumption
 datafusion.execution.coalesce_batches true When set to true, record batches 
will be examined between each operator and small batches will be coalesced into 
larger batches. This is helpful when there are highly selective filters or 
joins that could produce tiny output batches. The target batch size is 
determined by the configuration setting
-datafusion.execution.collect_statistics false Should DataFusion collect 
statistics when first creating a table. Has no effect after the table is 
created. Applies to the default `ListingTableProvider` in DataFusion. Defaults 
to false.
+datafusion.execution.collect_statistics true Should DataFusion collect 
statistics when first creating a table. Has no effect after the table is 
created. Applies to the default `ListingTableProvider` in DataFusion. Defaults 
to true.
 datafusion.execution.enable_recursive_ctes true Should DataFusion support 
recursive CTEs
 datafusion.execution.enforce_batch_size_in_joins false Should DataFusion 
enforce batch size in joins or not. By default, DataFusion will not enforce 
batch size in joins. Enforcing batch size in joins can reduce memory usage when 
joining large tables with a highly-selective join filter, but is also slightly 
slower.
 datafusion.execution.keep_partition_by_columns false Should DataFusion keep 
the columns used for partition_by in the output RecordBatches
diff --git a/datafusion/sqllogictest/test_files/limit.slt 
b/datafusion/sqllogictest/test_files/limit.slt
index 3398fa2901..b46d15cb96 100644
--- a/datafusion/sqllogictest/test_files/limit.slt
+++ b/datafusion/sqllogictest/test_files/limit.slt
@@ -854,7 +854,7 @@ physical_plan
 01)ProjectionExec: expr=[1 as foo]
 02)--SortPreservingMergeExec: [part_key@0 ASC NULLS LAST], fetch=1
 03)----SortExec: TopK(fetch=1), expr=[part_key@0 ASC NULLS LAST], 
preserve_partitioning=[true]
-04)------DataSourceExec: file_groups={3 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]},
 projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr 
[ true ]
+04)------DataSourceExec: file_groups={3 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet]]},
 projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr 
[ true ]
 
 query I
 with selection as (
diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt 
b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
index 5c0419b69d..ed3bed1c20 100644
--- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
+++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
@@ -212,10 +212,9 @@ logical_plan
 physical_plan
 01)CoalesceBatchesExec: target_batch_size=8192
 02)--FilterExec: val@0 != part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
-04)------DataSourceExec: file_groups={3 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]},
 projection=[val, part], file_type=parquet
+03)----DataSourceExec: file_groups={3 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]},
 projection=[val, part], file_type=parquet
 
-# If we reference only a partition column it gets evaluted during the listing 
phase
+# If we reference only a partition column it gets evaluated during the listing 
phase
 query TT
 EXPLAIN select * from t_pushdown where part != 'a';
 ----
@@ -257,8 +256,7 @@ logical_plan
 physical_plan
 01)CoalesceBatchesExec: target_batch_size=8192
 02)--FilterExec: val@0 = part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
 projection=[val, part], file_type=parquet
+03)----DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
 projection=[val, part], file_type=parquet
 
 query TT
 select val, part from t_pushdown where part = 'a' AND part = val;
@@ -274,8 +272,7 @@ logical_plan
 physical_plan
 01)CoalesceBatchesExec: target_batch_size=8192
 02)--FilterExec: val@0 = part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
 projection=[val, part], file_type=parquet
+03)----DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
 projection=[val, part], file_type=parquet
 
 query TT
 select val, part from t_pushdown where part = val AND part = 'a';
diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt 
b/datafusion/sqllogictest/test_files/parquet_statistics.slt
index c707b9f5bb..efbe69bd85 100644
--- a/datafusion/sqllogictest/test_files/parquet_statistics.slt
+++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt
@@ -46,7 +46,7 @@ statement ok
 set datafusion.explain.show_statistics = true;
 
 ######
-# By default, the statistics are not gathered
+# By default, the statistics are gathered
 ######
 
 # Recreate the table to pick up the current setting
@@ -59,18 +59,18 @@ query TT
 EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, 
Bytes=Absent, [(Col[0]:)]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, 
[(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, 
statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), 
Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) 
Null=Inexact(0))]]
+02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), 
Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) 
Null=Inexact(0))]]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, 
statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: 
Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
 04)------DataSourceExec: file_groups={2 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]},
 projection=[column1], file_type=parquet, predicate=column1@0 = 1, 
pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 
AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
-05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: 
Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
 
 # cleanup
 statement ok
 DROP TABLE test_table;
 
 ######
-# When the setting is true, the statistics are gathered
+# When the setting is true, statistics are gathered
 ######
 
 statement ok
diff --git a/datafusion/sqllogictest/test_files/repartition.slt 
b/datafusion/sqllogictest/test_files/repartition.slt
index 70666346e2..29d20d10b6 100644
--- a/datafusion/sqllogictest/test_files/repartition.slt
+++ b/datafusion/sqllogictest/test_files/repartition.slt
@@ -46,8 +46,8 @@ physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], 
aggr=[sum(parquet_table.column2)]
 02)--CoalesceBatchesExec: target_batch_size=8192
 03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=4
-04)------AggregateExec: mode=Partial, gby=[column1@0 as column1], 
aggr=[sum(parquet_table.column2)]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[column1@0 as column1], 
aggr=[sum(parquet_table.column2)]
 06)----------DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]},
 projection=[column1, column2], file_type=parquet
 
 # disable round robin repartitioning
diff --git a/docs/source/library-user-guide/upgrading.md 
b/docs/source/library-user-guide/upgrading.md
index 646753aca4..700b05a7ff 100644
--- a/docs/source/library-user-guide/upgrading.md
+++ b/docs/source/library-user-guide/upgrading.md
@@ -21,6 +21,23 @@
 
 ## DataFusion `49.0.0`
 
+### `datafusion.execution.collect_statistics` now defaults to `true`
+
+The default value of the `datafusion.execution.collect_statistics` 
configuration
+setting is now true. This change impacts users that use that value directly 
and relied
+on its default value being `false`.
+
+This change also restores the default behavior of `ListingTable` to its 
previous. If you use it directly
+you can maintain the current behavior by overriding the default value in your 
code.
+
+```rust
+# /* comment to avoid running
+ListingOptions::new(Arc::new(ParquetFormat::default()))
+    .with_collect_stat(false)
+    // other options
+# */
+```
+
 ### Metadata is now represented by `FieldMetadata`
 
 Metadata from the Arrow `Field` is now stored using the `FieldMetadata`
@@ -139,7 +156,7 @@ match expr {
 
 [details on #16207]: 
https://github.com/apache/datafusion/pull/16207#issuecomment-2922659103
 
-### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow.
+### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow
 
 The mapping of the SQL `VARCHAR` type has been changed from `Utf8` to 
`Utf8View`
 which improves performance for many string operations. You can read more about
@@ -291,7 +308,6 @@ Additionally `ObjectStore::list` and 
`ObjectStore::list_with_offset` have been c
 
 [#6619]: https://github.com/apache/arrow-rs/pull/6619
 [#7371]: https://github.com/apache/arrow-rs/pull/7371
-[#7328]: https://github.com/apache/arrow-rs/pull/6961
 
 This requires converting from `usize` to `u64` occasionally as well as changes 
to `ObjectStore` implementations such as
 
diff --git a/docs/source/user-guide/configs.md 
b/docs/source/user-guide/configs.md
index 1b8233a541..b55e63293f 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -47,7 +47,7 @@ Environment variables are read during `SessionConfig` 
initialisation so they mus
 | datafusion.catalog.newlines_in_values                                   | 
false                     | Specifies whether newlines in (quoted) CSV values 
are supported. This is the default value for `format.newlines_in_values` for 
`CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing 
newlines in quoted values may be affected by execution behaviour such as 
parallel file scanning. Setting this to `true` ensures that newlines in values 
are parsed successfully, which  [...]
 | datafusion.execution.batch_size                                         | 
8192                      | Default batch size while creating new batches, it's 
especially useful for buffer-in-memory batches since creating tiny batches 
would result in too much metadata memory consumption                            
                                                                                
                                                                                
                      [...]
 | datafusion.execution.coalesce_batches                                   | 
true                      | When set to true, record batches will be examined 
between each operator and small batches will be coalesced into larger batches. 
This is helpful when there are highly selective filters or joins that could 
produce tiny output batches. The target batch size is determined by the 
configuration setting                                                           
                                [...]
-| datafusion.execution.collect_statistics                                 | 
false                     | Should DataFusion collect statistics when first 
creating a table. Has no effect after the table is created. Applies to the 
default `ListingTableProvider` in DataFusion. Defaults to false.                
                                                                                
                                                                                
                          [...]
+| datafusion.execution.collect_statistics                                 | 
true                      | Should DataFusion collect statistics when first 
creating a table. Has no effect after the table is created. Applies to the 
default `ListingTableProvider` in DataFusion. Defaults to true.                 
                                                                                
                                                                                
                          [...]
 | datafusion.execution.target_partitions                                  | 0  
                       | Number of partitions for query execution. Increasing 
partitions can increase concurrency. Defaults to the number of CPU cores on the 
system                                                                          
                                                                                
                                                                                
                [...]
 | datafusion.execution.time_zone                                          | 
+00:00                    | The default time zone Some functions, e.g. 
`EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this 
time zone, and then extract the hour                                            
                                                                                
                                                                                
                          [...]
 | datafusion.execution.parquet.enable_page_index                          | 
true                      | (reading) If true, reads the Parquet data page 
level metadata (the Page Index), if present, to reduce the I/O and number of 
rows decoded.                                                                   
                                                                                
                                                                                
                         [...]
diff --git a/docs/source/user-guide/sql/ddl.md 
b/docs/source/user-guide/sql/ddl.md
index ff8fa9bac0..1d971594ad 100644
--- a/docs/source/user-guide/sql/ddl.md
+++ b/docs/source/user-guide/sql/ddl.md
@@ -95,14 +95,14 @@ LOCATION '/mnt/nyctaxi/tripdata.parquet';
 
 :::{note}
 Statistics
-: By default, when a table is created, DataFusion will _NOT_ read the files
+: By default, when a table is created, DataFusion will read the files
 to gather statistics, which can be expensive but can accelerate subsequent
-queries substantially. If you want to gather statistics
+queries substantially. If you don't want to gather statistics
 when creating a table, set the `datafusion.execution.collect_statistics`
-configuration option to `true` before creating the table. For example:
+configuration option to `false` before creating the table. For example:
 
 ```sql
-SET datafusion.execution.collect_statistics = true;
+SET datafusion.execution.collect_statistics = false;
 ```
 
 See the [config settings docs](../configs.md) for more details.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

(datafusion) branch main updated: Set the default value of `datafusion.execution.collect_statistics` to `true` (#16447)

Reply via email to