This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 9e012a6c1c feat: add column statistics into explain (#8112)
9e012a6c1c is described below

commit 9e012a6c1c495ebfba8c863755cdcb069e31d410
Author: Nga Tran <[email protected]>
AuthorDate: Sun Nov 12 06:20:09 2023 -0500

    feat: add column statistics into explain (#8112)
    
    * feat: add column statistics into explain
    
    * feat: only show non-absent statistics
    
    * fix: update test output
---
 datafusion/common/src/stats.rs                 | 39 +++++++++++++++++++++++++-
 datafusion/core/tests/sql/explain_analyze.rs   |  5 +++-
 datafusion/sqllogictest/test_files/explain.slt |  8 +++---
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index 2e799c92be..1c7a4fd4d5 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -257,7 +257,44 @@ impl Statistics {
 
 impl Display for Statistics {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Rows={}, Bytes={}", self.num_rows, self.total_byte_size)?;
+        // string of column statistics
+        let column_stats = self
+            .column_statistics
+            .iter()
+            .enumerate()
+            .map(|(i, cs)| {
+                let s = format!("(Col[{}]:", i);
+                let s = if cs.min_value != Precision::Absent {
+                    format!("{} Min={}", s, cs.min_value)
+                } else {
+                    s
+                };
+                let s = if cs.max_value != Precision::Absent {
+                    format!("{} Max={}", s, cs.max_value)
+                } else {
+                    s
+                };
+                let s = if cs.null_count != Precision::Absent {
+                    format!("{} Null={}", s, cs.null_count)
+                } else {
+                    s
+                };
+                let s = if cs.distinct_count != Precision::Absent {
+                    format!("{} Distinct={}", s, cs.distinct_count)
+                } else {
+                    s
+                };
+
+                s + ")"
+            })
+            .collect::<Vec<_>>()
+            .join(",");
+
+        write!(
+            f,
+            "Rows={}, Bytes={}, [{}]",
+            self.num_rows, self.total_byte_size, column_stats
+        )?;
 
         Ok(())
     }
diff --git a/datafusion/core/tests/sql/explain_analyze.rs 
b/datafusion/core/tests/sql/explain_analyze.rs
index 2436e82f3c..0ebd3a0c69 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -827,5 +827,8 @@ async fn csv_explain_analyze_with_statistics() {
         .to_string();
 
     // should contain scan statistics
-    assert_contains!(&formatted, ", statistics=[Rows=Absent, Bytes=Absent]");
+    assert_contains!(
+        &formatted,
+        ", statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]"
+    );
 }
diff --git a/datafusion/sqllogictest/test_files/explain.slt 
b/datafusion/sqllogictest/test_files/explain.slt
index 911ede678b..1db24efd9b 100644
--- a/datafusion/sqllogictest/test_files/explain.slt
+++ b/datafusion/sqllogictest/test_files/explain.slt
@@ -274,8 +274,8 @@ query TT
 EXPLAIN SELECT a, b, c FROM simple_explain_test limit 10;
 ----
 physical_plan
-GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Inexact(10), Bytes=Absent]
---CsvExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, 
c], limit=10, has_header=true, statistics=[Rows=Absent, Bytes=Absent]
+GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Inexact(10), Bytes=Absent, 
[(Col[0]:),(Col[1]:),(Col[2]:)]]
+--CsvExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, 
c], limit=10, has_header=true, statistics=[Rows=Absent, Bytes=Absent, 
[(Col[0]:),(Col[1]:),(Col[2]:)]]
 
 # Parquet scan with statistics collected
 statement ok
@@ -288,8 +288,8 @@ query TT
 EXPLAIN SELECT * FROM alltypes_plain limit 10;
 ----
 physical_plan
-GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent]
---ParquetExec: file_groups={1 group: 
[[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, 
projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, 
float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, 
statistics=[Rows=Exact(8), Bytes=Absent]
+GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, 
[(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+--ParquetExec: file_groups={1 group: 
[[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, 
projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, 
float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, 
statistics=[Rows=Exact(8), Bytes=Absent, 
[(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
 
 statement ok
 set datafusion.execution.collect_statistics = false;

Reply via email to