This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 9e012a6c1c feat: add column statistics into explain (#8112)
9e012a6c1c is described below
commit 9e012a6c1c495ebfba8c863755cdcb069e31d410
Author: Nga Tran <[email protected]>
AuthorDate: Sun Nov 12 06:20:09 2023 -0500
feat: add column statistics into explain (#8112)
* feat: add column statistics into explain
* feat: only show non-absent statistics
* fix: update test output
---
datafusion/common/src/stats.rs | 39 +++++++++++++++++++++++++-
datafusion/core/tests/sql/explain_analyze.rs | 5 +++-
datafusion/sqllogictest/test_files/explain.slt | 8 +++---
3 files changed, 46 insertions(+), 6 deletions(-)
diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index 2e799c92be..1c7a4fd4d5 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -257,7 +257,44 @@ impl Statistics {
impl Display for Statistics {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "Rows={}, Bytes={}", self.num_rows, self.total_byte_size)?;
+ // string of column statistics
+ let column_stats = self
+ .column_statistics
+ .iter()
+ .enumerate()
+ .map(|(i, cs)| {
+ let s = format!("(Col[{}]:", i);
+ let s = if cs.min_value != Precision::Absent {
+ format!("{} Min={}", s, cs.min_value)
+ } else {
+ s
+ };
+ let s = if cs.max_value != Precision::Absent {
+ format!("{} Max={}", s, cs.max_value)
+ } else {
+ s
+ };
+ let s = if cs.null_count != Precision::Absent {
+ format!("{} Null={}", s, cs.null_count)
+ } else {
+ s
+ };
+ let s = if cs.distinct_count != Precision::Absent {
+ format!("{} Distinct={}", s, cs.distinct_count)
+ } else {
+ s
+ };
+
+ s + ")"
+ })
+ .collect::<Vec<_>>()
+ .join(",");
+
+ write!(
+ f,
+ "Rows={}, Bytes={}, [{}]",
+ self.num_rows, self.total_byte_size, column_stats
+ )?;
Ok(())
}
diff --git a/datafusion/core/tests/sql/explain_analyze.rs
b/datafusion/core/tests/sql/explain_analyze.rs
index 2436e82f3c..0ebd3a0c69 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -827,5 +827,8 @@ async fn csv_explain_analyze_with_statistics() {
.to_string();
// should contain scan statistics
- assert_contains!(&formatted, ", statistics=[Rows=Absent, Bytes=Absent]");
+ assert_contains!(
+ &formatted,
+ ", statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]"
+ );
}
diff --git a/datafusion/sqllogictest/test_files/explain.slt
b/datafusion/sqllogictest/test_files/explain.slt
index 911ede678b..1db24efd9b 100644
--- a/datafusion/sqllogictest/test_files/explain.slt
+++ b/datafusion/sqllogictest/test_files/explain.slt
@@ -274,8 +274,8 @@ query TT
EXPLAIN SELECT a, b, c FROM simple_explain_test limit 10;
----
physical_plan
-GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Inexact(10), Bytes=Absent]
---CsvExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b,
c], limit=10, has_header=true, statistics=[Rows=Absent, Bytes=Absent]
+GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Inexact(10), Bytes=Absent,
[(Col[0]:),(Col[1]:),(Col[2]:)]]
+--CsvExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b,
c], limit=10, has_header=true, statistics=[Rows=Absent, Bytes=Absent,
[(Col[0]:),(Col[1]:),(Col[2]:)]]
# Parquet scan with statistics collected
statement ok
@@ -288,8 +288,8 @@ query TT
EXPLAIN SELECT * FROM alltypes_plain limit 10;
----
physical_plan
-GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent]
---ParquetExec: file_groups={1 group:
[[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]},
projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col], limit=10,
statistics=[Rows=Exact(8), Bytes=Absent]
+GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent,
[(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+--ParquetExec: file_groups={1 group:
[[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]},
projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col], limit=10,
statistics=[Rows=Exact(8), Bytes=Absent,
[(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
statement ok
set datafusion.execution.collect_statistics = false;