This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 2fd14a4aaf fix: Correct null_count in describe() (#10260)
2fd14a4aaf is described below
commit 2fd14a4aaf28949ab90393e434eb29253ba3a431
Author: Alex Huang <[email protected]>
AuthorDate: Tue Apr 30 22:12:12 2024 +0800
fix: Correct null_count in describe() (#10260)
* fix: Correct null_count in describe()
* chore: fix fmt
* chore: Fix ci
* fix: Update comment
* fix: refactor null_count calculation in describe() and add test
* chore
---
datafusion/core/src/dataframe/mod.rs | 12 ++++++++--
datafusion/core/tests/dataframe/describe.rs | 34 +++++++++++++++++++++++++++--
2 files changed, 42 insertions(+), 4 deletions(-)
diff --git a/datafusion/core/src/dataframe/mod.rs
b/datafusion/core/src/dataframe/mod.rs
index f877b7d698..4644e15feb 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -48,10 +48,12 @@ use datafusion_common::config::{CsvOptions, FormatOptions,
JsonOptions};
use datafusion_common::{
plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError,
UnnestOptions,
};
+use datafusion_expr::lit;
use datafusion_expr::{
- avg, count, is_null, max, median, min, stddev, utils::COUNT_STAR_EXPANSION,
+ avg, count, max, median, min, stddev, utils::COUNT_STAR_EXPANSION,
TableProviderFilterPushDown, UNNAMED_TABLE,
};
+use datafusion_expr::{case, is_null, sum};
use async_trait::async_trait;
@@ -534,7 +536,13 @@ impl DataFrame {
vec![],
original_schema_fields
.clone()
- .map(|f| count(is_null(col(f.name()))).alias(f.name()))
+ .map(|f| {
+ sum(case(is_null(col(f.name())))
+ .when(lit(true), lit(1))
+ .otherwise(lit(0))
+ .unwrap())
+ .alias(f.name())
+ })
.collect::<Vec<_>>(),
),
// mean aggregation
diff --git a/datafusion/core/tests/dataframe/describe.rs
b/datafusion/core/tests/dataframe/describe.rs
index e82c06efd6..e446d71473 100644
--- a/datafusion/core/tests/dataframe/describe.rs
+++ b/datafusion/core/tests/dataframe/describe.rs
@@ -39,7 +39,7 @@ async fn describe() -> Result<()> {
"| describe | id | bool_col | tinyint_col |
smallint_col | int_col | bigint_col | float_col
| double_col | date_string_col | string_col | timestamp_col
| year | month |",
"+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+",
"| count | 7300.0 | 7300 | 7300.0 |
7300.0 | 7300.0 | 7300.0 | 7300.0
| 7300.0 | 7300 | 7300 | 7300
| 7300.0 | 7300.0 |",
- "| null_count | 7300.0 | 7300 | 7300.0 |
7300.0 | 7300.0 | 7300.0 | 7300.0
| 7300.0 | 7300 | 7300 | 7300
| 7300.0 | 7300.0 |",
+ "| null_count | 0.0 | 0 | 0.0 |
0.0 | 0.0 | 0.0 | 0.0
| 0.0 | 0 | 0 | 0
| 0.0 | 0.0 |",
"| mean | 3649.5 | null | 4.5 |
4.5 | 4.5 | 45.0 |
4.949999964237213 | 45.45 | null | null | null
| 2009.5 | 6.526027397260274 |",
"| std | 2107.472815166704 | null | 2.8724780750809518 |
2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 |
3.1597258182544645 | 29.012028558317645 | null | null | null
| 0.5000342500942125 | 3.44808750051728 |",
"| min | 0.0 | null | 0.0 |
0.0 | 0.0 | 0.0 | 0.0
| 0.0 | 01/01/09 | 0 | 2008-12-31T23:00:00
| 2009.0 | 1.0 |",
@@ -69,7 +69,7 @@ async fn describe_boolean_binary() -> Result<()> {
"| describe | a | b |",
"+------------+------+------+",
"| count | 1 | 1 |",
- "| null_count | 1 | 1 |",
+ "| null_count | 0 | 0 |",
"| mean | null | null |",
"| std | null | null |",
"| min | a | null |",
@@ -81,6 +81,36 @@ async fn describe_boolean_binary() -> Result<()> {
Ok(())
}
+#[tokio::test]
+async fn describe_null() -> Result<()> {
+ let ctx = parquet_context().await;
+
+ //add test case for only boolean boolean/binary column
+ let result = ctx
+ .sql("select 'a' as a, null as b")
+ .await?
+ .describe()
+ .await?
+ .collect()
+ .await?;
+ #[rustfmt::skip]
+ let expected = [
+ "+------------+------+------+",
+ "| describe | a | b |",
+ "+------------+------+------+",
+ "| count | 1 | 0 |",
+ "| null_count | 0 | 1 |",
+ "| mean | null | null |",
+ "| std | null | null |",
+ "| min | null | null |",
+ "| max | null | null |",
+ "| median | null | null |",
+ "+------------+------+------+"
+ ];
+ assert_batches_eq!(expected, &result);
+ Ok(())
+}
+
/// Return a SessionContext with parquet file registered
async fn parquet_context() -> SessionContext {
let ctx = SessionContext::new();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]