This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 2fd14a4aaf fix: Correct null_count in describe() (#10260)
2fd14a4aaf is described below

commit 2fd14a4aaf28949ab90393e434eb29253ba3a431
Author: Alex Huang <[email protected]>
AuthorDate: Tue Apr 30 22:12:12 2024 +0800

    fix: Correct null_count in describe() (#10260)
    
    * fix: Correct null_count in describe()
    
    * chore: fix fmt
    
    * chore: Fix ci
    
    * fix: Update comment
    
    * fix: refactor null_count calculation in describe() and add test
    
    * chore
---
 datafusion/core/src/dataframe/mod.rs        | 12 ++++++++--
 datafusion/core/tests/dataframe/describe.rs | 34 +++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/datafusion/core/src/dataframe/mod.rs 
b/datafusion/core/src/dataframe/mod.rs
index f877b7d698..4644e15feb 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -48,10 +48,12 @@ use datafusion_common::config::{CsvOptions, FormatOptions, 
JsonOptions};
 use datafusion_common::{
     plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, 
UnnestOptions,
 };
+use datafusion_expr::lit;
 use datafusion_expr::{
-    avg, count, is_null, max, median, min, stddev, utils::COUNT_STAR_EXPANSION,
+    avg, count, max, median, min, stddev, utils::COUNT_STAR_EXPANSION,
     TableProviderFilterPushDown, UNNAMED_TABLE,
 };
+use datafusion_expr::{case, is_null, sum};
 
 use async_trait::async_trait;
 
@@ -534,7 +536,13 @@ impl DataFrame {
                 vec![],
                 original_schema_fields
                     .clone()
-                    .map(|f| count(is_null(col(f.name()))).alias(f.name()))
+                    .map(|f| {
+                        sum(case(is_null(col(f.name())))
+                            .when(lit(true), lit(1))
+                            .otherwise(lit(0))
+                            .unwrap())
+                        .alias(f.name())
+                    })
                     .collect::<Vec<_>>(),
             ),
             // mean aggregation
diff --git a/datafusion/core/tests/dataframe/describe.rs 
b/datafusion/core/tests/dataframe/describe.rs
index e82c06efd6..e446d71473 100644
--- a/datafusion/core/tests/dataframe/describe.rs
+++ b/datafusion/core/tests/dataframe/describe.rs
@@ -39,7 +39,7 @@ async fn describe() -> Result<()> {
         "| describe   | id                | bool_col | tinyint_col        | 
smallint_col       | int_col            | bigint_col         | float_col        
  | double_col         | date_string_col | string_col | timestamp_col           
| year               | month             |",
         
"+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+",
         "| count      | 7300.0            | 7300     | 7300.0             | 
7300.0             | 7300.0             | 7300.0             | 7300.0           
  | 7300.0             | 7300            | 7300       | 7300                    
| 7300.0             | 7300.0            |",
-        "| null_count | 7300.0            | 7300     | 7300.0             | 
7300.0             | 7300.0             | 7300.0             | 7300.0           
  | 7300.0             | 7300            | 7300       | 7300                    
| 7300.0             | 7300.0            |",
+        "| null_count | 0.0               | 0        | 0.0                | 
0.0                | 0.0                | 0.0                | 0.0              
  | 0.0                | 0               | 0          | 0                       
| 0.0                | 0.0               |",
         "| mean       | 3649.5            | null     | 4.5                | 
4.5                | 4.5                | 45.0               | 
4.949999964237213  | 45.45              | null            | null       | null   
                 | 2009.5             | 6.526027397260274 |",
         "| std        | 2107.472815166704 | null     | 2.8724780750809518 | 
2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 | 
3.1597258182544645 | 29.012028558317645 | null            | null       | null   
                 | 0.5000342500942125 | 3.44808750051728  |",
         "| min        | 0.0               | null     | 0.0                | 
0.0                | 0.0                | 0.0                | 0.0              
  | 0.0                | 01/01/09        | 0          | 2008-12-31T23:00:00     
| 2009.0             | 1.0               |",
@@ -69,7 +69,7 @@ async fn describe_boolean_binary() -> Result<()> {
         "| describe   | a    | b    |",
         "+------------+------+------+",
         "| count      | 1    | 1    |",
-        "| null_count | 1    | 1    |",
+        "| null_count | 0    | 0    |",
         "| mean       | null | null |",
         "| std        | null | null |",
         "| min        | a    | null |",
@@ -81,6 +81,36 @@ async fn describe_boolean_binary() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn describe_null() -> Result<()> {
+    let ctx = parquet_context().await;
+
+    //add test case for only boolean boolean/binary column
+    let result = ctx
+        .sql("select 'a' as a, null as b")
+        .await?
+        .describe()
+        .await?
+        .collect()
+        .await?;
+    #[rustfmt::skip]
+    let expected = [
+        "+------------+------+------+",
+        "| describe   | a    | b    |",
+        "+------------+------+------+",
+        "| count      | 1    | 0    |",
+        "| null_count | 0    | 1    |",
+        "| mean       | null | null |",
+        "| std        | null | null |",
+        "| min        | null | null |",
+        "| max        | null | null |",
+        "| median     | null | null |",
+        "+------------+------+------+"
+    ];
+    assert_batches_eq!(expected, &result);
+    Ok(())
+}
+
 /// Return a SessionContext with parquet file registered
 async fn parquet_context() -> SessionContext {
     let ctx = SessionContext::new();


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to