alamb commented on code in PR #5226: URL: https://github.com/apache/arrow-datafusion/pull/5226#discussion_r1120419464
########## datafusion/core/src/dataframe.rs: ########## @@ -302,6 +305,155 @@ impl DataFrame { )) } + /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and + /// returns nulls for non numeric datatypes. Try in keep output similar to pandas + /// + /// ``` + /// # use datafusion::prelude::*; + /// # use datafusion::error::Result; + /// # use arrow::util::pretty; + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// let ctx = SessionContext::new(); + /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?; + /// df.describe().await.unwrap(); + /// + /// # Ok(()) + /// # } + /// ``` + pub async fn describe(self) -> Result<Self> { + //the functions now supported + let supported_describe_functions = vec!["count", "null_count", "max", "min"]; + + let fields_iter = self.schema().fields().iter(); + + //define describe column + let mut describe_schemas = fields_iter + .clone() + .map(|field| { + if field.data_type().is_numeric() { + Field::new(field.name(), DataType::Float64, true) + } else { + Field::new(field.name(), DataType::Utf8, true) + } + }) + .collect::<Vec<_>>(); + describe_schemas.insert(0, Field::new("describe", DataType::Utf8, false)); + + //collect recordBatch + let describe_record_batch = vec![ + // count aggregation + self.clone() + .aggregate( + vec![], + fields_iter + .clone() + .map(|f| datafusion_expr::count(col(f.name())).alias(f.name())) + .collect::<Vec<_>>(), + )? + .collect() + .await?, + // null_count aggregation + self.clone() + .aggregate( + vec![], + fields_iter + .clone() + .map(|f| { + datafusion_expr::count(datafusion_expr::is_null( + col(f.name()), + )) + .alias(f.name()) + }) + .collect::<Vec<_>>(), + )? + .collect() + .await?, + // max aggregation + self.clone() + .aggregate( + vec![], + fields_iter + .clone() + .filter(|f| { + !matches!(f.data_type(), DataType::Binary | DataType::Boolean) + }) + .map(|f| datafusion_expr::max(col(f.name())).alias(f.name())) + .collect::<Vec<_>>(), + )? + .collect() + .await?, + // min aggregation + self.clone() + .aggregate( + vec![], + fields_iter + .clone() + .filter(|f| { + !matches!(f.data_type(), DataType::Binary | DataType::Boolean) + }) + .map(|f| datafusion_expr::min(col(f.name())).alias(f.name())) + .collect::<Vec<_>>(), + )? + .collect() + .await?, + ]; + + let mut array_ref_vec: Vec<ArrayRef> = vec![]; + for field in fields_iter { + let mut array_datas = vec![]; + for record_batch in describe_record_batch.iter() { + let column = record_batch.get(0).unwrap().column_by_name(field.name()); + match column { + Some(c) => { + if field.data_type().is_numeric() { + array_datas.push(cast(c, &DataType::Float64)?); + } else { + array_datas.push(cast(c, &DataType::Utf8)?); + } + } + //if None mean the column cannot be min/max aggregation + None => { + array_datas.push(Arc::new(StringArray::from_slice(["null"]))); + } + } + } + + array_ref_vec.push(concat( + array_datas + .iter() + .map(|af| af.as_ref()) + .collect::<Vec<_>>() + .as_slice(), + )?); + } + + //insert first column with function names + array_ref_vec.insert( + 0, + Arc::new(StringArray::from_slice( + supported_describe_functions.clone(), + )), + ); + + let describe_record_batch = Review Comment: 👍 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org