jiangzhx commented on code in PR #5226:
URL: https://github.com/apache/arrow-datafusion/pull/5226#discussion_r1118296489


##########
datafusion/core/src/dataframe.rs:
##########
@@ -302,6 +306,177 @@ impl DataFrame {
         ))
     }
 
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes 
at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to 
pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", 
CsvReadOptions::new()).await?;    
+    /// df.describe().await?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn describe(self) -> Result<()> {
+        Ok(print_batches(
+            &self.clone().collect_describe().await.unwrap(),
+        )?)
+    }
+
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes 
at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to 
pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", 
CsvReadOptions::new()).await?;    
+    /// df.collect_describe().await.unwrap();
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn collect_describe(self) -> Result<Vec<RecordBatch>> {
+        //the functions now supported
+        let supported_describe_functions = vec!["count", "null_count", "max", 
"min"]; //"count",  "max", "min",
+
+        let fields_iter = self.schema().fields().iter();
+
+        //define describe column
+        let mut describe_schemas = fields_iter
+            .clone()
+            .map(|field| {
+                if field.data_type().is_numeric() {
+                    Field::new(field.name(), DataType::Float64, true)
+                } else {
+                    Field::new(field.name(), DataType::Utf8, true)

Review Comment:
   the describe method return schema like this.
   <img width="1392" alt="image" 
src="https://user-images.githubusercontent.com/494507/221468185-82ede8f7-9efd-4678-8fe8-bf5af713256b.png";>
   
   
   the each column should have same datatype .
   for example :
   - `bool_col`  on  `count/null_count` return Int64 ;  error on  `min/max` , 
so make `bool_col` datatype `UTF8`;  
   - `float_col` on  `count/null_count` return Int64 ;  on  `min/max`  return 
float, so make `float_col` datatype `Float64`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to