drusso commented on a change in pull request #8836:
URL: https://github.com/apache/arrow/pull/8836#discussion_r536760938



##########
File path: rust/datafusion/src/sql/utils.rs
##########
@@ -0,0 +1,305 @@
+use crate::error::{DataFusionError, Result};
+use crate::logical_plan::{Expr, LogicalPlan};
+use arrow::datatypes::Schema;
+
+/// Resolves an `Expr::Wildcard` to a collection of `Expr::Column`'s.
+pub(crate) fn expand_wildcard(expr: &Expr, schema: &Schema) -> Vec<Expr> {
+    match expr {
+        Expr::Wildcard => schema
+            .fields()
+            .iter()
+            .map(|f| Expr::Column(f.name().to_string()))
+            .collect::<Vec<Expr>>(),
+        _ => vec![expr.clone()],
+    }
+}
+
+/// Collect all deeply nested `Expr::AggregateFunction` and
+/// `Expr::AggregateUDF`. They are returned in order of occurrence (depth
+/// first), with duplicates omitted.
+pub(crate) fn find_aggregate_exprs(exprs: &Vec<Expr>) -> Vec<Expr> {
+    find_exprs_in_exprs(exprs, &|nested_expr| match nested_expr {
+        Expr::AggregateFunction { .. } | Expr::AggregateUDF { .. } => true,
+        _ => false,
+    })
+}
+
+/// Collect all deeply nested `Expr::Column`'s. They are returned in order of
+/// appearance (depth first), with duplicates omitted.
+pub(crate) fn find_column_exprs(exprs: &Vec<Expr>) -> Vec<Expr> {
+    find_exprs_in_exprs(exprs, &|nested_expr| match nested_expr {
+        Expr::Column(_) => true,
+        _ => false,
+    })
+}
+
+/// Search the provided `Expr`'s, and all of their nested `Expr`, for any that
+/// pass the provided test. The returned `Expr`'s are deduplicated and returned
+/// in order of appearance (depth first).
+fn find_exprs_in_exprs<F>(exprs: &Vec<Expr>, test_fn: &F) -> Vec<Expr>
+where
+    F: Fn(&Expr) -> bool,
+{
+    exprs
+        .iter()
+        .flat_map(|expr| find_exprs_in_expr(expr, test_fn))
+        .fold(vec![], |mut acc, expr| {
+            if !acc.contains(&expr) {
+                acc.push(expr)
+            }
+            acc
+        })
+}
+
+/// Search an `Expr`, and all of its nested `Expr`'s, for any that pass the
+/// provided test. The returned `Expr`'s are deduplicated and returned in order
+/// of appearance (depth first).
+fn find_exprs_in_expr<F>(expr: &Expr, test_fn: &F) -> Vec<Expr>
+where
+    F: Fn(&Expr) -> bool,
+{
+    let matched_exprs = if test_fn(expr) {
+        vec![expr.clone()]
+    } else {
+        match expr {
+            Expr::AggregateFunction { args, .. } => find_exprs_in_exprs(&args, 
test_fn),
+            Expr::AggregateUDF { args, .. } => find_exprs_in_exprs(&args, 
test_fn),
+            Expr::Alias(nested_expr, _) => {
+                find_exprs_in_expr(nested_expr.as_ref(), test_fn)
+            }
+            Expr::BinaryExpr { left, right, .. } => {
+                let mut matches = vec![];
+                matches.extend(find_exprs_in_expr(left.as_ref(), test_fn));
+                matches.extend(find_exprs_in_expr(right.as_ref(), test_fn));
+                matches
+            }
+            Expr::Case {
+                expr: case_expr_opt,
+                when_then_expr,
+                else_expr: else_expr_opt,
+            } => {
+                let mut matches = vec![];
+
+                if let Some(case_expr) = case_expr_opt {
+                    matches.extend(find_exprs_in_expr(case_expr.as_ref(), 
test_fn));
+                }
+
+                matches.extend(
+                    when_then_expr
+                        .iter()
+                        .flat_map(|(a, b)| vec![a, b])
+                        .flat_map(|expr| find_exprs_in_expr(expr.as_ref(), 
test_fn))
+                        .collect::<Vec<Expr>>(),
+                );
+
+                if let Some(else_expr) = else_expr_opt {
+                    matches.extend(find_exprs_in_expr(else_expr.as_ref(), 
test_fn));
+                }
+
+                matches
+            }
+            Expr::Cast {
+                expr: nested_expr, ..
+            } => find_exprs_in_expr(nested_expr.as_ref(), test_fn),
+            Expr::IsNotNull(nested_expr) => {
+                find_exprs_in_expr(nested_expr.as_ref(), test_fn)
+            }
+            Expr::IsNull(nested_expr) => {
+                find_exprs_in_expr(nested_expr.as_ref(), test_fn)
+            }
+            Expr::Not(nested_expr) => find_exprs_in_expr(nested_expr.as_ref(), 
test_fn),
+            Expr::ScalarFunction { args, .. } => find_exprs_in_exprs(&args, 
test_fn),
+            Expr::ScalarUDF { args, .. } => find_exprs_in_exprs(&args, 
test_fn),
+            Expr::Sort {
+                expr: nested_expr, ..
+            } => find_exprs_in_expr(nested_expr.as_ref(), test_fn),
+
+            // These expressions don't nest other expressions.
+            Expr::Column(_)
+            | Expr::Literal(_)
+            | Expr::ScalarVariable(_)
+            | Expr::Wildcard => vec![],
+        }
+    };
+
+    matched_exprs.into_iter().fold(vec![], |mut acc, expr| {
+        if !acc.contains(&expr) {

Review comment:
       The function maintains the order in which the expressions appear in the 
query, which wouldn't be possible with a `HashSet`.
   
   Given a query:
   
   ```
   SELECT a, b, MIN(c), MIN(d) FROM t GROUP BY a, b
   ```
   The aggregation plan would be as follows. Note the ordering of the columns 
is the same as authored in the query. 
   
   ```
   Aggregate: groupBy=[[#a, #b]], aggr=[[MIN(#c), MIN(#d)]]
   ```
   
   Of course, maintaining ordering is not strictly required. The plans are 
logically equivalent regardless of the column ordering. However, maintaining 
the order does make the plans a little more user-friendly. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to