alamb commented on a change in pull request #2054:
URL: https://github.com/apache/arrow-datafusion/pull/2054#discussion_r836793959
##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1162,6 +1164,117 @@ pub(crate) fn expand_qualified_wildcard(
expand_wildcard(&qualifier_schema, plan)
}
+/// check whether the logical plan we are building is valid
+fn check_plan_invalid(plan: &LogicalPlan) -> Result<()> {
+ match plan {
+ LogicalPlan::Projection(Projection { expr, input, .. })
+ | LogicalPlan::Sort(Sort { expr, input })
+ | LogicalPlan::Window(Window {
+ window_expr: expr,
+ input,
+ ..
+ }) => check_plan_invalid(input)
+ .and_then(|_| check_any_invalid_expr(expr, input.schema())),
+
+ LogicalPlan::Filter(Filter {
+ predicate: expr,
+ input,
+ }) => check_plan_invalid(input)
+ .and_then(|_| check_invalid_expr(expr, input.schema())),
+
+ LogicalPlan::Aggregate(Aggregate {
+ input,
+ group_expr,
+ aggr_expr,
+ ..
+ }) => {
+ let schema = input.schema();
+ check_plan_invalid(input)
+ .and_then(|_| check_any_invalid_expr(group_expr, schema))
+ .and_then(|_| check_any_invalid_expr(aggr_expr, schema))
+ }
+
+ LogicalPlan::Join(Join { left, right, .. })
+ | LogicalPlan::CrossJoin(CrossJoin { left, right, .. }) => {
+ check_plan_invalid(left).and_then(|_| check_plan_invalid(right))
+ }
+
+ LogicalPlan::Repartition(Repartition { input, .. })
+ | LogicalPlan::Limit(Limit { input, .. })
+ | LogicalPlan::Explain(Explain { plan: input, .. })
+ | LogicalPlan::Analyze(Analyze { input, .. }) =>
check_plan_invalid(input),
+
+ LogicalPlan::Union(Union { inputs, .. }) => {
+ inputs.iter().try_for_each(check_plan_invalid)
+ }
+
+ LogicalPlan::TableScan(TableScan {
+ table_name: _,
+ source,
+ projection,
+ projected_schema,
+ filters,
+ limit: _,
+ }) => {
+ if let Some(projection) = projection {
+ if projection.len() > projected_schema.fields().len() {
+ return Err(DataFusionError::Plan(
+ "projection contains columns that doesnt belong to
projected schema"
+ .to_owned(),
+ ));
+ }
+ }
+ let schema = &source.schema().to_dfschema_ref()?;
+ check_any_invalid_expr(filters, schema)
+ }
+ _ => Ok(()),
+ }
+}
+
+/// find first error in the exprs
+fn check_any_invalid_expr(exprs: &[Expr], schema: &DFSchemaRef) -> Result<()> {
+ exprs.iter().try_for_each(|e| check_invalid_expr(e, schema))
+}
+
+/// do some checks for exprs in a logical plan
+fn check_invalid_expr(expr: &Expr, schema: &DFSchemaRef) -> Result<()> {
Review comment:
I think I may have missed it, but as written I don't think this recurses
into the `expr` -- so while it will find nonsense like
```
NOT "foo"
```
I don't think it would find
```
A = 5 OR (NOT "foo")
```
We could use an `ExpressionVisitor` for
thishttps://github.com/apache/arrow-datafusion/blob/634252b44cec8dc904e48926d0aa54feeb4d48af/datafusion/src/logical_plan/expr_visitor.rs#L32-L47
Here is an example of using `ExpressionVisitor` for something similar:
https://github.com/apache/arrow-datafusion/blob/2d6addd4c435123e934ce04564d8cc77bf101c37/datafusion/src/datasource/listing/helpers.rs#L55-L119
##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1395,4 +1508,58 @@ mod tests {
assert!(stringified_plan.should_display(true));
assert!(!stringified_plan.should_display(false));
}
+
+ #[test]
+ fn invalid_plan_with_not_non_boolean_expression() -> Result<()> {
+ let mut plan = LogicalPlanBuilder::scan_empty(
+ Some("employee_csv"),
+ &employee_schema(),
+ Some(vec![0, 1, 2, 3, 4]),
+ )?;
+ plan = plan.project(vec![Expr::Not(Box::new(Expr::Column(Column {
+ relation: None,
+ name: "id".to_string(),
+ })))])?;
Review comment:
I think the "builder" API for expressions makes for much easier reading:
```suggestion
plan = plan.project(vec![col("id").not()])?;
```
##########
File path: datafusion/src/sql/planner.rs
##########
@@ -2347,9 +2347,9 @@ mod tests {
#[test]
fn select_neg_filter() {
let sql = "SELECT id, first_name, last_name \
Review comment:
👍
##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1162,6 +1164,117 @@ pub(crate) fn expand_qualified_wildcard(
expand_wildcard(&qualifier_schema, plan)
}
+/// check whether the logical plan we are building is valid
Review comment:
As a follow on PR, it would be awesome to encapsulate this `LogicalPlan`
tree walking code into some sort of visitor.
##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1395,4 +1508,58 @@ mod tests {
assert!(stringified_plan.should_display(true));
assert!(!stringified_plan.should_display(false));
}
+
+ #[test]
+ fn invalid_plan_with_not_non_boolean_expression() -> Result<()> {
+ let mut plan = LogicalPlanBuilder::scan_empty(
+ Some("employee_csv"),
+ &employee_schema(),
+ Some(vec![0, 1, 2, 3, 4]),
+ )?;
+ plan = plan.project(vec![Expr::Not(Box::new(Expr::Column(Column {
+ relation: None,
+ name: "id".to_string(),
+ })))])?;
+ match plan.build() {
+ Ok(_) => panic!("Unexpect Ok"),
+ Err(e) => {
+ assert_eq!(
+ e.to_string(),
+ "Error during planning: \
+ Invalid Not Expression(NOT #employee_csv.id) -- \
+ Type(Int32) of expression(#employee_csv.id) must be of a
boolean type"
+ )
+ }
+ }
+ Ok(())
+ }
+
+ #[test]
+ fn invalid_plan_cast_utf8_boolean() -> Result<()> {
+ let mut plan = LogicalPlanBuilder::scan_empty(
+ Some("employee_csv"),
+ &employee_schema(),
+ Some(vec![0, 1, 2, 3, 4]),
+ )?;
+ let col = Expr::Column(Column {
+ relation: None,
+ name: "first_name".to_owned(),
+ });
Review comment:
```suggestion
let col = col("first_name");
```
##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1395,4 +1508,58 @@ mod tests {
assert!(stringified_plan.should_display(true));
assert!(!stringified_plan.should_display(false));
}
+
+ #[test]
+ fn invalid_plan_with_not_non_boolean_expression() -> Result<()> {
+ let mut plan = LogicalPlanBuilder::scan_empty(
+ Some("employee_csv"),
+ &employee_schema(),
+ Some(vec![0, 1, 2, 3, 4]),
+ )?;
+ plan = plan.project(vec![Expr::Not(Box::new(Expr::Column(Column {
+ relation: None,
+ name: "id".to_string(),
+ })))])?;
+ match plan.build() {
+ Ok(_) => panic!("Unexpect Ok"),
+ Err(e) => {
+ assert_eq!(
+ e.to_string(),
+ "Error during planning: \
+ Invalid Not Expression(NOT #employee_csv.id) -- \
+ Type(Int32) of expression(#employee_csv.id) must be of a
boolean type"
+ )
+ }
+ }
+ Ok(())
+ }
+
+ #[test]
+ fn invalid_plan_cast_utf8_boolean() -> Result<()> {
+ let mut plan = LogicalPlanBuilder::scan_empty(
+ Some("employee_csv"),
+ &employee_schema(),
+ Some(vec![0, 1, 2, 3, 4]),
+ )?;
+ let col = Expr::Column(Column {
+ relation: None,
+ name: "first_name".to_owned(),
+ });
+ plan = plan.project(vec![Expr::Cast {
+ expr: Box::new(col),
+ data_type: DataType::Boolean,
+ }])?;
+ match plan.build() {
+ Ok(_) => panic!("Unexpect Ok"),
+ Err(e) => {
+ assert_eq!(
+ e.to_string(),
+ "Error during planning: \
+ Invalid Cast Expression(CAST(#employee_csv.first_name AS
Boolean)) -- \
+ Utf8 cannot cast to Boolean"
+ )
+ }
+ }
+ Ok(())
+ }
Review comment:
Can we also please add a test for using wildcards incorrectly -- like
`SELECT .. GROUP BY *` and `SELECT .. GROUP BY foo*`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]