alamb commented on code in PR #12943: URL: https://github.com/apache/datafusion/pull/12943#discussion_r1817818906
########## datafusion/optimizer/src/push_down_filter.rs: ########## @@ -628,6 +627,103 @@ fn infer_join_predicates( .collect::<Result<Vec<_>>>() } +/// Check whether the given expression depends only on the columns in the given schema. +/// Meaning for a certain Review Comment: this sentence appears to end abruptly ########## datafusion/optimizer/src/push_down_filter.rs: ########## @@ -628,6 +627,103 @@ fn infer_join_predicates( .collect::<Result<Vec<_>>>() } +/// Check whether the given expression depends only on the columns in the given schema. +/// Meaning for a certain +fn expr_depends_only_on_schema(schema: &DFSchema, expr: &Expr) -> bool { + let mut is_applicable = true; + expr.apply(|expr| match expr { + Expr::Column(column) => { + is_applicable &= schema.has_column(column); + if is_applicable { + Ok(TreeNodeRecursion::Jump) + } else { + Ok(TreeNodeRecursion::Stop) + } + } + Expr::Literal(_) + | Expr::Alias(_) + | Expr::OuterReferenceColumn(_, _) + | Expr::ScalarVariable(_, _) + | Expr::Not(_) + | Expr::IsNotNull(_) + | Expr::IsNull(_) + | Expr::IsTrue(_) + | Expr::IsFalse(_) + | Expr::IsUnknown(_) + | Expr::IsNotTrue(_) + | Expr::IsNotFalse(_) + | Expr::IsNotUnknown(_) + | Expr::Negative(_) + | Expr::Cast(_) + | Expr::TryCast(_) + | Expr::BinaryExpr(_) + | Expr::Between(_) + | Expr::Like(_) + | Expr::SimilarTo(_) + | Expr::InList(_) + | Expr::Exists(_) + | Expr::InSubquery(_) + | Expr::ScalarSubquery(_) + | Expr::GroupingSet(_) + | Expr::Case(_) => Ok(TreeNodeRecursion::Continue), + + Expr::ScalarFunction(scalar_function) => { + match scalar_function.func.signature().volatility { + Volatility::Immutable => Ok(TreeNodeRecursion::Continue), + // TODO: Stable functions could be `applicable`, but that would require access to the context + Volatility::Stable | Volatility::Volatile => { + is_applicable = false; + Ok(TreeNodeRecursion::Stop) + } + } + } + + // TODO other expressions are not handled yet: + // - AGGREGATE and WINDOW should not end up in filter conditions, except maybe in some edge cases + // - Can `Wildcard` be considered as a `Literal`? + // - ScalarVariable could be `applicable`, but that would require access to the context + Expr::AggregateFunction { .. } + | Expr::WindowFunction { .. } + | Expr::Wildcard { .. } + | Expr::Unnest { .. } + | Expr::Placeholder(_) => { + is_applicable = false; + Ok(TreeNodeRecursion::Stop) + } + }) + .unwrap(); + is_applicable +} + +fn check_if_expr_depends_only_on_distinct_on_columns( + distinct_on: &DistinctOn, + expr: &Expr, +) -> Result<bool> { + let distinct_on_cols: HashSet<&Column> = distinct_on + .on_expr + .iter() + .flat_map(|e| e.column_refs()) + .collect(); + let distinct_on_input_schema = distinct_on.input.schema(); + let distinct_on_qualified_fields: Vec<_> = distinct_on_cols Review Comment: there is a lot of copying going on in this code (`qualified_field_from_column` etc) Is there any way to avoid this? I wonder if we could check each reference in the filter expression and see if it had any column that wasn't in the distinct on clause ########## datafusion/optimizer/src/push_down_filter.rs: ########## @@ -628,6 +627,103 @@ fn infer_join_predicates( .collect::<Result<Vec<_>>>() } +/// Check whether the given expression depends only on the columns in the given schema. +/// Meaning for a certain Review Comment: Also this appears to be a copy of https://github.com/apache/datafusion/blob/b098893a34f83f1a1df290168377d7622938b3f5/datafusion/core/src/datasource/listing/helpers.rs#L55-L54 Perhaps instead of copy/pasting the code, you could move it to a common location (e.g. datafusion-expr)? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org