neilconway commented on code in PR #21362:
URL: https://github.com/apache/datafusion/pull/21362#discussion_r3036954195
##########
datafusion/common/src/functional_dependencies.rs:
##########
@@ -590,6 +590,46 @@ pub fn get_required_group_by_exprs_indices(
.collect()
}
+/// Returns indices for the minimal subset of ORDER BY expressions that are
+/// functionally equivalent to the original set of ORDER BY expressions.
+pub fn get_required_sort_exprs_indices(
+ schema: &DFSchema,
+ sort_expr_names: &[String],
+) -> Option<Vec<usize>> {
+ let dependencies = schema.functional_dependencies();
+ let field_names = schema.field_names();
+ let sort_expr_indices = sort_expr_names
+ .iter()
+ .map(|sort_expr_name| {
+ field_names
+ .iter()
+ .position(|field_name| field_name == sort_expr_name)
+ })
+ .collect::<Option<Vec<_>>>()?;
Review Comment:
Possible improvement: as written, the optimization bails out entirely if
there are any ORDER BY expressions that don't match column names. That's a bit
conservative; e.g.,
```sql
SELECT deptno, SUM(sal) AS total_sal
FROM emp
GROUP BY deptno
ORDER BY deptno, total_sal, ABS(deptno)
```
We could still simplify this to `ORDER BY deptno, ABS(deptno)`. I'd guess
this case isn't super common in practice though?
##########
datafusion/optimizer/src/eliminate_duplicated_expr.rs:
##########
@@ -76,12 +76,50 @@ impl OptimizerRule for EliminateDuplicatedExpr {
.map(|wrapper| wrapper.0)
.collect();
- let transformed = if len != unique_exprs.len() {
+ let sort_expr_names = unique_exprs
+ .iter()
+ .map(|sort_expr| sort_expr.expr.schema_name().to_string())
+ .collect::<Vec<_>>();
+ let required_indices = get_required_sort_exprs_indices(
+ sort.input.schema().as_ref(),
+ &sort_expr_names,
+ );
+
+ let (unique_exprs, fd_transformed) =
+ if let Some(required_indices) = required_indices {
+ if required_indices.len() != unique_exprs.len() {
+ (
+ required_indices
+ .into_iter()
+ .map(|idx| unique_exprs[idx].clone())
+ .collect(),
+ true,
+ )
+ } else {
+ (unique_exprs, false)
+ }
+ } else {
+ (unique_exprs, false)
+ };
+
+ let transformed = if len != unique_exprs.len() ||
fd_transformed {
Transformed::yes
} else {
Transformed::no
};
Review Comment:
Could be simplified, e.g.,:
```suggestion
let unique_exprs = match required_indices {
Some(indices) if indices.len() < unique_exprs.len() =>
indices
.into_iter()
.map(|idx| unique_exprs[idx].clone())
.collect(),
_ => unique_exprs,
};
let transformed = if len != unique_exprs.len() {
Transformed::yes
} else {
Transformed::no
};
```
##########
datafusion/optimizer/src/eliminate_duplicated_expr.rs:
##########
@@ -76,12 +76,50 @@ impl OptimizerRule for EliminateDuplicatedExpr {
.map(|wrapper| wrapper.0)
.collect();
- let transformed = if len != unique_exprs.len() {
+ let sort_expr_names = unique_exprs
+ .iter()
+ .map(|sort_expr| sort_expr.expr.schema_name().to_string())
+ .collect::<Vec<_>>();
+ let required_indices = get_required_sort_exprs_indices(
+ sort.input.schema().as_ref(),
+ &sort_expr_names,
+ );
+
+ let (unique_exprs, fd_transformed) =
+ if let Some(required_indices) = required_indices {
+ if required_indices.len() != unique_exprs.len() {
+ (
+ required_indices
+ .into_iter()
+ .map(|idx| unique_exprs[idx].clone())
+ .collect(),
+ true,
+ )
+ } else {
+ (unique_exprs, false)
+ }
+ } else {
+ (unique_exprs, false)
+ };
+
+ let transformed = if len != unique_exprs.len() ||
fd_transformed {
Transformed::yes
} else {
Transformed::no
};
+ if unique_exprs.is_empty() {
Review Comment:
`unique_exprs` can't be empty because FDs can never get us down to zero sort
columns, right?
##########
datafusion/optimizer/src/eliminate_duplicated_expr.rs:
##########
@@ -175,4 +214,40 @@ mod tests {
TableScan: test
")
}
+
+ #[test]
+ fn eliminate_fd_redundant_sort_expr() -> Result<()> {
+ let table_scan = test_table_scan().unwrap();
+ let plan = LogicalPlanBuilder::from(table_scan)
+ .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_sal")])?
+ .sort(vec![
+ col("a").sort(true, true),
+ col("total_sal").sort(true, true),
+ ])?
+ .build()?;
+
+ assert_optimized_plan_equal!(plan, @r"
+ Sort: test.a ASC NULLS FIRST
+ Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b) AS total_sal]]
+ TableScan: test
+ ")
+ }
+
+ #[test]
+ fn keep_order_by_when_dependency_comes_later() -> Result<()> {
+ let table_scan = test_table_scan().unwrap();
+ let plan = LogicalPlanBuilder::from(table_scan)
+ .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_sal")])?
+ .sort(vec![
+ col("total_sal").sort(true, true),
+ col("a").sort(true, true),
+ ])?
+ .build()?;
+
+ assert_optimized_plan_equal!(plan, @r"
+ Sort: total_sal ASC NULLS FIRST, test.a ASC NULLS FIRST
+ Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b) AS total_sal]]
+ TableScan: test
+ ")
+ }
Review Comment:
Personally my preference is always to add tests as SLT if all we're doing is
stuff like checking `EXPLAIN` output, because it is easier to read, can be
evaluated in parallel, and doesn't have to be compiled.
##########
datafusion/sqllogictest/test_files/window.slt:
##########
Review Comment:
This comment needs updating.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]