This is an automated email from the ASF dual-hosted git repository.
jakevin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d0bd28eef feat: eliminate the duplicated sort keys in Order By clause
(#5462)
d0bd28eef is described below
commit d0bd28eef34ca001d5e43d2732761a1b4cf09c71
Author: jakevin <[email protected]>
AuthorDate: Mon Mar 6 22:55:16 2023 +0800
feat: eliminate the duplicated sort keys in Order By clause (#5462)
---
.../core/tests/sqllogictests/test_files/order.slt | 117 ++++++++++++++++++++-
.../optimizer/src/eliminate_duplicated_expr.rs | 103 ++++++++++++++++++
datafusion/optimizer/src/lib.rs | 7 +-
datafusion/optimizer/src/optimizer.rs | 2 +
4 files changed, 225 insertions(+), 4 deletions(-)
diff --git a/datafusion/core/tests/sqllogictests/test_files/order.slt
b/datafusion/core/tests/sqllogictests/test_files/order.slt
index 6c2bb3abc..d42d2cf62 100644
--- a/datafusion/core/tests/sqllogictests/test_files/order.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/order.slt
@@ -106,7 +106,6 @@ SELECT arrow_typeof(c1), arrow_typeof(c2), arrow_typeof(c3)
FROM test LIMIT 1;
----
Int32 Int64 Boolean
-
query II
SELECT c1, c2 FROM test ORDER BY c1 DESC, c2 ASC
----
@@ -155,6 +154,122 @@ SELECT c1, c2 FROM test ORDER BY c1 DESC, c2 ASC
0 9
0 10
+# eliminate duplicated sorted expr
+query TT
+explain SELECT c1, c2 FROM aggregate_test_100 ORDER BY c2, c3, c2
+----
+logical_plan
+Projection: aggregate_test_100.c1, aggregate_test_100.c2
+ Sort: aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS
LAST
+ TableScan: aggregate_test_100 projection=[c1, c2, c3]
+physical_plan
+ProjectionExec: expr=[c1@0 as c1, c2@1 as c2]
+ SortExec: expr=[c2@1 ASC NULLS LAST,c3@2 ASC NULLS LAST]
+ CsvExec: files={1 group:
[[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true,
limit=None, projection=[c1, c2, c3]
+
+query II
+SELECT c2, c3 FROM aggregate_test_100 ORDER BY c2, c3, c2
+----
+1 -99
+1 -98
+1 -85
+1 -72
+1 -56
+1 -25
+1 -24
+1 -8
+1 -5
+1 12
+1 29
+1 36
+1 38
+1 41
+1 54
+1 57
+1 70
+1 71
+1 83
+1 103
+1 120
+1 125
+2 -117
+2 -107
+2 -106
+2 -61
+2 -60
+2 -60
+2 -48
+2 -43
+2 -29
+2 1
+2 29
+2 31
+2 45
+2 49
+2 52
+2 52
+2 63
+2 68
+2 93
+2 97
+2 113
+2 122
+3 -101
+3 -95
+3 -76
+3 -72
+3 -12
+3 -2
+3 13
+3 13
+3 14
+3 17
+3 17
+3 22
+3 71
+3 73
+3 77
+3 97
+3 104
+3 112
+3 123
+4 -117
+4 -111
+4 -101
+4 -90
+4 -79
+4 -59
+4 -56
+4 -54
+4 -53
+4 -38
+4 3
+4 5
+4 17
+4 30
+4 47
+4 55
+4 65
+4 73
+4 74
+4 96
+4 97
+4 102
+4 123
+5 -101
+5 -94
+5 -86
+5 -82
+5 -59
+5 -44
+5 -40
+5 -31
+5 -5
+5 36
+5 62
+5 64
+5 68
+5 118
# sort_empty
diff --git a/datafusion/optimizer/src/eliminate_duplicated_expr.rs
b/datafusion/optimizer/src/eliminate_duplicated_expr.rs
new file mode 100644
index 000000000..5a882108e
--- /dev/null
+++ b/datafusion/optimizer/src/eliminate_duplicated_expr.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::optimizer::ApplyOrder;
+use crate::{OptimizerConfig, OptimizerRule};
+use datafusion_common::Result;
+use datafusion_expr::logical_plan::LogicalPlan;
+use datafusion_expr::Sort;
+use hashbrown::HashSet;
+
+/// Optimization rule that eliminate duplicated expr.
+#[derive(Default)]
+pub struct EliminateDuplicatedExpr;
+
+impl EliminateDuplicatedExpr {
+ #[allow(missing_docs)]
+ pub fn new() -> Self {
+ Self {}
+ }
+}
+
+impl OptimizerRule for EliminateDuplicatedExpr {
+ fn try_optimize(
+ &self,
+ plan: &LogicalPlan,
+ _config: &dyn OptimizerConfig,
+ ) -> Result<Option<LogicalPlan>> {
+ match plan {
+ LogicalPlan::Sort(sort) => {
+ // dedup sort.expr and keep order
+ let mut dedup_expr = Vec::new();
+ let mut dedup_set = HashSet::new();
+ for expr in &sort.expr {
+ if !dedup_set.contains(expr) {
+ dedup_expr.push(expr);
+ dedup_set.insert(expr.clone());
+ }
+ }
+ if dedup_expr.len() == sort.expr.len() {
+ Ok(None)
+ } else {
+ Ok(Some(LogicalPlan::Sort(Sort {
+ expr:
dedup_expr.into_iter().cloned().collect::<Vec<_>>(),
+ input: sort.input.clone(),
+ fetch: sort.fetch,
+ })))
+ }
+ }
+ _ => Ok(None),
+ }
+ }
+
+ fn name(&self) -> &str {
+ "eliminate_duplicated_expr"
+ }
+
+ fn apply_order(&self) -> Option<ApplyOrder> {
+ Some(ApplyOrder::TopDown)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::test::*;
+ use datafusion_expr::{col, logical_plan::builder::LogicalPlanBuilder};
+ use std::sync::Arc;
+
+ fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) ->
Result<()> {
+ crate::test::assert_optimized_plan_eq(
+ Arc::new(EliminateDuplicatedExpr::new()),
+ plan,
+ expected,
+ )
+ }
+
+ #[test]
+ fn eliminate_sort_expr() -> Result<()> {
+ let table_scan = test_table_scan().unwrap();
+ let plan = LogicalPlanBuilder::from(table_scan)
+ .sort(vec![col("a"), col("a"), col("b"), col("c")])?
+ .limit(5, Some(10))?
+ .build()?;
+ let expected = "Limit: skip=5, fetch=10\
+ \n Sort: test.a, test.b, test.c\
+ \n TableScan: test";
+ assert_optimized_plan_eq(&plan, expected)
+ }
+}
diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs
index ca0611d4e..4bbbb4645 100644
--- a/datafusion/optimizer/src/lib.rs
+++ b/datafusion/optimizer/src/lib.rs
@@ -20,6 +20,7 @@ pub mod common_subexpr_eliminate;
pub mod decorrelate_where_exists;
pub mod decorrelate_where_in;
pub mod eliminate_cross_join;
+pub mod eliminate_duplicated_expr;
pub mod eliminate_filter;
pub mod eliminate_limit;
pub mod eliminate_outer_join;
@@ -33,17 +34,17 @@ pub mod propagate_empty_relation;
pub mod push_down_filter;
pub mod push_down_limit;
pub mod push_down_projection;
+pub mod replace_distinct_aggregate;
+pub mod rewrite_disjunctive_predicate;
pub mod scalar_subquery_to_join;
pub mod simplify_expressions;
pub mod single_distinct_to_groupby;
pub mod type_coercion;
+pub mod unwrap_cast_in_comparison;
pub mod utils;
-pub mod replace_distinct_aggregate;
-pub mod rewrite_disjunctive_predicate;
#[cfg(test)]
pub mod test;
-pub mod unwrap_cast_in_comparison;
pub use optimizer::{OptimizerConfig, OptimizerContext, OptimizerRule};
pub use utils::optimize_children;
diff --git a/datafusion/optimizer/src/optimizer.rs
b/datafusion/optimizer/src/optimizer.rs
index 5077bed90..c1baa25d4 100644
--- a/datafusion/optimizer/src/optimizer.rs
+++ b/datafusion/optimizer/src/optimizer.rs
@@ -21,6 +21,7 @@ use crate::common_subexpr_eliminate::CommonSubexprEliminate;
use crate::decorrelate_where_exists::DecorrelateWhereExists;
use crate::decorrelate_where_in::DecorrelateWhereIn;
use crate::eliminate_cross_join::EliminateCrossJoin;
+use crate::eliminate_duplicated_expr::EliminateDuplicatedExpr;
use crate::eliminate_filter::EliminateFilter;
use crate::eliminate_limit::EliminateLimit;
use crate::eliminate_outer_join::EliminateOuterJoin;
@@ -222,6 +223,7 @@ impl Optimizer {
Arc::new(SimplifyExpressions::new()),
Arc::new(MergeProjection::new()),
Arc::new(RewriteDisjunctivePredicate::new()),
+ Arc::new(EliminateDuplicatedExpr::new()),
Arc::new(EliminateFilter::new()),
Arc::new(EliminateCrossJoin::new()),
Arc::new(CommonSubexprEliminate::new()),