This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new b5e034b  Remove unnecessary projection in logical plan optimization 
phase (#747)
b5e034b is described below

commit b5e034b1a4f47a47de68d176b98042a1e4df7d58
Author: Ruihang Xia <[email protected]>
AuthorDate: Tue Jul 20 00:29:27 2021 +0800

    Remove unnecessary projection in logical plan optimization phase (#747)
    
    * eliminate super-set project
    
    Signed-off-by: Ruihang Xia <[email protected]>
    
    * keep projection right before table scan
    
    Signed-off-by: Ruihang Xia <[email protected]>
    
    * tidy
    
    Signed-off-by: Ruihang Xia <[email protected]>
---
 datafusion/src/optimizer/projection_push_down.rs | 69 ++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 3 deletions(-)

diff --git a/datafusion/src/optimizer/projection_push_down.rs 
b/datafusion/src/optimizer/projection_push_down.rs
index 0272b9f..089dca2 100644
--- a/datafusion/src/optimizer/projection_push_down.rs
+++ b/datafusion/src/optimizer/projection_push_down.rs
@@ -173,7 +173,17 @@ fn optimize_plan(
                 true,
                 execution_props,
             )?;
-            if new_fields.is_empty() {
+
+            let new_required_columns_optimized = new_input
+                .schema()
+                .fields()
+                .iter()
+                .map(|f| f.qualified_column())
+                .collect::<HashSet<Column>>();
+
+            if new_fields.is_empty()
+                || (has_projection && &new_required_columns_optimized == 
required_columns)
+            {
                 // no need for an expression at all
                 Ok(new_input)
             } else {
@@ -497,6 +507,60 @@ mod tests {
     }
 
     #[test]
+    fn redundunt_project() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), col("b"), col("c")])?
+            .project(vec![col("a"), col("c"), col("b")])?
+            .build()?;
+        let expected = "Projection: #test.a, #test.c, #test.b\
+        \n  TableScan: test projection=Some([0, 1, 2])";
+
+        assert_optimized_plan_eq(&plan, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn reorder_projection() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("c"), col("b"), col("a")])?
+            .build()?;
+        let expected = "Projection: #test.c, #test.b, #test.a\
+        \n  TableScan: test projection=Some([0, 1, 2])";
+
+        assert_optimized_plan_eq(&plan, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn noncontiguous_redundunt_projection() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("c"), col("b"), col("a")])?
+            .filter(col("c").gt(lit(1)))?
+            .project(vec![col("c"), col("a"), col("b")])?
+            .filter(col("b").gt(lit(1)))?
+            .filter(col("a").gt(lit(1)))?
+            .project(vec![col("a"), col("c"), col("b")])?
+            .build()?;
+        let expected = "Projection: #test.a, #test.c, #test.b\
+        \n  Filter: #test.a Gt Int32(1)\
+        \n    Filter: #test.b Gt Int32(1)\
+        \n      Filter: #test.c Gt Int32(1)\
+        \n        TableScan: test projection=Some([0, 1, 2])";
+
+        assert_optimized_plan_eq(&plan, expected);
+
+        Ok(())
+    }
+
+    #[test]
     fn join_schema_trim_full_join_column_projection() -> Result<()> {
         let table_scan = test_table_scan()?;
 
@@ -812,8 +876,7 @@ mod tests {
 
         assert_fields_eq(&plan, vec!["c", "a", "MAX(test.b)"]);
 
-        let expected = "\
-        Projection: #test.c, #test.a, #MAX(test.b)\
+        let expected = "Projection: #test.c, #test.a, #MAX(test.b)\
         \n  Filter: #test.c Gt Int32(1)\
         \n    Aggregate: groupBy=[[#test.a, #test.c]], aggr=[[MAX(#test.b)]]\
         \n      TableScan: test projection=Some([0, 1, 2])";

Reply via email to