alamb commented on code in PR #4487:
URL: https://github.com/apache/arrow-datafusion/pull/4487#discussion_r1040000009


##########
datafusion/optimizer/src/push_down_projection.rs:
##########
@@ -94,23 +94,19 @@ fn optimize_plan(
 
             let mut new_expr = Vec::new();
             let mut new_fields = Vec::new();
+            let mut new_required_columns = HashSet::new();

Review Comment:
   Since this is shadowing a variable in the outer scope, I wonder if it would 
make sense to comment why we are starting with an empty set in this case rather 
than the pushed down columns?



##########
datafusion/core/tests/sql/explain_analyze.rs:
##########
@@ -744,7 +744,7 @@ async fn test_physical_plan_display_indent_multi_children() 
{
         "        RepartitionExec: partitioning=Hash([Column { name: \"c2\", 
index: 0 }], 9000)",
         "          ProjectionExec: expr=[c1@0 as c2]",
         "            RepartitionExec: partitioning=RoundRobinBatch(9000)",
-        "              CsvExec: files={1 group: 
[[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, 
projection=[c1, c2]",
+        "              CsvExec: files={1 group: 
[[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, 
projection=[c1]",

Review Comment:
   👍 



##########
datafusion/core/tests/sql/predicates.rs:
##########
@@ -589,19 +588,15 @@ async fn multiple_or_predicates() -> Result<()> {
     // factored out and appear only once in the following plan
     let expected = vec![
         "Explain [plan_type:Utf8, plan:Utf8]",
-        "  Projection: #lineitem.l_partkey [l_partkey:Int64]",
-        "    Projection: #part.p_size >= Int32(1) AS #part.p_size >= 
Int32(1)Int32(1)#part.p_size, #lineitem.l_partkey, #lineitem.l_quantity, 
#part.p_brand, #part.p_size [#part.p_size >= 
Int32(1)Int32(1)#part.p_size:Boolean;N, l_partkey:Int64, 
l_quantity:Decimal128(15, 2), p_brand:Utf8, p_size:Int32]",
-        "      Filter: #part.p_brand = Utf8(\"Brand#12\") AND 
#lineitem.l_quantity >= Decimal128(Some(100),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) AND #part.p_size <= Int32(5) OR #part.p_brand = 
Utf8(\"Brand#23\") AND #lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND 
#lineitem.l_quantity <= Decimal128(Some(2000),15,2) AND #part.p_size <= 
Int32(10) OR #part.p_brand = Utf8(\"Brand#34\") AND #lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(3000),15,2) AND #part.p_size <= Int32(15) [l_partkey:Int64, 
l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
-        "        Inner Join: #lineitem.l_partkey = #part.p_partkey 
[l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, 
p_size:Int32]",
-        "          Filter: #lineitem.l_quantity >= Decimal128(Some(100),15,2) 
AND #lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR #lineitem.l_quantity 
>= Decimal128(Some(1000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(2000),15,2) OR #lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(3000),15,2) [l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
-        "            TableScan: lineitem projection=[l_partkey, l_quantity], 
partial_filters=[#lineitem.l_quantity >= Decimal128(Some(100),15,2) AND 
#lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR #lineitem.l_quantity >= 
Decimal128(Some(1000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(2000),15,2) OR #lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)] [l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
-        "          Filter: #part.p_size >= Int32(1) AND #part.p_brand = 
Utf8(\"Brand#12\") AND #part.p_size <= Int32(5) OR #part.p_brand = 
Utf8(\"Brand#23\") AND #part.p_size <= Int32(10) OR #part.p_brand = 
Utf8(\"Brand#34\") AND #part.p_size <= Int32(15) [p_partkey:Int64, 
p_brand:Utf8, p_size:Int32]",
-        "            TableScan: part projection=[p_partkey, p_brand, p_size], 
partial_filters=[#part.p_size >= Int32(1), #part.p_brand = Utf8(\"Brand#12\") 
AND #part.p_size <= Int32(5) OR #part.p_brand = Utf8(\"Brand#23\") AND 
#part.p_size <= Int32(10) OR #part.p_brand = Utf8(\"Brand#34\") AND 
#part.p_size <= Int32(15)] [p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
         "  Projection: lineitem.l_partkey [l_partkey:Int64]",
-        "    Filter: part.p_brand = Utf8(\"Brand#12\") AND lineitem.l_quantity 
>= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND 
Int64(5) OR part.p_brand = Utf8(\"Brand#23\") AND lineitem.l_quantity >= 
Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(2000),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND 
Int64(10) OR part.p_brand = Utf8(\"Brand#34\") AND lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(3000),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND 
Int64(15) [l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64, 
p_brand:Utf8, p_size:Int32]",
+        "    Filter: part.p_brand = Utf8(\"Brand#12\") AND lineitem.l_quantity 
>= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) AND part.p_size <= Int32(5) OR part.p_brand = 
Utf8(\"Brand#23\") AND lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND 
lineitem.l_quantity <= Decimal128(Some(2000),15,2) AND part.p_size <= Int32(10) 
OR part.p_brand = Utf8(\"Brand#34\") AND lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(3000),15,2) AND part.p_size <= Int32(15) [l_partkey:Int64, 
l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
         "      Inner Join: lineitem.l_partkey = part.p_partkey 
[l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, 
p_size:Int32]",
-        "        TableScan: lineitem projection=[l_partkey, l_quantity] 
[l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
-        "        TableScan: part projection=[p_partkey, p_brand, p_size] 
[p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
+        "        Projection: lineitem.l_partkey, lineitem.l_quantity 
[l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
+        "          Filter: (lineitem.l_quantity >= 
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity AND 
lineitem.l_quantity <= 
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity OR 
lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantity AND 
lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity OR 
lineitem.l_quantity >= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity AND 
lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity) AND 
(lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
 >= Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR 
lineitem.l_quantity >= Decimal128(Some(2000),15,2)Decimal128(Some(20
 00),15,2)lineitem.l_quantity) AND (lineitem.l_quantity >= 
Decimal128(Some(100),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
 >= Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR 
lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity) AND 
(lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity
 >= Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR 
lineitem.l_quantity >= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity) AND 
(lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)Decimal12
 8(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity >= 
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR 
lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity) AND 
(lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
 <= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity 
OR lineitem.l_quantity >= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity) AND 
(lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
 <= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity 
OR lineitem.l_quantity <= Decimal128(Some(3000),15,2)Decima
 l128(Some(3000),15,2)lineitem.l_quantity) AND (lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) OR lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity
 <= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity 
OR lineitem.l_quantity >= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity) AND 
(lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity
 <= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity 
OR lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity) 
[lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)lineitem.l_quantity <= Decimal128(Some(2000)
 ,15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity <= 
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
 <= 
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity <= 
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)lineitem.l_quantity <= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineit
 em.l_quantity >= 
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
 >= 
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity >= 
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity >= 
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantity:Boolean;N,
 lineitem.l_quantity >= 
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity:Boolean;N,
 l_partkey:Int64, l_quantity:Decimal128(15, 2)]",

Review Comment:
   this filter looks pretty nasty -- perhaps it is related to 
https://github.com/apache/arrow-datafusion/pull/3903 🤔 
   
   On the other hand, there is a lot of pushed down filtering now 👍 
   cc @Ted-Jiang 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to