alamb commented on code in PR #4487:
URL: https://github.com/apache/arrow-datafusion/pull/4487#discussion_r1040000009
##########
datafusion/optimizer/src/push_down_projection.rs:
##########
@@ -94,23 +94,19 @@ fn optimize_plan(
let mut new_expr = Vec::new();
let mut new_fields = Vec::new();
+ let mut new_required_columns = HashSet::new();
Review Comment:
Since this is shadowing a variable in the outer scope, I wonder if it would
make sense to comment why we are starting with an empty set in this case rather
than the pushed down columns?
##########
datafusion/core/tests/sql/explain_analyze.rs:
##########
@@ -744,7 +744,7 @@ async fn test_physical_plan_display_indent_multi_children()
{
" RepartitionExec: partitioning=Hash([Column { name: \"c2\",
index: 0 }], 9000)",
" ProjectionExec: expr=[c1@0 as c2]",
" RepartitionExec: partitioning=RoundRobinBatch(9000)",
- " CsvExec: files={1 group:
[[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, has_header=true, limit=None,
projection=[c1, c2]",
+ " CsvExec: files={1 group:
[[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, has_header=true, limit=None,
projection=[c1]",
Review Comment:
👍
##########
datafusion/core/tests/sql/predicates.rs:
##########
@@ -589,19 +588,15 @@ async fn multiple_or_predicates() -> Result<()> {
// factored out and appear only once in the following plan
let expected = vec![
"Explain [plan_type:Utf8, plan:Utf8]",
- " Projection: #lineitem.l_partkey [l_partkey:Int64]",
- " Projection: #part.p_size >= Int32(1) AS #part.p_size >=
Int32(1)Int32(1)#part.p_size, #lineitem.l_partkey, #lineitem.l_quantity,
#part.p_brand, #part.p_size [#part.p_size >=
Int32(1)Int32(1)#part.p_size:Boolean;N, l_partkey:Int64,
l_quantity:Decimal128(15, 2), p_brand:Utf8, p_size:Int32]",
- " Filter: #part.p_brand = Utf8(\"Brand#12\") AND
#lineitem.l_quantity >= Decimal128(Some(100),15,2) AND #lineitem.l_quantity <=
Decimal128(Some(1100),15,2) AND #part.p_size <= Int32(5) OR #part.p_brand =
Utf8(\"Brand#23\") AND #lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND
#lineitem.l_quantity <= Decimal128(Some(2000),15,2) AND #part.p_size <=
Int32(10) OR #part.p_brand = Utf8(\"Brand#34\") AND #lineitem.l_quantity >=
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <=
Decimal128(Some(3000),15,2) AND #part.p_size <= Int32(15) [l_partkey:Int64,
l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
- " Inner Join: #lineitem.l_partkey = #part.p_partkey
[l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8,
p_size:Int32]",
- " Filter: #lineitem.l_quantity >= Decimal128(Some(100),15,2)
AND #lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR #lineitem.l_quantity
>= Decimal128(Some(1000),15,2) AND #lineitem.l_quantity <=
Decimal128(Some(2000),15,2) OR #lineitem.l_quantity >=
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <=
Decimal128(Some(3000),15,2) [l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
- " TableScan: lineitem projection=[l_partkey, l_quantity],
partial_filters=[#lineitem.l_quantity >= Decimal128(Some(100),15,2) AND
#lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR #lineitem.l_quantity >=
Decimal128(Some(1000),15,2) AND #lineitem.l_quantity <=
Decimal128(Some(2000),15,2) OR #lineitem.l_quantity >=
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <=
Decimal128(Some(3000),15,2)] [l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
- " Filter: #part.p_size >= Int32(1) AND #part.p_brand =
Utf8(\"Brand#12\") AND #part.p_size <= Int32(5) OR #part.p_brand =
Utf8(\"Brand#23\") AND #part.p_size <= Int32(10) OR #part.p_brand =
Utf8(\"Brand#34\") AND #part.p_size <= Int32(15) [p_partkey:Int64,
p_brand:Utf8, p_size:Int32]",
- " TableScan: part projection=[p_partkey, p_brand, p_size],
partial_filters=[#part.p_size >= Int32(1), #part.p_brand = Utf8(\"Brand#12\")
AND #part.p_size <= Int32(5) OR #part.p_brand = Utf8(\"Brand#23\") AND
#part.p_size <= Int32(10) OR #part.p_brand = Utf8(\"Brand#34\") AND
#part.p_size <= Int32(15)] [p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
" Projection: lineitem.l_partkey [l_partkey:Int64]",
- " Filter: part.p_brand = Utf8(\"Brand#12\") AND lineitem.l_quantity
>= Decimal128(Some(100),15,2) AND lineitem.l_quantity <=
Decimal128(Some(1100),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND
Int64(5) OR part.p_brand = Utf8(\"Brand#23\") AND lineitem.l_quantity >=
Decimal128(Some(1000),15,2) AND lineitem.l_quantity <=
Decimal128(Some(2000),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND
Int64(10) OR part.p_brand = Utf8(\"Brand#34\") AND lineitem.l_quantity >=
Decimal128(Some(2000),15,2) AND lineitem.l_quantity <=
Decimal128(Some(3000),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND
Int64(15) [l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64,
p_brand:Utf8, p_size:Int32]",
+ " Filter: part.p_brand = Utf8(\"Brand#12\") AND lineitem.l_quantity
>= Decimal128(Some(100),15,2) AND lineitem.l_quantity <=
Decimal128(Some(1100),15,2) AND part.p_size <= Int32(5) OR part.p_brand =
Utf8(\"Brand#23\") AND lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND
lineitem.l_quantity <= Decimal128(Some(2000),15,2) AND part.p_size <= Int32(10)
OR part.p_brand = Utf8(\"Brand#34\") AND lineitem.l_quantity >=
Decimal128(Some(2000),15,2) AND lineitem.l_quantity <=
Decimal128(Some(3000),15,2) AND part.p_size <= Int32(15) [l_partkey:Int64,
l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
" Inner Join: lineitem.l_partkey = part.p_partkey
[l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8,
p_size:Int32]",
- " TableScan: lineitem projection=[l_partkey, l_quantity]
[l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
- " TableScan: part projection=[p_partkey, p_brand, p_size]
[p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
+ " Projection: lineitem.l_partkey, lineitem.l_quantity
[l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
+ " Filter: (lineitem.l_quantity >=
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity AND
lineitem.l_quantity <=
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity OR
lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantity AND
lineitem.l_quantity <=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity OR
lineitem.l_quantity >=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity AND
lineitem.l_quantity <=
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity) AND
(lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity >=
Decimal128(Some(1000),15,2)lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
>= Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR
lineitem.l_quantity >= Decimal128(Some(2000),15,2)Decimal128(Some(20
00),15,2)lineitem.l_quantity) AND (lineitem.l_quantity >=
Decimal128(Some(100),15,2) OR lineitem.l_quantity >=
Decimal128(Some(1000),15,2)lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
>= Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR
lineitem.l_quantity <=
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity) AND
(lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity <=
Decimal128(Some(2000),15,2)lineitem.l_quantity <=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity
>= Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR
lineitem.l_quantity >=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity) AND
(lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity <=
Decimal128(Some(2000),15,2)lineitem.l_quantity <=
Decimal128(Some(2000),15,2)Decimal12
8(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity >=
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity OR
lineitem.l_quantity <=
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity) AND
(lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >=
Decimal128(Some(1000),15,2)lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
<= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity
OR lineitem.l_quantity >=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity) AND
(lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >=
Decimal128(Some(1000),15,2)lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
<= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity
OR lineitem.l_quantity <= Decimal128(Some(3000),15,2)Decima
l128(Some(3000),15,2)lineitem.l_quantity) AND (lineitem.l_quantity <=
Decimal128(Some(1100),15,2) OR lineitem.l_quantity <=
Decimal128(Some(2000),15,2)lineitem.l_quantity <=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity
<= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity
OR lineitem.l_quantity >=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity) AND
(lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity <=
Decimal128(Some(2000),15,2)lineitem.l_quantity <=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity
<= Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity
OR lineitem.l_quantity <=
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity)
[lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity <=
Decimal128(Some(2000),15,2)lineitem.l_quantity <= Decimal128(Some(2000)
,15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineitem.l_quantity <=
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >=
Decimal128(Some(1000),15,2)lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
<=
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity <=
Decimal128(Some(1100),15,2)Decimal128(Some(1100),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity <=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity <=
Decimal128(Some(3000),15,2)Decimal128(Some(3000),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity <=
Decimal128(Some(2000),15,2)lineitem.l_quantity <=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantitylineit
em.l_quantity >=
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity >= Decimal128(Some(100),15,2) OR lineitem.l_quantity >=
Decimal128(Some(1000),15,2)lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantitylineitem.l_quantity
>=
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity >=
Decimal128(Some(100),15,2)Decimal128(Some(100),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity >=
Decimal128(Some(1000),15,2)Decimal128(Some(1000),15,2)lineitem.l_quantity:Boolean;N,
lineitem.l_quantity >=
Decimal128(Some(2000),15,2)Decimal128(Some(2000),15,2)lineitem.l_quantity:Boolean;N,
l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
Review Comment:
this filter looks pretty nasty -- perhaps it is related to
https://github.com/apache/arrow-datafusion/pull/3903 🤔
On the other hand, there is a lot of pushed down filtering now 👍
cc @Ted-Jiang
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]