[ https://issues.apache.org/jira/browse/DRILL-6219?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Anton Gozhiy updated DRILL-6219: -------------------------------- Description: *Data set:* The data is generated used the attached file: *DRILL_6118_data_source.csv* Data gen commands: {code:sql} create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d1` (c1, c2, c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` where columns[0] in (1, 3); create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d2` (c1, c2, c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` where columns[0]=2; create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d3` (c1, c2, c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` where columns[0]>3; {code} *Steps:* # Execute the following query: {code:sql} select * from (select * from dfs.drillTestDir.`DRILL_6118_parquet_partitioned_by_folders` where c1>2) where c1>3{code} *Expected result:* Filrers "c1>3" and "c1>2" should both be pushed down so only the data from the folder "d3" should be scanned. *Actual result:* The data from the folders "d1" and "d3" are being scanned so as only filter "c1>2" is pushed down *Physical plan:* {noformat} 00-00 Screen : rowType = RecordType(DYNAMIC_STAR **): rowcount = 10.0, cumulative cost = {201.0 rows, 581.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105545 00-01 Project(**=[$0]) : rowType = RecordType(DYNAMIC_STAR **): rowcount = 10.0, cumulative cost = {200.0 rows, 580.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105544 00-02 SelectionVectorRemover : rowType = RecordType(DYNAMIC_STAR T25¦¦**): rowcount = 10.0, cumulative cost = {190.0 rows, 570.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105543 00-03 Filter(condition=[>(ITEM($0, 'c1'), 3)]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**): rowcount = 10.0, cumulative cost = {180.0 rows, 560.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105542 00-04 Project(T25¦¦**=[$0]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**): rowcount = 20.0, cumulative cost = {160.0 rows, 440.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105541 00-05 SelectionVectorRemover : rowType = RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 20.0, cumulative cost = {140.0 rows, 420.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105540 00-06 Filter(condition=[>($1, 2)]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 20.0, cumulative cost = {120.0 rows, 400.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105539 00-07 Project(T25¦¦**=[$0], c1=[$1]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 40.0, cumulative cost = {80.0 rows, 160.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105538 00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=/drill/testdata/DRILL_6118_parquet_partitioned_by_folders/d1/0_0_0.parquet], ReadEntryWithPath [path=/drill/testdata/DRILL_6118_parquet_partitioned_by_folders/d3/0_0_0.parquet]], selectionRoot=maprfs:/drill/testdata/DRILL_6118_parquet_partitioned_by_folders, numFiles=2, numRowGroups=2, usedMetadataFile=false, columns=[`**`]]]) : rowType = RecordType(DYNAMIC_STAR **, ANY c1): rowcount = 40.0, cumulative cost = {40.0 rows, 80.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105537 {noformat} was: *Data set:* The data is generated used the attached file: *DRILL_6118_data_source.csv* Data gen commands: {code:sql} create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d1` (c1, c2, c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` where columns[0] in (1, 3); create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d2` (c1, c2, c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` where columns[0]=2; create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d3` (c1, c2, c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` where columns[0]>3; {code} *Steps:* # Execute the following query: {code:sql} select * from (select * from dfs.drillTestDir.`DRILL_6118_parquet_partitioned_by_folders` where c1>2) where c1>3{code} *Expected result:* Filrers "c1>3" and "c1>2" should both be pushed down so only the data from the folder "d3" should be scanned. *Actual result:* The data from the folders "d1" and "d3" are being scanned so as only filter "c1>2" is pushed down *Physical plan:* {code} 00-00 Screen : rowType = RecordType(DYNAMIC_STAR **): rowcount = 10.0, cumulative cost = {201.0 rows, 581.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105545 00-01 Project(**=[$0]) : rowType = RecordType(DYNAMIC_STAR **): rowcount = 10.0, cumulative cost = {200.0 rows, 580.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105544 00-02 SelectionVectorRemover : rowType = RecordType(DYNAMIC_STAR T25¦¦**): rowcount = 10.0, cumulative cost = {190.0 rows, 570.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105543 00-03 Filter(condition=[>(ITEM($0, 'c1'), 3)]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**): rowcount = 10.0, cumulative cost = {180.0 rows, 560.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105542 00-04 Project(T25¦¦**=[$0]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**): rowcount = 20.0, cumulative cost = {160.0 rows, 440.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105541 00-05 SelectionVectorRemover : rowType = RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 20.0, cumulative cost = {140.0 rows, 420.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105540 00-06 Filter(condition=[>($1, 2)]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 20.0, cumulative cost = {120.0 rows, 400.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105539 00-07 Project(T25¦¦**=[$0], c1=[$1]) : rowType = RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 40.0, cumulative cost = {80.0 rows, 160.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105538 00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=/drill/testdata/DRILL_6118_parquet_partitioned_by_folders/d1/0_0_0.parquet], ReadEntryWithPath [path=/drill/testdata/DRILL_6118_parquet_partitioned_by_folders/d3/0_0_0.parquet]], selectionRoot=maprfs:/drill/testdata/DRILL_6118_parquet_partitioned_by_folders, numFiles=2, numRowGroups=2, usedMetadataFile=false, columns=[`**`]]]) : rowType = RecordType(DYNAMIC_STAR **, ANY c1): rowcount = 40.0, cumulative cost = {40.0 rows, 80.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105537 {code} > Filter pushdown doesn't work with star operator if there is a subquery with > it's own filter > ------------------------------------------------------------------------------------------- > > Key: DRILL-6219 > URL: https://issues.apache.org/jira/browse/DRILL-6219 > Project: Apache Drill > Issue Type: Bug > Affects Versions: 1.13.0 > Reporter: Anton Gozhiy > Priority: Major > > *Data set:* > The data is generated used the attached file: *DRILL_6118_data_source.csv* > Data gen commands: > {code:sql} > create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d1` (c1, c2, > c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] > c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` > where columns[0] in (1, 3); > create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d2` (c1, c2, > c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] > c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` > where columns[0]=2; > create table dfs.tmp.`DRILL_6118_parquet_partitioned_by_folders/d3` (c1, c2, > c3, c4, c5) as select cast(columns[0] as int) c1, columns[1] c2, columns[2] > c3, columns[3] c4, columns[4] c5 from dfs.tmp.`DRILL_6118_data_source.csv` > where columns[0]>3; > {code} > *Steps:* > # Execute the following query: > {code:sql} > select * from (select * from > dfs.drillTestDir.`DRILL_6118_parquet_partitioned_by_folders` where c1>2) > where c1>3{code} > *Expected result:* > Filrers "c1>3" and "c1>2" should both be pushed down so only the data from > the folder "d3" should be scanned. > *Actual result:* > The data from the folders "d1" and "d3" are being scanned so as only filter > "c1>2" is pushed down > *Physical plan:* > {noformat} > 00-00 Screen : rowType = RecordType(DYNAMIC_STAR **): rowcount = 10.0, > cumulative cost = {201.0 rows, 581.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, > id = 105545 > 00-01 Project(**=[$0]) : rowType = RecordType(DYNAMIC_STAR **): rowcount > = 10.0, cumulative cost = {200.0 rows, 580.0 cpu, 0.0 io, 0.0 network, 0.0 > memory}, id = 105544 > 00-02 SelectionVectorRemover : rowType = RecordType(DYNAMIC_STAR > T25¦¦**): rowcount = 10.0, cumulative cost = {190.0 rows, 570.0 cpu, 0.0 io, > 0.0 network, 0.0 memory}, id = 105543 > 00-03 Filter(condition=[>(ITEM($0, 'c1'), 3)]) : rowType = > RecordType(DYNAMIC_STAR T25¦¦**): rowcount = 10.0, cumulative cost = {180.0 > rows, 560.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105542 > 00-04 Project(T25¦¦**=[$0]) : rowType = RecordType(DYNAMIC_STAR > T25¦¦**): rowcount = 20.0, cumulative cost = {160.0 rows, 440.0 cpu, 0.0 io, > 0.0 network, 0.0 memory}, id = 105541 > 00-05 SelectionVectorRemover : rowType = RecordType(DYNAMIC_STAR > T25¦¦**, ANY c1): rowcount = 20.0, cumulative cost = {140.0 rows, 420.0 cpu, > 0.0 io, 0.0 network, 0.0 memory}, id = 105540 > 00-06 Filter(condition=[>($1, 2)]) : rowType = > RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 20.0, cumulative cost = > {120.0 rows, 400.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105539 > 00-07 Project(T25¦¦**=[$0], c1=[$1]) : rowType = > RecordType(DYNAMIC_STAR T25¦¦**, ANY c1): rowcount = 40.0, cumulative cost = > {80.0 rows, 160.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105538 > 00-08 Scan(groupscan=[ParquetGroupScan > [entries=[ReadEntryWithPath > [path=/drill/testdata/DRILL_6118_parquet_partitioned_by_folders/d1/0_0_0.parquet], > ReadEntryWithPath > [path=/drill/testdata/DRILL_6118_parquet_partitioned_by_folders/d3/0_0_0.parquet]], > > selectionRoot=maprfs:/drill/testdata/DRILL_6118_parquet_partitioned_by_folders, > numFiles=2, numRowGroups=2, usedMetadataFile=false, columns=[`**`]]]) : > rowType = RecordType(DYNAMIC_STAR **, ANY c1): rowcount = 40.0, cumulative > cost = {40.0 rows, 80.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 105537 > {noformat} -- This message was sent by Atlassian JIRA (v7.6.3#76005)