[ https://issues.apache.org/jira/browse/DRILL-6118?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16352839#comment-16352839 ]
ASF GitHub Bot commented on DRILL-6118: --------------------------------------- Github user chunhui-shi commented on a diff in the pull request: https://github.com/apache/drill/pull/1104#discussion_r166088073 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillFilterItemStarReWriterRule.java --- @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.planner.logical; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import org.apache.calcite.adapter.enumerable.EnumerableTableScan; +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptRuleOperand; +import org.apache.calcite.plan.RelOptTable; +import org.apache.calcite.prepare.RelOptTableImpl; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.CorrelationId; +import org.apache.calcite.rel.core.Filter; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.core.TableScan; +import org.apache.calcite.rel.logical.LogicalFilter; +import org.apache.calcite.rel.logical.LogicalProject; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeFactory; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexVisitorImpl; +import org.apache.calcite.schema.Table; +import org.apache.drill.exec.planner.types.RelDataTypeDrillImpl; +import org.apache.drill.exec.planner.types.RelDataTypeHolder; +import org.apache.drill.exec.util.Utilities; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.drill.exec.planner.logical.FieldsReWriterUtil.DesiredField; +import static org.apache.drill.exec.planner.logical.FieldsReWriterUtil.FieldsReWriter; + +/** + * Rule will transform filter -> project -> scan call with item star fields in filter + * into project -> filter -> project -> scan where item star fields are pushed into scan + * and replaced with actual field references. + * + * This will help partition pruning and push down rules to detect fields that can be pruned or push downed. + * Item star operator appears when sub-select or cte with star are used as source. + */ +public class DrillFilterItemStarReWriterRule extends RelOptRule { + + public static final DrillFilterItemStarReWriterRule INSTANCE = new DrillFilterItemStarReWriterRule( + RelOptHelper.some(Filter.class, RelOptHelper.some(Project.class, RelOptHelper.any( TableScan.class))), + "DrillFilterItemStarReWriterRule"); + + private DrillFilterItemStarReWriterRule(RelOptRuleOperand operand, String id) { + super(operand, id); + } + + @Override + public void onMatch(RelOptRuleCall call) { + Filter filterRel = call.rel(0); + Project projectRel = call.rel(1); + TableScan scanRel = call.rel(2); + + ItemStarFieldsVisitor itemStarFieldsVisitor = new ItemStarFieldsVisitor(filterRel.getRowType().getFieldNames()); --- End diff -- To make sure this works for complex expressions, could you add tests similar to testPushProjectIntoScan to test some expressions that include more than two fields in expressions in project(e.g. "select a+b") and in filter(e.g. "where a+b < 5")? > Handle item star columns during project / filter push down and directory > pruning > ---------------------------------------------------------------------------------- > > Key: DRILL-6118 > URL: https://issues.apache.org/jira/browse/DRILL-6118 > Project: Apache Drill > Issue Type: Improvement > Affects Versions: 1.12.0 > Reporter: Arina Ielchiieva > Assignee: Arina Ielchiieva > Priority: Major > Labels: doc-impacting > Fix For: 1.13.0 > > > Project push down, filter push down and partition pruning does not work with > dynamically expanded column with is represented as star in ITEM operator: > _ITEM($0, 'column_name')_ where $0 is a star. > This often occurs when view, sub-select or cte with star is issued. > To solve this issue we can create {{DrillFilterItemStarReWriterRule}} which > will rewrite such ITEM operator before filter push down and directory > pruning. For project into scan push down logic will be handled separately in > already existing rule {{DrillPushProjectIntoScanRule}}. Basically, we can > consider the following queries the same: > {{select col1 from t}} > {{select col1 from (select * from t)}} > *Use cases* > Since item star columns where not considered during project / filter push > down and directory pruning, push down and pruning did not happen. This was > causing Drill to read all columns from file (when only several are needed) or > ready all files instead. Views with star query is the most common example. > Such behavior significantly degrades performance for item star queries > comparing to queries without item star. > *EXAMPLES* > *Data set* > will create table with three files each in dedicated sub-folder: > {noformat} > use dfs.tmp; > create table `order_ctas/t1` as select cast(o_orderdate as date) as > o_orderdate from cp.`tpch/orders.parquet` where o_orderdate between date > '1992-01-01' and date '1992-01-03'; > create table `order_ctas/t2` as select cast(o_orderdate as date) as > o_orderdate from cp.`tpch/orders.parquet` where o_orderdate between date > '1992-01-04' and date '1992-01-06'; > create table `order_ctas/t3` as select cast(o_orderdate as date) as > o_orderdate from cp.`tpch/orders.parquet` where o_orderdate between date > '1992-01-07' and date '1992-01-09'; > {noformat} > *Filter push down* > {{select * from order_ctas where o_orderdate = date '1992-01-01'}} will read > only one file > {noformat} > 00-00 Screen > 00-01 Project(**=[$0]) > 00-02 Project(T1¦¦**=[$0]) > 00-03 SelectionVectorRemover > 00-04 Filter(condition=[=($1, 1992-01-01)]) > 00-05 Project(T1¦¦**=[$0], o_orderdate=[$1]) > 00-06 Scan(groupscan=[ParquetGroupScan > [entries=[ReadEntryWithPath [path=/tmp/order_ctas/t1/0_0_0.parquet]], > selectionRoot=/tmp/order_ctas, numFiles=1, numRowGroups=1, > usedMetadataFile=false, columns=[`**`]]]) > {noformat} > {{select * from (select * from order_ctas) where o_orderdate = date > '1992-01-01'}} will ready all three files > {noformat} > 00-00 Screen > 00-01 Project(**=[$0]) > 00-02 SelectionVectorRemover > 00-03 Filter(condition=[=(ITEM($0, 'o_orderdate'), 1992-01-01)]) > 00-04 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath > [path=/tmp/order_ctas/t1/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t2/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t3/0_0_0.parquet]], selectionRoot=/tmp/order_ctas, > numFiles=3, numRowGroups=3, usedMetadataFile=false, columns=[`**`]]]) > {noformat} > *Directory pruning* > {{select * from order_ctas where dir0 = 't1'}} will read data only from one > folder > {noformat} > 00-00 Screen > 00-01 Project(**=[$0]) > 00-02 Project(**=[$0]) > 00-03 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath > [path=/tmp/order_ctas/t1/0_0_0.parquet]], selectionRoot=/tmporder_ctas, > numFiles=1, numRowGroups=1, usedMetadataFile=false, columns=[`**`]]]) > {noformat} > {{select * from (select * from order_ctas) where dir0 = 't1'}} will read > content of all three folders > {noformat} > 00-00 Screen > 00-01 Project(**=[$0]) > 00-02 SelectionVectorRemover > 00-03 Filter(condition=[=(ITEM($0, 'dir0'), 't1')]) > 00-04 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath > [path=/tmp/order_ctas/t1/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t2/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t3/0_0_0.parquet]], selectionRoot=/tmp/order_ctas, > numFiles=3, numRowGroups=3, usedMetadataFile=false, columns=[`**`]]]) > {noformat} > *Project into Scan push down* > {{select o_orderdate, count(1) from order_ctas group by o_orderdate}} will > ready only one column from the files > {noformat} > 00-00 Screen > 00-01 Project(o_orderdate=[$0], EXPR$1=[$1]) > 00-02 HashAgg(group=[{0}], EXPR$1=[COUNT()]) > 00-03 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath > [path=/tmp/order_ctas/t1/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t2/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t3/0_0_0.parquet]], selectionRoot=/tmp/order_ctas, > numFiles=3, numRowGroups=3, usedMetadataFile=false, columns=[`o_orderdate`]]]) > {noformat} > {{select o_orderdate, count(1) from (select * from order_ctas) group by > o_orderdate}} will ready all columns from the files > {noformat} > 00-00 Screen > 00-01 Project(col_vrchr=[$0], EXPR$1=[$1]) > 00-02 StreamAgg(group=[{0}], EXPR$1=[COUNT()]) > 00-03 Sort(sort0=[$0], dir0=[ASC]) > 00-04 Project(col_vrchr=[ITEM($0, 'o_orderdate')]) > 00-05 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath > [path=/tmp/order_ctas/t1/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t2/0_0_0.parquet], ReadEntryWithPath > [path=/tmp/order_ctas/t3/0_0_0.parquet]], selectionRoot=/tmp/order_ctas, > numFiles=3, numRowGroups=3, usedMetadataFile=false, columns=[`**`]]]) > {noformat} > This Jira aims to fix all three described cases above in order to improve > performance for queries with item star columns. > -- This message was sent by Atlassian JIRA (v7.6.3#76005)