Repository: hive Updated Branches: refs/heads/master d90d63859 -> f9141094e
HIVE-15369: Extend column pruner to account for residual filter expression in Join operator (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f9141094 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f9141094 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f9141094 Branch: refs/heads/master Commit: f9141094e10d758ac9189907bf1318866c2fb8f2 Parents: d90d638 Author: Jesus Camacho Rodriguez <[email protected]> Authored: Tue Dec 6 16:00:26 2016 +0000 Committer: Jesus Camacho Rodriguez <[email protected]> Committed: Tue Dec 6 17:47:07 2016 +0000 ---------------------------------------------------------------------- .../ql/optimizer/ColumnPrunerProcFactory.java | 16 ++- ql/src/test/queries/clientpositive/join46.q | 12 ++ ql/src/test/results/clientpositive/join46.q.out | 98 ++++++++++++++++ .../results/clientpositive/llap/join46.q.out | 112 +++++++++++++++++++ 4 files changed, 234 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/f9141094/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java index 7681a83..00ec03e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java @@ -1083,11 +1083,12 @@ public final class ColumnPrunerProcFactory { LOG.info("JOIN " + op.getIdentifier() + " oldExprs: " + conf.getExprs()); - List<FieldNode> childColLists = cppCtx.genColLists(op); - if (childColLists == null) { + if (cppCtx.genColLists(op) == null) { return; } + List<FieldNode> neededColList = new ArrayList<>(cppCtx.genColLists(op)); + Map<Byte, List<FieldNode>> prunedColLists = new HashMap<>(); for (byte tag : conf.getTagOrder()) { prunedColLists.put(tag, new ArrayList<FieldNode>()); @@ -1107,6 +1108,13 @@ public final class ColumnPrunerProcFactory { } } + //add the columns in residual filters + if (conf.getResidualFilterExprs() != null) { + for (ExprNodeDesc desc : conf.getResidualFilterExprs()) { + neededColList = mergeFieldNodesWithDesc(neededColList, desc); + } + } + RowSchema joinRS = op.getSchema(); ArrayList<String> outputCols = new ArrayList<String>(); ArrayList<ColumnInfo> rs = new ArrayList<ColumnInfo>(); @@ -1116,7 +1124,7 @@ public final class ColumnPrunerProcFactory { String internalName = conf.getOutputColumnNames().get(i); ExprNodeDesc desc = columnExprMap.get(internalName); Byte tag = conf.getReversedExprs().get(internalName); - if (lookupColumn(childColLists, internalName) == null) { + if (lookupColumn(neededColList, internalName) == null) { int index = conf.getExprs().get(tag).indexOf(desc); if (index < 0) { continue; @@ -1174,7 +1182,7 @@ public final class ColumnPrunerProcFactory { for (Operator<? extends OperatorDesc> child : childOperators) { if (child instanceof ReduceSinkOperator) { - boolean[] flags = getPruneReduceSinkOpRetainFlags(toColumnNames(childColLists), + boolean[] flags = getPruneReduceSinkOpRetainFlags(toColumnNames(neededColList), (ReduceSinkOperator) child); pruneReduceSinkOperator(flags, (ReduceSinkOperator) child, cppCtx); } http://git-wip-us.apache.org/repos/asf/hive/blob/f9141094/ql/src/test/queries/clientpositive/join46.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/join46.q b/ql/src/test/queries/clientpositive/join46.q index e9e9850..a661c0f 100644 --- a/ql/src/test/queries/clientpositive/join46.q +++ b/ql/src/test/queries/clientpositive/join46.q @@ -118,6 +118,18 @@ ON (test1.value=test2.value AND (test1.key between 100 and 102 OR test2.key between 100 and 102)); +-- Complex condition, projecting columns +EXPLAIN +SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key); + +SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key); + -- Disjunction with pred on multiple inputs and single inputs (right outer join) EXPLAIN SELECT * http://git-wip-us.apache.org/repos/asf/hive/blob/f9141094/ql/src/test/results/clientpositive/join46.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/join46.q.out b/ql/src/test/results/clientpositive/join46.q.out index 90f0620..21ae29c 100644 --- a/ql/src/test/results/clientpositive/join46.q.out +++ b/ql/src/test/results/clientpositive/join46.q.out @@ -912,6 +912,104 @@ NULL NULL None NULL NULL NULL 101 2 Car 102 2 Del 99 2 Mat 102 2 Del Warning: Shuffle Join JOIN[6][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: -- Complex condition, projecting columns +EXPLAIN +SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +PREHOOK: type: QUERY +POSTHOOK: query: -- Complex condition, projecting columns +EXPLAIN +SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: test1 + Statistics: Num rows: 6 Data size: 56 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: int), col_1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 56 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 6 Data size: 56 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: string) + TableScan + alias: test2 + Statistics: Num rows: 4 Data size: 38 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: int), col_2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 38 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 4 Data size: 38 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + residual filter predicates: {((_col1 = _col4) or (_col0 = _col3))} + Statistics: Num rows: 24 Data size: 476 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: string), _col5 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 24 Data size: 476 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 24 Data size: 476 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[6][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@test1 +PREHOOK: Input: default@test2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test1 +POSTHOOK: Input: default@test2 +#### A masked pattern was here #### +Car Ema +Car Del +Bob NULL +Mat Ema +Mat Del +Alice NULL +None NULL +None NULL +Warning: Shuffle Join JOIN[6][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product PREHOOK: query: -- Disjunction with pred on multiple inputs and single inputs (right outer join) EXPLAIN SELECT * http://git-wip-us.apache.org/repos/asf/hive/blob/f9141094/ql/src/test/results/clientpositive/llap/join46.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/join46.q.out b/ql/src/test/results/clientpositive/llap/join46.q.out index fe1cf37..741f0e3 100644 --- a/ql/src/test/results/clientpositive/llap/join46.q.out +++ b/ql/src/test/results/clientpositive/llap/join46.q.out @@ -1038,6 +1038,118 @@ NULL NULL None NULL NULL NULL 101 2 Car 102 2 Del 101 2 Car 103 2 Ema Warning: Shuffle Join MERGEJOIN[9][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: -- Complex condition, projecting columns +EXPLAIN +SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +PREHOOK: type: QUERY +POSTHOOK: query: -- Complex condition, projecting columns +EXPLAIN +SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: test1 + Statistics: Num rows: 6 Data size: 56 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: int), col_1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 56 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 6 Data size: 56 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: string) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: test2 + Statistics: Num rows: 4 Data size: 38 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: int), col_2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 38 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 4 Data size: 38 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Outer Join0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + residual filter predicates: {((_col1 = _col4) or (_col0 = _col3))} + Statistics: Num rows: 24 Data size: 476 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: string), _col5 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 24 Data size: 476 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 24 Data size: 476 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[9][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +PREHOOK: type: QUERY +PREHOOK: Input: default@test1 +PREHOOK: Input: default@test2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT col_1, col_2 +FROM test1 LEFT OUTER JOIN test2 +ON (test1.value=test2.value + OR test1.key=test2.key) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test1 +POSTHOOK: Input: default@test2 +#### A masked pattern was here #### +None NULL +None NULL +Alice NULL +Mat Del +Mat Ema +Bob NULL +Car Del +Car Ema +Warning: Shuffle Join MERGEJOIN[9][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product PREHOOK: query: -- Disjunction with pred on multiple inputs and single inputs (right outer join) EXPLAIN SELECT *
