Repository: hive Updated Branches: refs/heads/master ad1552745 -> df5c56bd6
HIVE-15680: Incorrect results when hive.optimize.index.filter=true and same ORC table is referenced twice in query (Anthony Hsu, Prasanth Jayachandran reviewed by Thejas Nair) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/df5c56bd Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/df5c56bd Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/df5c56bd Branch: refs/heads/master Commit: df5c56bd66c64200f675ce61058a0813f6b1f66f Parents: ad15527 Author: Prasanth Jayachandran <prasan...@apache.org> Authored: Fri Jan 26 15:58:07 2018 -0800 Committer: Prasanth Jayachandran <prasan...@apache.org> Committed: Fri Jan 26 15:58:07 2018 -0800 ---------------------------------------------------------------------- .../apache/hadoop/hive/ql/exec/FetchTask.java | 2 +- .../hadoop/hive/ql/exec/SMBMapJoinOperator.java | 2 +- .../hadoop/hive/ql/exec/mr/MapredLocalTask.java | 2 +- .../hadoop/hive/ql/io/HiveInputFormat.java | 16 +++-- .../orc_ppd_same_table_multiple_aliases.q | 17 ++++++ .../orc_ppd_same_table_multiple_aliases.q.out | 64 ++++++++++++++++++++ 6 files changed, 96 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/df5c56bd/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchTask.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchTask.java index 39c1a42..a7dace9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchTask.java @@ -77,7 +77,7 @@ public class FetchTask extends Task<FetchWork> implements Serializable { ColumnProjectionUtils.appendReadColumns( job, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths()); // push down filters - HiveInputFormat.pushFilters(job, ts); + HiveInputFormat.pushFilters(job, ts, null); AcidUtils.setAcidTableScan(job, ts.getConf().isAcidTable()); AcidUtils.setAcidOperationalProperties(job, ts.getConf().getAcidOperationalProperties()); http://git-wip-us.apache.org/repos/asf/hive/blob/df5c56bd/ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java index 5a7c23d..270b576 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java @@ -207,7 +207,7 @@ public class SMBMapJoinOperator extends AbstractMapJoinOperator<SMBJoinDesc> imp ColumnProjectionUtils.appendReadColumns( jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths()); // push down filters - HiveInputFormat.pushFilters(jobClone, ts); + HiveInputFormat.pushFilters(jobClone, ts, null); AcidUtils.setAcidTableScan(jobClone, ts.getConf().isAcidTable()); AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().getAcidOperationalProperties()); http://git-wip-us.apache.org/repos/asf/hive/blob/df5c56bd/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java index 5ac3d58..abd42ec 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java @@ -483,7 +483,7 @@ public class MapredLocalTask extends Task<MapredLocalWork> implements Serializab ColumnProjectionUtils.appendReadColumns( jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths()); // push down filters - HiveInputFormat.pushFilters(jobClone, ts); + HiveInputFormat.pushFilters(jobClone, ts, null); AcidUtils.setAcidTableScan(jobClone, ts.getConf().isAcidTable()); AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().getAcidOperationalProperties()); http://git-wip-us.apache.org/repos/asf/hive/blob/df5c56bd/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 7d3ff36..c3b846c 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -475,7 +475,7 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable> } if (tableScan != null) { - pushFilters(conf, tableScan); + pushFilters(conf, tableScan, this.mrwork); } Path[] finalDirs = processPathsForMmRead(dirs, conf, validTxnList); @@ -633,7 +633,7 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable> tableScan.getNeededColumnIDs(), tableScan.getNeededColumns()); pushDownProjection = true; // push down filters - pushFilters(newjob, tableScan); + pushFilters(newjob, tableScan, this.mrwork); } } else { if (LOG.isDebugEnabled()) { @@ -729,7 +729,8 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable> return partDesc; } - public static void pushFilters(JobConf jobConf, TableScanOperator tableScan) { + public static void pushFilters(JobConf jobConf, TableScanOperator tableScan, + final MapWork mrwork) { // ensure filters are not set from previous pushFilters jobConf.unset(TableScanDesc.FILTER_TEXT_CONF_STR); @@ -753,6 +754,13 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable> return; } + // disable filter pushdown for mapreduce when there are more than one table aliases, + // since we don't clone jobConf per alias + if (mrwork != null && mrwork.getAliases() != null && mrwork.getAliases().size() > 1 && + jobConf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname).equals("mr")) { + return; + } + String serializedFilterObj = scanDesc.getSerializedFilterObject(); String serializedFilterExpr = scanDesc.getSerializedFilterExpr(); boolean hasObj = serializedFilterObj != null, hasExpr = serializedFilterExpr != null; @@ -849,7 +857,7 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable> ColumnProjectionUtils.appendReadColumns( jobConf, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths()); // push down filters - pushFilters(jobConf, ts); + pushFilters(jobConf, ts, this.mrwork); AcidUtils.setAcidTableScan(job, ts.getConf().isAcidTable()); AcidUtils.setAcidOperationalProperties(job, ts.getConf().getAcidOperationalProperties()); http://git-wip-us.apache.org/repos/asf/hive/blob/df5c56bd/ql/src/test/queries/clientpositive/orc_ppd_same_table_multiple_aliases.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/orc_ppd_same_table_multiple_aliases.q b/ql/src/test/queries/clientpositive/orc_ppd_same_table_multiple_aliases.q new file mode 100644 index 0000000..3b72c14 --- /dev/null +++ b/ql/src/test/queries/clientpositive/orc_ppd_same_table_multiple_aliases.q @@ -0,0 +1,17 @@ +-- SORT_QUERY_RESULTS; +set hive.optimize.index.filter=true; +create table test_table(number int) stored as ORC; + +-- Two insertions will create two files, with one stripe each +insert into table test_table VALUES (1); +insert into table test_table VALUES (2); + +-- This should return 2 records +select * from test_table; + +-- These should each return 1 record +select * from test_table where number = 1; +select * from test_table where number = 2; + +-- This should return 2 records +select * from test_table where number = 1 union all select * from test_table where number = 2; http://git-wip-us.apache.org/repos/asf/hive/blob/df5c56bd/ql/src/test/results/clientpositive/orc_ppd_same_table_multiple_aliases.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/orc_ppd_same_table_multiple_aliases.q.out b/ql/src/test/results/clientpositive/orc_ppd_same_table_multiple_aliases.q.out new file mode 100644 index 0000000..cc373a2 --- /dev/null +++ b/ql/src/test/results/clientpositive/orc_ppd_same_table_multiple_aliases.q.out @@ -0,0 +1,64 @@ +PREHOOK: query: create table test_table(number int) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_table +POSTHOOK: query: create table test_table(number int) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_table +PREHOOK: query: insert into table test_table VALUES (1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_table +POSTHOOK: query: insert into table test_table VALUES (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_table +POSTHOOK: Lineage: test_table.number SCRIPT [] +PREHOOK: query: insert into table test_table VALUES (2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_table +POSTHOOK: query: insert into table test_table VALUES (2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_table +POSTHOOK: Lineage: test_table.number SCRIPT [] +PREHOOK: query: select * from test_table +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +1 +2 +PREHOOK: query: select * from test_table where number = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table where number = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +1 +PREHOOK: query: select * from test_table where number = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table where number = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +2 +PREHOOK: query: select * from test_table where number = 1 union all select * from test_table where number = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table where number = 1 union all select * from test_table where number = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +1 +2