Repository: hive Updated Branches: refs/heads/master-txnstats 4cc7819d7 -> b51b47c0e
HIVE-19975 : PART 1 fix issues masking the bug, add a test (Sergey Shelukhin) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/b51b47c0 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/b51b47c0 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/b51b47c0 Branch: refs/heads/master-txnstats Commit: b51b47c0edca838a0f2109a049efb05e62da2b7b Parents: 4cc7819 Author: sergey <[email protected]> Authored: Wed Jun 27 20:25:16 2018 -0700 Committer: sergey <[email protected]> Committed: Wed Jun 27 20:26:53 2018 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 28 +- .../apache/hadoop/hive/ql/metadata/Hive.java | 5 +- ql/src/test/queries/clientpositive/acid_stats.q | 46 +++ .../results/clientpositive/acid_stats.q.out | 277 +++++++++++++++++++ .../hadoop/hive/metastore/HiveMetaStore.java | 4 +- .../hadoop/hive/metastore/ObjectStore.java | 6 + 6 files changed, 347 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/b51b47c0/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index caf886f..07ed709 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -1650,13 +1650,13 @@ public class AcidUtils { public void setValidWriteIdList(String validWriteIdList) { this.validWriteIdList = validWriteIdList; } - } - // TODO# remove - public static TableSnapshot getTableSnapshot( - Configuration conf, Table tbl) throws LockException { - return getTableSnapshot(conf, tbl, false); + @Override + public String toString() { + return "[txnId=" + txnId + ", validWriteIdList=" + validWriteIdList + "]"; + } } + /** * Create a TableShopshot with the given "conf" * for the table of the given "tbl". @@ -1667,7 +1667,7 @@ public class AcidUtils { * @throws LockException */ public static TableSnapshot getTableSnapshot( - Configuration conf, Table tbl, boolean isInTxnScope) throws LockException { + Configuration conf, Table tbl) throws LockException { if (!isTransactionalTable(tbl)) { return null; } else { @@ -1683,19 +1683,17 @@ public class AcidUtils { if (txnId > 0 && isTransactionalTable(tbl)) { validWriteIdList = getTableValidWriteIdList(conf, fullTableName); - // TODO: remove in_test filters? - if (validWriteIdList == null && !isInTxnScope - && !HiveConf.getBoolVar(conf, ConfVars.HIVE_IN_TEST)) { - LOG.warn("Obtaining write IDs from metastore for " + tbl.getTableName()); + + if (HiveConf.getBoolVar(conf, ConfVars.HIVE_IN_TEST) + && conf.get(ValidTxnList.VALID_TXNS_KEY) == null) { + return null; + } + if (validWriteIdList == null) { validWriteIdList = getTableValidWriteIdListWithTxnList( conf, tbl.getDbName(), tbl.getTableName()); } if (validWriteIdList == null) { - if (!HiveConf.getBoolVar(conf, ConfVars.HIVE_IN_TEST)) { - throw new AssertionError("Cannot find valid write ID list for " + tbl.getTableName()); - } else { - return null; - } + throw new AssertionError("Cannot find valid write ID list for " + tbl.getTableName()); } } return new TableSnapshot(txnId, http://git-wip-us.apache.org/repos/asf/hive/blob/b51b47c0/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index c0a9be0..37bcc03 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -769,7 +769,7 @@ public class Hive { try { AcidUtils.TableSnapshot tableSnapshot = null; if (transactional) { - tableSnapshot = AcidUtils.getTableSnapshot(conf, newParts.get(0).getTable(), true); + tableSnapshot = AcidUtils.getTableSnapshot(conf, newParts.get(0).getTable()); } // Remove the DDL time so that it gets refreshed for (Partition tmpPart: newParts) { @@ -2454,8 +2454,7 @@ private void constructOneLBLocationMap(FileStatus fSta, try { org.apache.hadoop.hive.metastore.api.Partition part = Partition.createMetaPartitionObject(tbl, partSpec, null); - AcidUtils.TableSnapshot tableSnapshot = - AcidUtils.getTableSnapshot(conf, tbl, false); + AcidUtils.TableSnapshot tableSnapshot = AcidUtils.getTableSnapshot(conf, tbl); part.setTxnId(tableSnapshot != null ? tableSnapshot.getTxnId() : 0); part.setValidWriteIdList(tableSnapshot != null ? tableSnapshot.getValidWriteIdList() : null); return new Partition(tbl, getMSC().add_partition(part)); http://git-wip-us.apache.org/repos/asf/hive/blob/b51b47c0/ql/src/test/queries/clientpositive/acid_stats.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/acid_stats.q b/ql/src/test/queries/clientpositive/acid_stats.q new file mode 100644 index 0000000..1e1c9b0 --- /dev/null +++ b/ql/src/test/queries/clientpositive/acid_stats.q @@ -0,0 +1,46 @@ +set hive.stats.dbclass=fs; +set hive.stats.fetch.column.stats=true; +set datanucleus.cache.collections=false; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.stats.autogather=true; +set hive.stats.column.autogather=true; +set hive.compute.query.using.stats=true; +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; + +set hive.fetch.task.conversion=none; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.query.results.cache.enabled=false; + +-- test simple partition case + +create table stats_part(key int,value string) partitioned by (p int) tblproperties ("transactional"="true", "transactional_properties"="insert_only"); + +insert into table stats_part partition(p=101) values (1, "foo"); +explain select count(key) from stats_part; +insert into table stats_part partition(p=102) values (1, "bar"); +explain select count(key) from stats_part; + +alter table stats_part drop partition (p=102); +explain select count(key) from stats_part; + +drop table stats_part; + +-- test the case where we insert without updating stats... just in case + +create table stats2(key int,value string) tblproperties ("transactional"="true", "transactional_properties"="insert_only"); +insert into table stats2 values (1, "foo"); +explain select count(*) from stats2; +insert into table stats2 values (1, "bar"); +explain select count(*) from stats2; + +set hive.stats.autogather=false; +set hive.stats.column.autogather=false; +insert into table stats2 values (1, "baz"); +explain select count(*) from stats2; + +drop table stats2; http://git-wip-us.apache.org/repos/asf/hive/blob/b51b47c0/ql/src/test/results/clientpositive/acid_stats.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/acid_stats.q.out b/ql/src/test/results/clientpositive/acid_stats.q.out new file mode 100644 index 0000000..969433e --- /dev/null +++ b/ql/src/test/results/clientpositive/acid_stats.q.out @@ -0,0 +1,277 @@ +PREHOOK: query: create table stats_part(key int,value string) partitioned by (p int) tblproperties ("transactional"="true", "transactional_properties"="insert_only") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@stats_part +POSTHOOK: query: create table stats_part(key int,value string) partitioned by (p int) tblproperties ("transactional"="true", "transactional_properties"="insert_only") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@stats_part +PREHOOK: query: insert into table stats_part partition(p=101) values (1, "foo") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@stats_part@p=101 +POSTHOOK: query: insert into table stats_part partition(p=101) values (1, "foo") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@stats_part@p=101 +POSTHOOK: Lineage: stats_part PARTITION(p=101).key SCRIPT [] +POSTHOOK: Lineage: stats_part PARTITION(p=101).value SCRIPT [] +PREHOOK: query: explain select count(key) from stats_part +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(key) from stats_part +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: insert into table stats_part partition(p=102) values (1, "bar") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@stats_part@p=102 +POSTHOOK: query: insert into table stats_part partition(p=102) values (1, "bar") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@stats_part@p=102 +POSTHOOK: Lineage: stats_part PARTITION(p=102).key SCRIPT [] +POSTHOOK: Lineage: stats_part PARTITION(p=102).value SCRIPT [] +PREHOOK: query: explain select count(key) from stats_part +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(key) from stats_part +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: stats_part + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: key (type: int) + outputColumnNames: key + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + Group By Operator + aggregations: count(key) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: alter table stats_part drop partition (p=102) +PREHOOK: type: ALTERTABLE_DROPPARTS +PREHOOK: Input: default@stats_part +PREHOOK: Output: default@stats_part@p=102 +POSTHOOK: query: alter table stats_part drop partition (p=102) +POSTHOOK: type: ALTERTABLE_DROPPARTS +POSTHOOK: Input: default@stats_part +POSTHOOK: Output: default@stats_part@p=102 +PREHOOK: query: explain select count(key) from stats_part +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(key) from stats_part +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: stats_part + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: int) + outputColumnNames: key + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(key) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: drop table stats_part +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@stats_part +PREHOOK: Output: default@stats_part +POSTHOOK: query: drop table stats_part +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@stats_part +POSTHOOK: Output: default@stats_part +PREHOOK: query: create table stats2(key int,value string) tblproperties ("transactional"="true", "transactional_properties"="insert_only") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@stats2 +POSTHOOK: query: create table stats2(key int,value string) tblproperties ("transactional"="true", "transactional_properties"="insert_only") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@stats2 +PREHOOK: query: insert into table stats2 values (1, "foo") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@stats2 +POSTHOOK: query: insert into table stats2 values (1, "foo") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@stats2 +POSTHOOK: Lineage: stats2.key SCRIPT [] +POSTHOOK: Lineage: stats2.value SCRIPT [] +PREHOOK: query: explain select count(*) from stats2 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from stats2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: insert into table stats2 values (1, "bar") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@stats2 +POSTHOOK: query: insert into table stats2 values (1, "bar") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@stats2 +POSTHOOK: Lineage: stats2.key SCRIPT [] +POSTHOOK: Lineage: stats2.value SCRIPT [] +PREHOOK: query: explain select count(*) from stats2 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from stats2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: insert into table stats2 values (1, "baz") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@stats2 +POSTHOOK: query: insert into table stats2 values (1, "baz") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@stats2 +POSTHOOK: Lineage: stats2.key SCRIPT [] +POSTHOOK: Lineage: stats2.value SCRIPT [] +PREHOOK: query: explain select count(*) from stats2 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from stats2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: stats2 + Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: drop table stats2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@stats2 +PREHOOK: Output: default@stats2 +POSTHOOK: query: drop table stats2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@stats2 +POSTHOOK: Output: default@stats2 http://git-wip-us.apache.org/repos/asf/hive/blob/b51b47c0/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java index fb15cda..df2bf10 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java @@ -5697,7 +5697,9 @@ public class HiveMetaStore extends ThriftHiveMetastore { } try { List<ColumnStatistics> stats = getMS().getPartitionColumnStatistics( - catName, dbName, tblName, lowerCasePartNames, lowerCaseColNames); + catName, dbName, tblName, lowerCasePartNames, lowerCaseColNames, + request.isSetTxnId() ? request.getTxnId() : -1, + request.isSetValidWriteIdList() ? request.getValidWriteIdList() : null); Map<String, List<ColumnStatisticsObj>> map = new HashMap<>(); for (ColumnStatistics stat : stats) { map.put(stat.getStatsDesc().getPartName(), stat.getStatsObj()); http://git-wip-us.apache.org/repos/asf/hive/blob/b51b47c0/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java index 2c3554e..f4919e0 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java @@ -8631,6 +8631,8 @@ public class ObjectStore implements RawStore, Configurable { @Override public List<ColumnStatistics> getPartitionColumnStatistics(String catName, String dbName, String tableName, List<String> partNames, List<String> colNames) throws MetaException, NoSuchObjectException { + // TODO: this will get stats without verifying ACID. Not clear when this can be valid... + // We need to check that this is not called on a txn table. return getPartitionColumnStatisticsInternal( catName, dbName, tableName, partNames, colNames, true, true); } @@ -8641,6 +8643,7 @@ public class ObjectStore implements RawStore, Configurable { List<String> partNames, List<String> colNames, long txnId, String writeIdList) throws MetaException, NoSuchObjectException { + // If any of the current partition stats in the metastore doesn't comply with // the isolation level of the query, return null. if (writeIdList != null) { @@ -12281,6 +12284,9 @@ public class ObjectStore implements RawStore, Configurable { long queryTxnId, String queryValidWriteIdList, long statsWriteId, boolean checkConcurrentWrites) throws MetaException { + // Note: can be changed to debug/info to verify the calls. + LOG.trace("Called with stats {}, {}; query {}, {}; checkConcurrentWrites {}", + statsTxnId, statsWriteIdList, queryTxnId, queryValidWriteIdList, checkConcurrentWrites); // if statsWriteIdList is null, // return true since the stats does not seem to be transactional. if (statsWriteIdList == null) {
