Repository: hive Updated Branches: refs/heads/master 54c3db908 -> 723f2d369
HIVE-12381: analyze table compute stats for table with special characters will wipe out all the table stats (Pengcheng Xiong, reviewed by Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/723f2d36 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/723f2d36 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/723f2d36 Branch: refs/heads/master Commit: 723f2d3695eed5e45bc61533fd229ec67cb77c5a Parents: 54c3db9 Author: Pengcheng Xiong <pxi...@apache.org> Authored: Wed Dec 2 22:31:45 2015 +0800 Committer: Pengcheng Xiong <pxi...@apache.org> Committed: Wed Dec 2 22:31:45 2015 +0800 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 3 - .../src/test/queries/positive/hbase_stats3.q | 50 --- .../test/results/positive/hbase_stats3.q.out | 324 -------------- .../hadoop/hive/ql/exec/FileSinkOperator.java | 4 +- .../apache/hadoop/hive/ql/exec/StatsTask.java | 13 +- .../hadoop/hive/ql/exec/TableScanOperator.java | 3 +- .../apache/hadoop/hive/ql/exec/Utilities.java | 25 -- .../hadoop/hive/ql/exec/spark/SparkTask.java | 6 +- .../ql/io/rcfile/stats/PartialScanMapper.java | 5 +- .../hive/ql/optimizer/GenMapRedUtils.java | 2 - .../RewriteQueryUsingAggregateIndexCtx.java | 3 +- .../hadoop/hive/ql/parse/SemanticAnalyzer.java | 3 +- .../hive/ql/parse/spark/GenSparkUtils.java | 1 - .../hadoop/hive/ql/plan/FileSinkDesc.java | 10 - .../hadoop/hive/ql/plan/TableScanDesc.java | 10 - .../hadoop/hive/ql/stats/StatsFactory.java | 11 - .../special_character_in_tabnames_1.q | 9 +- .../special_character_in_tabnames_2.q | 2 +- ql/src/test/queries/clientpositive/stats19.q | 105 ----- .../queries/clientpositive/stats_list_bucket.q | 2 - .../special_character_in_tabnames_1.q.out | 59 +++ .../special_character_in_tabnames_2.q.out | 10 +- .../test/results/clientpositive/stats19.q.out | 430 ------------------- .../stats_list_bucket.q.java1.7.out | 6 +- 24 files changed, 91 insertions(+), 1005 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 9e805bd..e984b6e 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1391,9 +1391,6 @@ public class HiveConf extends Configuration { "A lower value for error indicates higher accuracy and a higher compute cost."), HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION("hive.metastore.stats.ndv.densityfunction", false, "Whether to use density function to estimate the NDV for the whole table based on the NDV of partitions"), - HIVE_STATS_KEY_PREFIX_MAX_LENGTH("hive.stats.key.prefix.max.length", 150, - "Determines if when the prefix of the key used for intermediate stats collection\n" + - "exceeds a certain length, a hash of the key is used instead. If the value < 0 then hashing"), HIVE_STATS_KEY_PREFIX("hive.stats.key.prefix", "", "", true), // internal usage only // if length of variable length data type cannot be determined this length will be used. HIVE_STATS_MAX_VARIABLE_LENGTH("hive.stats.max.variable.length", 100, http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/hbase-handler/src/test/queries/positive/hbase_stats3.q ---------------------------------------------------------------------- diff --git a/hbase-handler/src/test/queries/positive/hbase_stats3.q b/hbase-handler/src/test/queries/positive/hbase_stats3.q deleted file mode 100644 index c74fa08..0000000 --- a/hbase-handler/src/test/queries/positive/hbase_stats3.q +++ /dev/null @@ -1,50 +0,0 @@ -set datanucleus.cache.collections=false; -set hive.stats.autogather=true; -set hive.stats.atomic=false; -set hive.stats.collect.rawdatasize=false; - -create table stats_part like srcpart; - -set hive.stats.key.prefix.max.length=0; - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=200; - --- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=-1; - --- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=0; - --- Verify the stats are correct for dynamic partitions - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=200; - --- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=-1; - --- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/hbase-handler/src/test/results/positive/hbase_stats3.q.out ---------------------------------------------------------------------- diff --git a/hbase-handler/src/test/results/positive/hbase_stats3.q.out b/hbase-handler/src/test/results/positive/hbase_stats3.q.out deleted file mode 100644 index 063800f..0000000 --- a/hbase-handler/src/test/results/positive/hbase_stats3.q.out +++ /dev/null @@ -1,324 +0,0 @@ -PREHOOK: query: create table stats_part like srcpart -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@stats_part -POSTHOOK: query: create table stats_part like srcpart -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: database:default -POSTHOOK: Output: default@stats_part -PREHOOK: query: -- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: query: -- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- Verify the stats are correct for dynamic partitions - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -POSTHOOK: query: -- Verify the stats are correct for dynamic partitions - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -POSTHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -POSTHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 0 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index 7459bba..32bfcf5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -1148,8 +1148,6 @@ public class FileSinkOperator extends TerminalOperator<FileSinkDesc> implements String taskID = Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf)); String spSpec = conf.getStaticSpec(); - int maxKeyLength = conf.getMaxStatsKeyPrefixLength(); - for (Map.Entry<String, FSPaths> entry : valToPaths.entrySet()) { String fspKey = entry.getKey(); // DP/LB FSPaths fspValue = entry.getValue(); @@ -1176,7 +1174,7 @@ public class FileSinkOperator extends TerminalOperator<FileSinkDesc> implements // use lowercase table name as prefix here, as StatsTask get table name from metastore to fetch counter. String prefix = conf.getTableInfo().getTableName().toLowerCase(); prefix = Utilities.join(prefix, spSpec, dpSpec); - prefix = Utilities.getHashedStatsPrefix(prefix, maxKeyLength); + prefix = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR; Map<String, String> statsToPublish = new HashMap<String, String>(); for (String statType : fspValue.stat.getStoredStats()) { http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java index c50d5b6..edf69fe 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java @@ -28,6 +28,7 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; @@ -155,8 +156,6 @@ public class StatsTask extends Task<StatsWork> implements Serializable { String tableFullName = table.getDbName() + "." + table.getTableName(); - int maxPrefixLength = StatsFactory.getMaxPrefixLength(conf); - if (partitions == null) { org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); Map<String, String> parameters = tTable.getParameters(); @@ -173,7 +172,7 @@ public class StatsTask extends Task<StatsWork> implements Serializable { if (statsAggregator != null) { String prefix = getAggregationPrefix(table, null); - updateStats(statsAggregator, parameters, prefix, maxPrefixLength, atomic); + updateStats(statsAggregator, parameters, prefix, atomic); } updateQuickStats(wh, parameters, tTable.getSd()); @@ -209,7 +208,7 @@ public class StatsTask extends Task<StatsWork> implements Serializable { if (statsAggregator != null) { String prefix = getAggregationPrefix(table, partn); - updateStats(statsAggregator, parameters, prefix, maxPrefixLength, atomic); + updateStats(statsAggregator, parameters, prefix, atomic); } updateQuickStats(wh, parameters, tPart.getSd()); @@ -252,7 +251,7 @@ public class StatsTask extends Task<StatsWork> implements Serializable { throws MetaException { // prefix is of the form dbName.tblName - String prefix = table.getDbName()+"."+table.getTableName(); + String prefix = table.getDbName() + "." + MetaStoreUtils.encodeTableName(table.getTableName()); if (partition != null) { return Utilities.join(prefix, Warehouse.makePartPath(partition.getSpec())); } @@ -301,10 +300,10 @@ public class StatsTask extends Task<StatsWork> implements Serializable { } private void updateStats(StatsAggregator statsAggregator, - Map<String, String> parameters, String prefix, int maxPrefixLength, boolean atomic) + Map<String, String> parameters, String prefix, boolean atomic) throws HiveException { - String aggKey = Utilities.getHashedStatsPrefix(prefix, maxPrefixLength); + String aggKey = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR; for (String statType : StatsSetupConst.statsRequireCompute) { String value = statsAggregator.aggregateStats(aggKey, statType); http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index 90c83e6..32806dc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -312,8 +312,7 @@ public class TableScanOperator extends Operator<TableScanDesc> implements statsToPublish.clear(); String prefix = Utilities.join(conf.getStatsAggPrefix(), pspecs); - int maxKeyLength = conf.getMaxStatsKeyPrefixLength(); - String key = Utilities.getHashedStatsPrefix(prefix, maxKeyLength); + String key = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR; for(String statType : stats.get(pspecs).getStoredStats()) { statsToPublish.put(statType, Long.toString(stats.get(pspecs).getStat(statType))); } http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 4eb46ff..dacb80f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -3026,31 +3026,6 @@ public final class Utilities { return factory == null ? null : factory.getStatsPublisher(); } - /** - * If statsPrefix's length is greater than maxPrefixLength and maxPrefixLength > 0, - * then it returns an MD5 hash of statsPrefix followed by path separator, otherwise - * it returns statsPrefix - * - * @param statsPrefix prefix of stats key - * @param maxPrefixLength max length of stats key - * @return if the length of prefix is longer than max, return MD5 hashed value of the prefix - */ - public static String getHashedStatsPrefix(String statsPrefix, int maxPrefixLength) { - // todo: this might return possibly longer prefix than - // maxPrefixLength (if set) when maxPrefixLength - postfixLength < 17, - // which would make stat values invalid (especially for 'counter' type) - if (maxPrefixLength >= 0 && statsPrefix.length() > maxPrefixLength) { - try { - MessageDigest digester = MessageDigest.getInstance("MD5"); - digester.update(statsPrefix.getBytes()); - return new String(digester.digest()) + Path.SEPARATOR; // 17 byte - } catch (NoSuchAlgorithmException e) { - throw new RuntimeException(e); - } - } - return statsPrefix.endsWith(Path.SEPARATOR) ? statsPrefix : statsPrefix + Path.SEPARATOR; - } - public static String join(String... elements) { StringBuilder builder = new StringBuilder(); for (String element : elements) { http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkTask.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkTask.java index 336d490..31eee45 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkTask.java @@ -30,6 +30,7 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.Warehouse; @@ -272,14 +273,13 @@ public class SparkTask extends Task<SparkWork> { StatsWork statsWork = statsTask.getWork(); String tablePrefix = getTablePrefix(statsWork); List<Map<String, String>> partitionSpecs = getPartitionSpecs(statsWork); - int maxPrefixLength = StatsFactory.getMaxPrefixLength(conf); if (partitionSpecs == null) { - prefixs.add(Utilities.getHashedStatsPrefix(tablePrefix, maxPrefixLength)); + prefixs.add(tablePrefix.endsWith(Path.SEPARATOR) ? tablePrefix : tablePrefix + Path.SEPARATOR); } else { for (Map<String, String> partitionSpec : partitionSpecs) { String prefixWithPartition = Utilities.join(tablePrefix, Warehouse.makePartPath(partitionSpec)); - prefixs.add(Utilities.getHashedStatsPrefix(prefixWithPartition, maxPrefixLength)); + prefixs.add(prefixWithPartition.endsWith(Path.SEPARATOR) ? prefixWithPartition : prefixWithPartition + Path.SEPARATOR); } } http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java index 8a5360e..09e4a47 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java @@ -24,6 +24,7 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.ErrorMsg; @@ -153,9 +154,9 @@ public class PartialScanMapper extends MapReduceBase implements throw new HiveException(ErrorMsg.STATSPUBLISHER_CONNECTION_ERROR.getErrorCodedMsg()); } - int maxPrefixLength = StatsFactory.getMaxPrefixLength(jc); // construct key used to store stats in intermediate db - String key = Utilities.getHashedStatsPrefix(statsAggKeyPrefix, maxPrefixLength); + String key = statsAggKeyPrefix.endsWith(Path.SEPARATOR) ? statsAggKeyPrefix : statsAggKeyPrefix + + Path.SEPARATOR; // construct statistics to be stored Map<String, String> statsToPublish = new HashMap<String, String>(); http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index 82514d4..ecdaa55 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -110,7 +110,6 @@ import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.TezWork; -import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; @@ -1498,7 +1497,6 @@ public final class GenMapRedUtils { // mark the MapredWork and FileSinkOperator for gathering stats nd.getConf().setGatherStats(true); nd.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE)); - nd.getConf().setMaxStatsKeyPrefixLength(StatsFactory.getMaxPrefixLength(hconf)); // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName()); // subscribe feeds from the MoveTask so that MoveTask can forward the list http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java index 9acc7b7..d0f28d8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteQueryUsingAggregateIndexCtx.java @@ -27,6 +27,7 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.GroupByOperator; @@ -180,7 +181,7 @@ public final class RewriteQueryUsingAggregateIndexCtx implements NodeProcessorC TableScanDesc indexTableScanDesc = new TableScanDesc(indexTableHandle); indexTableScanDesc.setGatherStats(false); - String k = indexTableName + Path.SEPARATOR; + String k = MetaStoreUtils.encodeTableName(indexTableName) + Path.SEPARATOR; indexTableScanDesc.setStatsAggPrefix(k); scanOperator.setConf(indexTableScanDesc); http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 0ff6001..9caffb6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -9491,7 +9491,6 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } tsDesc.setGatherStats(true); tsDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - tsDesc.setMaxStatsKeyPrefixLength(StatsFactory.getMaxPrefixLength(conf)); // append additional virtual columns for storing statistics Iterator<VirtualColumn> vcs = VirtualColumn.getStatsRegistry(conf).iterator(); @@ -9520,7 +9519,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { // db_name.table_name + partitionSec // as the prefix for easy of read during explain and debugging. // Currently, partition spec can only be static partition. - String k = tblName + Path.SEPARATOR; + String k = MetaStoreUtils.encodeTableName(tblName) + Path.SEPARATOR; tsDesc.setStatsAggPrefix(tab.getDbName()+"."+k); // set up WriteEntity for replication http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java index e87701f..40c23a5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java @@ -332,7 +332,6 @@ public class GenSparkUtils { for (FileSinkOperator fsOp : fileSinkList) { fsOp.getConf().setGatherStats(fileSink.getConf().isGatherStats()); fsOp.getConf().setStatsReliable(fileSink.getConf().isStatsReliable()); - fsOp.getConf().setMaxStatsKeyPrefixLength(fileSink.getConf().getMaxStatsKeyPrefixLength()); } } http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java index 9d6318a..40a8477 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java @@ -85,7 +85,6 @@ public class FileSinkDesc extends AbstractOperatorDesc { private boolean statsReliable; private ListBucketingCtx lbCtx; - private int maxStatsKeyPrefixLength = -1; private String statsTmpDir; private boolean statsCollectRawDataSize; @@ -152,7 +151,6 @@ public class FileSinkDesc extends AbstractOperatorDesc { ret.setParentDir(parentDir); ret.setLinkedFileSinkDesc(linkedFileSinkDesc); ret.setStatsReliable(statsReliable); - ret.setMaxStatsKeyPrefixLength(maxStatsKeyPrefixLength); ret.setStatsCollectRawDataSize(statsCollectRawDataSize); ret.setDpSortState(dpSortState); ret.setWriteType(writeType); @@ -400,14 +398,6 @@ public class FileSinkDesc extends AbstractOperatorDesc { this.linkedFileSinkDesc = linkedFileSinkDesc; } - public int getMaxStatsKeyPrefixLength() { - return maxStatsKeyPrefixLength; - } - - public void setMaxStatsKeyPrefixLength(int maxStatsKeyPrefixLength) { - this.maxStatsKeyPrefixLength = maxStatsKeyPrefixLength; - } - public boolean isStatsCollectRawDataSize() { return statsCollectRawDataSize; } http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index b354f98..43bf7c5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -28,7 +28,6 @@ import org.apache.hadoop.hive.ql.exec.PTFUtils; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; -import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.TableSample; import org.apache.hadoop.hive.ql.plan.Explain.Level; @@ -71,7 +70,6 @@ public class TableScanDesc extends AbstractOperatorDesc { */ private boolean gatherStats; private boolean statsReliable; - private int maxStatsKeyPrefixLength = -1; private String tmpStatsDir; private ExprNodeGenericFuncDesc filterExpr; @@ -256,14 +254,6 @@ public class TableScanDesc extends AbstractOperatorDesc { this.statsReliable = statsReliable; } - public int getMaxStatsKeyPrefixLength() { - return maxStatsKeyPrefixLength; - } - - public void setMaxStatsKeyPrefixLength(int maxStatsKeyPrefixLength) { - this.maxStatsKeyPrefixLength = maxStatsKeyPrefixLength; - } - public void setRowLimit(int rowLimit) { this.rowLimit = rowLimit; } http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java index 9f4ed67..97bad32 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java @@ -29,7 +29,6 @@ import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.util.ReflectionUtils; import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVESTATSDBCLASS; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH; /** * A factory of stats publisher and aggregator implementations of the @@ -43,16 +42,6 @@ public final class StatsFactory { private Class <? extends Serializable> aggregatorImplementation; private final Configuration jobConf; - public static int getMaxPrefixLength(Configuration conf) { - - if (HiveConf.getVar(conf, HIVESTATSDBCLASS).equalsIgnoreCase(StatDB.fs.name())) { - // no limit on prefix for fs. - return -1; - } - int maxPrefixLength = HiveConf.getIntVar(conf, HIVE_STATS_KEY_PREFIX_MAX_LENGTH); - return maxPrefixLength; - } - public static StatsFactory newFactory(Configuration conf) { return newFactory(HiveConf.getVar(conf, HIVESTATSDBCLASS), conf); } http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/queries/clientpositive/special_character_in_tabnames_1.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/special_character_in_tabnames_1.q b/ql/src/test/queries/clientpositive/special_character_in_tabnames_1.q index 7867ae1..799a66b 100644 --- a/ql/src/test/queries/clientpositive/special_character_in_tabnames_1.q +++ b/ql/src/test/queries/clientpositive/special_character_in_tabnames_1.q @@ -1,4 +1,4 @@ -set hive.cbo.enable=false; +set hive.cbo.enable=true; set hive.exec.check.crossproducts=false; set hive.stats.fetch.column.stats=true; set hive.auto.convert.join=false; @@ -1072,4 +1072,9 @@ insert overwrite table `src/_/cbo` select * from src; select * from `src/_/cbo` limit 1; - +drop table `t//`; +create table `t//` (col string); +insert into `t//` values(1); +insert into `t//` values(null); +analyze table `t//` compute statistics; +explain select * from `t//`; http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/queries/clientpositive/special_character_in_tabnames_2.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/special_character_in_tabnames_2.q b/ql/src/test/queries/clientpositive/special_character_in_tabnames_2.q index 6110279..d7010e9 100644 --- a/ql/src/test/queries/clientpositive/special_character_in_tabnames_2.q +++ b/ql/src/test/queries/clientpositive/special_character_in_tabnames_2.q @@ -1,4 +1,4 @@ -set hive.cbo.enable=false; +set hive.cbo.enable=true; -- try the query without indexing, with manual indexing, and with automatic indexing -- SORT_QUERY_RESULTS http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/queries/clientpositive/stats19.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/stats19.q b/ql/src/test/queries/clientpositive/stats19.q deleted file mode 100644 index a60be03..0000000 --- a/ql/src/test/queries/clientpositive/stats19.q +++ /dev/null @@ -1,105 +0,0 @@ -set datanucleus.cache.collections=false; -set hive.stats.autogather=true; -set hive.stats.reliable=true; -set hive.stats.dbclass=custom; -set hive.stats.default.publisher=org.apache.hadoop.hive.ql.stats.DummyStatsPublisher; -set hive.stats.default.aggregator=org.apache.hadoop.hive.ql.stats.KeyVerifyingStatsAggregator; - --- Note, its important that the partitions created below have a name greater than 16 characters in --- length since KeyVerifyingStatsAggregator depends on checking that a keyPrefix is hashed by the --- length of the keyPrefix, having a partition name greather than 16 characters guarantees no false --- positives. - -create table stats_part like srcpart; - -set hive.stats.key.prefix.max.length=0; - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -set hive.stats.key.prefix.max.length=4000; - --- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -set hive.stats.key.prefix.max.length=-1; - --- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -set hive.stats.dbclass=fs; -set hive.stats.default.publisher=; -set hive.stats.default.aggregator=; - -set hive.stats.key.prefix.max.length=0; - --- Run the tests again and verify the stats are correct, this should verify that the stats publisher --- is hashing as well where appropriate - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=4000; - --- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=-1; - --- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - - -set hive.stats.dbclass=custom; -set hive.stats.default.publisher=org.apache.hadoop.hive.ql.stats.DummyStatsPublisher; -set hive.stats.default.aggregator=org.apache.hadoop.hive.ql.stats.KeyVerifyingStatsAggregator; -set hive.stats.key.prefix.max.length=0; - --- Do the same for dynamic partitions - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -set hive.stats.key.prefix.max.length=4000; - --- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -set hive.stats.key.prefix.max.length=-1; - --- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -set hive.stats.dbclass=fs; -set hive.stats.default.publisher=; -set hive.stats.default.aggregator=; - -set hive.stats.key.prefix.max.length=0; - --- Run the tests again and verify the stats are correct, this should verify that the stats publisher --- is hashing as well where appropriate - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=200; - --- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); - -set hive.stats.key.prefix.max.length=-1; - --- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src; - -desc formatted stats_part partition (ds='2010-04-08', hr = '13'); http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/queries/clientpositive/stats_list_bucket.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/stats_list_bucket.q b/ql/src/test/queries/clientpositive/stats_list_bucket.q index 73403ab..51137a8 100644 --- a/ql/src/test/queries/clientpositive/stats_list_bucket.q +++ b/ql/src/test/queries/clientpositive/stats_list_bucket.q @@ -13,9 +13,7 @@ skewed by (c1, c2) on (('466','val_466'),('287','val_287'),('82','val_82')) stored as directories stored as rcfile; -set hive.stats.key.prefix.max.length=1; --- Make sure we use hashed IDs during stats publishing. -- Try partitioned table with list bucketing. -- The stats should show 500 rows loaded, as many rows as the src table has. http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/results/clientpositive/special_character_in_tabnames_1.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/special_character_in_tabnames_1.q.out b/ql/src/test/results/clientpositive/special_character_in_tabnames_1.q.out index bd0088a..cb949e4 100644 --- a/ql/src/test/results/clientpositive/special_character_in_tabnames_1.q.out +++ b/ql/src/test/results/clientpositive/special_character_in_tabnames_1.q.out @@ -19548,3 +19548,62 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@src/_/cbo #### A masked pattern was here #### 238 val_238 +PREHOOK: query: drop table `t//` +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table `t//` +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table `t//` (col string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t// +POSTHOOK: query: create table `t//` (col string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t// +PREHOOK: query: insert into `t//` values(1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@t// +POSTHOOK: query: insert into `t//` values(1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@t// +POSTHOOK: Lineage: t//.col SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into `t//` values(null) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@t// +POSTHOOK: query: insert into `t//` values(null) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@t// +POSTHOOK: Lineage: t//.col SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: analyze table `t//` compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@t// +PREHOOK: Output: default@t// +POSTHOOK: query: analyze table `t//` compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t// +POSTHOOK: Output: default@t// +PREHOOK: query: explain select * from `t//` +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from `t//` +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: t// + Statistics: Num rows: 2 Data size: 3 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: col (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 3 Basic stats: COMPLETE Column stats: NONE + ListSink + http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/results/clientpositive/special_character_in_tabnames_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/special_character_in_tabnames_2.q.out b/ql/src/test/results/clientpositive/special_character_in_tabnames_2.q.out index 1cc672e..51d31e0 100644 --- a/ql/src/test/results/clientpositive/special_character_in_tabnames_2.q.out +++ b/ql/src/test/results/clientpositive/special_character_in_tabnames_2.q.out @@ -113,7 +113,7 @@ STAGE PLANS: alias: s/c Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((key > 80) and (key < 100)) (type: boolean) + predicate: ((UDFToDouble(key) > 80.0) and (UDFToDouble(key) < 100.0)) (type: boolean) Statistics: Num rows: 55 Data size: 584 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string), value (type: string) @@ -183,9 +183,9 @@ STAGE PLANS: Map Operator Tree: TableScan alias: default__s/c_src_index__ - filterExpr: ((key > 80) and (key < 100)) (type: boolean) + filterExpr: ((UDFToDouble(key) > 80.0) and (UDFToDouble(key) < 100.0)) (type: boolean) Filter Operator - predicate: ((key > 80) and (key < 100)) (type: boolean) + predicate: ((UDFToDouble(key) > 80.0) and (UDFToDouble(key) < 100.0)) (type: boolean) Select Operator expressions: _bucketname (type: string), _offsets (type: array<bigint>) outputColumnNames: _col0, _col1 @@ -216,10 +216,10 @@ STAGE PLANS: Map Operator Tree: TableScan alias: s/c - filterExpr: ((key > 80) and (key < 100)) (type: boolean) + filterExpr: ((UDFToDouble(key) > 80.0) and (UDFToDouble(key) < 100.0)) (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((key > 80) and (key < 100)) (type: boolean) + predicate: ((UDFToDouble(key) > 80.0) and (UDFToDouble(key) < 100.0)) (type: boolean) Statistics: Num rows: 55 Data size: 584 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string), value (type: string) http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/results/clientpositive/stats19.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/stats19.q.out b/ql/src/test/results/clientpositive/stats19.q.out deleted file mode 100644 index ea56f3a..0000000 --- a/ql/src/test/results/clientpositive/stats19.q.out +++ /dev/null @@ -1,430 +0,0 @@ -PREHOOK: query: -- Note, its important that the partitions created below have a name greater than 16 characters in --- length since KeyVerifyingStatsAggregator depends on checking that a keyPrefix is hashed by the --- length of the keyPrefix, having a partition name greather than 16 characters guarantees no false --- positives. - -create table stats_part like srcpart -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@stats_part -POSTHOOK: query: -- Note, its important that the partitions created below have a name greater than 16 characters in --- length since KeyVerifyingStatsAggregator depends on checking that a keyPrefix is hashed by the --- length of the keyPrefix, having a partition name greather than 16 characters guarantees no false --- positives. - -create table stats_part like srcpart -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: database:default -POSTHOOK: Output: default@stats_part -PREHOOK: query: -- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -Stats prefix is hashed: true -Stats prefix is hashed: true -POSTHOOK: query: -- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -Stats prefix is hashed: false -Stats prefix is hashed: false -POSTHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -Stats prefix is hashed: false -Stats prefix is hashed: false -POSTHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- Run the tests again and verify the stats are correct, this should verify that the stats publisher --- is hashing as well where appropriate - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: query: -- Run the tests again and verify the stats are correct, this should verify that the stats publisher --- is hashing as well where appropriate - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr = '13') select key, value from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- Do the same for dynamic partitions - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -Stats prefix is hashed: true -Stats prefix is hashed: true -POSTHOOK: query: -- Do the same for dynamic partitions - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -Stats prefix is hashed: false -Stats prefix is hashed: false -POSTHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -Stats prefix is hashed: false -Stats prefix is hashed: false -POSTHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: -- Run the tests again and verify the stats are correct, this should verify that the stats publisher --- is hashing as well where appropriate - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -POSTHOOK: query: -- Run the tests again and verify the stats are correct, this should verify that the stats publisher --- is hashing as well where appropriate - --- The stats key should be hashed since the max length is too small -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -POSTHOOK: query: -- The stats key should not be hashed since the max length is large enough -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 -PREHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: default@stats_part@ds=2010-04-08 -POSTHOOK: query: -- The stats key should not be hashed since negative values should imply hashing is turned off -insert overwrite table stats_part partition (ds='2010-04-08', hr) select key, value, '13' from src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: default@stats_part@ds=2010-04-08/hr=13 -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] -POSTHOOK: Lineage: stats_part PARTITION(ds=2010-04-08,hr=13).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] -PREHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -PREHOOK: type: DESCTABLE -PREHOOK: Input: default@stats_part -POSTHOOK: query: desc formatted stats_part partition (ds='2010-04-08', hr = '13') -POSTHOOK: type: DESCTABLE -POSTHOOK: Input: default@stats_part -# col_name data_type comment - -key string default -value string default - -# Partition Information -# col_name data_type comment - -ds string -hr string - -# Detailed Partition Information -Partition Value: [2010-04-08, 13] -Database: default -Table: stats_part -#### A masked pattern was here #### -Partition Parameters: - COLUMN_STATS_ACCURATE true - numFiles 1 - numRows 500 - rawDataSize 5312 - totalSize 5812 -#### A masked pattern was here #### - -# Storage Information -SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -InputFormat: org.apache.hadoop.mapred.TextInputFormat -OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat -Compressed: No -Num Buckets: -1 -Bucket Columns: [] -Sort Columns: [] -Storage Desc Params: - serialization.format 1 http://git-wip-us.apache.org/repos/asf/hive/blob/723f2d36/ql/src/test/results/clientpositive/stats_list_bucket.q.java1.7.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/stats_list_bucket.q.java1.7.out b/ql/src/test/results/clientpositive/stats_list_bucket.q.java1.7.out index 63372c5..eb0ab90 100644 --- a/ql/src/test/results/clientpositive/stats_list_bucket.q.java1.7.out +++ b/ql/src/test/results/clientpositive/stats_list_bucket.q.java1.7.out @@ -32,8 +32,7 @@ stored as rcfile POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@stats_list_bucket -PREHOOK: query: -- Make sure we use hashed IDs during stats publishing. --- Try partitioned table with list bucketing. +PREHOOK: query: -- Try partitioned table with list bucketing. -- The stats should show 500 rows loaded, as many rows as the src table has. insert overwrite table stats_list_bucket partition (ds = '2008-04-08', hr = '11') @@ -41,8 +40,7 @@ insert overwrite table stats_list_bucket partition (ds = '2008-04-08', hr = '11 PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: default@stats_list_bucket@ds=2008-04-08/hr=11 -POSTHOOK: query: -- Make sure we use hashed IDs during stats publishing. --- Try partitioned table with list bucketing. +POSTHOOK: query: -- Try partitioned table with list bucketing. -- The stats should show 500 rows loaded, as many rows as the src table has. insert overwrite table stats_list_bucket partition (ds = '2008-04-08', hr = '11')