Repository: hive Updated Branches: refs/heads/master df722342a -> e5ba2690f
HIVE-13616: Investigate renaming a table without invalidating the column stats (Reviewed by Chaoyu Tang, Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e5ba2690 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e5ba2690 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e5ba2690 Branch: refs/heads/master Commit: e5ba2690f1886952ee366de0310ea2aec031f347 Parents: df72234 Author: Aihua Xu <aihu...@apache.org> Authored: Wed May 11 15:11:20 2016 -0400 Committer: Aihua Xu <aihu...@apache.org> Committed: Fri May 20 09:26:25 2016 -0400 ---------------------------------------------------------------------- .../hadoop/hive/metastore/HiveAlterHandler.java | 85 ++++-- .../rename_table_update_column_stats.q | 55 ++++ .../rename_table_update_column_stats.q.out | 280 +++++++++++++++++++ 3 files changed, 390 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/e5ba2690/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java index 0652b9d..68c6e44 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.EnvironmentContext; @@ -237,9 +238,8 @@ public class HiveAlterHandler implements AlterHandler { // alterPartition() MetaStoreUtils.updateTableStatsFast(db, newt, wh, false, true, environmentContext); } - updateTableColumnStatsForAlterTable(msdb, oldt, newt); - // now finally call alter table - msdb.alterTable(dbname, name, newt); + + alterTableUpdateTableColumnStats(msdb, oldt, newt); // commit the changes success = msdb.commitTransaction(); } catch (InvalidObjectException e) { @@ -644,48 +644,73 @@ public class HiveAlterHandler implements AlterHandler { } } - private void updateTableColumnStatsForAlterTable(RawStore msdb, Table oldTable, Table newTable) + private void alterTableUpdateTableColumnStats(RawStore msdb, + Table oldTable, Table newTable) throws MetaException, InvalidObjectException { - String dbName = oldTable.getDbName(); - String tableName = oldTable.getTableName(); - String newDbName = HiveStringUtils.normalizeIdentifier(newTable.getDbName()); + String dbName = oldTable.getDbName().toLowerCase(); + String tableName = HiveStringUtils.normalizeIdentifier(oldTable.getTableName()); + String newDbName = newTable.getDbName().toLowerCase(); String newTableName = HiveStringUtils.normalizeIdentifier(newTable.getTableName()); try { - if (!dbName.equals(newDbName) || !tableName.equals(newTableName)) { - msdb.deleteTableColumnStatistics(dbName, tableName, null); - } else { - List<FieldSchema> oldCols = oldTable.getSd().getCols(); - List<FieldSchema> newCols = newTable.getSd().getCols(); - if (!MetaStoreUtils.areSameColumns(oldCols, newCols)) { + List<FieldSchema> oldCols = oldTable.getSd().getCols(); + List<FieldSchema> newCols = newTable.getSd().getCols(); + List<ColumnStatisticsObj> newStatsObjs = new ArrayList<ColumnStatisticsObj>(); + ColumnStatistics colStats = null; + boolean updateColumnStats = true; + + // Nothing to update if everything is the same + if (newDbName.equals(dbName) && + newTableName.equals(tableName) && + MetaStoreUtils.areSameColumns(oldCols, newCols)) { + updateColumnStats = false; + } + + if (updateColumnStats) { List<String> oldColNames = new ArrayList<String>(oldCols.size()); for (FieldSchema oldCol : oldCols) { oldColNames.add(oldCol.getName()); } - ColumnStatistics cs = msdb.getTableColumnStatistics(dbName, tableName, oldColNames); - if (cs == null) { - return; - } + // Collect column stats which need to be rewritten and remove old stats + colStats = msdb.getTableColumnStatistics(dbName, tableName, oldColNames); + if (colStats == null) { + updateColumnStats = false; + } else { + List<ColumnStatisticsObj> statsObjs = colStats.getStatsObj(); + if (statsObjs != null) { + for (ColumnStatisticsObj statsObj : statsObjs) { + boolean found = false; + for (FieldSchema newCol : newCols) { + if (statsObj.getColName().equalsIgnoreCase(newCol.getName()) + && statsObj.getColType().equals(newCol.getType())) { + found = true; + break; + } + } - List<ColumnStatisticsObj> statsObjs = cs.getStatsObj(); - if (statsObjs != null) { - for (ColumnStatisticsObj statsObj : statsObjs) { - boolean found = false; - for (FieldSchema newCol : newCols) { - if (statsObj.getColName().equalsIgnoreCase(newCol.getName()) - && statsObj.getColType().equals(newCol.getType())) { - found = true; - break; + if (found) { + if (!newDbName.equals(dbName) || !newTableName.equals(tableName)) { + msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName()); + newStatsObjs.add(statsObj); + } + } else { + msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName()); } } - if (!found) { - msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName()); - } } } } - } + + // Change to new table and append stats for the new table + msdb.alterTable(dbName, tableName, newTable); + if (updateColumnStats && !newStatsObjs.isEmpty()) { + ColumnStatisticsDesc statsDesc = colStats.getStatsDesc(); + statsDesc.setDbName(newDbName); + statsDesc.setTableName(newTableName); + colStats.setStatsObj(newStatsObjs); + msdb.updateTableColumnStatistics(colStats); + } } catch (NoSuchObjectException nsoe) { LOG.debug("Could not find db entry." + nsoe); } catch (InvalidInputException e) { http://git-wip-us.apache.org/repos/asf/hive/blob/e5ba2690/ql/src/test/queries/clientpositive/rename_table_update_column_stats.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/rename_table_update_column_stats.q b/ql/src/test/queries/clientpositive/rename_table_update_column_stats.q new file mode 100644 index 0000000..8e5766c --- /dev/null +++ b/ql/src/test/queries/clientpositive/rename_table_update_column_stats.q @@ -0,0 +1,55 @@ +set hive.mapred.mode=nonstrict; +set hive.metastore.try.direct.sql=true; + +drop database if exists statsdb1; +create database statsdb1; +drop database if exists statsdb2; +create database statsdb2; + +create table statsdb1.testtable1 (col1 int, col2 string, col3 string); +insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10; + +use statsdb1; + +analyze table testtable1 compute statistics for columns; + +describe formatted statsdb1.testtable1 col1; +describe formatted statsdb1.testtable1 col2; +describe formatted statsdb1.testtable1 col3; + +alter table statsdb1.testtable1 rename to statsdb2.testtable2; +describe formatted statsdb2.testtable2 col1; +describe formatted statsdb2.testtable2 col2; +describe formatted statsdb2.testtable2 col3; + +use default; +drop database statsdb1 cascade; +drop database statsdb2 cascade; + + +set hive.metastore.try.direct.sql=false; + +drop database if exists statsdb1; +create database statsdb1; +drop database if exists statsdb2; +create database statsdb2; + +create table statsdb1.testtable1 (col1 int, col2 string, col3 string); +insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10; + +use statsdb1; + +analyze table testtable1 compute statistics for columns; + +describe formatted statsdb1.testtable1 col1; +describe formatted statsdb1.testtable1 col2; +describe formatted statsdb1.testtable1 col3; + +alter table statsdb1.testtable1 rename to statsdb2.testtable2; +describe formatted statsdb2.testtable2 col1; +describe formatted statsdb2.testtable2 col2; +describe formatted statsdb2.testtable2 col3; + +use default; +drop database statsdb1 cascade; +drop database statsdb2 cascade; http://git-wip-us.apache.org/repos/asf/hive/blob/e5ba2690/ql/src/test/results/clientpositive/rename_table_update_column_stats.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/rename_table_update_column_stats.q.out b/ql/src/test/results/clientpositive/rename_table_update_column_stats.q.out new file mode 100644 index 0000000..c025cfa --- /dev/null +++ b/ql/src/test/results/clientpositive/rename_table_update_column_stats.q.out @@ -0,0 +1,280 @@ +PREHOOK: query: drop database if exists statsdb1 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb1 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb1 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: create database statsdb1 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database if exists statsdb2 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb2 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb2 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb2 +POSTHOOK: query: create database statsdb2 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb2 +PREHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:statsdb1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:statsdb1 +POSTHOOK: Output: statsdb1@testtable1 +PREHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Lineage: testtable1.col1 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col3 SIMPLE [] +PREHOOK: query: use statsdb1 +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:statsdb1 +POSTHOOK: query: use statsdb1 +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:statsdb1 +PREHOOK: query: analyze table testtable1 compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table testtable1 compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +PREHOOK: query: describe formatted statsdb1.testtable1 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +PREHOOK: type: ALTERTABLE_RENAME +PREHOOK: Input: statsdb1@testtable1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +POSTHOOK: type: ALTERTABLE_RENAME +POSTHOOK: Input: statsdb1@testtable1 +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Output: statsdb2@testtable2 +PREHOOK: query: describe formatted statsdb2.testtable2 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: use default +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:default +POSTHOOK: query: use default +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:default +PREHOOK: query: drop database statsdb1 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb1 +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: drop database statsdb1 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb1 +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database statsdb2 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb2 +PREHOOK: Output: database:statsdb2 +PREHOOK: Output: statsdb2@testtable2 +POSTHOOK: query: drop database statsdb2 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb2 +POSTHOOK: Output: database:statsdb2 +POSTHOOK: Output: statsdb2@testtable2 +PREHOOK: query: drop database if exists statsdb1 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb1 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb1 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: create database statsdb1 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database if exists statsdb2 +PREHOOK: type: DROPDATABASE +POSTHOOK: query: drop database if exists statsdb2 +POSTHOOK: type: DROPDATABASE +PREHOOK: query: create database statsdb2 +PREHOOK: type: CREATEDATABASE +PREHOOK: Output: database:statsdb2 +POSTHOOK: query: create database statsdb2 +POSTHOOK: type: CREATEDATABASE +POSTHOOK: Output: database:statsdb2 +PREHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:statsdb1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: create table statsdb1.testtable1 (col1 int, col2 string, col3 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:statsdb1 +POSTHOOK: Output: statsdb1@testtable1 +PREHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: insert into statsdb1.testtable1 select key, value, 'val3' from src limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Lineage: testtable1.col1 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: testtable1.col3 SIMPLE [] +PREHOOK: query: use statsdb1 +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:statsdb1 +POSTHOOK: query: use statsdb1 +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:statsdb1 +PREHOOK: query: analyze table testtable1 compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table testtable1 compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: statsdb1@testtable1 +#### A masked pattern was here #### +PREHOOK: query: describe formatted statsdb1.testtable1 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb1.testtable1 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb1@testtable1 +POSTHOOK: query: describe formatted statsdb1.testtable1 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb1@testtable1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +PREHOOK: type: ALTERTABLE_RENAME +PREHOOK: Input: statsdb1@testtable1 +PREHOOK: Output: statsdb1@testtable1 +POSTHOOK: query: alter table statsdb1.testtable1 rename to statsdb2.testtable2 +POSTHOOK: type: ALTERTABLE_RENAME +POSTHOOK: Input: statsdb1@testtable1 +POSTHOOK: Output: statsdb1@testtable1 +POSTHOOK: Output: statsdb2@testtable2 +PREHOOK: query: describe formatted statsdb2.testtable2 col1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col1 int 27 484 0 8 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col2 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col2 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col2 string 0 12 6.7 7 from deserializer +PREHOOK: query: describe formatted statsdb2.testtable2 col3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: statsdb2@testtable2 +POSTHOOK: query: describe formatted statsdb2.testtable2 col3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: statsdb2@testtable2 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +col3 string 0 1 4.0 4 from deserializer +PREHOOK: query: use default +PREHOOK: type: SWITCHDATABASE +PREHOOK: Input: database:default +POSTHOOK: query: use default +POSTHOOK: type: SWITCHDATABASE +POSTHOOK: Input: database:default +PREHOOK: query: drop database statsdb1 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb1 +PREHOOK: Output: database:statsdb1 +POSTHOOK: query: drop database statsdb1 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb1 +POSTHOOK: Output: database:statsdb1 +PREHOOK: query: drop database statsdb2 cascade +PREHOOK: type: DROPDATABASE +PREHOOK: Input: database:statsdb2 +PREHOOK: Output: database:statsdb2 +PREHOOK: Output: statsdb2@testtable2 +POSTHOOK: query: drop database statsdb2 cascade +POSTHOOK: type: DROPDATABASE +POSTHOOK: Input: database:statsdb2 +POSTHOOK: Output: database:statsdb2 +POSTHOOK: Output: statsdb2@testtable2