Repository: hive Updated Branches: refs/heads/branch-1.0 d687bfb81 -> 8026f3914
backport HIVE-9720: Metastore does not properly migrate column stats when renaming a table across databases (Chaoyu via Xuefu) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/8026f391 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/8026f391 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/8026f391 Branch: refs/heads/branch-1.0 Commit: 8026f3914013825e224e62b6c540b3745459cb9e Parents: d687bfb Author: Pengcheng Xiong <pxi...@apache.org> Authored: Mon Aug 31 14:26:35 2015 -0700 Committer: Pengcheng Xiong <pxi...@apache.org> Committed: Mon Aug 31 14:26:35 2015 -0700 ---------------------------------------------------------------------- .../hadoop/hive/metastore/HiveAlterHandler.java | 161 +++++++++++++++++++ .../hadoop/hive/metastore/MetaStoreUtils.java | 23 +++ .../alter_table_invalidate_column_stats_2.q | 6 + .../alter_table_invalidate_column_stats_2.q.out | 51 ++++++ 4 files changed, 241 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/8026f391/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java index fc6215a..09c1c56 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java @@ -33,8 +33,11 @@ import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.ObjectPair; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.MetaException; @@ -42,6 +45,9 @@ import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hive.common.util.HiveStringUtils; + +import com.google.common.collect.Lists; /** * Hive specific implementation of alter @@ -191,6 +197,13 @@ public class HiveAlterHandler implements AlterHandler { Path newPartLocPath = new Path(oldUri.getScheme(), oldUri.getAuthority(), newPath); altps.add(ObjectPair.create(part, part.getSd().getLocation())); part.getSd().setLocation(newPartLocPath.toString()); + String oldPartName = Warehouse.makePartName(oldt.getPartitionKeys(), part.getValues()); + try { + //existing partition column stats is no longer valid, remove them + msdb.deletePartitionColumnStatistics(dbname, name, oldPartName, part.getValues(), null); + } catch (InvalidInputException iie) { + throw new InvalidOperationException("Unable to update partition stats in table rename." + iie); + } msdb.alterPartition(dbname, name, part.getValues(), part); } } @@ -201,6 +214,7 @@ public class HiveAlterHandler implements AlterHandler { // alterPartition() MetaStoreUtils.updateUnpartitionedTableStatsFast(db, newt, wh, false, true); } + updateTableColumnStatsForAlterTable(msdb, oldt, newt); // now finally call alter table msdb.alterTable(dbname, name, newt); // commit the changes @@ -293,6 +307,7 @@ public class HiveAlterHandler implements AlterHandler { if (MetaStoreUtils.requireCalStats(hiveConf, oldPart, new_part, tbl)) { MetaStoreUtils.updatePartitionStatsFast(new_part, wh, false, true); } + updatePartColumnStats(msdb, dbname, name, new_part.getValues(), new_part); msdb.alterPartition(dbname, name, new_part.getValues(), new_part); } catch (InvalidObjectException e) { throw new InvalidOperationException("alter is not possible"); @@ -331,6 +346,15 @@ public class HiveAlterHandler implements AlterHandler { // if the external partition is renamed, the file should not change if (tbl.getTableType().equals(TableType.EXTERNAL_TABLE.toString())) { new_part.getSd().setLocation(oldPart.getSd().getLocation()); + String oldPartName = Warehouse.makePartName(tbl.getPartitionKeys(), oldPart.getValues()); + try { + //existing partition column stats is no longer valid, remove + msdb.deletePartitionColumnStatistics(dbname, name, oldPartName, oldPart.getValues(), null); + } catch (NoSuchObjectException nsoe) { + //ignore + } catch (InvalidInputException iie) { + throw new InvalidOperationException("Unable to update partition stats in table rename." + iie); + } msdb.alterPartition(dbname, name, part_vals, new_part); } else { try { @@ -377,6 +401,15 @@ public class HiveAlterHandler implements AlterHandler { if (MetaStoreUtils.requireCalStats(hiveConf, oldPart, new_part, tbl)) { MetaStoreUtils.updatePartitionStatsFast(new_part, wh, false, true); } + String oldPartName = Warehouse.makePartName(tbl.getPartitionKeys(), oldPart.getValues()); + try { + //existing partition column stats is no longer valid, remove + msdb.deletePartitionColumnStatistics(dbname, name, oldPartName, oldPart.getValues(), null); + } catch (NoSuchObjectException nsoe) { + //ignore + } catch (InvalidInputException iie) { + throw new InvalidOperationException("Unable to update partition stats in table rename." + iie); + } msdb.alterPartition(dbname, name, part_vals, new_part); } } @@ -443,6 +476,7 @@ public class HiveAlterHandler implements AlterHandler { if (MetaStoreUtils.requireCalStats(hiveConf, oldTmpPart, tmpPart, tbl)) { MetaStoreUtils.updatePartitionStatsFast(tmpPart, wh, false, true); } + updatePartColumnStats(msdb, dbname, name, oldTmpPart.getValues(), tmpPart); } msdb.alterPartitions(dbname, name, partValsList, new_parts); } catch (InvalidObjectException e) { @@ -490,4 +524,131 @@ public class HiveAlterHandler implements AlterHandler { return new Path(currentUri.getScheme(), currentUri.getAuthority(), defaultNewPath.toUri().getPath()); } + + private void updatePartColumnStatsForAlterColumns(RawStore msdb, Partition oldPartition, + String oldPartName, List<String> partVals, List<FieldSchema> oldCols, Partition newPart) + throws MetaException, InvalidObjectException { + String dbName = oldPartition.getDbName(); + String tableName = oldPartition.getTableName(); + try { + List<String> oldPartNames = Lists.newArrayList(oldPartName); + List<String> oldColNames = new ArrayList<String>(oldCols.size()); + for (FieldSchema oldCol : oldCols) { + oldColNames.add(oldCol.getName()); + } + List<FieldSchema> newCols = newPart.getSd().getCols(); + List<ColumnStatistics> partsColStats = msdb.getPartitionColumnStatistics(dbName, tableName, + oldPartNames, oldColNames); + assert (partsColStats.size() <= 1); + for (ColumnStatistics partColStats : partsColStats) { //actually only at most one loop + List<ColumnStatisticsObj> statsObjs = partColStats.getStatsObj(); + for (ColumnStatisticsObj statsObj : statsObjs) { + boolean found =false; + for (FieldSchema newCol : newCols) { + if (statsObj.getColName().equals(newCol.getName()) + && statsObj.getColType().equals(newCol.getType())) { + found = true; + break; + } + } + if (!found) { + msdb.deletePartitionColumnStatistics(dbName, tableName, oldPartName, partVals, + statsObj.getColName()); + } + } + } + } catch (NoSuchObjectException nsoe) { + LOG.debug("Could not find db entry." + nsoe); + //ignore + } catch (InvalidInputException iie) { + throw new InvalidObjectException + ("Invalid input to update partition column stats in alter table change columns" + iie); + } + } + + private void updatePartColumnStats(RawStore msdb, String dbName, String tableName, + List<String> partVals, Partition newPart) throws MetaException, InvalidObjectException { + dbName = HiveStringUtils.normalizeIdentifier(dbName); + tableName = HiveStringUtils.normalizeIdentifier(tableName); + String newDbName = HiveStringUtils.normalizeIdentifier(newPart.getDbName()); + String newTableName = HiveStringUtils.normalizeIdentifier(newPart.getTableName()); + + Table oldTable = msdb.getTable(dbName, tableName); + if (oldTable == null) { + return; + } + + try { + String oldPartName = Warehouse.makePartName(oldTable.getPartitionKeys(), partVals); + String newPartName = Warehouse.makePartName(oldTable.getPartitionKeys(), newPart.getValues()); + if (!dbName.equals(newDbName) || !tableName.equals(newTableName) + || !oldPartName.equals(newPartName)) { + msdb.deletePartitionColumnStatistics(dbName, tableName, oldPartName, partVals, null); + } else { + Partition oldPartition = msdb.getPartition(dbName, tableName, partVals); + if (oldPartition == null) { + return; + } + if (oldPartition.getSd() != null && newPart.getSd() != null) { + List<FieldSchema> oldCols = oldPartition.getSd().getCols(); + if (!MetaStoreUtils.areSameColumns(oldCols, newPart.getSd().getCols())) { + updatePartColumnStatsForAlterColumns(msdb, oldPartition, oldPartName, partVals, oldCols, newPart); + } + } + } + } catch (NoSuchObjectException nsoe) { + LOG.debug("Could not find db entry." + nsoe); + //ignore + } catch (InvalidInputException iie) { + throw new InvalidObjectException("Invalid input to update partition column stats." + iie); + } + } + + private void updateTableColumnStatsForAlterTable(RawStore msdb, Table oldTable, Table newTable) + throws MetaException, InvalidObjectException { + String dbName = oldTable.getDbName(); + String tableName = oldTable.getTableName(); + String newDbName = HiveStringUtils.normalizeIdentifier(newTable.getDbName()); + String newTableName = HiveStringUtils.normalizeIdentifier(newTable.getTableName()); + + try { + if (!dbName.equals(newDbName) || !tableName.equals(newTableName)) { + msdb.deleteTableColumnStatistics(dbName, tableName, null); + } else { + List<FieldSchema> oldCols = oldTable.getSd().getCols(); + List<FieldSchema> newCols = newTable.getSd().getCols(); + if (!MetaStoreUtils.areSameColumns(oldCols, newCols)) { + List<String> oldColNames = new ArrayList<String>(oldCols.size()); + for (FieldSchema oldCol : oldCols) { + oldColNames.add(oldCol.getName()); + } + + ColumnStatistics cs = msdb.getTableColumnStatistics(dbName, tableName, oldColNames); + if (cs == null) { + return; + } + + List<ColumnStatisticsObj> statsObjs = cs.getStatsObj(); + for (ColumnStatisticsObj statsObj : statsObjs) { + boolean found = false; + for (FieldSchema newCol : newCols) { + if (statsObj.getColName().equalsIgnoreCase(newCol.getName()) + && statsObj.getColType().equals(newCol.getType())) { + found = true; + break; + } + } + if (!found) { + msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName()); + } + } + } + } + } catch (NoSuchObjectException nsoe) { + LOG.debug("Could not find db entry." + nsoe); + } catch (InvalidInputException e) { + //should not happen since the input were verified before passed in + throw new InvalidObjectException("Invalid inputs to update table column stats: " + e); + } + } } http://git-wip-us.apache.org/repos/asf/hive/blob/8026f391/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index 3cf9f17..35e39b3 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -552,6 +552,29 @@ public class MetaStoreUtils { } } + static boolean isCascadeNeededInAlterTable(Table oldTable, Table newTable) { + //currently cascade only supports add/replace columns and + //changing column type/position/name/comments + List<FieldSchema> oldCols = oldTable.getSd().getCols(); + List<FieldSchema> newCols = newTable.getSd().getCols(); + return !areSameColumns(oldCols, newCols); + } + + static boolean areSameColumns(List<FieldSchema> oldCols, List<FieldSchema> newCols) { + if (oldCols.size() != newCols.size()) { + return false; + } else { + for (int i = 0; i < oldCols.size(); i++) { + FieldSchema oldCol = oldCols.get(i); + FieldSchema newCol = newCols.get(i); + if(!oldCol.equals(newCol)) { + return false; + } + } + } + return true; + } + /** * @return true if oldType and newType are compatible. * Two types are compatible if we have internal functions to cast one to another. http://git-wip-us.apache.org/repos/asf/hive/blob/8026f391/ql/src/test/queries/clientpositive/alter_table_invalidate_column_stats_2.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/alter_table_invalidate_column_stats_2.q b/ql/src/test/queries/clientpositive/alter_table_invalidate_column_stats_2.q new file mode 100644 index 0000000..5dd82c6 --- /dev/null +++ b/ql/src/test/queries/clientpositive/alter_table_invalidate_column_stats_2.q @@ -0,0 +1,6 @@ +create table testpart1(id int) partitioned by (dept string); +alter table testpart1 add partition(dept='a'); +insert into table testpart1 partition(dept='a') values (1); +analyze table testpart1 partition(dept='a') compute statistics for columns; +alter table testpart1 rename to testpart1_rename; +drop table testpart1_rename; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/8026f391/ql/src/test/results/clientpositive/alter_table_invalidate_column_stats_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/alter_table_invalidate_column_stats_2.q.out b/ql/src/test/results/clientpositive/alter_table_invalidate_column_stats_2.q.out new file mode 100644 index 0000000..8ff9e81 --- /dev/null +++ b/ql/src/test/results/clientpositive/alter_table_invalidate_column_stats_2.q.out @@ -0,0 +1,51 @@ +PREHOOK: query: create table testpart1(id int) partitioned by (dept string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@testpart1 +POSTHOOK: query: create table testpart1(id int) partitioned by (dept string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@testpart1 +PREHOOK: query: alter table testpart1 add partition(dept='a') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@testpart1 +POSTHOOK: query: alter table testpart1 add partition(dept='a') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@testpart1 +POSTHOOK: Output: default@testpart1@dept=a +PREHOOK: query: insert into table testpart1 partition(dept='a') values (1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@testpart1@dept=a +POSTHOOK: query: insert into table testpart1 partition(dept='a') values (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@testpart1@dept=a +POSTHOOK: Lineage: testpart1 PARTITION(dept=a).id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: analyze table testpart1 partition(dept='a') compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@testpart1 +PREHOOK: Input: default@testpart1@dept=a +#### A masked pattern was here #### +POSTHOOK: query: analyze table testpart1 partition(dept='a') compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@testpart1 +POSTHOOK: Input: default@testpart1@dept=a +#### A masked pattern was here #### +PREHOOK: query: alter table testpart1 rename to testpart1_rename +PREHOOK: type: ALTERTABLE_RENAME +PREHOOK: Input: default@testpart1 +PREHOOK: Output: default@testpart1 +POSTHOOK: query: alter table testpart1 rename to testpart1_rename +POSTHOOK: type: ALTERTABLE_RENAME +POSTHOOK: Input: default@testpart1 +POSTHOOK: Output: default@testpart1 +POSTHOOK: Output: default@testpart1_rename +PREHOOK: query: drop table testpart1_rename +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@testpart1_rename +PREHOOK: Output: default@testpart1_rename +POSTHOOK: query: drop table testpart1_rename +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@testpart1_rename +POSTHOOK: Output: default@testpart1_rename