This is an automated email from the ASF dual-hosted git repository. dbecker pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 52b11ab6aa0dbd2d99e6d583d740858b9f892b5f Author: Sai Hemanth Gantasala <[email protected]> AuthorDate: Wed Feb 7 17:44:42 2024 -0800 IMPALA-12487: Skip reloading file metadata for ALTER_TABLE events with trivial changes in StorageDescriptor IMPALA-11534 skips reloading file metadata for some trivial ALTER_TABLE events. However, ALTER_TABLE events that have trivial changes in StorageDescriptor are not handled in IMPALA-11534. The only changes that require file metadata reload are: location, rowformat, fileformat, serde, and storedAsSubDirectories. The file metadata reload can be skipped for all other changes in SD. Testing: 1) Manual testing by changing SD parameters in local environment. 2) Added unit tests for the same in MetastoreEventsProcessorTest class. Change-Id: I6fd9a9504bf93d2529dc7accbf436ad83e51d8ac Reviewed-on: http://gerrit.cloudera.org:8080/21019 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../org/apache/impala/compat/MetastoreShim.java | 2 +- .../impala/catalog/CatalogServiceCatalog.java | 13 ++- .../main/java/org/apache/impala/catalog/Table.java | 6 +- .../impala/catalog/events/MetastoreEvents.java | 76 ++++++++++++--- .../events/MetastoreEventsProcessorTest.java | 108 +++++++++++++++++++++ 5 files changed, 186 insertions(+), 19 deletions(-) diff --git a/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java b/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java index 000dccdf8..9ecad6284 100644 --- a/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java +++ b/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java @@ -945,7 +945,7 @@ public class MetastoreShim extends Hive3MetastoreShimBase { } } else { catalog_.reloadTableIfExists(entry.getKey().getDb(), entry.getKey().getTbl(), - "CommitTxnEvent", getEventId(), /*isSkipFileMetadataReload*/false); + "CommitTxnEvent", getEventId(), /*isSkipFileMetadataReload*/false); } } } diff --git a/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java b/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java index 817f97f50..b0f9d5181 100644 --- a/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java +++ b/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java @@ -2761,6 +2761,7 @@ public class CatalogServiceCatalog extends Catalog { } catalogTimeline.markEvent("Loaded table"); } + boolean isFullReloadOnTable = !isSkipFileMetadataReload; if (currentHmsEventId != -1 && syncToLatestEventId) { // fetch latest event id from HMS before starting table load and set that event // id as table's last synced id. It may happen that while the table was being @@ -2770,16 +2771,20 @@ public class CatalogServiceCatalog extends Catalog { // We are not replaying new events for the table in the end because certain // events like ALTER_TABLE in MetastoreEventProcessor call this method which // might trigger an endless loop cycle - tbl.setLastSyncedEventId(currentHmsEventId); + if (isFullReloadOnTable) { + tbl.setLastSyncedEventId(currentHmsEventId); + } else { + tbl.setLastSyncedEventId(eventId); + } } tbl.setCatalogVersion(newCatalogVersion); LOG.info(String.format("Refreshed table metadata: %s", tbl.getFullName())); // Set the last refresh event id as current HMS event id since all the metadata // until the current HMS event id is refreshed at this point. - if (currentHmsEventId > eventId) { - tbl.setLastRefreshEventId(currentHmsEventId); + if (currentHmsEventId > eventId && isFullReloadOnTable) { + tbl.setLastRefreshEventId(currentHmsEventId, isFullReloadOnTable); } else { - tbl.setLastRefreshEventId(eventId); + tbl.setLastRefreshEventId(eventId, isFullReloadOnTable); } return tbl.toTCatalogObject(resultType); } finally { diff --git a/fe/src/main/java/org/apache/impala/catalog/Table.java b/fe/src/main/java/org/apache/impala/catalog/Table.java index 09c5086e0..8fd187d95 100644 --- a/fe/src/main/java/org/apache/impala/catalog/Table.java +++ b/fe/src/main/java/org/apache/impala/catalog/Table.java @@ -1103,6 +1103,10 @@ public abstract class Table extends CatalogObjectImpl implements FeTable { public long getLastRefreshEventId() { return lastRefreshEventId_; } public void setLastRefreshEventId(long eventId) { + setLastRefreshEventId(eventId, true); + } + + public void setLastRefreshEventId(long eventId, boolean isSetLastSyncEventId) { if (eventId > lastRefreshEventId_) { lastRefreshEventId_ = eventId; } @@ -1111,7 +1115,7 @@ public abstract class Table extends CatalogObjectImpl implements FeTable { // TODO: Should we reset lastSyncedEvent Id if it is less than event Id? // If we don't reset it - we may start syncing table from an event id which // is less than refresh event id - if (lastSyncedEventId_ < eventId) { + if (lastSyncedEventId_ < eventId && isSetLastSyncEventId) { setLastSyncedEventId(eventId); } } diff --git a/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java b/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java index d5324004b..2bb6d8682 100644 --- a/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java +++ b/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.NotificationEvent; import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.TxnToWriteId; import org.apache.hadoop.hive.metastore.messaging.AbortTxnMessage; @@ -1892,12 +1893,19 @@ public class MetastoreEvents { // There are lot of other alter statements which doesn't require file metadata // reload but these are the most common types for alter statements. + boolean skipFileMetadata = false; if (isFieldSchemaChanged(beforeTable, afterTable) || - isTableOwnerChanged(beforeTable.getOwner(), afterTable.getOwner()) || - !isCustomTblPropsChanged(whitelistedTblProperties, beforeTable, afterTable)) { - return true; + isTableOwnerChanged(beforeTable.getOwner(), afterTable.getOwner())) { + skipFileMetadata = true; + } else if (!Objects.equals(beforeTable.getSd(), afterTable.getSd())) { + if (isTrivialSdPropsChanged(beforeTable.getSd(), afterTable.getSd())) { + skipFileMetadata = true; + } + } else if (!isCustomTblPropsChanged(whitelistedTblProperties, beforeTable, + afterTable)) { + skipFileMetadata = true; } - return false; + return skipFileMetadata; } private boolean isFieldSchemaChanged( @@ -1907,8 +1915,9 @@ public class MetastoreEvents { List<FieldSchema> afterCols = afterTable.getSd().getCols(); // check if columns are added or removed if (beforeCols.size() != afterCols.size()) { - infoLog("Change in number of columns detected for table {}.{} from {} to {}", - dbName_, tblName_, beforeCols.size(), afterCols.size()); + infoLog("Change in number of columns detected for table {}.{} from {} to {}. " + + "So file metadata reload can be skipped.", dbName_, tblName_, + beforeCols.size(), afterCols.size()); return true; } // check if columns are replaced or column definition is changed @@ -1916,10 +1925,10 @@ public class MetastoreEvents { for (int i = 0; i < beforeCols.size(); i++) { if (!beforeCols.get(i).getName().equals(afterCols.get(i).getName()) || !beforeCols.get(i).getType().equals(afterCols.get(i).getType())) { - infoLog("Change in table schema detected for table {}.{} from {} to {} ", - tblName_, dbName_, beforeCols.get(i).getName() + " (" + - beforeCols.get(i).getType() +")", afterCols.get(i).getName() + " (" + - afterCols.get(i).getType() + ")"); + infoLog("Change in table schema detected for table {}.{} from {} ({}) " + + "to {} ({}). So file metadata reload can be skipped.", dbName_, tblName_, + beforeCols.get(i).getName(), beforeCols.get(i).getType(), + afterCols.get(i).getName(), afterCols.get(i).getType()); return true; } } @@ -1929,7 +1938,8 @@ public class MetastoreEvents { private boolean isTableOwnerChanged(String ownerBefore, String ownerAfter) { if (!Objects.equals(ownerBefore, ownerAfter)) { infoLog("Change in Ownership detected for table {}.{}, oldOwner: {}, " + - "newOwner: {}", dbName_, tblName_, ownerBefore, ownerAfter); + "newOwner: {}. So file metadata reload can be skipped.", + dbName_, tblName_, ownerBefore, ownerAfter); return true; } return false; @@ -1944,14 +1954,54 @@ public class MetastoreEvents { String configAfter = afterTable.getParameters().get(whitelistConfig); if (!Objects.equals(configBefore, configAfter)) { infoLog("Change in whitelisted table properties detected for table {}.{} " + - "whitelisted config: {}, value before: {}, value after: {}", dbName_, - tblName_, whitelistConfig, configBefore, configAfter); + "whitelisted config: {}, value before: {}, value after: {}. So file " + + "metadata should be reloaded.", dbName_, tblName_, whitelistConfig, + configBefore, configAfter); return true; } } return false; } + // Check if the trivial SD properties are changed during the alter statement. + // Also, the caller should make sure that 'beforeSd' and 'afterSd' are not equal. + private boolean isTrivialSdPropsChanged(StorageDescriptor beforeSd, + StorageDescriptor afterSd) { + Preconditions.checkNotNull(beforeSd, "beforeSd is null"); + Preconditions.checkNotNull(afterSd, "afterSd is null"); + StorageDescriptor previousSD = normalizeStorageDescriptor(beforeSd.deepCopy()); + StorageDescriptor currentSD = normalizeStorageDescriptor(afterSd.deepCopy()); + if (!Objects.equals(previousSD, currentSD)) { + infoLog("Non-trivial change in table storage descriptor (SD) detected for " + + "table {}.{}. So file metadata should be reloaded. SD before: {}, SD " + + "after: {}", dbName_, tblName_, beforeSd.toString(), afterSd.toString()); + return false; + } + infoLog("Trivial changes in table storage descriptor (SD) detected for table " + + "{}.{}. So file metadata reload can be skipped.", dbName_, tblName_); + return true; + } + + /** + * Normalize the storage descriptor by unsetting the trivial fields in SD like + * columns, compressed, numBuckets, bucketCols,SkewedInfo, and + * setStoredAsSubDirectories. + */ + private StorageDescriptor normalizeStorageDescriptor(StorageDescriptor sd) { + sd.unsetCols(); + sd.unsetCompressed(); + sd.unsetNumBuckets(); + sd.unsetBucketCols(); + sd.unsetSortCols(); + sd.unsetSkewedInfo(); + // setStoredAsSubDirectories = null or 'false' are trivial changes, so we normalize + // this value to false if it is null. + if (!sd.isSetStoredAsSubDirectories()) { + sd.setStoredAsSubDirectories(false); + } + return sd; + } + /** * Detects a event sync flag was turned on in this event */ diff --git a/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java b/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java index 6820292e7..81461e633 100644 --- a/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java +++ b/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java @@ -176,6 +176,8 @@ import org.junit.Assume; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; @@ -197,6 +199,8 @@ public class MetastoreEventsProcessorTest { private static CatalogServiceTestCatalog catalog_; private static CatalogOpExecutor catalogOpExecutor_; private static MetastoreEventsProcessor eventsProcessor_; + private static final Logger LOG = + LoggerFactory.getLogger(MetastoreEventsProcessorTest.class); @BeforeClass public static void setUpTestEnvironment() throws TException, ImpalaException { @@ -3084,6 +3088,110 @@ public class MetastoreEventsProcessorTest { assertNotEquals(fileMetadataLoadAfter, fileMetadataLoadBefore); } + /** + * Some of the the alter table events on Storage Descriptor doesn't require to reload + * file metadata, this test asserts that file metadata is not reloaded for such alter + * table events + */ + @Test + public void testAlterTableSdVerifyMetadataReload() throws Exception { + createDatabase(TEST_DB_NAME, null); + final String testTblName = "testSdFileMetadataReload"; + createTable(testTblName, true); + List<List<String>> partVals = new ArrayList<>(); + partVals.add(Arrays.asList("1")); + addPartitions(TEST_DB_NAME, testTblName, partVals); + eventsProcessor_.processEvents(); + // load the table first + loadTable(testTblName); + HdfsTable tbl = (HdfsTable) catalog_.getTable(TEST_DB_NAME, testTblName); + // get file metadata load counter before altering the partition + long fileMetadataLoadBefore = + tbl.getMetrics().getCounter(HdfsTable.NUM_LOAD_FILEMETADATA_METRIC).getCount(); + + // Test 1: Set file format + LOG.info("Test changes in file format for an Alter table statement"); + org.apache.hadoop.hive.metastore.api.Table hmsTbl = catalog_.getTable(TEST_DB_NAME, + testTblName).getMetaStoreTable(); + hmsTbl.getSd().setInputFormat("org.apache.hadoop.mapred.TextInputFormat"); + hmsTbl.getSd().setOutputFormat( + "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"); + long fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter); + fileMetadataLoadBefore = fileMetadataLoadAfter; + + // Test 2: set rowformat + LOG.info("Test changes in row format for Alter table statement"); + hmsTbl = catalog_.getTable(TEST_DB_NAME, testTblName).getMetaStoreTable(); + hmsTbl.getSd().getSerdeInfo().setSerializerClass( + "org.apache.hadoop.hive.contrib.serde2.RegexSerDe"); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter); + fileMetadataLoadBefore = fileMetadataLoadAfter; + + // Test 3: set location + LOG.info("Test changes in table location for Alter table statement"); + hmsTbl = catalog_.getTable(TEST_DB_NAME, testTblName).getMetaStoreTable(); + assertNotNull("Location is expected to be set to proceed forward in the test", + hmsTbl.getSd().getLocation()); + String tblLocation = hmsTbl.getSd().getLocation() + "_changed"; + hmsTbl.getSd().setLocation(tblLocation); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter); + fileMetadataLoadBefore = fileMetadataLoadAfter; + + // Test 4: set storedAsSubDirectories from false to unset + LOG.info("Test changes in storedAsSubDirectories from false to unset"); + hmsTbl = tbl.getMetaStoreTable().deepCopy(); + hmsTbl.getSd().unsetStoredAsSubDirectories(); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore, fileMetadataLoadAfter); + + // Test 5: set storedAsSubDirectories to true + LOG.info("Test changes in storedAsSubDirectories from unset to true"); + hmsTbl.getSd().setStoredAsSubDirectories(true); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter); + fileMetadataLoadBefore = fileMetadataLoadAfter; + + // Test 6: set storedAsSubDirectories from true to false + LOG.info("Test changes in storedAsSubDirectories from true to false"); + hmsTbl.getSd().setStoredAsSubDirectories(false); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter); + + // Test 7: set storedAsSubDirectories from unset to false + LOG.info("Test changes in storedAsSubDirectories from unset to false"); + // first set the value to unset from false + hmsTbl.getSd().unsetStoredAsSubDirectories(); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + fileMetadataLoadBefore = fileMetadataLoadAfter; + hmsTbl.getSd().setStoredAsSubDirectories(false); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore, fileMetadataLoadAfter); + + // Test 8: keep SD unchanged by setting the same value and also change non-trivial + // TblProperties to verify file metadata is reloaded + LOG.info("Test changes in non-trivial tbl props and same SD"); + hmsTbl = tbl.getMetaStoreTable().deepCopy(); + hmsTbl.getSd().setStoredAsSubDirectories(false); + hmsTbl.getParameters().put("EXTERNAL", "false"); + fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, hmsTbl); + assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter); + } + + private long processAlterTableAndReturnMetric(String testTblName, + org.apache.hadoop.hive.metastore.api.Table msTbl) throws Exception { + try (MetaStoreClient msClient = catalog_.getMetaStoreClient()) { + msClient.getHiveClient().alter_table_with_environmentContext( + TEST_DB_NAME, testTblName, msTbl, null); + } + eventsProcessor_.processEvents(); + HdfsTable tbl = (HdfsTable) catalog_.getTable(TEST_DB_NAME, testTblName); + return + tbl.getMetrics().getCounter(HdfsTable.NUM_LOAD_FILEMETADATA_METRIC).getCount(); + } + /** * For an external table, the test asserts file metadata is not reloaded during * AlterPartitionEvent processing if only partition parameters are changed
