IMPALA-4943: Speed up block md loading for add/recover partition calls.

This change makes alter table add/recover partitions calls use the
per directory block metadata loading routines instead of doing it
per file. This is done since these calls always load the entire
partition directory from scratch and there is no advantage in
loading them incrementally on a per-file basis.

Tests: Ran core tests and the metadata benchmark tests.

(I) Improvement: METADATA-BENCHMARKS()
100K-PARTITIONS-1M-FILES-03-RECOVER [text / none / none] (718.62s ->
549.91s [-23.48%])

(I) Improvement: METADATA-BENCHMARKS()
100K-PARTITIONS-1M-FILES-08-ADD-PARTITION [text / none / none] (46.92s
-> 26.20s [-44.15%])

Change-Id: I331f1f090518f317bcd7df069e480edbd8f039f1
Reviewed-on: http://gerrit.cloudera.org:8080/6651
Reviewed-by: Bharath Vissapragada <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/8bd854df
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/8bd854df
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/8bd854df

Branch: refs/heads/master
Commit: 8bd854dfa6f40bd32e8fcd6f284c15b045b4f1ee
Parents: 7555316
Author: Bharath Vissapragada <[email protected]>
Authored: Fri Apr 14 12:42:45 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Fri Apr 21 20:53:26 2017 +0000

----------------------------------------------------------------------
 .../org/apache/impala/catalog/HdfsTable.java    | 25 ++++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8bd854df/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java 
b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 30241b0..143e2b1 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -835,6 +835,19 @@ public class HdfsTable extends Table {
   }
 
   /**
+   * Helper method to load the partition file metadata from scratch. This 
method is
+   * optimized for loading newly added partitions. For refreshing existing 
partitions
+   * use refreshFileMetadata(HdfsPartition).
+   */
+  private void loadFileMetadataFromScratch(HdfsPartition partition) {
+    Path partitionDirPath = partition.getLocationPath();
+    Set<Path> dirsToLoad = Sets.newHashSet(partitionDirPath);
+    HashMap<Path, List<HdfsPartition>> partsByPath = Maps.newHashMap();
+    partsByPath.put(partitionDirPath, Lists.newArrayList(partition));
+    loadMetadataAndDiskIds(dirsToLoad, partsByPath);
+  }
+
+  /**
    * Helper method to load the block locations from each directory in 
'locations'
    * and filtering only the paths from 'partsByPath'. Also loads the disk IDs
    * corresponding to these block locations.
@@ -903,7 +916,7 @@ public class HdfsTable extends Table {
       org.apache.hadoop.hive.metastore.api.Partition msPartition)
       throws CatalogException {
     HdfsPartition hdfsPartition = createPartition(storageDescriptor, 
msPartition);
-    refreshFileMetadata(hdfsPartition);
+    loadFileMetadataFromScratch(hdfsPartition);
     return hdfsPartition;
   }
 
@@ -1513,7 +1526,9 @@ public class HdfsTable extends Table {
   }
 
   /**
-   * Loads the file descriptors and block metadata of a list of partitions.
+   * Loads the file descriptors and block metadata of a list of partitions. 
This function
+   * is optimized for incremental loading of the partition file metadata. To 
load it from
+   * scratch, use loadFileMetadataFromScratch(HdfsPartition).
    */
   private void loadPartitionFileMetadata(List<HdfsPartition> partitions)
       throws Exception {
@@ -1548,8 +1563,7 @@ public class HdfsTable extends Table {
   /**
    * Loads the file descriptors and block metadata of a partition from its
    * StorageDescriptor. If 'partition' does not have an entry in the Hive 
Metastore,
-   * 'storageDescriptor' is the StorageDescriptor of the associated table. 
Populates
-   * 'perFsFileBlocks' with file block info and updates table metadata.
+   * 'storageDescriptor' is the StorageDescriptor of the associated table.
    */
   private void loadPartitionFileMetadata(StorageDescriptor storageDescriptor,
       HdfsPartition partition) throws Exception {
@@ -1994,8 +2008,9 @@ public class HdfsTable extends Table {
    */
   public void reloadPartition(HdfsPartition oldPartition, Partition 
hmsPartition)
       throws CatalogException {
-    HdfsPartition refreshedPartition = createAndLoadPartition(
+    HdfsPartition refreshedPartition = createPartition(
         hmsPartition.getSd(), hmsPartition);
+    refreshFileMetadata(refreshedPartition);
     Preconditions.checkArgument(oldPartition == null
         || oldPartition.compareTo(refreshedPartition) == 0);
     dropPartition(oldPartition);

Reply via email to