Repository: hive Updated Branches: refs/heads/branch-3 490041dd3 -> 4c73511f3
HIVE-21040 : msck does unnecessary file listing at last level of directory tree (Vihang Karajgaonkar, reviewed by Prasanth Jayachandran) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4c73511f Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4c73511f Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4c73511f Branch: refs/heads/branch-3 Commit: 4c73511f3f59144fb8cc306117a1bf1f3d6dd071 Parents: 490041d Author: Vihang Karajgaonkar <vihan...@apache.org> Authored: Mon Dec 17 17:13:56 2018 -0800 Committer: Vihang Karajgaonkar <vihan...@apache.org> Committed: Wed Jan 2 11:16:18 2019 -0800 ---------------------------------------------------------------------- .../hive/ql/metadata/HiveMetaStoreChecker.java | 18 ++-- .../ql/metadata/TestHiveMetaStoreChecker.java | 106 +++++++++++++++++++ 2 files changed, 116 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/4c73511f/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java index 598bb2e..9339094 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.session.SessionState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -470,10 +471,13 @@ public class HiveMetaStoreChecker { throws IOException, HiveException, InterruptedException { final Path currentPath = pd.p; final int currentDepth = pd.depth; + if (currentDepth == partColNames.size()) { + return currentPath; + } FileStatus[] fileStatuses = fs.listStatus(currentPath, FileUtils.HIDDEN_FILES_PATH_FILTER); // found no files under a sub-directory under table base path; it is possible that the table // is empty and hence there are no partition sub-directories created under base path - if (fileStatuses.length == 0 && currentDepth > 0 && currentDepth < partColNames.size()) { + if (fileStatuses.length == 0 && currentDepth > 0) { // since maxDepth is not yet reached, we are missing partition // columns in currentPath logOrThrowExceptionWithMsg( @@ -481,12 +485,12 @@ public class HiveMetaStoreChecker { } else { // found files under currentPath add them to the queue if it is a directory for (FileStatus fileStatus : fileStatuses) { - if (!fileStatus.isDirectory() && currentDepth < partColNames.size()) { + if (!fileStatus.isDirectory()) { // found a file at depth which is less than number of partition keys logOrThrowExceptionWithMsg( "MSCK finds a file rather than a directory when it searches for " + fileStatus.getPath().toString()); - } else if (fileStatus.isDirectory() && currentDepth < partColNames.size()) { + } else { // found a sub-directory at a depth less than number of partition keys // validate if the partition directory name matches with the corresponding // partition colName at currentDepth @@ -503,9 +507,6 @@ public class HiveMetaStoreChecker { } } } - if (currentDepth == partColNames.size()) { - return currentPath; - } } return null; } @@ -528,7 +529,8 @@ public class HiveMetaStoreChecker { } } - private void checkPartitionDirs(final ExecutorService executor, + @VisibleForTesting + void checkPartitionDirs(final ExecutorService executor, final Path basePath, final Set<Path> result, final FileSystem fs, final List<String> partColNames) throws HiveException { try { @@ -559,7 +561,7 @@ public class HiveMetaStoreChecker { nextLevel = tempQueue; } } catch (InterruptedException | ExecutionException e) { - LOG.error(e.getMessage()); + LOG.error("Exception received while listing partition directories", e); executor.shutdownNow(); throw new HiveException(e.getCause()); } http://git-wip-us.apache.org/repos/asf/hive/blob/4c73511f/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java index ff411f6..46f6ad8 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java @@ -18,16 +18,29 @@ package org.apache.hadoop.hive.ql.metadata; import static org.junit.Assert.*; +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; import org.apache.hadoop.hive.metastore.api.Database; @@ -39,10 +52,12 @@ import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.thrift.TException; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; import com.google.common.collect.Lists; +import org.mockito.Mockito; /** * TestHiveMetaStoreChecker. @@ -575,6 +590,94 @@ public class TestHiveMetaStoreChecker { } assertTrue("Expected HiveException", exception!=null && exception instanceof HiveException); } + + /** + * Test counts the number of listStatus calls in the msck core method of + * listing sub-directories. This is important to check since it unnecessary + * listStatus calls could cause performance degradation in remote filesystems + * like S3. The test creates a mock FileSystem object and a mock directory structure + * to simulate a table which has 2 partition keys and 2 partition values at each level. + * In the end it counts how many times the listStatus is called on the mock filesystem + * and confirm its equal to the current theoretical value. + * + * @throws IOException + * @throws HiveException + */ + @Test + public void testNumberOfListStatusCalls() throws IOException, HiveException { + LocalFileSystem mockFs = Mockito.mock(LocalFileSystem.class); + Path tableLocation = new Path("mock:///tmp/testTable"); + + Path countryUS = new Path(tableLocation, "country=US"); + Path countryIND = new Path(tableLocation, "country=IND"); + + Path cityPA = new Path(countryUS, "city=PA"); + Path citySF = new Path(countryUS, "city=SF"); + Path cityBOM = new Path(countryIND, "city=BOM"); + Path cityDEL = new Path(countryIND, "city=DEL"); + + Path paData = new Path(cityPA, "datafile"); + Path sfData = new Path(citySF, "datafile"); + Path bomData = new Path(cityBOM, "datafile"); + Path delData = new Path(cityDEL, "datafile"); + + //level 1 listing + FileStatus[] allCountries = getMockFileStatus(countryUS, countryIND); + when(mockFs.listStatus(tableLocation, FileUtils.HIDDEN_FILES_PATH_FILTER)) + .thenReturn(allCountries); + + //level 2 listing + FileStatus[] filesInUS = getMockFileStatus(cityPA, citySF); + when(mockFs.listStatus(countryUS, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInUS); + + FileStatus[] filesInInd = getMockFileStatus(cityBOM, cityDEL); + when(mockFs.listStatus(countryIND, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInInd); + + //level 3 listing + FileStatus[] paFiles = getMockFileStatus(paData); + when(mockFs.listStatus(cityPA, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(paFiles); + + FileStatus[] sfFiles = getMockFileStatus(sfData); + when(mockFs.listStatus(citySF, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(sfFiles); + + FileStatus[] bomFiles = getMockFileStatus(bomData); + when(mockFs.listStatus(cityBOM, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(bomFiles); + + FileStatus[] delFiles = getMockFileStatus(delData); + when(mockFs.listStatus(cityDEL, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(delFiles); + + HiveMetaStoreChecker checker = new HiveMetaStoreChecker(hive); + ExecutorService executorService = Executors.newFixedThreadPool(2); + Set<Path> result = new HashSet<>(); + checker.checkPartitionDirs(executorService, tableLocation, result, mockFs, + Arrays.asList("country", "city")); + // if there are n partition columns, then number of times listStatus should be called + // must be equal + // to (numDirsAtLevel1) + (numDirsAtLevel2) + ... + (numDirAtLeveln-1) + // in this case it should 1 (table level) + 2 (US, IND) + verify(mockFs, times(3)).listStatus(any(Path.class), any(PathFilter.class)); + Assert.assertEquals("msck should have found 4 unknown partitions", 4, result.size()); + } + + private FileStatus[] getMockFileStatus(Path... paths) throws IOException { + FileStatus[] result = new FileStatus[paths.length]; + int i = 0; + for (Path p : paths) { + result[i++] = createMockFileStatus(p); + } + return result; + } + + private FileStatus createMockFileStatus(Path p) { + FileStatus mock = Mockito.mock(FileStatus.class); + when(mock.getPath()).thenReturn(p); + if (p.toString().contains("datafile")) { + when(mock.isDirectory()).thenReturn(false); + } else { + when(mock.isDirectory()).thenReturn(true); + } + return mock; + } /** * Creates a test partitioned table with the required level of nested partitions and number of * partitions @@ -702,6 +805,9 @@ public class TestHiveMetaStoreChecker { private void createDirectory(String partPath) throws IOException { Path part = new Path(partPath); fs.mkdirs(part); + // create files under partitions to simulate real partitions + fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata1")); + fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata2")); fs.deleteOnExit(part); } }