This is an automated email from the ASF dual-hosted git repository.

pvary pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new ae55a3049b4 HIVE-25980: Reduce fs calls in 
HiveMetaStoreChecker.checkTable (Chiran Ravani reviewed by Syed Shameerur 
Rahman and Peter Vary)(#3053)
ae55a3049b4 is described below

commit ae55a3049b4100cea92ec4ac6374a8bc0f16e4a6
Author: Chiran Ravani <[email protected]>
AuthorDate: Sat Jun 25 05:34:52 2022 -0400

    HIVE-25980: Reduce fs calls in HiveMetaStoreChecker.checkTable (Chiran 
Ravani reviewed by Syed Shameerur Rahman and Peter Vary)(#3053)
---
 .../clientpositive/msck_repair_hive_25980.q        |  33 +++++++
 .../llap/msck_repair_hive_25980.q.out              |  86 +++++++++++++++++
 .../hive/metastore/HiveMetaStoreChecker.java       | 105 ++++++++++-----------
 3 files changed, 169 insertions(+), 55 deletions(-)

diff --git a/ql/src/test/queries/clientpositive/msck_repair_hive_25980.q 
b/ql/src/test/queries/clientpositive/msck_repair_hive_25980.q
new file mode 100644
index 00000000000..a769ad7dafb
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/msck_repair_hive_25980.q
@@ -0,0 +1,33 @@
+DROP TABLE IF EXISTS repairtable_hive_25980;
+
+CREATE TABLE repairtable_hive_25980(id int, name string) partitioned by(year 
int,month int);
+
+MSCK REPAIR TABLE repairtable_hive_25980;
+
+SHOW PARTITIONS repairtable_hive_25980;
+
+dfs ${system:test.dfs.mkdir} 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=01;
+dfs ${system:test.dfs.mkdir} 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=03;
+dfs ${system:test.dfs.mkdir} 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=04;
+dfs ${system:test.dfs.mkdir} 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=02;
+dfs ${system:test.dfs.mkdir} 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=01;
+dfs ${system:test.dfs.mkdir} 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=03;
+
+MSCK REPAIR TABLE repairtable_hive_25980;
+
+SHOW PARTITIONS repairtable_hive_25980;
+
+dfs -rmdir 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=02;
+dfs -rmdir 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=01;
+dfs -rmdir 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=03;
+dfs -rmdir ${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021;
+dfs ${system:test.dfs.mkdir} 
file:///tmp/repairtable_hive_25980_external_dir/year=2022/month=02;
+dfs ${system:test.dfs.mkdir} 
file:///tmp/repairtable_hive_25980_external_dir/year=2021/month=04;
+dfs ${system:test.dfs.mkdir} 
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=12;
+
+alter table repairtable_hive_25980 add partition(year=2022,month=02) location 
'file:///tmp/repairtable_hive_25980_external_dir/year=2022/month=02';
+alter table repairtable_hive_25980 add partition(year=2021,month=04) location 
'file:///tmp/repairtable_hive_25980_external_dir/year=2021/month=04';
+
+MSCK REPAIR TABLE repairtable_hive_25980 SYNC PARTITIONS;
+
+SHOW PARTITIONS repairtable_hive_25980;
diff --git 
a/ql/src/test/results/clientpositive/llap/msck_repair_hive_25980.q.out 
b/ql/src/test/results/clientpositive/llap/msck_repair_hive_25980.q.out
new file mode 100644
index 00000000000..cc3b799e5d4
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/msck_repair_hive_25980.q.out
@@ -0,0 +1,86 @@
+PREHOOK: query: DROP TABLE IF EXISTS repairtable_hive_25980
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS repairtable_hive_25980
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE repairtable_hive_25980(id int, name string) 
partitioned by(year int,month int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: CREATE TABLE repairtable_hive_25980(id int, name string) 
partitioned by(year int,month int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@repairtable_hive_25980
+PREHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+PREHOOK: type: MSCK
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@repairtable_hive_25980
+PREHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+PREHOOK: type: SHOWPARTITIONS
+PREHOOK: Input: default@repairtable_hive_25980
+POSTHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Input: default@repairtable_hive_25980
+PREHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+PREHOOK: type: MSCK
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@repairtable_hive_25980
+Partitions not in metastore:   repairtable_hive_25980:year=2021/month=1        
repairtable_hive_25980:year=2021/month=2        
repairtable_hive_25980:year=2021/month=3        
repairtable_hive_25980:year=2022/month=1        
repairtable_hive_25980:year=2022/month=3        
repairtable_hive_25980:year=2022/month=4
+#### A masked pattern was here ####
+PREHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+PREHOOK: type: SHOWPARTITIONS
+PREHOOK: Input: default@repairtable_hive_25980
+POSTHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Input: default@repairtable_hive_25980
+year=2021/month=1
+year=2021/month=2
+year=2021/month=3
+year=2022/month=1
+year=2022/month=3
+year=2022/month=4
+#### A masked pattern was here ####
+PREHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+PREHOOK: Output: default@repairtable_hive_25980
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+POSTHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: Output: default@repairtable_hive_25980@year=2022/month=2
+#### A masked pattern was here ####
+PREHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+PREHOOK: Output: default@repairtable_hive_25980
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+POSTHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: Output: default@repairtable_hive_25980@year=2021/month=4
+PREHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980 SYNC PARTITIONS
+PREHOOK: type: MSCK
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980 SYNC PARTITIONS
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@repairtable_hive_25980
+Partitions not in metastore:   repairtable_hive_25980:year=2022/month=12
+Partitions missing from filesystem:    
repairtable_hive_25980:year=2021/month=1        
repairtable_hive_25980:year=2021/month=2        
repairtable_hive_25980:year=2021/month=3
+#### A masked pattern was here ####
+Repair: Dropped partition from metastore 
hive.default.repairtable_hive_25980:year=2021/month=1
+Repair: Dropped partition from metastore 
hive.default.repairtable_hive_25980:year=2021/month=2
+Repair: Dropped partition from metastore 
hive.default.repairtable_hive_25980:year=2021/month=3
+PREHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+PREHOOK: type: SHOWPARTITIONS
+PREHOOK: Input: default@repairtable_hive_25980
+POSTHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Input: default@repairtable_hive_25980
+year=2021/month=4
+year=2022/month=1
+year=2022/month=12
+year=2022/month=2
+year=2022/month=3
+year=2022/month=4
diff --git 
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
 
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
index 06783848247..5644cd8e0c8 100644
--- 
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
+++ 
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
@@ -309,7 +309,33 @@ public class HiveMetaStoreChecker {
       return;
     }
 
-    Set<Path> partPaths = new HashSet<>();
+    // now check the table folder and see if we find anything
+    // that isn't in the metastore
+    Set<Path> allPartDirs = new HashSet<>();
+    List<FieldSchema> partColumns = table.getPartitionKeys();
+    checkPartitionDirs(tablePath, allPartDirs, 
Collections.unmodifiableList(getPartColNames(table)));
+    String tablePathStr = tablePath.toString();
+    int tablePathLength = tablePathStr.length();
+
+    if (filterExp != null) {
+      PartitionExpressionProxy expressionProxy = createExpressionProxy(conf);
+      List<String> partitions = new ArrayList<>();
+      Set<Path> partDirs = new HashSet<Path>();
+      boolean tablePathStrEndsWith = tablePathStr.endsWith("/");
+      int tablePathStrLen = tablePathStr.endsWith("/") ? tablePathStr.length() 
: tablePathStr.length() + 1;
+      allPartDirs.stream().forEach(path -> 
partitions.add(path.toString().substring(tablePathStrLen)));
+
+      // Remove all partition paths which does not matches the filter 
expression.
+      expressionProxy.filterPartitionsByExpr(partColumns, filterExp,
+              
conf.get(MetastoreConf.ConfVars.DEFAULTPARTITIONNAME.getVarname()), partitions);
+
+      // now the partition list will contain all the paths that matches the 
filter expression.
+      // add them back to partDirs.
+      for (String path : partitions) {
+        partDirs.add(new Path(tablePath, path));
+      }
+      allPartDirs = partDirs;
+    }
 
     // check that the partition folders exist on disk
     for (Partition partition : parts) {
@@ -326,10 +352,24 @@ public class HiveMetaStoreChecker {
       CheckResult.PartitionResult prFromMetastore = new 
CheckResult.PartitionResult();
       prFromMetastore.setPartitionName(getPartitionName(table, partition));
       prFromMetastore.setTableName(partition.getTableName());
-      if (!fs.exists(partPath)) {
-        result.getPartitionsNotOnFs().add(prFromMetastore);
-      } else {
+      if (allPartDirs.remove(partPath)) {
         result.getCorrectPartitions().add(prFromMetastore);
+      } else {
+        // There can be edge case where user can define partition directory 
outside of table directory
+        // to avoid eviction of such partitions
+        // we check existence of partition path which are not in table 
directory
+        if (!partPath.toString().contains(tablePathStr)) {
+          if (!fs.exists(partPath)) {
+            result.getPartitionsNotOnFs().add(prFromMetastore);
+          } else {
+            result.getCorrectPartitions().add(prFromMetastore);
+          }
+        } else {
+          // If Partition Path contains table path, we assume to be 
non-existent partition since
+          // Partition spec has to be in format 
FS://<TablePath>/<PartKeyName>=<PartValue>
+          // otherwise partition discovery may fail.
+          result.getPartitionsNotOnFs().add(prFromMetastore);
+        }
       }
 
       if (partitionExpirySeconds > 0) {
@@ -348,15 +388,9 @@ public class HiveMetaStoreChecker {
           }
         }
       }
-
-      for (int i = 0; i < getPartitionSpec(table, partition).size(); i++) {
-        Path qualifiedPath = partPath.makeQualified(fs);
-        partPaths.add(qualifiedPath);
-        partPath = partPath.getParent();
-      }
     }
 
-    findUnknownPartitions(table, partPaths, filterExp, result);
+    findUnknownPartitions(table, allPartDirs, result);
 
     if (!isPartitioned(table) && TxnUtils.isTransactionalTable(table)) {
       // Check for writeIds in the table directory
@@ -368,22 +402,19 @@ public class HiveMetaStoreChecker {
   }
 
   /**
-   * Find partitions on the fs that are unknown to the metastore.
+   * Add partitions on the fs that are unknown to the metastore.
    *
    * @param table
    *          Table where the partitions would be located
-   * @param partPaths
-   *          Paths of the partitions the ms knows about
-   * @param filterExp
-   *          Filter expression which is used to prune th partition from the
-   *          metastore and FileSystem.
+   * @param missingPartDirs
+   *          Paths of the partitions that ms does not know about
    * @param result
    *          Result object
    * @throws IOException
    *           Thrown if we fail at fetching listings from the fs.
    * @throws MetastoreException ex
    */
-  void findUnknownPartitions(Table table, Set<Path> partPaths, byte[] 
filterExp,
+  void findUnknownPartitions(Table table, Set<Path> missingPartDirs,
       CheckResult result) throws IOException, MetastoreException, 
MetaException {
 
     Path tablePath = getPath(table);
@@ -391,42 +422,6 @@ public class HiveMetaStoreChecker {
       return;
     }
     boolean transactionalTable = TxnUtils.isTransactionalTable(table);
-    // now check the table folder and see if we find anything
-    // that isn't in the metastore
-    Set<Path> allPartDirs = new HashSet<>();
-    List<FieldSchema> partColumns = table.getPartitionKeys();
-    checkPartitionDirs(tablePath, allPartDirs, 
Collections.unmodifiableList(getPartColNames(table)));
-
-    if (filterExp != null) {
-      PartitionExpressionProxy expressionProxy = createExpressionProxy(conf);
-      List<String> paritions = new ArrayList<>();
-      Set<Path> partDirs = new HashSet<Path>();
-      String tablePathStr = tablePath.toString();
-      for (Path path : allPartDirs) {
-        // remove the table's path from the partition path
-        // eg: <tablePath>/p1=1/p2=2/p3=3 ---> p1=1/p2=2/p3=3
-        if (tablePathStr.endsWith("/")) {
-          paritions.add(path.toString().substring(tablePathStr.length()));
-        } else {
-          paritions.add(path.toString().substring(tablePathStr.length() + 1));
-        }
-      }
-      // Remove all partition paths which does not matches the filter 
expression.
-      expressionProxy.filterPartitionsByExpr(partColumns, filterExp,
-          conf.get(MetastoreConf.ConfVars.DEFAULTPARTITIONNAME.getVarname()), 
paritions);
-
-      // now the partition list will contain all the paths that matches the 
filter expression.
-      // add them back to partDirs.
-      for (String path : paritions) {
-        partDirs.add(new Path(tablePath, path));
-      }
-      allPartDirs = partDirs;
-    }
-    // don't want the table dir
-    allPartDirs.remove(tablePath);
-
-    // remove the partition paths we know about
-    allPartDirs.removeAll(partPaths);
 
     Set<String> partColNames = Sets.newHashSet();
     for(FieldSchema fSchema : getPartCols(table)) {
@@ -435,7 +430,7 @@ public class HiveMetaStoreChecker {
 
     Map<String, String> partitionColToTypeMap = 
getPartitionColtoTypeMap(table.getPartitionKeys());
     // we should now only have the unexpected folders left
-    for (Path partPath : allPartDirs) {
+    for (Path partPath : missingPartDirs) {
       FileSystem fs = partPath.getFileSystem(conf);
       String partitionName = getPartitionName(fs.makeQualified(tablePath),
           partPath, partColNames, partitionColToTypeMap);

Reply via email to