This is an automated email from the ASF dual-hosted git repository.
pvary pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new ae55a3049b4 HIVE-25980: Reduce fs calls in
HiveMetaStoreChecker.checkTable (Chiran Ravani reviewed by Syed Shameerur
Rahman and Peter Vary)(#3053)
ae55a3049b4 is described below
commit ae55a3049b4100cea92ec4ac6374a8bc0f16e4a6
Author: Chiran Ravani <[email protected]>
AuthorDate: Sat Jun 25 05:34:52 2022 -0400
HIVE-25980: Reduce fs calls in HiveMetaStoreChecker.checkTable (Chiran
Ravani reviewed by Syed Shameerur Rahman and Peter Vary)(#3053)
---
.../clientpositive/msck_repair_hive_25980.q | 33 +++++++
.../llap/msck_repair_hive_25980.q.out | 86 +++++++++++++++++
.../hive/metastore/HiveMetaStoreChecker.java | 105 ++++++++++-----------
3 files changed, 169 insertions(+), 55 deletions(-)
diff --git a/ql/src/test/queries/clientpositive/msck_repair_hive_25980.q
b/ql/src/test/queries/clientpositive/msck_repair_hive_25980.q
new file mode 100644
index 00000000000..a769ad7dafb
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/msck_repair_hive_25980.q
@@ -0,0 +1,33 @@
+DROP TABLE IF EXISTS repairtable_hive_25980;
+
+CREATE TABLE repairtable_hive_25980(id int, name string) partitioned by(year
int,month int);
+
+MSCK REPAIR TABLE repairtable_hive_25980;
+
+SHOW PARTITIONS repairtable_hive_25980;
+
+dfs ${system:test.dfs.mkdir}
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=01;
+dfs ${system:test.dfs.mkdir}
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=03;
+dfs ${system:test.dfs.mkdir}
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=04;
+dfs ${system:test.dfs.mkdir}
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=02;
+dfs ${system:test.dfs.mkdir}
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=01;
+dfs ${system:test.dfs.mkdir}
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=03;
+
+MSCK REPAIR TABLE repairtable_hive_25980;
+
+SHOW PARTITIONS repairtable_hive_25980;
+
+dfs -rmdir
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=02;
+dfs -rmdir
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=01;
+dfs -rmdir
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021/month=03;
+dfs -rmdir ${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2021;
+dfs ${system:test.dfs.mkdir}
file:///tmp/repairtable_hive_25980_external_dir/year=2022/month=02;
+dfs ${system:test.dfs.mkdir}
file:///tmp/repairtable_hive_25980_external_dir/year=2021/month=04;
+dfs ${system:test.dfs.mkdir}
${system:test.local.warehouse.dir}/repairtable_hive_25980/year=2022/month=12;
+
+alter table repairtable_hive_25980 add partition(year=2022,month=02) location
'file:///tmp/repairtable_hive_25980_external_dir/year=2022/month=02';
+alter table repairtable_hive_25980 add partition(year=2021,month=04) location
'file:///tmp/repairtable_hive_25980_external_dir/year=2021/month=04';
+
+MSCK REPAIR TABLE repairtable_hive_25980 SYNC PARTITIONS;
+
+SHOW PARTITIONS repairtable_hive_25980;
diff --git
a/ql/src/test/results/clientpositive/llap/msck_repair_hive_25980.q.out
b/ql/src/test/results/clientpositive/llap/msck_repair_hive_25980.q.out
new file mode 100644
index 00000000000..cc3b799e5d4
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/msck_repair_hive_25980.q.out
@@ -0,0 +1,86 @@
+PREHOOK: query: DROP TABLE IF EXISTS repairtable_hive_25980
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS repairtable_hive_25980
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE repairtable_hive_25980(id int, name string)
partitioned by(year int,month int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: CREATE TABLE repairtable_hive_25980(id int, name string)
partitioned by(year int,month int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@repairtable_hive_25980
+PREHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+PREHOOK: type: MSCK
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@repairtable_hive_25980
+PREHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+PREHOOK: type: SHOWPARTITIONS
+PREHOOK: Input: default@repairtable_hive_25980
+POSTHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Input: default@repairtable_hive_25980
+PREHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+PREHOOK: type: MSCK
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@repairtable_hive_25980
+Partitions not in metastore: repairtable_hive_25980:year=2021/month=1
repairtable_hive_25980:year=2021/month=2
repairtable_hive_25980:year=2021/month=3
repairtable_hive_25980:year=2022/month=1
repairtable_hive_25980:year=2022/month=3
repairtable_hive_25980:year=2022/month=4
+#### A masked pattern was here ####
+PREHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+PREHOOK: type: SHOWPARTITIONS
+PREHOOK: Input: default@repairtable_hive_25980
+POSTHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Input: default@repairtable_hive_25980
+year=2021/month=1
+year=2021/month=2
+year=2021/month=3
+year=2022/month=1
+year=2022/month=3
+year=2022/month=4
+#### A masked pattern was here ####
+PREHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+PREHOOK: Output: default@repairtable_hive_25980
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+POSTHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: Output: default@repairtable_hive_25980@year=2022/month=2
+#### A masked pattern was here ####
+PREHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+PREHOOK: Output: default@repairtable_hive_25980
+#### A masked pattern was here ####
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+#### A masked pattern was here ####
+POSTHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: Output: default@repairtable_hive_25980@year=2021/month=4
+PREHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980 SYNC PARTITIONS
+PREHOOK: type: MSCK
+PREHOOK: Output: default@repairtable_hive_25980
+POSTHOOK: query: MSCK REPAIR TABLE repairtable_hive_25980 SYNC PARTITIONS
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@repairtable_hive_25980
+Partitions not in metastore: repairtable_hive_25980:year=2022/month=12
+Partitions missing from filesystem:
repairtable_hive_25980:year=2021/month=1
repairtable_hive_25980:year=2021/month=2
repairtable_hive_25980:year=2021/month=3
+#### A masked pattern was here ####
+Repair: Dropped partition from metastore
hive.default.repairtable_hive_25980:year=2021/month=1
+Repair: Dropped partition from metastore
hive.default.repairtable_hive_25980:year=2021/month=2
+Repair: Dropped partition from metastore
hive.default.repairtable_hive_25980:year=2021/month=3
+PREHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+PREHOOK: type: SHOWPARTITIONS
+PREHOOK: Input: default@repairtable_hive_25980
+POSTHOOK: query: SHOW PARTITIONS repairtable_hive_25980
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Input: default@repairtable_hive_25980
+year=2021/month=4
+year=2022/month=1
+year=2022/month=12
+year=2022/month=2
+year=2022/month=3
+year=2022/month=4
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
index 06783848247..5644cd8e0c8 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
@@ -309,7 +309,33 @@ public class HiveMetaStoreChecker {
return;
}
- Set<Path> partPaths = new HashSet<>();
+ // now check the table folder and see if we find anything
+ // that isn't in the metastore
+ Set<Path> allPartDirs = new HashSet<>();
+ List<FieldSchema> partColumns = table.getPartitionKeys();
+ checkPartitionDirs(tablePath, allPartDirs,
Collections.unmodifiableList(getPartColNames(table)));
+ String tablePathStr = tablePath.toString();
+ int tablePathLength = tablePathStr.length();
+
+ if (filterExp != null) {
+ PartitionExpressionProxy expressionProxy = createExpressionProxy(conf);
+ List<String> partitions = new ArrayList<>();
+ Set<Path> partDirs = new HashSet<Path>();
+ boolean tablePathStrEndsWith = tablePathStr.endsWith("/");
+ int tablePathStrLen = tablePathStr.endsWith("/") ? tablePathStr.length()
: tablePathStr.length() + 1;
+ allPartDirs.stream().forEach(path ->
partitions.add(path.toString().substring(tablePathStrLen)));
+
+ // Remove all partition paths which does not matches the filter
expression.
+ expressionProxy.filterPartitionsByExpr(partColumns, filterExp,
+
conf.get(MetastoreConf.ConfVars.DEFAULTPARTITIONNAME.getVarname()), partitions);
+
+ // now the partition list will contain all the paths that matches the
filter expression.
+ // add them back to partDirs.
+ for (String path : partitions) {
+ partDirs.add(new Path(tablePath, path));
+ }
+ allPartDirs = partDirs;
+ }
// check that the partition folders exist on disk
for (Partition partition : parts) {
@@ -326,10 +352,24 @@ public class HiveMetaStoreChecker {
CheckResult.PartitionResult prFromMetastore = new
CheckResult.PartitionResult();
prFromMetastore.setPartitionName(getPartitionName(table, partition));
prFromMetastore.setTableName(partition.getTableName());
- if (!fs.exists(partPath)) {
- result.getPartitionsNotOnFs().add(prFromMetastore);
- } else {
+ if (allPartDirs.remove(partPath)) {
result.getCorrectPartitions().add(prFromMetastore);
+ } else {
+ // There can be edge case where user can define partition directory
outside of table directory
+ // to avoid eviction of such partitions
+ // we check existence of partition path which are not in table
directory
+ if (!partPath.toString().contains(tablePathStr)) {
+ if (!fs.exists(partPath)) {
+ result.getPartitionsNotOnFs().add(prFromMetastore);
+ } else {
+ result.getCorrectPartitions().add(prFromMetastore);
+ }
+ } else {
+ // If Partition Path contains table path, we assume to be
non-existent partition since
+ // Partition spec has to be in format
FS://<TablePath>/<PartKeyName>=<PartValue>
+ // otherwise partition discovery may fail.
+ result.getPartitionsNotOnFs().add(prFromMetastore);
+ }
}
if (partitionExpirySeconds > 0) {
@@ -348,15 +388,9 @@ public class HiveMetaStoreChecker {
}
}
}
-
- for (int i = 0; i < getPartitionSpec(table, partition).size(); i++) {
- Path qualifiedPath = partPath.makeQualified(fs);
- partPaths.add(qualifiedPath);
- partPath = partPath.getParent();
- }
}
- findUnknownPartitions(table, partPaths, filterExp, result);
+ findUnknownPartitions(table, allPartDirs, result);
if (!isPartitioned(table) && TxnUtils.isTransactionalTable(table)) {
// Check for writeIds in the table directory
@@ -368,22 +402,19 @@ public class HiveMetaStoreChecker {
}
/**
- * Find partitions on the fs that are unknown to the metastore.
+ * Add partitions on the fs that are unknown to the metastore.
*
* @param table
* Table where the partitions would be located
- * @param partPaths
- * Paths of the partitions the ms knows about
- * @param filterExp
- * Filter expression which is used to prune th partition from the
- * metastore and FileSystem.
+ * @param missingPartDirs
+ * Paths of the partitions that ms does not know about
* @param result
* Result object
* @throws IOException
* Thrown if we fail at fetching listings from the fs.
* @throws MetastoreException ex
*/
- void findUnknownPartitions(Table table, Set<Path> partPaths, byte[]
filterExp,
+ void findUnknownPartitions(Table table, Set<Path> missingPartDirs,
CheckResult result) throws IOException, MetastoreException,
MetaException {
Path tablePath = getPath(table);
@@ -391,42 +422,6 @@ public class HiveMetaStoreChecker {
return;
}
boolean transactionalTable = TxnUtils.isTransactionalTable(table);
- // now check the table folder and see if we find anything
- // that isn't in the metastore
- Set<Path> allPartDirs = new HashSet<>();
- List<FieldSchema> partColumns = table.getPartitionKeys();
- checkPartitionDirs(tablePath, allPartDirs,
Collections.unmodifiableList(getPartColNames(table)));
-
- if (filterExp != null) {
- PartitionExpressionProxy expressionProxy = createExpressionProxy(conf);
- List<String> paritions = new ArrayList<>();
- Set<Path> partDirs = new HashSet<Path>();
- String tablePathStr = tablePath.toString();
- for (Path path : allPartDirs) {
- // remove the table's path from the partition path
- // eg: <tablePath>/p1=1/p2=2/p3=3 ---> p1=1/p2=2/p3=3
- if (tablePathStr.endsWith("/")) {
- paritions.add(path.toString().substring(tablePathStr.length()));
- } else {
- paritions.add(path.toString().substring(tablePathStr.length() + 1));
- }
- }
- // Remove all partition paths which does not matches the filter
expression.
- expressionProxy.filterPartitionsByExpr(partColumns, filterExp,
- conf.get(MetastoreConf.ConfVars.DEFAULTPARTITIONNAME.getVarname()),
paritions);
-
- // now the partition list will contain all the paths that matches the
filter expression.
- // add them back to partDirs.
- for (String path : paritions) {
- partDirs.add(new Path(tablePath, path));
- }
- allPartDirs = partDirs;
- }
- // don't want the table dir
- allPartDirs.remove(tablePath);
-
- // remove the partition paths we know about
- allPartDirs.removeAll(partPaths);
Set<String> partColNames = Sets.newHashSet();
for(FieldSchema fSchema : getPartCols(table)) {
@@ -435,7 +430,7 @@ public class HiveMetaStoreChecker {
Map<String, String> partitionColToTypeMap =
getPartitionColtoTypeMap(table.getPartitionKeys());
// we should now only have the unexpected folders left
- for (Path partPath : allPartDirs) {
+ for (Path partPath : missingPartDirs) {
FileSystem fs = partPath.getFileSystem(conf);
String partitionName = getPartitionName(fs.makeQualified(tablePath),
partPath, partColNames, partitionColToTypeMap);