This is an automated email from the ASF dual-hosted git repository.
danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new ddfa681b6ea [HUDI-7758] Only consider files in Hudi partitions when
initializing MDT (#11219)
ddfa681b6ea is described below
commit ddfa681b6ea1840c4f58f97cf47bdbcbc0df79b4
Author: Tim Brown <[email protected]>
AuthorDate: Thu Jul 4 03:17:11 2024 -0500
[HUDI-7758] Only consider files in Hudi partitions when initializing MDT
(#11219)
---
.../hudi/client/functional/TestHoodieBackedMetadata.java | 2 ++
.../src/main/java/org/apache/hudi/common/fs/FSUtils.java | 6 +++++-
.../java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java | 10 +++++-----
.../java/org/apache/hudi/common/testutils/FileCreateUtils.java | 3 ++-
.../org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java | 6 +++++-
5 files changed, 19 insertions(+), 8 deletions(-)
diff --git
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
index b73ed53e62f..fd940737880 100644
---
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
+++
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
@@ -461,6 +461,8 @@ public class TestHoodieBackedMetadata extends
TestHoodieMetadataBase {
// Create an empty directory which is not a partition directory (lacks
partition metadata)
final String nonPartitionDirectory =
HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-nonpartition";
Files.createDirectories(Paths.get(basePath, nonPartitionDirectory));
+ // Write random file to assert it is not added to the view
+ Files.createFile(Paths.get(basePath, nonPartitionDirectory,
"randomFile.parquet"));
// Three directories which are partitions but will be ignored due to filter
final String filterDirRegex = ".*-filterDir\\d|\\..*";
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
index 0b0828903ba..4fb8149ed56 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
@@ -80,6 +80,7 @@ public class FSUtils {
public static final Pattern LOG_FILE_PATTERN =
Pattern.compile("^\\.(.+)_(.*)\\.(log|archive)\\.(\\d+)(_((\\d+)-(\\d+)-(\\d+))(.cdc)?)?");
public static final Pattern PREFIX_BY_FILE_ID_PATTERN =
Pattern.compile("^(.+)-(\\d+)");
+ private static final Pattern BASE_FILE_PATTERN =
Pattern.compile("[a-zA-Z0-9-]+_[a-zA-Z0-9-]+_[0-9]+\\.[a-zA-Z0-9]+");
private static final String LOG_FILE_EXTENSION = ".log";
@@ -428,7 +429,10 @@ public class FSUtils {
public static boolean isBaseFile(StoragePath path) {
String extension = getFileExtension(path.getName());
- return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension);
+ if (HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension)) {
+ return BASE_FILE_PATTERN.matcher(path.getName()).matches();
+ }
+ return false;
}
public static boolean isLogFile(StoragePath logPath) {
diff --git
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index cc73a3bd953..624525da71a 100644
---
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -2149,16 +2149,16 @@ public class HoodieTableMetadataUtil {
// Pre-allocate with the maximum length possible
filenameToSizeMap = new HashMap<>(pathInfos.size());
+ // Presence of partition meta file implies this is a HUDI partition
+ isHoodiePartition = pathInfos.stream().anyMatch(status ->
status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX));
for (StoragePathInfo pathInfo : pathInfos) {
- if (pathInfo.isDirectory()) {
+ // Do not attempt to search for more subdirectories inside directories
that are partitions
+ if (!isHoodiePartition && pathInfo.isDirectory()) {
// Ignore .hoodie directory as there cannot be any partitions inside
it
if
(!pathInfo.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) {
this.subDirectories.add(pathInfo.getPath());
}
- } else if
(pathInfo.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX))
{
- // Presence of partition meta file implies this is a HUDI partition
- this.isHoodiePartition = true;
- } else if (FSUtils.isDataFile(pathInfo.getPath())) {
+ } else if (isHoodiePartition &&
FSUtils.isDataFile(pathInfo.getPath())) {
// Regular HUDI data file (base file or log file)
String dataFileCommitTime =
FSUtils.getCommitTime(pathInfo.getPath().getName());
// Limit the file listings to files which were created by successful
commits before the maxInstant time.
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
index bf4d69ac249..3ad3ce54265 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
@@ -381,7 +381,7 @@ public class FileCreateUtils {
createMetaFile(basePath, instantTime,
HoodieTimeline.INFLIGHT_SAVEPOINT_EXTENSION);
}
- public static void createPartitionMetaFile(String basePath, String
partitionPath) throws IOException {
+ public static URI createPartitionMetaFile(String basePath, String
partitionPath) throws IOException {
Path metaFilePath;
try {
Path parentPath = Paths.get(new URI(basePath).getPath(), partitionPath);
@@ -390,6 +390,7 @@ public class FileCreateUtils {
if (Files.notExists(metaFilePath)) {
Files.createFile(metaFilePath);
}
+ return metaFilePath.toUri();
} catch (URISyntaxException e) {
throw new HoodieException("Error creating partition meta file", e);
}
diff --git
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
index ac0e6a84cc8..83b6abe12e5 100644
---
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
+++
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
@@ -27,6 +27,7 @@ import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.testutils.FileCreateUtils;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestTable;
@@ -40,6 +41,7 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
+import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -98,6 +100,8 @@ public class TestHoodieTableMetadataUtil extends
HoodieCommonTestHarness {
// Generate 10 inserts for each partition and populate
partitionBaseFilePairs and recordKeys.
DATE_PARTITIONS.forEach(p -> {
try {
+ URI partitionMetaFile =
FileCreateUtils.createPartitionMetaFile(basePath, p);
+ StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile);
String fileId1 = UUID.randomUUID().toString();
FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1);
StoragePath storagePath1 = new
StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri());
@@ -122,7 +126,7 @@ public class TestHoodieTableMetadataUtil extends
HoodieCommonTestHarness {
fileSlice2.setBaseFile(baseFile2);
partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo(
p,
-
metaClient.getStorage().listDirectEntries(Arrays.asList(storagePath1,
storagePath2)),
+
metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath,
storagePath1, storagePath2)),
instant2,
Collections.emptySet()));
} catch (Exception e) {