This is an automated email from the ASF dual-hosted git repository.

danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new ddfa681b6ea [HUDI-7758] Only consider files in Hudi partitions when 
initializing MDT (#11219)
ddfa681b6ea is described below

commit ddfa681b6ea1840c4f58f97cf47bdbcbc0df79b4
Author: Tim Brown <[email protected]>
AuthorDate: Thu Jul 4 03:17:11 2024 -0500

    [HUDI-7758] Only consider files in Hudi partitions when initializing MDT 
(#11219)
---
 .../hudi/client/functional/TestHoodieBackedMetadata.java       |  2 ++
 .../src/main/java/org/apache/hudi/common/fs/FSUtils.java       |  6 +++++-
 .../java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java | 10 +++++-----
 .../java/org/apache/hudi/common/testutils/FileCreateUtils.java |  3 ++-
 .../org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java  |  6 +++++-
 5 files changed, 19 insertions(+), 8 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
index b73ed53e62f..fd940737880 100644
--- 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
+++ 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java
@@ -461,6 +461,8 @@ public class TestHoodieBackedMetadata extends 
TestHoodieMetadataBase {
     // Create an empty directory which is not a partition directory (lacks 
partition metadata)
     final String nonPartitionDirectory = 
HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-nonpartition";
     Files.createDirectories(Paths.get(basePath, nonPartitionDirectory));
+    // Write random file to assert it is not added to the view
+    Files.createFile(Paths.get(basePath, nonPartitionDirectory, 
"randomFile.parquet"));
 
     // Three directories which are partitions but will be ignored due to filter
     final String filterDirRegex = ".*-filterDir\\d|\\..*";
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java 
b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
index 0b0828903ba..4fb8149ed56 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
@@ -80,6 +80,7 @@ public class FSUtils {
   public static final Pattern LOG_FILE_PATTERN =
       
Pattern.compile("^\\.(.+)_(.*)\\.(log|archive)\\.(\\d+)(_((\\d+)-(\\d+)-(\\d+))(.cdc)?)?");
   public static final Pattern PREFIX_BY_FILE_ID_PATTERN = 
Pattern.compile("^(.+)-(\\d+)");
+  private static final Pattern BASE_FILE_PATTERN = 
Pattern.compile("[a-zA-Z0-9-]+_[a-zA-Z0-9-]+_[0-9]+\\.[a-zA-Z0-9]+");
 
   private static final String LOG_FILE_EXTENSION = ".log";
 
@@ -428,7 +429,10 @@ public class FSUtils {
 
   public static boolean isBaseFile(StoragePath path) {
     String extension = getFileExtension(path.getName());
-    return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension);
+    if (HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension)) {
+      return BASE_FILE_PATTERN.matcher(path.getName()).matches();
+    }
+    return false;
   }
 
   public static boolean isLogFile(StoragePath logPath) {
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
 
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index cc73a3bd953..624525da71a 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -2149,16 +2149,16 @@ public class HoodieTableMetadataUtil {
       // Pre-allocate with the maximum length possible
       filenameToSizeMap = new HashMap<>(pathInfos.size());
 
+      // Presence of partition meta file implies this is a HUDI partition
+      isHoodiePartition = pathInfos.stream().anyMatch(status -> 
status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX));
       for (StoragePathInfo pathInfo : pathInfos) {
-        if (pathInfo.isDirectory()) {
+        // Do not attempt to search for more subdirectories inside directories 
that are partitions
+        if (!isHoodiePartition && pathInfo.isDirectory()) {
           // Ignore .hoodie directory as there cannot be any partitions inside 
it
           if 
(!pathInfo.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) {
             this.subDirectories.add(pathInfo.getPath());
           }
-        } else if 
(pathInfo.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX))
 {
-          // Presence of partition meta file implies this is a HUDI partition
-          this.isHoodiePartition = true;
-        } else if (FSUtils.isDataFile(pathInfo.getPath())) {
+        } else if (isHoodiePartition && 
FSUtils.isDataFile(pathInfo.getPath())) {
           // Regular HUDI data file (base file or log file)
           String dataFileCommitTime = 
FSUtils.getCommitTime(pathInfo.getPath().getName());
           // Limit the file listings to files which were created by successful 
commits before the maxInstant time.
diff --git 
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
 
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
index bf4d69ac249..3ad3ce54265 100644
--- 
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
+++ 
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
@@ -381,7 +381,7 @@ public class FileCreateUtils {
     createMetaFile(basePath, instantTime, 
HoodieTimeline.INFLIGHT_SAVEPOINT_EXTENSION);
   }
 
-  public static void createPartitionMetaFile(String basePath, String 
partitionPath) throws IOException {
+  public static URI createPartitionMetaFile(String basePath, String 
partitionPath) throws IOException {
     Path metaFilePath;
     try {
       Path parentPath = Paths.get(new URI(basePath).getPath(), partitionPath);
@@ -390,6 +390,7 @@ public class FileCreateUtils {
       if (Files.notExists(metaFilePath)) {
         Files.createFile(metaFilePath);
       }
+      return metaFilePath.toUri();
     } catch (URISyntaxException e) {
       throw new HoodieException("Error creating partition meta file", e);
     }
diff --git 
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
 
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
index ac0e6a84cc8..83b6abe12e5 100644
--- 
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
+++ 
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestHoodieTableMetadataUtil.java
@@ -27,6 +27,7 @@ import org.apache.hudi.common.model.FileSlice;
 import org.apache.hudi.common.model.HoodieBaseFile;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.testutils.FileCreateUtils;
 import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
 import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
 import org.apache.hudi.common.testutils.HoodieTestTable;
@@ -40,6 +41,7 @@ import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 
 import java.io.IOException;
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -98,6 +100,8 @@ public class TestHoodieTableMetadataUtil extends 
HoodieCommonTestHarness {
     // Generate 10 inserts for each partition and populate 
partitionBaseFilePairs and recordKeys.
     DATE_PARTITIONS.forEach(p -> {
       try {
+        URI partitionMetaFile = 
FileCreateUtils.createPartitionMetaFile(basePath, p);
+        StoragePath partitionMetadataPath = new StoragePath(partitionMetaFile);
         String fileId1 = UUID.randomUUID().toString();
         FileSlice fileSlice1 = new FileSlice(p, instant1, fileId1);
         StoragePath storagePath1 = new 
StoragePath(hoodieTestTable.getBaseFilePath(p, fileId1).toUri());
@@ -122,7 +126,7 @@ public class TestHoodieTableMetadataUtil extends 
HoodieCommonTestHarness {
         fileSlice2.setBaseFile(baseFile2);
         partitionInfoList.add(new HoodieTableMetadataUtil.DirectoryInfo(
             p,
-            
metaClient.getStorage().listDirectEntries(Arrays.asList(storagePath1, 
storagePath2)),
+            
metaClient.getStorage().listDirectEntries(Arrays.asList(partitionMetadataPath, 
storagePath1, storagePath2)),
             instant2,
             Collections.emptySet()));
       } catch (Exception e) {

Reply via email to