danny0405 commented on code in PR #13007:
URL: https://github.com/apache/hudi/pull/13007#discussion_r2013411057


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/DirectWriteMarkers.java:
##########
@@ -115,15 +108,64 @@ public Set<String> 
createdAndMergedDataPaths(HoodieEngineContext context, int pa
         HoodieStorage storage = HoodieStorageUtils.getStorage(path, 
storageConf);
         return storage.listFiles(path).stream()
             .map(pathInfo -> pathInfo.getPath().toString())
-            .filter(pathStr -> 
pathStr.contains(HoodieTableMetaClient.MARKER_EXTN)
-                && !pathStr.endsWith(IOType.APPEND.name()))
+            .filter(pathStr -> NOT_APPEND_MARKER_PREDICATE.test(pathStr))
             .map(this::translateMarkerToDataPath);
       }, parallelism));
     }
 
     return dataFiles;
   }
 
+  public Set<String> getAppendedLogPaths(HoodieEngineContext context, int 
parallelism) throws IOException {
+    Set<String> logFiles = new HashSet<>();
+    List<String> subDirectories = 
getSubDirectoriesByMarkerCondition(storage.listDirectEntries(markerDirPath), 
logFiles, APPEND_MARKER_PREDICATE);
+
+    if (subDirectories.size() > 0) {
+      parallelism = Math.min(subDirectories.size(), parallelism);
+      StorageConfiguration<?> storageConf = storage.getConf();
+      context.setJobStatus(this.getClass().getSimpleName(), "Obtaining marker 
files for all created, merged paths");
+      logFiles.addAll(context.flatMap(subDirectories, directory -> {
+        Queue<StoragePath> candidatesDirs = new LinkedList<>();
+        candidatesDirs.add(new StoragePath(directory));
+        List<String> result = new ArrayList<>();
+        while (!candidatesDirs.isEmpty()) {
+          StoragePath path = candidatesDirs.remove();
+          HoodieStorage storage = HoodieStorageUtils.getStorage(path, 
storageConf);
+          List<StoragePathInfo> storagePathInfos = 
storage.listDirectEntries(path);
+          for (StoragePathInfo pathInfo : storagePathInfos) {
+            if (pathInfo.isDirectory()) {
+              candidatesDirs.add(pathInfo.getPath());
+            } else {
+              String pathStr = pathInfo.getPath().toString();
+              if (APPEND_MARKER_PREDICATE.test(pathStr)) {
+                result.add(translateMarkerToDataPath(pathStr));
+              }
+            }
+          }
+        }
+        return result.stream();
+      }, parallelism));
+    }
+
+    return logFiles;
+  }
+
+  private List<String> 
getSubDirectoriesByMarkerCondition(List<StoragePathInfo> topLevelInfoList, 
Set<String> dataFiles, Predicate<String> pathCondition) {
+    List<String> subDirectories = new ArrayList<>();
+    for (StoragePathInfo topLevelInfo: topLevelInfoList) {
+      if (topLevelInfo.isFile()) {
+        String pathStr = topLevelInfo.getPath().toString();
+        if (pathCondition.test(pathStr)) {
+          dataFiles.add(translateMarkerToDataPath(pathStr));

Review Comment:
   Can we avoid to modify passed in collections?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to