karuppayya commented on code in PR #5495:
URL: https://github.com/apache/iceberg/pull/5495#discussion_r945038993


##########
spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java:
##########
@@ -426,65 +405,159 @@ private static Map<String, String> 
flattenMap(Map<String, String> map) {
     if (map != null) {
       for (String key : map.keySet()) {
         String value = map.get(key);
-        for (String splitKey : COMMA.split(key)) {
+        for (String splitKey : COMMA_SPLITTER.split(key)) {
           flattenedMap.put(splitKey.trim(), value.trim());
         }
       }
     }
     return flattenedMap;
   }
 
-  private static MapPartitionsFunction<Tuple2<FileMetadata, FileMetadata>, 
String> findOrphanFiles(
-      PrefixMismatchMode mode, SetAccumulator<Pair<String, String>> conflicts) 
{
-    return rows -> {
-      Iterator<String> transformed =
-          Iterators.transform(
-              rows,
-              row -> {
-                FileMetadata actual = row._1;
-                FileMetadata valid = row._2;
-
-                if (valid == null) {
-                  return actual.location;
-                }
-
-                boolean schemeMatch =
-                    Strings.isNullOrEmpty(valid.scheme)
-                        || valid.scheme.equalsIgnoreCase(actual.scheme);
-                boolean authorityMatch =
-                    Strings.isNullOrEmpty(valid.authority)
-                        || valid.authority.equalsIgnoreCase(actual.authority);
-
-                if ((!schemeMatch || !authorityMatch) && mode == 
PrefixMismatchMode.DELETE) {
-                  return actual.location;
-                } else {
-                  if (!schemeMatch) {
-                    conflicts.add(Pair.of(valid.scheme, actual.scheme));
-                  }
-                  if (!authorityMatch) {
-                    conflicts.add(Pair.of(valid.authority, actual.authority));
-                  }
-                }
-
-                return null;
-              });
-      return Iterators.filter(transformed, Objects::nonNull);
-    };
+  private static class ListDirsRecursively implements 
FlatMapFunction<Iterator<String>, String> {
+
+    private final Broadcast<SerializableConfiguration> hadoopConf;
+    private final long olderThanTimestamp;
+    private final PathFilter pathFilter;
+
+    ListDirsRecursively(
+        Broadcast<SerializableConfiguration> hadoopConf,
+        long olderThanTimestamp,
+        PathFilter pathFilter) {
+
+      this.hadoopConf = hadoopConf;
+      this.olderThanTimestamp = olderThanTimestamp;
+      this.pathFilter = pathFilter;
+    }
+
+    @Override
+    public Iterator<String> call(Iterator<String> dirs) throws Exception {
+      List<String> subDirs = Lists.newArrayList();
+      List<String> files = Lists.newArrayList();
+
+      Predicate<FileStatus> predicate = file -> file.getModificationTime() < 
olderThanTimestamp;
+
+      dirs.forEachRemaining(
+          dir -> {
+            listDirRecursively(
+                dir,
+                predicate,
+                hadoopConf.value().value(),
+                MAX_EXECUTOR_LISTING_DEPTH,
+                MAX_EXECUTOR_LISTING_DIRECT_SUB_DIRS,
+                subDirs,
+                pathFilter,
+                files);
+          });
+
+      if (!subDirs.isEmpty()) {
+        throw new RuntimeException(
+            "Could not list sub directories, reached maximum depth: " + 
MAX_EXECUTOR_LISTING_DEPTH);
+      }
+
+      return files.iterator();
+    }
   }
 
-  private static MapPartitionsFunction<Row, FileMetadata> toFileMetadata(
-      Map<String, String> equalSchemesMap, Map<String, String> 
equalAuthoritiesMap) {
-    return rows ->
-        Iterators.transform(
-            rows,
-            row -> {
-              String location = row.getString(0);
-              URI uri = new Path(location).toUri();
-              String scheme = equalSchemesMap.getOrDefault(uri.getScheme(), 
uri.getScheme());
-              String authority =
-                  equalAuthoritiesMap.getOrDefault(uri.getAuthority(), 
uri.getAuthority());
-              return new FileMetadata(scheme, authority, uri.getPath(), 
location);
-            });
+  private static class FindOrphanFiles

Review Comment:
   ![Screen Shot 2022-08-12 at 8 06 39 
PM](https://user-images.githubusercontent.com/5082742/184466125-46438171-0f8d-44cc-b913-002d8526cb82.png)
   nit: Given the total number of classes, may be better to have a separate 
package and have these package private.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to