karuppayya commented on code in PR #4652:
URL: https://github.com/apache/iceberg/pull/4652#discussion_r927881922
##########
spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java:
##########
@@ -326,6 +346,96 @@ private static FlatMapFunction<Iterator<String>, String>
listDirsRecursively(
};
}
+ @VisibleForTesting
+ static List<String> findOrphanFiles(
+ SparkSession spark, Dataset<Row> actualFileDF, Dataset<Row>
validFileDF,
+ Map<String, String> equalSchemes, Map<String, String>
equalAuthorities,
+ PrefixMismatchMode prefixMismatchMode) {
+ Dataset<FileMetadata> actualFileMetadataDS = actualFileDF.mapPartitions(
+ toFileMetadata(equalSchemes, equalAuthorities),
+ Encoders.bean(FileMetadata.class));
+ Dataset<FileMetadata> validFileMetadataDS = validFileDF.mapPartitions(
+ toFileMetadata(equalSchemes, equalAuthorities),
+ Encoders.bean(FileMetadata.class));
+
+ SetAccumulator<Pair<String, String>> conflicts = new SetAccumulator<>();
+ spark.sparkContext().register(conflicts);
+
+ Column joinCond =
actualFileMetadataDS.col("path").equalTo(validFileMetadataDS.col("path"));
+
+ List<String> orphanFiles =
actualFileMetadataDS.joinWith(validFileMetadataDS,
+ joinCond, "leftouter")
Review Comment:
done
##########
spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java:
##########
@@ -326,6 +346,96 @@ private static FlatMapFunction<Iterator<String>, String>
listDirsRecursively(
};
}
+ @VisibleForTesting
+ static List<String> findOrphanFiles(
+ SparkSession spark, Dataset<Row> actualFileDF, Dataset<Row>
validFileDF,
+ Map<String, String> equalSchemes, Map<String, String>
equalAuthorities,
+ PrefixMismatchMode prefixMismatchMode) {
+ Dataset<FileMetadata> actualFileMetadataDS = actualFileDF.mapPartitions(
+ toFileMetadata(equalSchemes, equalAuthorities),
+ Encoders.bean(FileMetadata.class));
+ Dataset<FileMetadata> validFileMetadataDS = validFileDF.mapPartitions(
+ toFileMetadata(equalSchemes, equalAuthorities),
+ Encoders.bean(FileMetadata.class));
+
+ SetAccumulator<Pair<String, String>> conflicts = new SetAccumulator<>();
+ spark.sparkContext().register(conflicts);
+
+ Column joinCond =
actualFileMetadataDS.col("path").equalTo(validFileMetadataDS.col("path"));
+
+ List<String> orphanFiles =
actualFileMetadataDS.joinWith(validFileMetadataDS,
+ joinCond, "leftouter")
+ .mapPartitions(findOrphanFiles(prefixMismatchMode, conflicts),
Encoders.STRING())
+ .collectAsList();
+
+ if (prefixMismatchMode == PrefixMismatchMode.ERROR &&
!conflicts.value().isEmpty()) {
+ throw new ValidationException("Unable to determine whether certain files
are orphan. " +
+ "Metadata references files that match listed/provided files
except for authority/scheme. " +
+ "Please, inspect the conflicting authorities/schemes and provide
which of them are equal " +
+ "by further configuring the action via equalSchemes() and
equalAuthorities() methods. " +
+ "Set the prefix mismatch mode to 'NONE' to ignore remaining
locations with conflicting " +
+ "authorities/schemes or to 'DELETE' iff you are ABSOLUTELY
confident that remaining conflicting " +
+ "authorities/schemes are different. It will be impossible to
recover deleted files. " +
+ "Conflicting authorities/schemes: %s.", conflicts.value());
+ }
+ return orphanFiles;
+ }
+
+ private static Map<String, String> flattenMap(Map<String, String> map) {
+ Map<String, String> flattenedMap = Maps.newHashMap();
+ if (map != null) {
+ for (String key : map.keySet()) {
+ String value = map.get(key);
+ for (String splitKey : COMMA.split(key)) {
+ flattenedMap.put(splitKey.trim(), value.trim());
+ }
+ }
+ }
+ return flattenedMap;
+ }
+
+ private static MapPartitionsFunction<Tuple2<FileMetadata, FileMetadata>,
String> findOrphanFiles(
+ PrefixMismatchMode mode,
+ SetAccumulator<Pair<String, String>> conflicts) {
+ return rows -> {
+ Iterator<String> transformed = Iterators.transform(rows, row -> {
+ FileMetadata actual = row._1;
+ FileMetadata valid = row._2;
+ if (valid == null) {
Review Comment:
done
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]