nastra commented on code in PR #13245:
URL: https://github.com/apache/iceberg/pull/13245#discussion_r2179246847
##########
spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java:
##########
@@ -538,6 +542,75 @@ public void testBinPackWithDeletes() throws IOException {
assertThat(actualRecords).as("7 rows are removed").hasSize(total - 7);
}
+ @TestTemplate
+ public void removeOrphanedDVsFromDeleteManifest() throws Exception {
+ assumeThat(formatVersion).isGreaterThanOrEqualTo(3);
+ Table table = createTable();
+ int numDataFiles = 5;
+ // 100 / 5 = 20 records per data file
+ writeRecords(numDataFiles, 100);
+ int numPositionsToDelete = 10;
+ table.refresh();
+ List<DataFile> dataFiles = TestHelpers.dataFiles(table);
+ assertThat(dataFiles).hasSize(numDataFiles);
+
+ RowDelta rowDelta = table.newRowDelta();
+ for (DataFile dataFile : dataFiles) {
+ writeDV(table, dataFile.partition(), dataFile.location(),
numPositionsToDelete)
+ .forEach(rowDelta::addDeletes);
+ }
+
+ rowDelta.commit();
+
+ Set<DeleteFile> deleteFiles = TestHelpers.deleteFiles(table);
+ assertThat(deleteFiles).hasSize(numDataFiles);
+
+ for (ManifestFile manifestFile :
table.currentSnapshot().deleteManifests(table.io())) {
+ Set<String> validDataFilePaths =
+ TestHelpers.dataFiles(table).stream()
+ .map(ContentFile::location)
+ .collect(Collectors.toSet());
+ ManifestReader<DeleteFile> reader =
+ ManifestFiles.readDeleteManifest(
+ manifestFile, table.io(), ((BaseTable)
table).operations().current().specsById());
+ for (DeleteFile deleteFile : reader) {
+ // make sure there are no orphaned DVs
+
assertThat(validDataFilePaths).contains(deleteFile.referencedDataFile());
+ }
+ }
+
+ // all data files should be rewritten. Set MIN_INPUT_FILES > to the number
of data files so that
+ // compaction is only triggered when the delete ratio of >= 30% is hit
+ RewriteDataFiles.Result result =
+ SparkActions.get(spark)
+ .rewriteDataFiles(table)
+ .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "10")
+ .option(SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "0")
+ .execute();
+
+ assertThat(result.rewrittenDataFilesCount()).isEqualTo(numDataFiles);
+ assertThat(result.removedDeleteFilesCount()).isEqualTo(numDataFiles);
+
+ table.refresh();
+ assertThat(TestHelpers.dataFiles(table)).hasSize(1);
+ assertThat(TestHelpers.deleteFiles(table)).isEmpty();
+
+ Set<String> validDataFilePaths =
Review Comment:
yes this is needed. Even if we assert that `deleteFiles` is empty, there
could still be orphaned DVs in the delete manifests, which in fact was the case
without propagating orphaned DVs
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]