This is an automated email from the ASF dual-hosted git repository.
amoghj pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new 3aa0fcd7cd Core: Remove statistics files in CatalogUtil:dropTableData
(#9305)
3aa0fcd7cd is described below
commit 3aa0fcd7cd3e0d1836395ca02ba473da71154fe4
Author: Hongyue/Steve Zhang <[email protected]>
AuthorDate: Wed Jan 3 18:12:08 2024 -0800
Core: Remove statistics files in CatalogUtil:dropTableData (#9305)
Co-authored-by: Steve Zhang <[email protected]>
---
.../main/java/org/apache/iceberg/CatalogUtil.java | 5 ++
.../iceberg/hadoop/TestCatalogUtilDropTable.java | 62 ++++++++++++++++++++--
2 files changed, 63 insertions(+), 4 deletions(-)
diff --git a/core/src/main/java/org/apache/iceberg/CatalogUtil.java
b/core/src/main/java/org/apache/iceberg/CatalogUtil.java
index 72b4813a1f..2d16b19fc9 100644
--- a/core/src/main/java/org/apache/iceberg/CatalogUtil.java
+++ b/core/src/main/java/org/apache/iceberg/CatalogUtil.java
@@ -117,6 +117,11 @@ public class CatalogUtil {
Iterables.transform(metadata.previousFiles(),
TableMetadata.MetadataLogEntry::file),
"previous metadata",
true);
+ deleteFiles(
+ io,
+ Iterables.transform(metadata.statisticsFiles(), StatisticsFile::path),
+ "statistics",
+ true);
deleteFile(io, metadata.metadataFileLocation(), "metadata");
}
diff --git
a/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java
b/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java
index 478ac3a8c2..b8511bef52 100644
--- a/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java
+++ b/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java
@@ -18,16 +18,27 @@
*/
package org.apache.iceberg.hadoop;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Set;
+import java.util.UUID;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.iceberg.CatalogUtil;
+import org.apache.iceberg.GenericBlobMetadata;
+import org.apache.iceberg.GenericStatisticsFile;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.StatisticsFile;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.puffin.Blob;
+import org.apache.iceberg.puffin.Puffin;
+import org.apache.iceberg.puffin.PuffinWriter;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.Test;
@@ -38,19 +49,28 @@ import org.mockito.Mockito;
public class TestCatalogUtilDropTable extends HadoopTableTestBase {
@Test
- public void dropTableDataDeletesExpectedFiles() {
+ public void dropTableDataDeletesExpectedFiles() throws IOException {
table.newFastAppend().appendFile(FILE_A).commit();
table.newAppend().appendFile(FILE_B).commit();
+ StatisticsFile statisticsFile =
+ writeStatsFile(
+ table.currentSnapshot().snapshotId(),
+ table.currentSnapshot().sequenceNumber(),
+ tableLocation + "/metadata/" + UUID.randomUUID() + ".stats",
+ table.io());
+ table.updateStatistics().setStatistics(statisticsFile.snapshotId(),
statisticsFile).commit();
- TableMetadata tableMetadata = readMetadataVersion(3);
+ TableMetadata tableMetadata = readMetadataVersion(4);
Set<Snapshot> snapshotSet = Sets.newHashSet(table.snapshots());
Set<String> manifestListLocations = manifestListLocations(snapshotSet);
Set<String> manifestLocations = manifestLocations(snapshotSet, table.io());
Set<String> dataLocations = dataLocations(snapshotSet, table.io());
Set<String> metadataLocations = metadataLocations(tableMetadata);
+ Set<String> statsLocations = statsLocations(tableMetadata);
Assertions.assertThat(manifestListLocations).as("should have 2 manifest
lists").hasSize(2);
- Assertions.assertThat(metadataLocations).as("should have 3 metadata
locations").hasSize(3);
+ Assertions.assertThat(metadataLocations).as("should have 4 metadata
locations").hasSize(4);
+ Assertions.assertThat(statsLocations).as("should have 1 stats
file").hasSize(1);
FileIO fileIO = Mockito.mock(FileIO.class);
Mockito.when(fileIO.newInputFile(Mockito.anyString()))
@@ -69,7 +89,8 @@ public class TestCatalogUtilDropTable extends
HadoopTableTestBase {
manifestListLocations.size()
+ manifestLocations.size()
+ dataLocations.size()
- + metadataLocations.size()))
+ + metadataLocations.size()
+ + statsLocations.size()))
.deleteFile(argumentCaptor.capture());
List<String> deletedPaths = argumentCaptor.getAllValues();
@@ -85,6 +106,9 @@ public class TestCatalogUtilDropTable extends
HadoopTableTestBase {
Assertions.assertThat(deletedPaths)
.as("should contain all created metadata locations")
.containsAll(metadataLocations);
+ Assertions.assertThat(deletedPaths)
+ .as("should contain all created statistic")
+ .containsAll(statsLocations);
}
@Test
@@ -181,4 +205,34 @@ public class TestCatalogUtilDropTable extends
HadoopTableTestBase {
metadataLocations.add(tableMetadata.metadataFileLocation());
return metadataLocations;
}
+
+ private Set<String> statsLocations(TableMetadata tableMetadata) {
+ return tableMetadata.statisticsFiles().stream()
+ .map(StatisticsFile::path)
+ .collect(Collectors.toSet());
+ }
+
+ private StatisticsFile writeStatsFile(
+ long snapshotId, long snapshotSequenceNumber, String statsLocation,
FileIO fileIO)
+ throws IOException {
+ try (PuffinWriter puffinWriter =
Puffin.write(fileIO.newOutputFile(statsLocation)).build()) {
+ puffinWriter.add(
+ new Blob(
+ "some-blob-type",
+ ImmutableList.of(1),
+ snapshotId,
+ snapshotSequenceNumber,
+ ByteBuffer.wrap("blob
content".getBytes(StandardCharsets.UTF_8))));
+ puffinWriter.finish();
+
+ return new GenericStatisticsFile(
+ snapshotId,
+ statsLocation,
+ puffinWriter.fileSize(),
+ puffinWriter.footerSize(),
+ puffinWriter.writtenBlobsMetadata().stream()
+ .map(GenericBlobMetadata::from)
+ .collect(ImmutableList.toImmutableList()));
+ }
+ }
}