This is an automated email from the ASF dual-hosted git repository.

amoghj pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/main by this push:
     new 3aa0fcd7cd Core: Remove statistics files in CatalogUtil:dropTableData 
(#9305)
3aa0fcd7cd is described below

commit 3aa0fcd7cd3e0d1836395ca02ba473da71154fe4
Author: Hongyue/Steve Zhang <[email protected]>
AuthorDate: Wed Jan 3 18:12:08 2024 -0800

    Core: Remove statistics files in CatalogUtil:dropTableData (#9305)
    
    Co-authored-by: Steve Zhang <[email protected]>
---
 .../main/java/org/apache/iceberg/CatalogUtil.java  |  5 ++
 .../iceberg/hadoop/TestCatalogUtilDropTable.java   | 62 ++++++++++++++++++++--
 2 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/core/src/main/java/org/apache/iceberg/CatalogUtil.java 
b/core/src/main/java/org/apache/iceberg/CatalogUtil.java
index 72b4813a1f..2d16b19fc9 100644
--- a/core/src/main/java/org/apache/iceberg/CatalogUtil.java
+++ b/core/src/main/java/org/apache/iceberg/CatalogUtil.java
@@ -117,6 +117,11 @@ public class CatalogUtil {
         Iterables.transform(metadata.previousFiles(), 
TableMetadata.MetadataLogEntry::file),
         "previous metadata",
         true);
+    deleteFiles(
+        io,
+        Iterables.transform(metadata.statisticsFiles(), StatisticsFile::path),
+        "statistics",
+        true);
     deleteFile(io, metadata.metadataFileLocation(), "metadata");
   }
 
diff --git 
a/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java 
b/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java
index 478ac3a8c2..b8511bef52 100644
--- a/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java
+++ b/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java
@@ -18,16 +18,27 @@
  */
 package org.apache.iceberg.hadoop;
 
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.Set;
+import java.util.UUID;
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
 import org.apache.iceberg.CatalogUtil;
+import org.apache.iceberg.GenericBlobMetadata;
+import org.apache.iceberg.GenericStatisticsFile;
 import org.apache.iceberg.ManifestFile;
 import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.StatisticsFile;
 import org.apache.iceberg.TableMetadata;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.puffin.Blob;
+import org.apache.iceberg.puffin.Puffin;
+import org.apache.iceberg.puffin.PuffinWriter;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.assertj.core.api.Assertions;
 import org.junit.jupiter.api.Test;
@@ -38,19 +49,28 @@ import org.mockito.Mockito;
 public class TestCatalogUtilDropTable extends HadoopTableTestBase {
 
   @Test
-  public void dropTableDataDeletesExpectedFiles() {
+  public void dropTableDataDeletesExpectedFiles() throws IOException {
     table.newFastAppend().appendFile(FILE_A).commit();
     table.newAppend().appendFile(FILE_B).commit();
+    StatisticsFile statisticsFile =
+        writeStatsFile(
+            table.currentSnapshot().snapshotId(),
+            table.currentSnapshot().sequenceNumber(),
+            tableLocation + "/metadata/" + UUID.randomUUID() + ".stats",
+            table.io());
+    table.updateStatistics().setStatistics(statisticsFile.snapshotId(), 
statisticsFile).commit();
 
-    TableMetadata tableMetadata = readMetadataVersion(3);
+    TableMetadata tableMetadata = readMetadataVersion(4);
     Set<Snapshot> snapshotSet = Sets.newHashSet(table.snapshots());
 
     Set<String> manifestListLocations = manifestListLocations(snapshotSet);
     Set<String> manifestLocations = manifestLocations(snapshotSet, table.io());
     Set<String> dataLocations = dataLocations(snapshotSet, table.io());
     Set<String> metadataLocations = metadataLocations(tableMetadata);
+    Set<String> statsLocations = statsLocations(tableMetadata);
     Assertions.assertThat(manifestListLocations).as("should have 2 manifest 
lists").hasSize(2);
-    Assertions.assertThat(metadataLocations).as("should have 3 metadata 
locations").hasSize(3);
+    Assertions.assertThat(metadataLocations).as("should have 4 metadata 
locations").hasSize(4);
+    Assertions.assertThat(statsLocations).as("should have 1 stats 
file").hasSize(1);
 
     FileIO fileIO = Mockito.mock(FileIO.class);
     Mockito.when(fileIO.newInputFile(Mockito.anyString()))
@@ -69,7 +89,8 @@ public class TestCatalogUtilDropTable extends 
HadoopTableTestBase {
                 manifestListLocations.size()
                     + manifestLocations.size()
                     + dataLocations.size()
-                    + metadataLocations.size()))
+                    + metadataLocations.size()
+                    + statsLocations.size()))
         .deleteFile(argumentCaptor.capture());
 
     List<String> deletedPaths = argumentCaptor.getAllValues();
@@ -85,6 +106,9 @@ public class TestCatalogUtilDropTable extends 
HadoopTableTestBase {
     Assertions.assertThat(deletedPaths)
         .as("should contain all created metadata locations")
         .containsAll(metadataLocations);
+    Assertions.assertThat(deletedPaths)
+        .as("should contain all created statistic")
+        .containsAll(statsLocations);
   }
 
   @Test
@@ -181,4 +205,34 @@ public class TestCatalogUtilDropTable extends 
HadoopTableTestBase {
     metadataLocations.add(tableMetadata.metadataFileLocation());
     return metadataLocations;
   }
+
+  private Set<String> statsLocations(TableMetadata tableMetadata) {
+    return tableMetadata.statisticsFiles().stream()
+        .map(StatisticsFile::path)
+        .collect(Collectors.toSet());
+  }
+
+  private StatisticsFile writeStatsFile(
+      long snapshotId, long snapshotSequenceNumber, String statsLocation, 
FileIO fileIO)
+      throws IOException {
+    try (PuffinWriter puffinWriter = 
Puffin.write(fileIO.newOutputFile(statsLocation)).build()) {
+      puffinWriter.add(
+          new Blob(
+              "some-blob-type",
+              ImmutableList.of(1),
+              snapshotId,
+              snapshotSequenceNumber,
+              ByteBuffer.wrap("blob 
content".getBytes(StandardCharsets.UTF_8))));
+      puffinWriter.finish();
+
+      return new GenericStatisticsFile(
+          snapshotId,
+          statsLocation,
+          puffinWriter.fileSize(),
+          puffinWriter.footerSize(),
+          puffinWriter.writtenBlobsMetadata().stream()
+              .map(GenericBlobMetadata::from)
+              .collect(ImmutableList.toImmutableList()));
+    }
+  }
 }

Reply via email to