This is an automated email from the ASF dual-hosted git repository.

aokolnychyi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new da2ad389fd Core: Don't persist counts for paths and positions in 
position delete files (#8590)
da2ad389fd is described below

commit da2ad389fd9ba8222f6fb3f57922209c239a7045
Author: Anton Okolnychyi <[email protected]>
AuthorDate: Wed Sep 20 14:30:31 2023 -0700

    Core: Don't persist counts for paths and positions in position delete files 
(#8590)
---
 .../main/java/org/apache/iceberg/MetricsUtil.java  | 30 ++++++++++++---
 .../iceberg/deletes/PositionDeleteWriter.java      |  6 +--
 .../apache/iceberg/io/TestFileWriterFactory.java   | 44 ++++++++++++++++++++++
 3 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/core/src/main/java/org/apache/iceberg/MetricsUtil.java 
b/core/src/main/java/org/apache/iceberg/MetricsUtil.java
index 2cd001b5c4..2d23121bb0 100644
--- a/core/src/main/java/org/apache/iceberg/MetricsUtil.java
+++ b/core/src/main/java/org/apache/iceberg/MetricsUtil.java
@@ -41,18 +41,36 @@ public class MetricsUtil {
   private MetricsUtil() {}
 
   /**
-   * Copies a metrics object without lower and upper bounds for given fields.
+   * Copies a metrics object without value, NULL and NaN counts for given 
fields.
    *
-   * @param excludedFieldIds field IDs for which the lower and upper bounds 
must be dropped
+   * @param excludedFieldIds field IDs for which the counts must be dropped
+   * @return a new metrics object without counts for given fields
+   */
+  public static Metrics copyWithoutFieldCounts(Metrics metrics, Set<Integer> 
excludedFieldIds) {
+    return new Metrics(
+        metrics.recordCount(),
+        metrics.columnSizes(),
+        copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
+        copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
+        copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
+        metrics.lowerBounds(),
+        metrics.upperBounds());
+  }
+
+  /**
+   * Copies a metrics object without counts and bounds for given fields.
+   *
+   * @param excludedFieldIds field IDs for which the counts and bounds must be 
dropped
    * @return a new metrics object without lower and upper bounds for given 
fields
    */
-  public static Metrics copyWithoutFieldBounds(Metrics metrics, Set<Integer> 
excludedFieldIds) {
+  public static Metrics copyWithoutFieldCountsAndBounds(
+      Metrics metrics, Set<Integer> excludedFieldIds) {
     return new Metrics(
         metrics.recordCount(),
         metrics.columnSizes(),
-        metrics.valueCounts(),
-        metrics.nullValueCounts(),
-        metrics.nanValueCounts(),
+        copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
+        copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
+        copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
         copyWithoutKeys(metrics.lowerBounds(), excludedFieldIds),
         copyWithoutKeys(metrics.upperBounds(), excludedFieldIds));
   }
diff --git 
a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java 
b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java
index 4f799b4349..c8193755f5 100644
--- a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java
+++ b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java
@@ -47,7 +47,7 @@ import org.apache.iceberg.util.CharSequenceSet;
  * records, consider using {@link SortingPositionOnlyDeleteWriter} instead.
  */
 public class PositionDeleteWriter<T> implements FileWriter<PositionDelete<T>, 
DeleteWriteResult> {
-  private static final Set<Integer> SINGLE_REFERENCED_FILE_BOUNDS_ONLY =
+  private static final Set<Integer> FILE_AND_POS_FIELD_IDS =
       ImmutableSet.of(DELETE_FILE_PATH.fieldId(), DELETE_FILE_POS.fieldId());
 
   private final FileAppender<StructLike> appender;
@@ -121,9 +121,9 @@ public class PositionDeleteWriter<T> implements 
FileWriter<PositionDelete<T>, De
   private Metrics metrics() {
     Metrics metrics = appender.metrics();
     if (referencedDataFiles.size() > 1) {
-      return MetricsUtil.copyWithoutFieldBounds(metrics, 
SINGLE_REFERENCED_FILE_BOUNDS_ONLY);
+      return MetricsUtil.copyWithoutFieldCountsAndBounds(metrics, 
FILE_AND_POS_FIELD_IDS);
     } else {
-      return metrics;
+      return MetricsUtil.copyWithoutFieldCounts(metrics, 
FILE_AND_POS_FIELD_IDS);
     }
   }
 }
diff --git 
a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java 
b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java
index 7910c666b4..e25a179edb 100644
--- a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java
+++ b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java
@@ -46,6 +46,7 @@ import org.apache.iceberg.orc.ORC;
 import org.apache.iceberg.parquet.Parquet;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.CharSequenceSet;
 import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.StructLikeSet;
@@ -232,14 +233,20 @@ public abstract class TestFileWriterFactory<T> extends 
WriterTestBase<T> {
     if (fileFormat == FileFormat.AVRO) {
       Assert.assertNull(deleteFile.lowerBounds());
       Assert.assertNull(deleteFile.upperBounds());
+      Assert.assertNull(deleteFile.columnSizes());
     } else {
       Assert.assertEquals(1, referencedDataFiles.size());
       Assert.assertEquals(2, deleteFile.lowerBounds().size());
       
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
       Assert.assertEquals(2, deleteFile.upperBounds().size());
       
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
+      Assert.assertEquals(2, deleteFile.columnSizes().size());
     }
 
+    Assert.assertNull(deleteFile.valueCounts());
+    Assert.assertNull(deleteFile.nullValueCounts());
+    Assert.assertNull(deleteFile.nanValueCounts());
+
     // verify the written delete file
     GenericRecord deleteRecord = 
GenericRecord.create(DeleteSchemaUtil.pathPosSchema());
     List<Record> expectedDeletes =
@@ -281,6 +288,34 @@ public abstract class TestFileWriterFactory<T> extends 
WriterTestBase<T> {
     DeleteFile deleteFile = result.first();
     CharSequenceSet referencedDataFiles = result.second();
 
+    if (fileFormat == FileFormat.AVRO) {
+      Assert.assertNull(deleteFile.lowerBounds());
+      Assert.assertNull(deleteFile.upperBounds());
+      Assert.assertNull(deleteFile.columnSizes());
+      Assert.assertNull(deleteFile.valueCounts());
+      Assert.assertNull(deleteFile.nullValueCounts());
+      Assert.assertNull(deleteFile.nanValueCounts());
+    } else {
+      Assert.assertEquals(1, referencedDataFiles.size());
+      Assert.assertEquals(4, deleteFile.lowerBounds().size());
+      
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
+      
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_POS.fieldId()));
+      for (Types.NestedField column : table.schema().columns()) {
+        
Assert.assertTrue(deleteFile.lowerBounds().containsKey(column.fieldId()));
+      }
+      Assert.assertEquals(4, deleteFile.upperBounds().size());
+      
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
+      
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_POS.fieldId()));
+      for (Types.NestedField column : table.schema().columns()) {
+        
Assert.assertTrue(deleteFile.upperBounds().containsKey(column.fieldId()));
+      }
+      // ORC also contains metrics for the deleted row struct, not just actual 
data fields
+      Assert.assertTrue(deleteFile.columnSizes().size() >= 4);
+      Assert.assertTrue(deleteFile.valueCounts().size() >= 2);
+      Assert.assertTrue(deleteFile.nullValueCounts().size() >= 2);
+      Assert.assertNull(deleteFile.nanValueCounts());
+    }
+
     // verify the written delete file
     GenericRecord deletedRow = GenericRecord.create(table.schema());
     Schema positionDeleteSchema = 
DeleteSchemaUtil.posDeleteSchema(table.schema());
@@ -336,6 +371,15 @@ public abstract class TestFileWriterFactory<T> extends 
WriterTestBase<T> {
     Assert.assertEquals(2, referencedDataFiles.size());
     Assert.assertNull(deleteFile.lowerBounds());
     Assert.assertNull(deleteFile.upperBounds());
+    Assert.assertNull(deleteFile.valueCounts());
+    Assert.assertNull(deleteFile.nullValueCounts());
+    Assert.assertNull(deleteFile.nanValueCounts());
+
+    if (fileFormat == FileFormat.AVRO) {
+      Assert.assertNull(deleteFile.columnSizes());
+    } else {
+      Assert.assertEquals(2, deleteFile.columnSizes().size());
+    }
 
     // commit the data and delete files
     table

Reply via email to