This is an automated email from the ASF dual-hosted git repository.
aokolnychyi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new da2ad389fd Core: Don't persist counts for paths and positions in
position delete files (#8590)
da2ad389fd is described below
commit da2ad389fd9ba8222f6fb3f57922209c239a7045
Author: Anton Okolnychyi <[email protected]>
AuthorDate: Wed Sep 20 14:30:31 2023 -0700
Core: Don't persist counts for paths and positions in position delete files
(#8590)
---
.../main/java/org/apache/iceberg/MetricsUtil.java | 30 ++++++++++++---
.../iceberg/deletes/PositionDeleteWriter.java | 6 +--
.../apache/iceberg/io/TestFileWriterFactory.java | 44 ++++++++++++++++++++++
3 files changed, 71 insertions(+), 9 deletions(-)
diff --git a/core/src/main/java/org/apache/iceberg/MetricsUtil.java
b/core/src/main/java/org/apache/iceberg/MetricsUtil.java
index 2cd001b5c4..2d23121bb0 100644
--- a/core/src/main/java/org/apache/iceberg/MetricsUtil.java
+++ b/core/src/main/java/org/apache/iceberg/MetricsUtil.java
@@ -41,18 +41,36 @@ public class MetricsUtil {
private MetricsUtil() {}
/**
- * Copies a metrics object without lower and upper bounds for given fields.
+ * Copies a metrics object without value, NULL and NaN counts for given
fields.
*
- * @param excludedFieldIds field IDs for which the lower and upper bounds
must be dropped
+ * @param excludedFieldIds field IDs for which the counts must be dropped
+ * @return a new metrics object without counts for given fields
+ */
+ public static Metrics copyWithoutFieldCounts(Metrics metrics, Set<Integer>
excludedFieldIds) {
+ return new Metrics(
+ metrics.recordCount(),
+ metrics.columnSizes(),
+ copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
+ copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
+ copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
+ metrics.lowerBounds(),
+ metrics.upperBounds());
+ }
+
+ /**
+ * Copies a metrics object without counts and bounds for given fields.
+ *
+ * @param excludedFieldIds field IDs for which the counts and bounds must be
dropped
* @return a new metrics object without lower and upper bounds for given
fields
*/
- public static Metrics copyWithoutFieldBounds(Metrics metrics, Set<Integer>
excludedFieldIds) {
+ public static Metrics copyWithoutFieldCountsAndBounds(
+ Metrics metrics, Set<Integer> excludedFieldIds) {
return new Metrics(
metrics.recordCount(),
metrics.columnSizes(),
- metrics.valueCounts(),
- metrics.nullValueCounts(),
- metrics.nanValueCounts(),
+ copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
+ copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
+ copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.lowerBounds(), excludedFieldIds),
copyWithoutKeys(metrics.upperBounds(), excludedFieldIds));
}
diff --git
a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java
b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java
index 4f799b4349..c8193755f5 100644
--- a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java
+++ b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java
@@ -47,7 +47,7 @@ import org.apache.iceberg.util.CharSequenceSet;
* records, consider using {@link SortingPositionOnlyDeleteWriter} instead.
*/
public class PositionDeleteWriter<T> implements FileWriter<PositionDelete<T>,
DeleteWriteResult> {
- private static final Set<Integer> SINGLE_REFERENCED_FILE_BOUNDS_ONLY =
+ private static final Set<Integer> FILE_AND_POS_FIELD_IDS =
ImmutableSet.of(DELETE_FILE_PATH.fieldId(), DELETE_FILE_POS.fieldId());
private final FileAppender<StructLike> appender;
@@ -121,9 +121,9 @@ public class PositionDeleteWriter<T> implements
FileWriter<PositionDelete<T>, De
private Metrics metrics() {
Metrics metrics = appender.metrics();
if (referencedDataFiles.size() > 1) {
- return MetricsUtil.copyWithoutFieldBounds(metrics,
SINGLE_REFERENCED_FILE_BOUNDS_ONLY);
+ return MetricsUtil.copyWithoutFieldCountsAndBounds(metrics,
FILE_AND_POS_FIELD_IDS);
} else {
- return metrics;
+ return MetricsUtil.copyWithoutFieldCounts(metrics,
FILE_AND_POS_FIELD_IDS);
}
}
}
diff --git
a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java
b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java
index 7910c666b4..e25a179edb 100644
--- a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java
+++ b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java
@@ -46,6 +46,7 @@ import org.apache.iceberg.orc.ORC;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.CharSequenceSet;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.StructLikeSet;
@@ -232,14 +233,20 @@ public abstract class TestFileWriterFactory<T> extends
WriterTestBase<T> {
if (fileFormat == FileFormat.AVRO) {
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
+ Assert.assertNull(deleteFile.columnSizes());
} else {
Assert.assertEquals(1, referencedDataFiles.size());
Assert.assertEquals(2, deleteFile.lowerBounds().size());
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertEquals(2, deleteFile.upperBounds().size());
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
+ Assert.assertEquals(2, deleteFile.columnSizes().size());
}
+ Assert.assertNull(deleteFile.valueCounts());
+ Assert.assertNull(deleteFile.nullValueCounts());
+ Assert.assertNull(deleteFile.nanValueCounts());
+
// verify the written delete file
GenericRecord deleteRecord =
GenericRecord.create(DeleteSchemaUtil.pathPosSchema());
List<Record> expectedDeletes =
@@ -281,6 +288,34 @@ public abstract class TestFileWriterFactory<T> extends
WriterTestBase<T> {
DeleteFile deleteFile = result.first();
CharSequenceSet referencedDataFiles = result.second();
+ if (fileFormat == FileFormat.AVRO) {
+ Assert.assertNull(deleteFile.lowerBounds());
+ Assert.assertNull(deleteFile.upperBounds());
+ Assert.assertNull(deleteFile.columnSizes());
+ Assert.assertNull(deleteFile.valueCounts());
+ Assert.assertNull(deleteFile.nullValueCounts());
+ Assert.assertNull(deleteFile.nanValueCounts());
+ } else {
+ Assert.assertEquals(1, referencedDataFiles.size());
+ Assert.assertEquals(4, deleteFile.lowerBounds().size());
+
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
+
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_POS.fieldId()));
+ for (Types.NestedField column : table.schema().columns()) {
+
Assert.assertTrue(deleteFile.lowerBounds().containsKey(column.fieldId()));
+ }
+ Assert.assertEquals(4, deleteFile.upperBounds().size());
+
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
+
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_POS.fieldId()));
+ for (Types.NestedField column : table.schema().columns()) {
+
Assert.assertTrue(deleteFile.upperBounds().containsKey(column.fieldId()));
+ }
+ // ORC also contains metrics for the deleted row struct, not just actual
data fields
+ Assert.assertTrue(deleteFile.columnSizes().size() >= 4);
+ Assert.assertTrue(deleteFile.valueCounts().size() >= 2);
+ Assert.assertTrue(deleteFile.nullValueCounts().size() >= 2);
+ Assert.assertNull(deleteFile.nanValueCounts());
+ }
+
// verify the written delete file
GenericRecord deletedRow = GenericRecord.create(table.schema());
Schema positionDeleteSchema =
DeleteSchemaUtil.posDeleteSchema(table.schema());
@@ -336,6 +371,15 @@ public abstract class TestFileWriterFactory<T> extends
WriterTestBase<T> {
Assert.assertEquals(2, referencedDataFiles.size());
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
+ Assert.assertNull(deleteFile.valueCounts());
+ Assert.assertNull(deleteFile.nullValueCounts());
+ Assert.assertNull(deleteFile.nanValueCounts());
+
+ if (fileFormat == FileFormat.AVRO) {
+ Assert.assertNull(deleteFile.columnSizes());
+ } else {
+ Assert.assertEquals(2, deleteFile.columnSizes().size());
+ }
// commit the data and delete files
table