This is an automated email from the ASF dual-hosted git repository.
etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new 72b5f8e1ea API, Core: Only include required stats fields (#15739)
72b5f8e1ea is described below
commit 72b5f8e1eab614a9b946308d8a6eb9f20a90ea3e
Author: Eduard Tudenhoefner <[email protected]>
AuthorDate: Mon Mar 23 19:34:07 2026 +0100
API, Core: Only include required stats fields (#15739)
---
.palantir/revapi.yml | 4 +
.../org/apache/iceberg/stats/FieldStatistic.java | 74 ++++++---
.../java/org/apache/iceberg/stats/StatsUtil.java | 2 +-
.../org/apache/iceberg/stats/TestStatsUtil.java | 182 +++++++++++++++++----
.../iceberg/avro/SupportsIndexProjection.java | 5 +
.../org/apache/iceberg/stats/BaseContentStats.java | 18 +-
.../org/apache/iceberg/stats/BaseFieldStats.java | 54 +++++-
.../org/apache/iceberg/stats/TestContentStats.java | 95 +++++++++--
.../org/apache/iceberg/stats/TestFieldStats.java | 4 +-
9 files changed, 351 insertions(+), 87 deletions(-)
diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml
index dc5951256c..cd4afe6fdc 100644
--- a/.palantir/revapi.yml
+++ b/.palantir/revapi.yml
@@ -1370,6 +1370,10 @@ acceptedBreaks:
new: "class org.apache.iceberg.encryption.EncryptingFileIO"
justification: "New method for Manifest List reading"
org.apache.iceberg:iceberg-core:
+ - code: "java.class.defaultSerializationChanged"
+ old: "class org.apache.iceberg.avro.SupportsIndexProjection"
+ new: "class org.apache.iceberg.avro.SupportsIndexProjection"
+ justification: "Serialization across versions is not guaranteed"
- code: "java.class.defaultSerializationChanged"
old: "class org.apache.iceberg.hadoop.SerializableConfiguration"
new: "class org.apache.iceberg.hadoop.SerializableConfiguration"
diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
index 8d13ba5567..72058e5253 100644
--- a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
+++ b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
@@ -20,6 +20,8 @@ package org.apache.iceberg.stats;
import static org.apache.iceberg.types.Types.NestedField.optional;
+import java.util.List;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
@@ -88,39 +90,61 @@ public enum FieldStatistic {
};
}
- public static Types.StructType fieldStatsFor(Type type, int baseFieldId) {
- return Types.StructType.of(
+ public static Types.StructType fieldStatsFor(Types.NestedField field, int
baseFieldId) {
+ List<Types.NestedField> fields = Lists.newArrayListWithCapacity(8);
+ Type type = field.type();
+
+ fields.add(
optional(
baseFieldId + VALUE_COUNT.offset(),
VALUE_COUNT.fieldName(),
Types.LongType.get(),
- "Total value count, including null and NaN"),
- optional(
- baseFieldId + NULL_VALUE_COUNT.offset(),
- NULL_VALUE_COUNT.fieldName(),
- Types.LongType.get(),
- "Total null value count"),
- optional(
- baseFieldId + NAN_VALUE_COUNT.offset(),
- NAN_VALUE_COUNT.fieldName(),
- Types.LongType.get(),
- "Total NaN value count"),
- optional(
- baseFieldId + AVG_VALUE_SIZE.offset(),
- AVG_VALUE_SIZE.fieldName(),
- Types.IntegerType.get(),
- "Avg value size of variable-length types (String, Binary)"),
- optional(
- baseFieldId + MAX_VALUE_SIZE.offset(),
- MAX_VALUE_SIZE.fieldName(),
- Types.IntegerType.get(),
- "Max value size of variable-length types (String, Binary)"),
- optional(baseFieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(),
type, "Lower bound"),
- optional(baseFieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(),
type, "Upper bound"),
+ "Total value count, including null and NaN"));
+
+ if (field.isOptional()) {
+ fields.add(
+ optional(
+ baseFieldId + NULL_VALUE_COUNT.offset(),
+ NULL_VALUE_COUNT.fieldName(),
+ Types.LongType.get(),
+ "Total null value count"));
+ }
+
+ if (type.typeId() == Type.TypeID.FLOAT || type.typeId() ==
Type.TypeID.DOUBLE) {
+ fields.add(
+ optional(
+ baseFieldId + NAN_VALUE_COUNT.offset(),
+ NAN_VALUE_COUNT.fieldName(),
+ Types.LongType.get(),
+ "Total NaN value count"));
+ }
+
+ if (type.typeId() == Type.TypeID.STRING || type.typeId() ==
Type.TypeID.BINARY) {
+ fields.add(
+ optional(
+ baseFieldId + AVG_VALUE_SIZE.offset(),
+ AVG_VALUE_SIZE.fieldName(),
+ Types.IntegerType.get(),
+ "Avg value size of variable-length types (String, Binary)"));
+ fields.add(
+ optional(
+ baseFieldId + MAX_VALUE_SIZE.offset(),
+ MAX_VALUE_SIZE.fieldName(),
+ Types.IntegerType.get(),
+ "Max value size of variable-length types (String, Binary)"));
+ }
+
+ fields.add(
+ optional(baseFieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(),
type, "Lower bound"));
+ fields.add(
+ optional(baseFieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(),
type, "Upper bound"));
+ fields.add(
optional(
baseFieldId + EXACT_BOUNDS.offset(),
EXACT_BOUNDS.fieldName(),
Types.BooleanType.get(),
"Whether the upper/lower bound is exact or not"));
+
+ return Types.StructType.of(fields);
}
}
diff --git a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
b/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
index 349f9fe75b..2ff52f92bd 100644
--- a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
+++ b/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
@@ -178,7 +178,7 @@ public class StatsUtil {
int fieldId = StatsUtil.statsFieldIdForField(field.fieldId());
if (fieldId >= 0) {
- Types.StructType structType =
FieldStatistic.fieldStatsFor(field.type(), fieldId);
+ Types.StructType structType = FieldStatistic.fieldStatsFor(field,
fieldId);
return optional(fieldId, Integer.toString(field.fieldId()),
structType);
} else {
skippedFieldIds.add(field.fieldId());
diff --git a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
b/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
index 4a17081ab7..62c7c0ea75 100644
--- a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
+++ b/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
@@ -18,10 +18,19 @@
*/
package org.apache.iceberg.stats;
+import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE;
+import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS;
+import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND;
+import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE;
+import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT;
+import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT;
+import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND;
+import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;
import static org.assertj.core.api.Assertions.assertThat;
+import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import org.apache.iceberg.Schema;
import org.apache.iceberg.types.Types;
@@ -139,37 +148,38 @@ public class TestStatsUtil {
@Test
public void contentStatsForSimpleSchema() {
- Schema schema =
- new Schema(
- required(0, "i", Types.IntegerType.get()),
- required(2, "f", Types.FloatType.get()),
- required(4, "s", Types.StringType.get()),
- required(6, "b", Types.BooleanType.get()),
- required(1_000_000, "u", Types.UUIDType.get()));
+ Types.NestedField intField = required(0, "i", Types.IntegerType.get());
+ Types.NestedField floatField = required(2, "f", Types.FloatType.get());
+ Types.NestedField stringField = required(4, "s", Types.StringType.get());
+ Types.NestedField booleanField = required(6, "b", Types.BooleanType.get());
+ Types.NestedField uuidField = required(1_000_000, "u",
Types.UUIDType.get());
+ Schema schema = new Schema(intField, floatField, stringField,
booleanField, uuidField);
Schema expectedStatsSchema =
new Schema(
optional(
146,
"content_stats",
Types.StructType.of(
- optional(
- 10000, "0",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10000)),
- optional(
- 10400, "2",
FieldStatistic.fieldStatsFor(Types.FloatType.get(), 10400)),
- optional(
- 10800, "4",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 10800)),
- optional(
- 11200, "6",
FieldStatistic.fieldStatsFor(Types.BooleanType.get(), 11200)),
+ optional(10000, "0",
FieldStatistic.fieldStatsFor(intField, 10000)),
+ optional(10400, "2",
FieldStatistic.fieldStatsFor(floatField, 10400)),
+ optional(10800, "4",
FieldStatistic.fieldStatsFor(stringField, 10800)),
+ optional(11200, "6",
FieldStatistic.fieldStatsFor(booleanField, 11200)),
optional(
200010000,
"1000000",
- FieldStatistic.fieldStatsFor(Types.UUIDType.get(),
200010000)))));
+ FieldStatistic.fieldStatsFor(uuidField, 200010000)))));
Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema));
assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct());
}
@Test
public void contentStatsForComplexSchema() {
+ Types.NestedField listElement = optional(3, "element",
Types.IntegerType.get());
+ Types.NestedField structInt = optional(7, "int", Types.IntegerType.get());
+ Types.NestedField structString = optional(8, "string",
Types.StringType.get());
+ Types.NestedField mapKey = required(22, "key", Types.IntegerType.get());
+ Types.NestedField mapValue = optional(24, "value", Types.StringType.get());
+ Types.NestedField uuidField = required(100_000, "u", Types.UUIDType.get());
Schema schema =
new Schema(
required(0, "i", Types.IntegerType.get()),
@@ -185,7 +195,7 @@ public class TestStatsUtil {
"b",
Types.MapType.ofOptional(22, 24, Types.IntegerType.get(),
Types.StringType.get())),
required(30, "variant", Types.VariantType.get()),
- required(100_000, "u", Types.UUIDType.get()));
+ uuidField);
Schema expectedStatsSchema =
new Schema(
optional(
@@ -193,22 +203,132 @@ public class TestStatsUtil {
"content_stats",
Types.StructType.of(
optional(
- 10000, "0",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10000)),
- optional(
- 10600, "3",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10600)),
+ 10000,
+ "0",
+ FieldStatistic.fieldStatsFor(
+ required(0, "i", Types.IntegerType.get()), 10000)),
+ optional(10600, "3",
FieldStatistic.fieldStatsFor(listElement, 10600)),
+ optional(11400, "7",
FieldStatistic.fieldStatsFor(structInt, 11400)),
+ optional(11600, "8",
FieldStatistic.fieldStatsFor(structString, 11600)),
+ optional(14400, "22", FieldStatistic.fieldStatsFor(mapKey,
14400)),
+ optional(14800, "24",
FieldStatistic.fieldStatsFor(mapValue, 14800)),
optional(
- 11400, "7",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 11400)),
- optional(
- 11600, "8",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 11600)),
- optional(
- 14400, "22",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 14400)),
- optional(
- 14800, "24",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 14800)),
- optional(
- 20010000,
- "100000",
- FieldStatistic.fieldStatsFor(Types.UUIDType.get(),
20010000)))));
+ 20010000, "100000",
FieldStatistic.fieldStatsFor(uuidField, 20010000)))));
Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema));
assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct());
}
+
+ @Test
+ public void conditionalFieldInclusionForInteger() {
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(required(1, "x",
Types.IntegerType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName())
+ .doesNotContain(
+ NULL_VALUE_COUNT.fieldName(),
+ NAN_VALUE_COUNT.fieldName(),
+ AVG_VALUE_SIZE.fieldName(),
+ MAX_VALUE_SIZE.fieldName());
+
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(optional(1, "x",
Types.IntegerType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ NULL_VALUE_COUNT.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName())
+ .doesNotContain(
+ NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE.fieldName(),
MAX_VALUE_SIZE.fieldName());
+ }
+
+ @Test
+ public void conditionalFieldInclusionForFloatAndDouble() {
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(required(1, "x",
Types.FloatType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ NAN_VALUE_COUNT.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName())
+ .doesNotContain(
+ NULL_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE.fieldName(),
MAX_VALUE_SIZE.fieldName());
+
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(optional(1, "x",
Types.DoubleType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ NULL_VALUE_COUNT.fieldName(),
+ NAN_VALUE_COUNT.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName());
+ }
+
+ @Test
+ public void conditionalFieldInclusionForString() {
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(required(1, "x",
Types.StringType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ AVG_VALUE_SIZE.fieldName(),
+ MAX_VALUE_SIZE.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName())
+ .doesNotContain(NULL_VALUE_COUNT.fieldName(),
NAN_VALUE_COUNT.fieldName());
+
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(optional(1, "x",
Types.StringType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ NULL_VALUE_COUNT.fieldName(),
+ AVG_VALUE_SIZE.fieldName(),
+ MAX_VALUE_SIZE.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName());
+ }
+
+ @Test
+ public void conditionalFieldInclusionForBinary() {
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(optional(1, "x",
Types.BinaryType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ NULL_VALUE_COUNT.fieldName(),
+ AVG_VALUE_SIZE.fieldName(),
+ MAX_VALUE_SIZE.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName())
+ .doesNotContain(NAN_VALUE_COUNT.fieldName());
+
+ assertThat(
+ fieldStatsNames(
+ FieldStatistic.fieldStatsFor(required(1, "x",
Types.BinaryType.get()), 10000)))
+ .containsExactly(
+ VALUE_COUNT.fieldName(),
+ AVG_VALUE_SIZE.fieldName(),
+ MAX_VALUE_SIZE.fieldName(),
+ LOWER_BOUND.fieldName(),
+ UPPER_BOUND.fieldName(),
+ EXACT_BOUNDS.fieldName())
+ .doesNotContain(NULL_VALUE_COUNT.fieldName(),
NAN_VALUE_COUNT.fieldName());
+ }
+
+ private List<String> fieldStatsNames(Types.StructType structType) {
+ return structType.fields().stream().map(Types.NestedField::name).toList();
+ }
}
diff --git
a/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java
b/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java
index fa4ffa5aec..41b5d3e608 100644
--- a/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java
+++ b/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java
@@ -55,6 +55,11 @@ public abstract class SupportsIndexProjection implements
StructLike, Serializabl
}
}
+ /** Constructor with a precomputed position mapping */
+ protected SupportsIndexProjection(int[] fromProjectionPos) {
+ this.fromProjectionPos = fromProjectionPos;
+ }
+
/** Copy constructor */
protected SupportsIndexProjection(SupportsIndexProjection toCopy) {
this.fromProjectionPos = toCopy.fromProjectionPos;
diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java
b/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java
index 9e20f2ad25..be56c411b6 100644
--- a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java
+++ b/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java
@@ -59,6 +59,7 @@ public class BaseContentStats implements ContentStats,
Serializable {
fieldStats.add(
BaseFieldStats.builder()
.fieldId(StatsUtil.fieldIdForStatsField(field.fieldId()))
+ .statsStruct(structType)
.type(type)
.build());
}
@@ -243,6 +244,7 @@ public class BaseContentStats implements ContentStats,
Serializable {
return this;
}
+ @SuppressWarnings("rawtypes")
public BaseContentStats build() {
Preconditions.checkArgument(
null != statsStruct || null != schema, "Either stats struct or table
schema must be set");
@@ -252,7 +254,21 @@ public class BaseContentStats implements ContentStats,
Serializable {
this.statsStruct =
StatsUtil.contentStatsFor(schema).type().asStructType();
}
- return new BaseContentStats(statsStruct, stats);
+ List<FieldStats<?>> resolvedStats =
Lists.newArrayListWithCapacity(stats.size());
+ for (FieldStats<?> stat : stats) {
+ int statsFieldId = StatsUtil.statsFieldIdForField(stat.fieldId());
+ Types.NestedField statsField = statsStruct.field(statsFieldId);
+ if (null != statsField && statsField.type().isStructType()) {
+ resolvedStats.add(
+ ((BaseFieldStats.Builder) BaseFieldStats.buildFrom(stat))
+ .statsStruct(statsField.type().asStructType())
+ .build());
+ } else {
+ resolvedStats.add(stat);
+ }
+ }
+
+ return new BaseContentStats(statsStruct, resolvedStats);
}
}
}
diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
b/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
index f26294213c..470303179b 100644
--- a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
+++ b/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
@@ -18,16 +18,18 @@
*/
package org.apache.iceberg.stats;
-import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.util.Objects;
+import org.apache.iceberg.avro.SupportsIndexProjection;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ByteBuffers;
-public class BaseFieldStats<T> implements FieldStats<T>, Serializable {
+public class BaseFieldStats<T> extends SupportsIndexProjection implements
FieldStats<T> {
+ private static final int[] IDENTITY_MAPPING = identityMapping();
private final int fieldId;
private final Type type;
private final Long valueCount;
@@ -41,6 +43,7 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
private BaseFieldStats(
int fieldId,
+ int[] fromProjectionPos,
Type type,
Long valueCount,
Long nullValueCount,
@@ -50,6 +53,7 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
T lowerBound,
T upperBound,
boolean hasExactBounds) {
+ super(fromProjectionPos != null ? fromProjectionPos : IDENTITY_MAPPING);
this.fieldId = fieldId;
this.type = type;
this.valueCount = valueCount;
@@ -62,6 +66,36 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
this.hasExactBounds = hasExactBounds;
}
+ private static int[] identityMapping() {
+ int numStats = FieldStatistic.values().length;
+ int[] mapping = new int[numStats];
+ for (int i = 0; i < numStats; i++) {
+ mapping[i] = i;
+ }
+
+ return mapping;
+ }
+
+ /**
+ * Computes a position mapping from the column-specific stats struct to the
full 8-field struct.
+ * Each entry maps a projected position to its base position (0-based) using
the field ID offsets
+ * from the column's base stats field ID.
+ */
+ private static int[] projectionMapping(Types.StructType statsStruct, int
dataFieldId) {
+ if (statsStruct == null) {
+ return null;
+ }
+
+ int baseStatsFieldId = StatsUtil.statsFieldIdForField(dataFieldId);
+ int[] mapping = new int[statsStruct.fields().size()];
+ for (int i = 0; i < mapping.length; i++) {
+ // offset is 1-based (matching FieldStatistic.offset()), position is
0-based
+ mapping[i] = statsStruct.fields().get(i).fieldId() - baseStatsFieldId -
1;
+ }
+
+ return mapping;
+ }
+
@Override
public int fieldId() {
return fieldId;
@@ -144,12 +178,7 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
}
@Override
- public int size() {
- return 8;
- }
-
- @Override
- public <X> X get(int pos, Class<X> javaClass) {
+ protected <X> X internalGet(int pos, Class<X> javaClass) {
return switch (FieldStatistic.fromPosition(pos)) {
case VALUE_COUNT -> javaClass.cast(valueCount);
case NULL_VALUE_COUNT -> javaClass.cast(nullValueCount);
@@ -164,7 +193,7 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
}
@Override
- public void set(int pos, Object value) {
+ protected <X> void internalSet(int pos, X value) {
throw new UnsupportedOperationException("set() not supported");
}
@@ -239,6 +268,7 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
public static class Builder<T> {
private int fieldId;
+ private int[] fromProjectionPos;
private Type type;
private Long valueCount;
private Long nullValueCount;
@@ -251,6 +281,11 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
private Builder() {}
+ public Builder<T> statsStruct(Types.StructType statsStruct) {
+ this.fromProjectionPos = projectionMapping(statsStruct, fieldId);
+ return this;
+ }
+
public Builder<T> type(Type newType) {
this.type = newType;
return this;
@@ -333,6 +368,7 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
return new BaseFieldStats<>(
fieldId,
+ fromProjectionPos,
type,
valueCount,
nullValueCount,
diff --git a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
b/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
index d083e73065..6baff7dfe6 100644
--- a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
+++ b/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
@@ -241,34 +241,93 @@ public class TestContentStats {
}
@Test
- public void setByPosition() {
+ public void setByPositionOptionalString() {
+ Schema tableSchema = new Schema(optional(1, "s", Types.StringType.get()));
+ Types.StructType rootStatsStruct =
StatsUtil.contentStatsFor(tableSchema).type().asStructType();
+ Types.StructType statsStructForFieldId =
rootStatsStruct.fields().get(0).type().asStructType();
+ assertThat(statsStructForFieldId.fields()).hasSize(7);
+
+ GenericRecord record = GenericRecord.create(statsStructForFieldId);
+ BaseFieldStats<String> fieldStats =
+ BaseFieldStats.<String>builder()
+ .type(Types.StringType.get())
+ .fieldId(1)
+ .valueCount(10L)
+ .nullValueCount(2L)
+ .avgValueSize(3)
+ .maxValueSize(10)
+ .lowerBound("aa")
+ .upperBound("zzz")
+ .hasExactBounds()
+ .build();
+
+ record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount());
+ record.setField(NULL_VALUE_COUNT.fieldName(), fieldStats.nullValueCount());
+ record.setField(AVG_VALUE_SIZE.fieldName(), fieldStats.avgValueSize());
+ record.setField(MAX_VALUE_SIZE.fieldName(), fieldStats.maxValueSize());
+ record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound());
+ record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound());
+ record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds());
+
+ BaseContentStats stats = new BaseContentStats(rootStatsStruct);
+ stats.set(0, record);
+ assertThat(stats.fieldStats()).containsExactly(fieldStats);
+ }
+
+ @Test
+ public void setByPositionOptionalDouble() {
+ Schema tableSchema = new Schema(optional(1, "d", Types.DoubleType.get()));
+ Types.StructType rootStatsStruct =
StatsUtil.contentStatsFor(tableSchema).type().asStructType();
+ Types.StructType statsStructForFieldId =
rootStatsStruct.fields().get(0).type().asStructType();
+ assertThat(statsStructForFieldId.fields()).hasSize(6);
+
+ GenericRecord record = GenericRecord.create(statsStructForFieldId);
+ BaseFieldStats<Double> fieldStats =
+ BaseFieldStats.<Double>builder()
+ .type(Types.DoubleType.get())
+ .fieldId(1)
+ .valueCount(10L)
+ .nullValueCount(2L)
+ .nanValueCount(3L)
+ .lowerBound(5.0)
+ .upperBound(20.0)
+ .hasExactBounds()
+ .build();
+
+ record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount());
+ record.setField(NULL_VALUE_COUNT.fieldName(), fieldStats.nullValueCount());
+ record.setField(NAN_VALUE_COUNT.fieldName(), fieldStats.nanValueCount());
+ record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound());
+ record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound());
+ record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds());
+
+ BaseContentStats stats = new BaseContentStats(rootStatsStruct);
+ stats.set(0, record);
+ assertThat(stats.fieldStats()).containsExactly(fieldStats);
+ }
+
+ @Test
+ public void setByPositionRequiredInteger() {
Schema tableSchema = new Schema(required(1, "id",
Types.IntegerType.get()));
Types.StructType rootStatsStruct =
StatsUtil.contentStatsFor(tableSchema).type().asStructType();
- Types.StructType statsStructForIdField =
rootStatsStruct.fields().get(0).type().asStructType();
+ Types.StructType statsStructForFieldId =
rootStatsStruct.fields().get(0).type().asStructType();
+ assertThat(statsStructForFieldId.fields()).hasSize(4);
- GenericRecord record = GenericRecord.create(statsStructForIdField);
+ GenericRecord record = GenericRecord.create(statsStructForFieldId);
BaseFieldStats<Integer> fieldStats =
BaseFieldStats.<Integer>builder()
.type(Types.IntegerType.get())
.fieldId(1)
.valueCount(10L)
- .nullValueCount(2L)
- .nanValueCount(3L)
- .avgValueSize(30)
- .maxValueSize(70)
.lowerBound(5)
.upperBound(20)
.hasExactBounds()
.build();
- record.set(VALUE_COUNT.position(), fieldStats.valueCount());
- record.set(NULL_VALUE_COUNT.position(), fieldStats.nullValueCount());
- record.set(NAN_VALUE_COUNT.position(), fieldStats.nanValueCount());
- record.set(AVG_VALUE_SIZE.position(), fieldStats.avgValueSize());
- record.set(MAX_VALUE_SIZE.position(), fieldStats.maxValueSize());
- record.set(LOWER_BOUND.position(), fieldStats.lowerBound());
- record.set(UPPER_BOUND.position(), fieldStats.upperBound());
- record.set(EXACT_BOUNDS.position(), fieldStats.hasExactBounds());
+ record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount());
+ record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound());
+ record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound());
+ record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds());
// this is typically called by Avro reflection code
BaseContentStats stats = new BaseContentStats(rootStatsStruct);
@@ -287,17 +346,17 @@ public class TestContentStats {
BaseContentStats stats = new BaseContentStats(rootStatsStruct);
// invalid lower bound
- record.set(LOWER_BOUND.position(), 5.0);
+ record.setField(LOWER_BOUND.fieldName(), 5.0);
assertThatThrownBy(() -> stats.set(0, record))
.isInstanceOf(IllegalArgumentException.class)
.hasMessage(
"Invalid lower bound type, expected a subtype of class
java.lang.Integer: java.lang.Double");
// set valid lower bound so that upper bound is evaluated
- record.set(LOWER_BOUND.position(), 5);
+ record.setField(LOWER_BOUND.fieldName(), 5);
// invalid upper bound
- record.set(UPPER_BOUND.position(), "20");
+ record.setField(UPPER_BOUND.fieldName(), "20");
assertThatThrownBy(() -> stats.set(0, record))
.isInstanceOf(IllegalArgumentException.class)
.hasMessage(
diff --git a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
b/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
index ffd91efd8a..be5f316694 100644
--- a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
+++ b/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
@@ -211,8 +211,8 @@ public class TestFieldStats {
assertThat(fieldStats.get(EXACT_BOUNDS.position(),
Boolean.class)).isEqualTo(true);
assertThatThrownBy(() -> assertThat(fieldStats.get(10, Long.class)))
- .isInstanceOf(IllegalArgumentException.class)
- .hasMessage("Invalid statistic position: 10");
+ .isInstanceOf(ArrayIndexOutOfBoundsException.class)
+ .hasMessage("Index 10 out of bounds for length 8");
assertThatThrownBy(() -> assertThat(fieldStats.get(VALUE_COUNT.position(),
Double.class)))
.isInstanceOf(ClassCastException.class)
.hasMessage("Cannot cast java.lang.Long to java.lang.Double");