This is an automated email from the ASF dual-hosted git repository.

etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/main by this push:
     new 72b5f8e1ea API, Core: Only include required stats fields (#15739)
72b5f8e1ea is described below

commit 72b5f8e1eab614a9b946308d8a6eb9f20a90ea3e
Author: Eduard Tudenhoefner <[email protected]>
AuthorDate: Mon Mar 23 19:34:07 2026 +0100

    API, Core: Only include required stats fields (#15739)
---
 .palantir/revapi.yml                               |   4 +
 .../org/apache/iceberg/stats/FieldStatistic.java   |  74 ++++++---
 .../java/org/apache/iceberg/stats/StatsUtil.java   |   2 +-
 .../org/apache/iceberg/stats/TestStatsUtil.java    | 182 +++++++++++++++++----
 .../iceberg/avro/SupportsIndexProjection.java      |   5 +
 .../org/apache/iceberg/stats/BaseContentStats.java |  18 +-
 .../org/apache/iceberg/stats/BaseFieldStats.java   |  54 +++++-
 .../org/apache/iceberg/stats/TestContentStats.java |  95 +++++++++--
 .../org/apache/iceberg/stats/TestFieldStats.java   |   4 +-
 9 files changed, 351 insertions(+), 87 deletions(-)

diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml
index dc5951256c..cd4afe6fdc 100644
--- a/.palantir/revapi.yml
+++ b/.palantir/revapi.yml
@@ -1370,6 +1370,10 @@ acceptedBreaks:
         new: "class org.apache.iceberg.encryption.EncryptingFileIO"
         justification: "New method for Manifest List reading"
     org.apache.iceberg:iceberg-core:
+      - code: "java.class.defaultSerializationChanged"
+        old: "class org.apache.iceberg.avro.SupportsIndexProjection"
+        new: "class org.apache.iceberg.avro.SupportsIndexProjection"
+        justification: "Serialization across versions is not guaranteed"
       - code: "java.class.defaultSerializationChanged"
         old: "class org.apache.iceberg.hadoop.SerializableConfiguration"
         new: "class org.apache.iceberg.hadoop.SerializableConfiguration"
diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java 
b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
index 8d13ba5567..72058e5253 100644
--- a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
+++ b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
@@ -20,6 +20,8 @@ package org.apache.iceberg.stats;
 
 import static org.apache.iceberg.types.Types.NestedField.optional;
 
+import java.util.List;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types;
 
@@ -88,39 +90,61 @@ public enum FieldStatistic {
     };
   }
 
-  public static Types.StructType fieldStatsFor(Type type, int baseFieldId) {
-    return Types.StructType.of(
+  public static Types.StructType fieldStatsFor(Types.NestedField field, int 
baseFieldId) {
+    List<Types.NestedField> fields = Lists.newArrayListWithCapacity(8);
+    Type type = field.type();
+
+    fields.add(
         optional(
             baseFieldId + VALUE_COUNT.offset(),
             VALUE_COUNT.fieldName(),
             Types.LongType.get(),
-            "Total value count, including null and NaN"),
-        optional(
-            baseFieldId + NULL_VALUE_COUNT.offset(),
-            NULL_VALUE_COUNT.fieldName(),
-            Types.LongType.get(),
-            "Total null value count"),
-        optional(
-            baseFieldId + NAN_VALUE_COUNT.offset(),
-            NAN_VALUE_COUNT.fieldName(),
-            Types.LongType.get(),
-            "Total NaN value count"),
-        optional(
-            baseFieldId + AVG_VALUE_SIZE.offset(),
-            AVG_VALUE_SIZE.fieldName(),
-            Types.IntegerType.get(),
-            "Avg value size of variable-length types (String, Binary)"),
-        optional(
-            baseFieldId + MAX_VALUE_SIZE.offset(),
-            MAX_VALUE_SIZE.fieldName(),
-            Types.IntegerType.get(),
-            "Max value size of variable-length types (String, Binary)"),
-        optional(baseFieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(), 
type, "Lower bound"),
-        optional(baseFieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(), 
type, "Upper bound"),
+            "Total value count, including null and NaN"));
+
+    if (field.isOptional()) {
+      fields.add(
+          optional(
+              baseFieldId + NULL_VALUE_COUNT.offset(),
+              NULL_VALUE_COUNT.fieldName(),
+              Types.LongType.get(),
+              "Total null value count"));
+    }
+
+    if (type.typeId() == Type.TypeID.FLOAT || type.typeId() == 
Type.TypeID.DOUBLE) {
+      fields.add(
+          optional(
+              baseFieldId + NAN_VALUE_COUNT.offset(),
+              NAN_VALUE_COUNT.fieldName(),
+              Types.LongType.get(),
+              "Total NaN value count"));
+    }
+
+    if (type.typeId() == Type.TypeID.STRING || type.typeId() == 
Type.TypeID.BINARY) {
+      fields.add(
+          optional(
+              baseFieldId + AVG_VALUE_SIZE.offset(),
+              AVG_VALUE_SIZE.fieldName(),
+              Types.IntegerType.get(),
+              "Avg value size of variable-length types (String, Binary)"));
+      fields.add(
+          optional(
+              baseFieldId + MAX_VALUE_SIZE.offset(),
+              MAX_VALUE_SIZE.fieldName(),
+              Types.IntegerType.get(),
+              "Max value size of variable-length types (String, Binary)"));
+    }
+
+    fields.add(
+        optional(baseFieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(), 
type, "Lower bound"));
+    fields.add(
+        optional(baseFieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(), 
type, "Upper bound"));
+    fields.add(
         optional(
             baseFieldId + EXACT_BOUNDS.offset(),
             EXACT_BOUNDS.fieldName(),
             Types.BooleanType.get(),
             "Whether the upper/lower bound is exact or not"));
+
+    return Types.StructType.of(fields);
   }
 }
diff --git a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java 
b/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
index 349f9fe75b..2ff52f92bd 100644
--- a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
+++ b/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
@@ -178,7 +178,7 @@ public class StatsUtil {
 
       int fieldId = StatsUtil.statsFieldIdForField(field.fieldId());
       if (fieldId >= 0) {
-        Types.StructType structType = 
FieldStatistic.fieldStatsFor(field.type(), fieldId);
+        Types.StructType structType = FieldStatistic.fieldStatsFor(field, 
fieldId);
         return optional(fieldId, Integer.toString(field.fieldId()), 
structType);
       } else {
         skippedFieldIds.add(field.fieldId());
diff --git a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java 
b/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
index 4a17081ab7..62c7c0ea75 100644
--- a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
+++ b/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
@@ -18,10 +18,19 @@
  */
 package org.apache.iceberg.stats;
 
+import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE;
+import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS;
+import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND;
+import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE;
+import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT;
+import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT;
+import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND;
+import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT;
 import static org.apache.iceberg.types.Types.NestedField.optional;
 import static org.apache.iceberg.types.Types.NestedField.required;
 import static org.assertj.core.api.Assertions.assertThat;
 
+import java.util.List;
 import java.util.concurrent.ThreadLocalRandom;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.types.Types;
@@ -139,37 +148,38 @@ public class TestStatsUtil {
 
   @Test
   public void contentStatsForSimpleSchema() {
-    Schema schema =
-        new Schema(
-            required(0, "i", Types.IntegerType.get()),
-            required(2, "f", Types.FloatType.get()),
-            required(4, "s", Types.StringType.get()),
-            required(6, "b", Types.BooleanType.get()),
-            required(1_000_000, "u", Types.UUIDType.get()));
+    Types.NestedField intField = required(0, "i", Types.IntegerType.get());
+    Types.NestedField floatField = required(2, "f", Types.FloatType.get());
+    Types.NestedField stringField = required(4, "s", Types.StringType.get());
+    Types.NestedField booleanField = required(6, "b", Types.BooleanType.get());
+    Types.NestedField uuidField = required(1_000_000, "u", 
Types.UUIDType.get());
+    Schema schema = new Schema(intField, floatField, stringField, 
booleanField, uuidField);
     Schema expectedStatsSchema =
         new Schema(
             optional(
                 146,
                 "content_stats",
                 Types.StructType.of(
-                    optional(
-                        10000, "0", 
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10000)),
-                    optional(
-                        10400, "2", 
FieldStatistic.fieldStatsFor(Types.FloatType.get(), 10400)),
-                    optional(
-                        10800, "4", 
FieldStatistic.fieldStatsFor(Types.StringType.get(), 10800)),
-                    optional(
-                        11200, "6", 
FieldStatistic.fieldStatsFor(Types.BooleanType.get(), 11200)),
+                    optional(10000, "0", 
FieldStatistic.fieldStatsFor(intField, 10000)),
+                    optional(10400, "2", 
FieldStatistic.fieldStatsFor(floatField, 10400)),
+                    optional(10800, "4", 
FieldStatistic.fieldStatsFor(stringField, 10800)),
+                    optional(11200, "6", 
FieldStatistic.fieldStatsFor(booleanField, 11200)),
                     optional(
                         200010000,
                         "1000000",
-                        FieldStatistic.fieldStatsFor(Types.UUIDType.get(), 
200010000)))));
+                        FieldStatistic.fieldStatsFor(uuidField, 200010000)))));
     Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema));
     
assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct());
   }
 
   @Test
   public void contentStatsForComplexSchema() {
+    Types.NestedField listElement = optional(3, "element", 
Types.IntegerType.get());
+    Types.NestedField structInt = optional(7, "int", Types.IntegerType.get());
+    Types.NestedField structString = optional(8, "string", 
Types.StringType.get());
+    Types.NestedField mapKey = required(22, "key", Types.IntegerType.get());
+    Types.NestedField mapValue = optional(24, "value", Types.StringType.get());
+    Types.NestedField uuidField = required(100_000, "u", Types.UUIDType.get());
     Schema schema =
         new Schema(
             required(0, "i", Types.IntegerType.get()),
@@ -185,7 +195,7 @@ public class TestStatsUtil {
                 "b",
                 Types.MapType.ofOptional(22, 24, Types.IntegerType.get(), 
Types.StringType.get())),
             required(30, "variant", Types.VariantType.get()),
-            required(100_000, "u", Types.UUIDType.get()));
+            uuidField);
     Schema expectedStatsSchema =
         new Schema(
             optional(
@@ -193,22 +203,132 @@ public class TestStatsUtil {
                 "content_stats",
                 Types.StructType.of(
                     optional(
-                        10000, "0", 
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10000)),
-                    optional(
-                        10600, "3", 
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10600)),
+                        10000,
+                        "0",
+                        FieldStatistic.fieldStatsFor(
+                            required(0, "i", Types.IntegerType.get()), 10000)),
+                    optional(10600, "3", 
FieldStatistic.fieldStatsFor(listElement, 10600)),
+                    optional(11400, "7", 
FieldStatistic.fieldStatsFor(structInt, 11400)),
+                    optional(11600, "8", 
FieldStatistic.fieldStatsFor(structString, 11600)),
+                    optional(14400, "22", FieldStatistic.fieldStatsFor(mapKey, 
14400)),
+                    optional(14800, "24", 
FieldStatistic.fieldStatsFor(mapValue, 14800)),
                     optional(
-                        11400, "7", 
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 11400)),
-                    optional(
-                        11600, "8", 
FieldStatistic.fieldStatsFor(Types.StringType.get(), 11600)),
-                    optional(
-                        14400, "22", 
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 14400)),
-                    optional(
-                        14800, "24", 
FieldStatistic.fieldStatsFor(Types.StringType.get(), 14800)),
-                    optional(
-                        20010000,
-                        "100000",
-                        FieldStatistic.fieldStatsFor(Types.UUIDType.get(), 
20010000)))));
+                        20010000, "100000", 
FieldStatistic.fieldStatsFor(uuidField, 20010000)))));
     Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema));
     
assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct());
   }
+
+  @Test
+  public void conditionalFieldInclusionForInteger() {
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(required(1, "x", 
Types.IntegerType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName())
+        .doesNotContain(
+            NULL_VALUE_COUNT.fieldName(),
+            NAN_VALUE_COUNT.fieldName(),
+            AVG_VALUE_SIZE.fieldName(),
+            MAX_VALUE_SIZE.fieldName());
+
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(optional(1, "x", 
Types.IntegerType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            NULL_VALUE_COUNT.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName())
+        .doesNotContain(
+            NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE.fieldName(), 
MAX_VALUE_SIZE.fieldName());
+  }
+
+  @Test
+  public void conditionalFieldInclusionForFloatAndDouble() {
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(required(1, "x", 
Types.FloatType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            NAN_VALUE_COUNT.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName())
+        .doesNotContain(
+            NULL_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE.fieldName(), 
MAX_VALUE_SIZE.fieldName());
+
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(optional(1, "x", 
Types.DoubleType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            NULL_VALUE_COUNT.fieldName(),
+            NAN_VALUE_COUNT.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName());
+  }
+
+  @Test
+  public void conditionalFieldInclusionForString() {
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(required(1, "x", 
Types.StringType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            AVG_VALUE_SIZE.fieldName(),
+            MAX_VALUE_SIZE.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName())
+        .doesNotContain(NULL_VALUE_COUNT.fieldName(), 
NAN_VALUE_COUNT.fieldName());
+
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(optional(1, "x", 
Types.StringType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            NULL_VALUE_COUNT.fieldName(),
+            AVG_VALUE_SIZE.fieldName(),
+            MAX_VALUE_SIZE.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName());
+  }
+
+  @Test
+  public void conditionalFieldInclusionForBinary() {
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(optional(1, "x", 
Types.BinaryType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            NULL_VALUE_COUNT.fieldName(),
+            AVG_VALUE_SIZE.fieldName(),
+            MAX_VALUE_SIZE.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName())
+        .doesNotContain(NAN_VALUE_COUNT.fieldName());
+
+    assertThat(
+            fieldStatsNames(
+                FieldStatistic.fieldStatsFor(required(1, "x", 
Types.BinaryType.get()), 10000)))
+        .containsExactly(
+            VALUE_COUNT.fieldName(),
+            AVG_VALUE_SIZE.fieldName(),
+            MAX_VALUE_SIZE.fieldName(),
+            LOWER_BOUND.fieldName(),
+            UPPER_BOUND.fieldName(),
+            EXACT_BOUNDS.fieldName())
+        .doesNotContain(NULL_VALUE_COUNT.fieldName(), 
NAN_VALUE_COUNT.fieldName());
+  }
+
+  private List<String> fieldStatsNames(Types.StructType structType) {
+    return structType.fields().stream().map(Types.NestedField::name).toList();
+  }
 }
diff --git 
a/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java 
b/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java
index fa4ffa5aec..41b5d3e608 100644
--- a/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java
+++ b/core/src/main/java/org/apache/iceberg/avro/SupportsIndexProjection.java
@@ -55,6 +55,11 @@ public abstract class SupportsIndexProjection implements 
StructLike, Serializabl
     }
   }
 
+  /** Constructor with a precomputed position mapping */
+  protected SupportsIndexProjection(int[] fromProjectionPos) {
+    this.fromProjectionPos = fromProjectionPos;
+  }
+
   /** Copy constructor */
   protected SupportsIndexProjection(SupportsIndexProjection toCopy) {
     this.fromProjectionPos = toCopy.fromProjectionPos;
diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java 
b/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java
index 9e20f2ad25..be56c411b6 100644
--- a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java
+++ b/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java
@@ -59,6 +59,7 @@ public class BaseContentStats implements ContentStats, 
Serializable {
       fieldStats.add(
           BaseFieldStats.builder()
               .fieldId(StatsUtil.fieldIdForStatsField(field.fieldId()))
+              .statsStruct(structType)
               .type(type)
               .build());
     }
@@ -243,6 +244,7 @@ public class BaseContentStats implements ContentStats, 
Serializable {
       return this;
     }
 
+    @SuppressWarnings("rawtypes")
     public BaseContentStats build() {
       Preconditions.checkArgument(
           null != statsStruct || null != schema, "Either stats struct or table 
schema must be set");
@@ -252,7 +254,21 @@ public class BaseContentStats implements ContentStats, 
Serializable {
         this.statsStruct = 
StatsUtil.contentStatsFor(schema).type().asStructType();
       }
 
-      return new BaseContentStats(statsStruct, stats);
+      List<FieldStats<?>> resolvedStats = 
Lists.newArrayListWithCapacity(stats.size());
+      for (FieldStats<?> stat : stats) {
+        int statsFieldId = StatsUtil.statsFieldIdForField(stat.fieldId());
+        Types.NestedField statsField = statsStruct.field(statsFieldId);
+        if (null != statsField && statsField.type().isStructType()) {
+          resolvedStats.add(
+              ((BaseFieldStats.Builder) BaseFieldStats.buildFrom(stat))
+                  .statsStruct(statsField.type().asStructType())
+                  .build());
+        } else {
+          resolvedStats.add(stat);
+        }
+      }
+
+      return new BaseContentStats(statsStruct, resolvedStats);
     }
   }
 }
diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java 
b/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
index f26294213c..470303179b 100644
--- a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
+++ b/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
@@ -18,16 +18,18 @@
  */
 package org.apache.iceberg.stats;
 
-import java.io.Serializable;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.util.Objects;
+import org.apache.iceberg.avro.SupportsIndexProjection;
 import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.ByteBuffers;
 
-public class BaseFieldStats<T> implements FieldStats<T>, Serializable {
+public class BaseFieldStats<T> extends SupportsIndexProjection implements 
FieldStats<T> {
+  private static final int[] IDENTITY_MAPPING = identityMapping();
   private final int fieldId;
   private final Type type;
   private final Long valueCount;
@@ -41,6 +43,7 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
 
   private BaseFieldStats(
       int fieldId,
+      int[] fromProjectionPos,
       Type type,
       Long valueCount,
       Long nullValueCount,
@@ -50,6 +53,7 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
       T lowerBound,
       T upperBound,
       boolean hasExactBounds) {
+    super(fromProjectionPos != null ? fromProjectionPos : IDENTITY_MAPPING);
     this.fieldId = fieldId;
     this.type = type;
     this.valueCount = valueCount;
@@ -62,6 +66,36 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
     this.hasExactBounds = hasExactBounds;
   }
 
+  private static int[] identityMapping() {
+    int numStats = FieldStatistic.values().length;
+    int[] mapping = new int[numStats];
+    for (int i = 0; i < numStats; i++) {
+      mapping[i] = i;
+    }
+
+    return mapping;
+  }
+
+  /**
+   * Computes a position mapping from the column-specific stats struct to the 
full 8-field struct.
+   * Each entry maps a projected position to its base position (0-based) using 
the field ID offsets
+   * from the column's base stats field ID.
+   */
+  private static int[] projectionMapping(Types.StructType statsStruct, int 
dataFieldId) {
+    if (statsStruct == null) {
+      return null;
+    }
+
+    int baseStatsFieldId = StatsUtil.statsFieldIdForField(dataFieldId);
+    int[] mapping = new int[statsStruct.fields().size()];
+    for (int i = 0; i < mapping.length; i++) {
+      // offset is 1-based (matching FieldStatistic.offset()), position is 
0-based
+      mapping[i] = statsStruct.fields().get(i).fieldId() - baseStatsFieldId - 
1;
+    }
+
+    return mapping;
+  }
+
   @Override
   public int fieldId() {
     return fieldId;
@@ -144,12 +178,7 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
   }
 
   @Override
-  public int size() {
-    return 8;
-  }
-
-  @Override
-  public <X> X get(int pos, Class<X> javaClass) {
+  protected <X> X internalGet(int pos, Class<X> javaClass) {
     return switch (FieldStatistic.fromPosition(pos)) {
       case VALUE_COUNT -> javaClass.cast(valueCount);
       case NULL_VALUE_COUNT -> javaClass.cast(nullValueCount);
@@ -164,7 +193,7 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
   }
 
   @Override
-  public void set(int pos, Object value) {
+  protected <X> void internalSet(int pos, X value) {
     throw new UnsupportedOperationException("set() not supported");
   }
 
@@ -239,6 +268,7 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
 
   public static class Builder<T> {
     private int fieldId;
+    private int[] fromProjectionPos;
     private Type type;
     private Long valueCount;
     private Long nullValueCount;
@@ -251,6 +281,11 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
 
     private Builder() {}
 
+    public Builder<T> statsStruct(Types.StructType statsStruct) {
+      this.fromProjectionPos = projectionMapping(statsStruct, fieldId);
+      return this;
+    }
+
     public Builder<T> type(Type newType) {
       this.type = newType;
       return this;
@@ -333,6 +368,7 @@ public class BaseFieldStats<T> implements FieldStats<T>, 
Serializable {
 
       return new BaseFieldStats<>(
           fieldId,
+          fromProjectionPos,
           type,
           valueCount,
           nullValueCount,
diff --git a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java 
b/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
index d083e73065..6baff7dfe6 100644
--- a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
+++ b/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
@@ -241,34 +241,93 @@ public class TestContentStats {
   }
 
   @Test
-  public void setByPosition() {
+  public void setByPositionOptionalString() {
+    Schema tableSchema = new Schema(optional(1, "s", Types.StringType.get()));
+    Types.StructType rootStatsStruct = 
StatsUtil.contentStatsFor(tableSchema).type().asStructType();
+    Types.StructType statsStructForFieldId = 
rootStatsStruct.fields().get(0).type().asStructType();
+    assertThat(statsStructForFieldId.fields()).hasSize(7);
+
+    GenericRecord record = GenericRecord.create(statsStructForFieldId);
+    BaseFieldStats<String> fieldStats =
+        BaseFieldStats.<String>builder()
+            .type(Types.StringType.get())
+            .fieldId(1)
+            .valueCount(10L)
+            .nullValueCount(2L)
+            .avgValueSize(3)
+            .maxValueSize(10)
+            .lowerBound("aa")
+            .upperBound("zzz")
+            .hasExactBounds()
+            .build();
+
+    record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount());
+    record.setField(NULL_VALUE_COUNT.fieldName(), fieldStats.nullValueCount());
+    record.setField(AVG_VALUE_SIZE.fieldName(), fieldStats.avgValueSize());
+    record.setField(MAX_VALUE_SIZE.fieldName(), fieldStats.maxValueSize());
+    record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound());
+    record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound());
+    record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds());
+
+    BaseContentStats stats = new BaseContentStats(rootStatsStruct);
+    stats.set(0, record);
+    assertThat(stats.fieldStats()).containsExactly(fieldStats);
+  }
+
+  @Test
+  public void setByPositionOptionalDouble() {
+    Schema tableSchema = new Schema(optional(1, "d", Types.DoubleType.get()));
+    Types.StructType rootStatsStruct = 
StatsUtil.contentStatsFor(tableSchema).type().asStructType();
+    Types.StructType statsStructForFieldId = 
rootStatsStruct.fields().get(0).type().asStructType();
+    assertThat(statsStructForFieldId.fields()).hasSize(6);
+
+    GenericRecord record = GenericRecord.create(statsStructForFieldId);
+    BaseFieldStats<Double> fieldStats =
+        BaseFieldStats.<Double>builder()
+            .type(Types.DoubleType.get())
+            .fieldId(1)
+            .valueCount(10L)
+            .nullValueCount(2L)
+            .nanValueCount(3L)
+            .lowerBound(5.0)
+            .upperBound(20.0)
+            .hasExactBounds()
+            .build();
+
+    record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount());
+    record.setField(NULL_VALUE_COUNT.fieldName(), fieldStats.nullValueCount());
+    record.setField(NAN_VALUE_COUNT.fieldName(), fieldStats.nanValueCount());
+    record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound());
+    record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound());
+    record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds());
+
+    BaseContentStats stats = new BaseContentStats(rootStatsStruct);
+    stats.set(0, record);
+    assertThat(stats.fieldStats()).containsExactly(fieldStats);
+  }
+
+  @Test
+  public void setByPositionRequiredInteger() {
     Schema tableSchema = new Schema(required(1, "id", 
Types.IntegerType.get()));
     Types.StructType rootStatsStruct = 
StatsUtil.contentStatsFor(tableSchema).type().asStructType();
-    Types.StructType statsStructForIdField = 
rootStatsStruct.fields().get(0).type().asStructType();
+    Types.StructType statsStructForFieldId = 
rootStatsStruct.fields().get(0).type().asStructType();
+    assertThat(statsStructForFieldId.fields()).hasSize(4);
 
-    GenericRecord record = GenericRecord.create(statsStructForIdField);
+    GenericRecord record = GenericRecord.create(statsStructForFieldId);
     BaseFieldStats<Integer> fieldStats =
         BaseFieldStats.<Integer>builder()
             .type(Types.IntegerType.get())
             .fieldId(1)
             .valueCount(10L)
-            .nullValueCount(2L)
-            .nanValueCount(3L)
-            .avgValueSize(30)
-            .maxValueSize(70)
             .lowerBound(5)
             .upperBound(20)
             .hasExactBounds()
             .build();
 
-    record.set(VALUE_COUNT.position(), fieldStats.valueCount());
-    record.set(NULL_VALUE_COUNT.position(), fieldStats.nullValueCount());
-    record.set(NAN_VALUE_COUNT.position(), fieldStats.nanValueCount());
-    record.set(AVG_VALUE_SIZE.position(), fieldStats.avgValueSize());
-    record.set(MAX_VALUE_SIZE.position(), fieldStats.maxValueSize());
-    record.set(LOWER_BOUND.position(), fieldStats.lowerBound());
-    record.set(UPPER_BOUND.position(), fieldStats.upperBound());
-    record.set(EXACT_BOUNDS.position(), fieldStats.hasExactBounds());
+    record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount());
+    record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound());
+    record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound());
+    record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds());
 
     // this is typically called by Avro reflection code
     BaseContentStats stats = new BaseContentStats(rootStatsStruct);
@@ -287,17 +346,17 @@ public class TestContentStats {
     BaseContentStats stats = new BaseContentStats(rootStatsStruct);
 
     // invalid lower bound
-    record.set(LOWER_BOUND.position(), 5.0);
+    record.setField(LOWER_BOUND.fieldName(), 5.0);
     assertThatThrownBy(() -> stats.set(0, record))
         .isInstanceOf(IllegalArgumentException.class)
         .hasMessage(
             "Invalid lower bound type, expected a subtype of class 
java.lang.Integer: java.lang.Double");
 
     // set valid lower bound so that upper bound is evaluated
-    record.set(LOWER_BOUND.position(), 5);
+    record.setField(LOWER_BOUND.fieldName(), 5);
 
     // invalid upper bound
-    record.set(UPPER_BOUND.position(), "20");
+    record.setField(UPPER_BOUND.fieldName(), "20");
     assertThatThrownBy(() -> stats.set(0, record))
         .isInstanceOf(IllegalArgumentException.class)
         .hasMessage(
diff --git a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java 
b/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
index ffd91efd8a..be5f316694 100644
--- a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
+++ b/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
@@ -211,8 +211,8 @@ public class TestFieldStats {
     assertThat(fieldStats.get(EXACT_BOUNDS.position(), 
Boolean.class)).isEqualTo(true);
 
     assertThatThrownBy(() -> assertThat(fieldStats.get(10, Long.class)))
-        .isInstanceOf(IllegalArgumentException.class)
-        .hasMessage("Invalid statistic position: 10");
+        .isInstanceOf(ArrayIndexOutOfBoundsException.class)
+        .hasMessage("Index 10 out of bounds for length 8");
     assertThatThrownBy(() -> assertThat(fieldStats.get(VALUE_COUNT.position(), 
Double.class)))
         .isInstanceOf(ClassCastException.class)
         .hasMessage("Cannot cast java.lang.Long to java.lang.Double");

Reply via email to