This is an automated email from the ASF dual-hosted git repository. etudenhoefner pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push: new c05f6c98d6 API: required nested fields within optional structs can produce null (#13804) c05f6c98d6 is described below commit c05f6c98d66a5a05ec5dca0b7a1c28036187317d Author: Dejan Gvozdenac <d.gvozdena...@gmail.com> AuthorDate: Fri Sep 19 02:53:17 2025 -0400 API: required nested fields within optional structs can produce null (#13804) --- api/src/main/java/org/apache/iceberg/Accessor.java | 5 + .../main/java/org/apache/iceberg/Accessors.java | 50 +++++++--- .../apache/iceberg/expressions/BoundReference.java | 4 +- .../iceberg/expressions/TestBoundReference.java | 108 +++++++++++++++++++++ .../iceberg/parquet/TestBloomRowGroupFilter.java | 8 +- .../org/apache/iceberg/spark/sql/TestSelect.java | 17 ++++ .../org/apache/iceberg/spark/sql/TestSelect.java | 17 ++++ .../org/apache/iceberg/spark/sql/TestSelect.java | 17 ++++ 8 files changed, 209 insertions(+), 17 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/Accessor.java b/api/src/main/java/org/apache/iceberg/Accessor.java index 2a20a04df9..20b09bf910 100644 --- a/api/src/main/java/org/apache/iceberg/Accessor.java +++ b/api/src/main/java/org/apache/iceberg/Accessor.java @@ -25,4 +25,9 @@ public interface Accessor<T> extends Serializable { Object get(T container); Type type(); + + /** Returns true if the current field or any ancestor in the access path is optional. */ + default boolean hasOptionalFieldInPath() { + return false; + } } diff --git a/api/src/main/java/org/apache/iceberg/Accessors.java b/api/src/main/java/org/apache/iceberg/Accessors.java index 0b36730fbb..06ee0a916c 100644 --- a/api/src/main/java/org/apache/iceberg/Accessors.java +++ b/api/src/main/java/org/apache/iceberg/Accessors.java @@ -59,11 +59,13 @@ public class Accessors { private final int position; private final Type type; private final Class<?> javaClass; + private final boolean hasOptionalFieldInPath; - PositionAccessor(int pos, Type type) { + PositionAccessor(int pos, Type type, boolean isOptional) { this.position = pos; this.type = type; this.javaClass = type.typeId().javaClass(); + this.hasOptionalFieldInPath = isOptional; } @Override @@ -84,6 +86,11 @@ public class Accessors { return javaClass; } + @Override + public boolean hasOptionalFieldInPath() { + return hasOptionalFieldInPath; + } + @Override public String toString() { return "Accessor(positions=[" + position + "], type=" + type + ")"; @@ -95,12 +102,14 @@ public class Accessors { private final int p1; private final Type type; private final Class<?> javaClass; + private final boolean hasOptionalFieldInPath; - Position2Accessor(int pos, PositionAccessor wrapped) { + Position2Accessor(int pos, PositionAccessor wrapped, boolean isOptional) { this.p0 = pos; this.p1 = wrapped.position(); this.type = wrapped.type(); this.javaClass = wrapped.javaClass(); + this.hasOptionalFieldInPath = isOptional || wrapped.hasOptionalFieldInPath(); } @Override @@ -117,6 +126,11 @@ public class Accessors { return javaClass; } + @Override + public boolean hasOptionalFieldInPath() { + return hasOptionalFieldInPath; + } + @Override public String toString() { return "Accessor(positions=[" + p0 + ", " + p1 + "], type=" + type + ")"; @@ -129,13 +143,15 @@ public class Accessors { private final int p2; private final Type type; private final Class<?> javaClass; + private final boolean hasOptionalFieldInPath; - Position3Accessor(int pos, Position2Accessor wrapped) { + Position3Accessor(int pos, Position2Accessor wrapped, boolean isOptional) { this.p0 = pos; this.p1 = wrapped.p0; this.p2 = wrapped.p1; this.type = wrapped.type(); this.javaClass = wrapped.javaClass(); + this.hasOptionalFieldInPath = isOptional || wrapped.hasOptionalFieldInPath(); } @Override @@ -148,6 +164,11 @@ public class Accessors { return type; } + @Override + public boolean hasOptionalFieldInPath() { + return hasOptionalFieldInPath; + } + @Override public String toString() { return "Accessor(positions=[" + p0 + ", " + p1 + ", " + p2 + "], type=" + type + ")"; @@ -157,10 +178,12 @@ public class Accessors { private static class WrappedPositionAccessor implements Accessor<StructLike> { private final int position; private final Accessor<StructLike> accessor; + private final boolean hasOptionalFieldInPath; - WrappedPositionAccessor(int pos, Accessor<StructLike> accessor) { + WrappedPositionAccessor(int pos, Accessor<StructLike> accessor, boolean isOptional) { this.position = pos; this.accessor = accessor; + this.hasOptionalFieldInPath = isOptional || accessor.hasOptionalFieldInPath(); } @Override @@ -177,27 +200,32 @@ public class Accessors { return accessor.type(); } + @Override + public boolean hasOptionalFieldInPath() { + return hasOptionalFieldInPath; + } + @Override public String toString() { return "WrappedAccessor(position=" + position + ", wrapped=" + accessor + ")"; } } - private static Accessor<StructLike> newAccessor(int pos, Type type) { - return new PositionAccessor(pos, type); + private static Accessor<StructLike> newAccessor(int pos, boolean isOptional, Type type) { + return new PositionAccessor(pos, type, isOptional); } private static Accessor<StructLike> newAccessor( int pos, boolean isOptional, Accessor<StructLike> accessor) { if (isOptional) { // the wrapped position handles null layers - return new WrappedPositionAccessor(pos, accessor); + return new WrappedPositionAccessor(pos, accessor, isOptional); } else if (accessor.getClass() == PositionAccessor.class) { - return new Position2Accessor(pos, (PositionAccessor) accessor); + return new Position2Accessor(pos, (PositionAccessor) accessor, isOptional); } else if (accessor instanceof Position2Accessor) { - return new Position3Accessor(pos, (Position2Accessor) accessor); + return new Position3Accessor(pos, (Position2Accessor) accessor, isOptional); } else { - return new WrappedPositionAccessor(pos, accessor); + return new WrappedPositionAccessor(pos, accessor, isOptional); } } @@ -226,7 +254,7 @@ public class Accessors { } // Add an accessor for this field as an Object (may or may not be primitive). - accessors.put(field.fieldId(), newAccessor(i, field.type())); + accessors.put(field.fieldId(), newAccessor(i, field.isOptional(), field.type())); } return accessors; diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java b/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java index 0295fe518f..decda85f2e 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java @@ -57,7 +57,9 @@ public class BoundReference<T> implements BoundTerm<T>, Reference<T> { @Override public boolean producesNull() { - return field.isOptional(); + // A leaf required field can evaluate to null if it is optional itself or any + // ancestor on the path is optional. + return accessor.hasOptionalFieldInPath(); } @Override diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestBoundReference.java b/api/src/test/java/org/apache/iceberg/expressions/TestBoundReference.java new file mode 100644 index 0000000000..ed921b248f --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/expressions/TestBoundReference.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.expressions; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; +import org.apache.iceberg.Accessor; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class TestBoundReference { + // Build a schema with a single nested struct with optionalList.size() levels with the following + // structure: + // s1: struct(s2: struct(s3: struct(..., sn: struct(leaf: int)))) + // where each s{i} is an optional struct if optionalList.get(i) is true and a required struct if + // false + private static Schema buildSchemaFromOptionalList(List<Boolean> optionalList, String leafName) { + Preconditions.checkArgument( + optionalList != null && !optionalList.isEmpty(), "optionalList must not be null or empty"); + Types.NestedField leaf = + optionalList.get(optionalList.size() - 1) + ? optional(optionalList.size(), leafName, Types.IntegerType.get()) + : required(optionalList.size(), leafName, Types.IntegerType.get()); + + Types.StructType current = Types.StructType.of(leaf); + + for (int i = optionalList.size() - 2; i >= 0; i--) { + int id = i + 1; + String name = "s" + (i + 1); + current = + Types.StructType.of( + optionalList.get(i) ? optional(id, name, current) : required(id, name, current)); + } + + return new Schema(current.fields()); + } + + private static Stream<Arguments> producesNullCases() { + // the test cases specify two arguments: + // - the first is a list of booleans that indicate whether fields in the nested sequence of + // structs are optional or required. For example, [false, true, false] will construct a + // struct like s1.s2.s3 with s1 being required, s2 being optional, and s3 being required. + // - the second is a boolean that indicates whether calling producesNull() on the BoundReference + // of the leaf field should return true or false. + return Stream.of( + // basic fields, no struct levels + Arguments.of(Arrays.asList(false), false), + Arguments.of(Arrays.asList(true), true), + // one level + Arguments.of(Arrays.asList(false, false), false), + Arguments.of(Arrays.asList(false, true), true), + Arguments.of(Arrays.asList(true, false), true), + // two levels + Arguments.of(Arrays.asList(false, false, false), false), + Arguments.of(Arrays.asList(false, false, true), true), + Arguments.of(Arrays.asList(true, false, false), true), + Arguments.of(Arrays.asList(false, true, false), true), + // three levels + Arguments.of(Arrays.asList(false, false, false, false), false), + Arguments.of(Arrays.asList(false, false, false, true), true), + Arguments.of(Arrays.asList(true, false, false, false), true), + Arguments.of(Arrays.asList(false, true, false, false), true), + // four levels + Arguments.of(Arrays.asList(false, false, false, false, false), false), + Arguments.of(Arrays.asList(false, false, false, false, true), true), + Arguments.of(Arrays.asList(true, false, false, false, false), true), + Arguments.of(Arrays.asList(false, true, true, true, false), true)); + } + + @ParameterizedTest + @MethodSource("producesNullCases") + public void testProducesNull(List<Boolean> optionalList, boolean expectedProducesNull) { + String leafName = "leaf"; + Schema schema = buildSchemaFromOptionalList(optionalList, leafName); + int leafId = optionalList.size(); + Types.NestedField leafField = schema.findField(leafId); + Accessor<StructLike> accessor = schema.accessorForField(leafId); + + BoundReference<Integer> ref = new BoundReference<>(leafField, accessor, leafName); + assertThat(ref.producesNull()).isEqualTo(expectedProducesNull); + } +} diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java index f54ddfcc5a..694c72876c 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java @@ -297,9 +297,7 @@ public class TestBloomRowGroupFilter { shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNull("struct_not_null.int_field")) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); - assertThat(shouldRead) - .as("Should read: this field is required and are always not-null") - .isTrue(); + assertThat(shouldRead).as("Should read: bloom filter doesn't help").isTrue(); } @Test @@ -323,8 +321,8 @@ public class TestBloomRowGroupFilter { new ParquetBloomRowGroupFilter(SCHEMA, isNull("struct_not_null.int_field")) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); assertThat(shouldRead) - .as("Should skip: this field is required and are always not-null") - .isFalse(); + .as("Should read: required nested field can still be null if any ancestor is optional") + .isTrue(); } @Test diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java index ab15139820..a21dfd388d 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java @@ -613,4 +613,21 @@ public class TestSelect extends CatalogTestBase { assertEquals("Should return all expected rows", ImmutableList.of(row(1)), result); sql("DROP TABLE IF EXISTS %s", complexTypeTableName); } + + @TestTemplate + public void testRequiredNestedFieldInOptionalStructFilter() { + String nestedStructTable = tableName("nested_struct_table"); + sql( + "CREATE TABLE %s (id INT NOT NULL, address STRUCT<street: STRING NOT NULL>) " + + "USING iceberg", + nestedStructTable); + sql("INSERT INTO %s VALUES (0, NULL)", nestedStructTable); + sql("INSERT INTO %s VALUES (1, STRUCT('123 Main St'))", nestedStructTable); + + List<Object[]> result = + sql("SELECT id FROM %s WHERE address.street IS NULL", nestedStructTable); + + assertEquals("Should return all expected rows", ImmutableList.of(row(0)), result); + sql("DROP TABLE IF EXISTS %s", nestedStructTable); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java index 240622750f..68b93be479 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java @@ -613,4 +613,21 @@ public class TestSelect extends CatalogTestBase { assertEquals("Should return all expected rows", ImmutableList.of(row(1)), result); sql("DROP TABLE IF EXISTS %s", complexTypeTableName); } + + @TestTemplate + public void testRequiredNestedFieldInOptionalStructFilter() { + String nestedStructTable = tableName("nested_struct_table"); + sql( + "CREATE TABLE %s (id INT NOT NULL, address STRUCT<street: STRING NOT NULL>) " + + "USING iceberg", + nestedStructTable); + sql("INSERT INTO %s VALUES (0, NULL)", nestedStructTable); + sql("INSERT INTO %s VALUES (1, STRUCT('123 Main St'))", nestedStructTable); + + List<Object[]> result = + sql("SELECT id FROM %s WHERE address.street IS NULL", nestedStructTable); + + assertEquals("Should return all expected rows", ImmutableList.of(row(0)), result); + sql("DROP TABLE IF EXISTS %s", nestedStructTable); + } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java index 240622750f..68b93be479 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java @@ -613,4 +613,21 @@ public class TestSelect extends CatalogTestBase { assertEquals("Should return all expected rows", ImmutableList.of(row(1)), result); sql("DROP TABLE IF EXISTS %s", complexTypeTableName); } + + @TestTemplate + public void testRequiredNestedFieldInOptionalStructFilter() { + String nestedStructTable = tableName("nested_struct_table"); + sql( + "CREATE TABLE %s (id INT NOT NULL, address STRUCT<street: STRING NOT NULL>) " + + "USING iceberg", + nestedStructTable); + sql("INSERT INTO %s VALUES (0, NULL)", nestedStructTable); + sql("INSERT INTO %s VALUES (1, STRUCT('123 Main St'))", nestedStructTable); + + List<Object[]> result = + sql("SELECT id FROM %s WHERE address.street IS NULL", nestedStructTable); + + assertEquals("Should return all expected rows", ImmutableList.of(row(0)), result); + sql("DROP TABLE IF EXISTS %s", nestedStructTable); + } }