This is an automated email from the ASF dual-hosted git repository.
etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new ef4007964e Data, Parquet: Fix UUID ClassCastException when reading
Parquet files with UUIDs (#14027)
ef4007964e is described below
commit ef4007964e364566d6b8ee4951dd3eb7ea70a8c9
Author: Andre Luis Anastacio <[email protected]>
AuthorDate: Wed Oct 15 12:05:40 2025 -0300
Data, Parquet: Fix UUID ClassCastException when reading Parquet files with
UUIDs (#14027)
---
.../iceberg/data/TestMetricsRowGroupFilter.java | 65 ++++++++++++++-
.../apache/iceberg/parquet/ParquetConversions.java | 2 +
.../parquet/TestDictionaryRowGroupFilter.java | 94 +++++++++++++++++++++-
3 files changed, 157 insertions(+), 4 deletions(-)
diff --git
a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
index e12015d5eb..c871c25c93 100644
--- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
+++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
@@ -119,7 +119,8 @@ public class TestMetricsRowGroupFilter {
optional(14, "all_nans", DoubleType.get()),
optional(15, "some_nans", FloatType.get()),
optional(16, "no_nans", DoubleType.get()),
- optional(17, "some_double_nans", DoubleType.get()));
+ optional(17, "some_double_nans", DoubleType.get()),
+ optional(18, "uuid_col", Types.UUIDType.get()));
private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
Types.StructType.of(Types.NestedField.required(8, "_int_field",
IntegerType.get()));
@@ -137,7 +138,8 @@ public class TestMetricsRowGroupFilter {
optional(14, "_all_nans", Types.DoubleType.get()),
optional(15, "_some_nans", FloatType.get()),
optional(16, "_no_nans", Types.DoubleType.get()),
- optional(17, "_some_double_nans", Types.DoubleType.get()));
+ optional(17, "_some_double_nans", Types.DoubleType.get()),
+ optional(18, "_uuid_col", Types.UUIDType.get()));
private static final Schema VARIANT_SCHEMA =
new Schema(
@@ -157,6 +159,9 @@ public class TestMetricsRowGroupFilter {
private static final int INT_MIN_VALUE = 30;
private static final int INT_MAX_VALUE = 79;
+ private static final UUID UUID_WITH_ZEROS =
+ UUID.fromString("00000000-0000-0000-0000-000000000000");
+
private File orcFile = null;
private MessageType parquetSchema = null;
private BlockMetaData rowGroupMetadata = null;
@@ -210,6 +215,7 @@ public class TestMetricsRowGroupFilter {
GenericRecord structNotNull =
GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE);
structNotNull.setField("_int_field", INT_MIN_VALUE + i);
record.setField("_struct_not_null", structNotNull); // struct with int
+ record.setField("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null);
appender.add(record);
}
@@ -248,6 +254,8 @@ public class TestMetricsRowGroupFilter {
GenericRecord structNotNull =
GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE);
structNotNull.setField("_int_field", INT_MIN_VALUE + i);
builder.setField("_struct_not_null", structNotNull); // struct with int
+ builder.setField("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null);
+
records.add(builder);
}
@@ -1063,6 +1071,59 @@ public class TestMetricsRowGroupFilter {
}
}
+ @TestTemplate
+ public void testUUID() {
+ assumeThat(format).as("Only valid for
Parquet").isEqualTo(FileFormat.PARQUET);
+
+ UUID nonExistentUuid =
UUID.fromString("99999999-9999-9999-9999-999999999999");
+
+ boolean shouldRead = shouldRead(equal("uuid_col", UUID_WITH_ZEROS));
+ assertThat(shouldRead).as("Should read: column contains the
value").isTrue();
+
+ shouldRead = shouldRead(equal("uuid_col", nonExistentUuid));
+ assertThat(shouldRead).as("Should skip: column does not contain the
value").isFalse();
+
+ shouldRead = shouldRead(notEqual("uuid_col", UUID_WITH_ZEROS));
+ assertThat(shouldRead).as("Should read: column contains nulls").isTrue();
+
+ shouldRead = shouldRead(notEqual("uuid_col", nonExistentUuid));
+ assertThat(shouldRead).as("Should read: column contains non-matching
values").isTrue();
+
+ shouldRead = shouldRead(lessThan("uuid_col", UUID_WITH_ZEROS));
+ assertThat(shouldRead).as("Should skip: no values lower").isFalse();
+
+ shouldRead = shouldRead(lessThanOrEqual("uuid_col", UUID_WITH_ZEROS));
+ assertThat(shouldRead).as("Should read: column contains the
value").isTrue();
+
+ shouldRead = shouldRead(greaterThan("uuid_col", UUID_WITH_ZEROS));
+ assertThat(shouldRead).as("Should skip: no values greater").isFalse();
+
+ shouldRead = shouldRead(greaterThanOrEqual("uuid_col", UUID_WITH_ZEROS));
+ assertThat(shouldRead).as("Should read: column contains the
value").isTrue();
+
+ shouldRead = shouldRead(isNull("uuid_col"));
+ assertThat(shouldRead).as("Should read: column contains null
values").isTrue();
+
+ shouldRead = shouldRead(notNull("uuid_col"));
+ assertThat(shouldRead).as("Should read: column contains non-null
values").isTrue();
+
+ shouldRead = shouldRead(in("uuid_col", UUID_WITH_ZEROS, nonExistentUuid));
+ assertThat(shouldRead).as("Should read: column contains one of the
values").isTrue();
+
+ shouldRead = shouldRead(in("uuid_col", nonExistentUuid));
+ assertThat(shouldRead).as("Should skip: column contains none of the
values").isFalse();
+
+ shouldRead = shouldRead(notIn("uuid_col", nonExistentUuid));
+ assertThat(shouldRead)
+ .as("Should read: column contains values not in the exclusion list")
+ .isTrue();
+
+ shouldRead = shouldRead(notIn("uuid_col", UUID_WITH_ZEROS));
+ assertThat(shouldRead)
+ .as("Should read: column contains null values not in the exclusion
list")
+ .isTrue();
+ }
+
private boolean shouldRead(Expression expression) {
return shouldRead(expression, true);
}
diff --git
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java
index 3a70198a1a..1e5ed1fb9b 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java
@@ -83,6 +83,8 @@ class ParquetConversions {
} else if (icebergType.typeId() == Type.TypeID.DOUBLE
&& parquetType.getPrimitiveTypeName() ==
PrimitiveType.PrimitiveTypeName.FLOAT) {
return value -> ((Float) fromParquet.apply(value)).doubleValue();
+ } else if (icebergType.typeId() == Type.TypeID.UUID) {
+ return binary -> UUIDUtil.convert(((Binary) binary).toByteBuffer());
}
}
diff --git
a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
index ac6e41347d..48cc2b0992 100644
---
a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
+++
b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
@@ -111,7 +111,8 @@ public class TestDictionaryRowGroupFilter {
14,
"decimal_fixed",
DecimalType.of(20, 10)), // >18 precision to enforce
FIXED_LEN_BYTE_ARRAY
- optional(15, "_nans_and_nulls", DoubleType.get()));
+ optional(15, "_nans_and_nulls", DoubleType.get()),
+ optional(16, "uuid_col", Types.UUIDType.get()));
private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
Types.StructType.of(Types.NestedField.required(9, "_int_field",
IntegerType.get()));
@@ -133,7 +134,8 @@ public class TestDictionaryRowGroupFilter {
14,
"_decimal_fixed",
DecimalType.of(20, 10)), // >18 precision to enforce
FIXED_LEN_BYTE_ARRAY
- optional(15, "_nans_and_nulls", DoubleType.get()));
+ optional(15, "_nans_and_nulls", DoubleType.get()),
+ optional(16, "_uuid_col", Types.UUIDType.get()));
private static final String TOO_LONG_FOR_STATS;
@@ -153,6 +155,9 @@ public class TestDictionaryRowGroupFilter {
.subtract(DECIMAL_MIN_VALUE)
.divide(new BigDecimal(INT_MAX_VALUE - INT_MIN_VALUE),
RoundingMode.HALF_UP);
+ private static final UUID UUID_WITH_ZEROS =
+ UUID.fromString("00000000-0000-0000-0000-000000000000");
+
private MessageType parquetSchema = null;
private BlockMetaData rowGroupMetadata = null;
private DictionaryPageReadStore dictionaryStore = null;
@@ -203,6 +208,8 @@ public class TestDictionaryRowGroupFilter {
structNotNull.put("_int_field", INT_MIN_VALUE + i);
builder.set("_struct_not_null", structNotNull); // struct with int
+ builder.set("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null);
+
appender.add(builder.build());
}
}
@@ -1267,6 +1274,89 @@ public class TestDictionaryRowGroupFilter {
.isTrue();
}
+ @TestTemplate
+ public void testUUID() {
+ assumeThat(getColumnForName(rowGroupMetadata, "_uuid_col").getEncodings())
+ .contains(Encoding.RLE_DICTIONARY);
+
+ UUID nonExistentUuid =
UUID.fromString("99999999-9999-9999-9999-999999999999");
+
+ boolean shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col",
UUID_WITH_ZEROS))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: column contains the
value").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col",
nonExistentUuid))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should skip: column does not contain the
value").isFalse();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("uuid_col",
UUID_WITH_ZEROS))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: column contains nulls").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("uuid_col",
nonExistentUuid))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: column contains non-matching
values").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("uuid_col",
UUID_WITH_ZEROS))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should skip: no uuid less than lower
bound").isFalse();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA,
lessThanOrEqual("uuid_col", UUID_WITH_ZEROS))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("uuid_col",
UUID_WITH_ZEROS))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should skip: no uuid greater than upper
bound").isFalse();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA,
greaterThanOrEqual("uuid_col", UUID_WITH_ZEROS))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("uuid_col"))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: column contains null
values").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("uuid_col"))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: column contains non-null
values").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(
+ SCHEMA, in("uuid_col", UUID_WITH_ZEROS, nonExistentUuid))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should read: column contains one of the
values").isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, in("uuid_col",
nonExistentUuid))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead).as("Should skip: column contains none of the
values").isFalse();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("uuid_col",
nonExistentUuid))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead)
+ .as("Should read: column contains values not in the exclusion list")
+ .isTrue();
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("uuid_col",
UUID_WITH_ZEROS))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ assertThat(shouldRead)
+ .as("Should read: column contains null values not in the exclusion
list")
+ .isTrue();
+ }
+
private ColumnChunkMetaData getColumnForName(BlockMetaData rowGroup, String
columnName) {
ColumnPath columnPath = ColumnPath.fromDotString(columnName);
for (ColumnChunkMetaData column : rowGroup.getColumns()) {