This is an automated email from the ASF dual-hosted git repository.
huaxingao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new b6b89268de Add variant type support to ParquetTypeVisitor (#14588)
b6b89268de is described below
commit b6b89268de26b119817db9ce20b540baa349c541
Author: Tamas Mate <[email protected]>
AuthorDate: Tue Nov 18 19:39:14 2025 +0100
Add variant type support to ParquetTypeVisitor (#14588)
* Parquet: Add variant type support to ParquetTypeVisitor
Implement variant(GroupType) method in ParquetTypeVisitor and all
subclasses to enable proper handling of Parquet variant logical types
during schema conversion and manipulation operations.
* Address review comments
- Replace variant spec version with constant
- Add variant tests
* Column id reordering
* Address review comments
---
.../apache/iceberg/parquet/ApplyNameMapping.java | 6 ++
.../apache/iceberg/parquet/MessageTypeToType.java | 5 ++
.../apache/iceberg/parquet/ParquetSchemaUtil.java | 5 ++
.../apache/iceberg/parquet/ParquetTypeVisitor.java | 12 +++
.../java/org/apache/iceberg/parquet/RemoveIds.java | 10 +++
.../iceberg/parquet/TestParquetSchemaUtil.java | 88 ++++++++++++++++++++++
6 files changed, 126 insertions(+)
diff --git
a/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java
b/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java
index 3ad11ab6e5..b6569a6eb6 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java
@@ -103,6 +103,12 @@ class ApplyNameMapping extends ParquetTypeVisitor<Type> {
return field == null ? primitive : primitive.withId(field.id());
}
+ @Override
+ public Type variant(GroupType variant) {
+ MappedField field = nameMapping.find(currentPath());
+ return field == null ? variant : variant.withId(field.id());
+ }
+
@Override
public void beforeField(Type type) {
fieldNames.push(type.getName());
diff --git
a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java
b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java
index 26ef6e468e..841777152e 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java
@@ -179,6 +179,11 @@ class MessageTypeToType extends ParquetTypeVisitor<Type> {
throw new UnsupportedOperationException("Cannot convert unknown primitive
type: " + primitive);
}
+ @Override
+ public Type variant(GroupType variant) {
+ return Types.VariantType.get();
+ }
+
private static class ParquetLogicalTypeVisitor
implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<Type> {
private static final ParquetLogicalTypeVisitor INSTANCE = new
ParquetLogicalTypeVisitor();
diff --git
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
index f4760738df..9a81626827 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
@@ -217,6 +217,11 @@ public class ParquetSchemaUtil {
public Boolean primitive(PrimitiveType primitive) {
return primitive.getId() != null;
}
+
+ @Override
+ public Boolean variant(GroupType variant) {
+ return variant.getId() != null;
+ }
}
public static Type determineListElementType(GroupType array) {
diff --git
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java
index 43c62a8c77..271d9e8bf8 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java
@@ -22,6 +22,7 @@ import java.util.Deque;
import java.util.List;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.variants.Variant;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
@@ -46,6 +47,9 @@ public class ParquetTypeVisitor<T> {
return visitList(group, visitor);
} else if (LogicalTypeAnnotation.mapType().equals(annotation)) {
return visitMap(group, visitor);
+ } else if
(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)
+ .equals(annotation)) {
+ return visitVariant(group, visitor);
}
return visitor.struct(group, visitFields(group, visitor));
@@ -168,6 +172,10 @@ public class ParquetTypeVisitor<T> {
}
}
+ private static <T> T visitVariant(GroupType variant, ParquetTypeVisitor<T>
visitor) {
+ return visitor.variant(variant);
+ }
+
private static <T> List<T> visitFields(GroupType group,
ParquetTypeVisitor<T> visitor) {
List<T> results =
Lists.newArrayListWithExpectedSize(group.getFieldCount());
for (Type field : group.getFields()) {
@@ -202,6 +210,10 @@ public class ParquetTypeVisitor<T> {
return null;
}
+ public T variant(GroupType variant) {
+ return null;
+ }
+
public void beforeField(Type type) {
fieldNames.push(type.getName());
}
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java
b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java
index 37ead23a67..b1d1d91bd5 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java
@@ -77,6 +77,16 @@ public class RemoveIds extends ParquetTypeVisitor<Type> {
.named(primitive.getName());
}
+ @Override
+ public Type variant(GroupType variant) {
+ Types.GroupBuilder<GroupType> builder =
+
Types.buildGroup(variant.getRepetition()).as(variant.getLogicalTypeAnnotation());
+ for (Type field : variant.getFields()) {
+ builder.addField(field);
+ }
+ return builder.named(variant.getName());
+ }
+
public static MessageType removeIds(MessageType type) {
return (MessageType) ParquetTypeVisitor.visit(type, new RemoveIds());
}
diff --git
a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java
b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java
index 84fbf2a7d9..1df904f13c 100644
---
a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java
+++
b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java
@@ -28,7 +28,9 @@ import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
+import org.apache.iceberg.variants.Variant;
import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import org.apache.parquet.schema.PrimitiveType;
@@ -127,6 +129,37 @@ public class TestParquetSchemaUtil {
assertThat(messageTypeWithIdsFromNameMapping).isEqualTo(messageTypeWithIds);
}
+ @Test
+ public void testAssignIdsToVariantTypesByNameMapping() {
+ Types.StructType structType =
+ Types.StructType.of(
+ optional(30, "variant_col", Types.VariantType.get()),
+ optional(
+ 31,
+ "struct_with_variant",
+ Types.StructType.of(
+ Types.NestedField.required(32, "id",
Types.IntegerType.get()),
+ Types.NestedField.optional(33, "data",
Types.VariantType.get()))),
+ required(
+ 34, "list_of_variants", Types.ListType.ofOptional(35,
Types.VariantType.get())),
+ required(
+ 36,
+ "map_with_variant_value",
+ Types.MapType.ofOptional(37, 38, Types.StringType.get(),
Types.VariantType.get())));
+
+ Schema schema =
+ new Schema(
+ TypeUtil.assignFreshIds(structType, new
AtomicInteger(0)::incrementAndGet)
+ .asStructType()
+ .fields());
+ NameMapping nameMapping = MappingUtil.create(schema);
+ MessageType messageTypeWithIds = ParquetSchemaUtil.convert(schema,
"parquet_type");
+ MessageType messageTypeWithIdsFromNameMapping =
+
ParquetSchemaUtil.applyNameMapping(RemoveIds.removeIds(messageTypeWithIds),
nameMapping);
+
+
assertThat(messageTypeWithIdsFromNameMapping).isEqualTo(messageTypeWithIds);
+ }
+
@Test
public void testSchemaConversionWithoutAssigningIds() {
MessageType messageType =
@@ -264,6 +297,47 @@ public class TestParquetSchemaUtil {
.isEqualTo(expectedSchema.asStruct());
}
+ @Test
+ public void testVariantTypesWithoutAssigningIds() {
+ MessageType messageType =
+ new MessageType(
+ "test",
+ variant(30, "variant_col_1", Repetition.OPTIONAL),
+ variant(null, "variant_col_2", Repetition.REQUIRED),
+ struct(
+ 31,
+ "struct_col_3",
+ Repetition.REQUIRED,
+ primitive(32, "n1", PrimitiveTypeName.INT32,
Repetition.REQUIRED),
+ variant(null, "variant_field", Repetition.OPTIONAL)),
+ list(33, "list_col_6", Repetition.OPTIONAL, variant(34, "v",
Repetition.OPTIONAL)),
+ map(
+ 35,
+ "map_col_6",
+ Repetition.REQUIRED,
+ primitive(36, "k", PrimitiveTypeName.INT32,
Repetition.REQUIRED),
+ variant(37, "v", Repetition.OPTIONAL)));
+
+ Schema expectedSchema =
+ new Schema(
+ optional(30, "variant_col_1", Types.VariantType.get()),
+ required(
+ 31,
+ "struct_col_3",
+ Types.StructType.of(required(32, "n1",
Types.IntegerType.get()))),
+ optional(33, "list_col_6", Types.ListType.ofOptional(34,
Types.VariantType.get())),
+ required(
+ 35,
+ "map_col_6",
+ Types.MapType.ofOptional(
+ 36, 37, Types.IntegerType.get(),
Types.VariantType.get())));
+
+ Schema actualSchema = ParquetSchemaUtil.convertAndPrune(messageType);
+ assertThat(actualSchema.asStruct())
+ .as("Schema must match")
+ .isEqualTo(expectedSchema.asStruct());
+ }
+
@Test
public void testSchemaConversionForHiveStyleLists() {
String parquetSchemaString =
@@ -464,4 +538,18 @@ public class TestParquetSchemaUtil {
}
return builder.named(name);
}
+
+ private Type variant(Integer id, String name, Repetition repetition) {
+ GroupBuilder<GroupType> builder =
+ org.apache.parquet.schema.Types.buildGroup(repetition)
+
.as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION))
+ .primitive(PrimitiveTypeName.BINARY, Repetition.REQUIRED)
+ .named("metadata")
+ .primitive(PrimitiveTypeName.BINARY, Repetition.REQUIRED)
+ .named("value");
+ if (id != null) {
+ builder.id(id);
+ }
+ return builder.named(name);
+ }
}