This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch release-1.2.0 in repository https://gitbox.apache.org/repos/asf/hudi.git
commit 37df6e10f41ecf3fdd4fb8005d5f9040e629c08f Author: Rahil C <[email protected]> AuthorDate: Tue May 19 01:09:20 2026 -0700 fix(lance): fail fast when write schema contains VARIANT columns (#18775) Lance does not currently support VARIANT in its file format (https://lance.org/guide/data_types/#arrow-type-system). Without a guard, writes that include VARIANT-typed columns fail deep inside the Avro-to-Arrow conversion layer (LanceArrowUtils.toArrowSchema) with a cryptic error. Add a recursive Avro-schema walk in HoodieSparkLanceWriter that throws HoodieNotSupportedException up front with a user-friendly message naming the offending column path. Invoke it from HoodieSparkFileWriterFactory.newLanceFileWriter so every Lance write path (Spark DataSource, DeltaStreamer, bootstrap, async clustering, async compaction) is covered before any Arrow allocator is opened. Mirrors the existing VECTOR element-type guard in the same writer. --- .../io/storage/HoodieSparkFileWriterFactory.java | 1 + .../hudi/io/storage/HoodieSparkLanceWriter.java | 44 +++++++++++++ .../io/storage/TestHoodieSparkLanceWriter.java | 72 ++++++++++++++++++++++ 3 files changed, 117 insertions(+) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java index e8faef103e7c..7b3571d8df20 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java @@ -115,6 +115,7 @@ public class HoodieSparkFileWriterFactory extends HoodieFileWriterFactory { @Override protected HoodieFileWriter newLanceFileWriter(String instantTime, StoragePath path, HoodieConfig config, HoodieSchema schema, TaskContextSupplier taskContextSupplier) throws IOException { + HoodieSparkLanceWriter.validateNoVariantColumns(schema); boolean populateMetaFields = config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS); StructType structType = HoodieInternalRowUtils.getCachedSchema(schema); boolean enableBloomFilter = enableBloomFilter(populateMetaFields, config); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java index fe109a1ffe60..3bdf12d9059b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.schema.HoodieSchema; +import org.apache.hudi.common.schema.HoodieSchemaField; import org.apache.hudi.common.schema.HoodieSchemaType; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieNotSupportedException; @@ -196,6 +197,49 @@ public class HoodieSparkLanceWriter extends HoodieBaseLanceWriter<InternalRow, U * <p>Only top-level fields are inspected; Hudi BLOB and VECTOR are top-level types in the Hudi * schema model. */ + /** + * Fail fast if the write schema contains any VARIANT-typed column. The Lance file format + * does not currently support VARIANT (see https://lance.org/guide/data_types/#arrow-type-system); + * without this guard the write would fail deep in the Avro-to-Arrow conversion layer with a + * cryptic error. Walks the schema recursively so nested VARIANT fields (inside records, unions, + * arrays, maps) are also caught. + */ + static void validateNoVariantColumns(HoodieSchema schema) { + checkNoVariant(schema, ""); + } + + private static void checkNoVariant(HoodieSchema schema, String path) { + HoodieSchemaType type = schema.getType(); + if (type == HoodieSchemaType.VARIANT) { + throw new HoodieNotSupportedException( + "Lance base-file format does not currently support VARIANT columns " + + "(see https://lance.org/guide/data_types/#arrow-type-system). " + + "Found VARIANT field at '" + (path.isEmpty() ? "<root>" : path) + "'. " + + "Use Parquet for tables with VARIANT columns."); + } + switch (type) { + case RECORD: + for (HoodieSchemaField f : schema.getFields()) { + String childPath = path.isEmpty() ? f.name() : path + "." + f.name(); + checkNoVariant(f.schema(), childPath); + } + break; + case UNION: + for (HoodieSchema branch : schema.getTypes()) { + checkNoVariant(branch, path); + } + break; + case ARRAY: + checkNoVariant(schema.getElementType(), path + "[]"); + break; + case MAP: + checkNoVariant(schema.getValueType(), path + ".<value>"); + break; + default: + // Primitive or BLOB / VECTOR — nothing to recurse into for VARIANT detection. + } + } + private static StructType enrichSparkSchemaForLance(StructType sparkSchema) { Map<Integer, HoodieSchema.Vector> vectorColumns = VectorConversionUtils.detectVectorColumnsFromMetadata(sparkSchema); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java index 2f425b2158e4..722833a7756c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java @@ -23,8 +23,12 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.schema.HoodieSchema; +import org.apache.hudi.common.schema.HoodieSchemaField; +import org.apache.hudi.common.schema.HoodieSchemaType; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.io.memory.HoodieArrowAllocator; import org.apache.hudi.storage.HoodieStorage; import org.apache.hudi.storage.StoragePath; @@ -57,6 +61,8 @@ import org.lance.file.LanceFileReader; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.List; import static org.apache.hudi.common.bloom.BloomFilterTypeCode.SIMPLE; @@ -567,4 +573,70 @@ public class TestHoodieSparkLanceWriter { return false; } } + + // ----- VARIANT-on-Lance guard tests ----- + + @Test + public void testValidateNoVariantColumns_noVariant_succeeds() { + HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, Arrays.asList( + HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)), + HoodieSchemaField.of("name", HoodieSchema.create(HoodieSchemaType.STRING)))); + HoodieSparkLanceWriter.validateNoVariantColumns(record); + } + + @Test + public void testValidateNoVariantColumns_topLevelVariant_throws() { + HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, Collections.singletonList( + HoodieSchemaField.of("payload", HoodieSchema.createVariant()))); + HoodieNotSupportedException ex = assertThrows( + HoodieNotSupportedException.class, + () -> HoodieSparkLanceWriter.validateNoVariantColumns(record)); + assertTrue(ex.getMessage().contains("Lance")); + assertTrue(ex.getMessage().contains("VARIANT")); + assertTrue(ex.getMessage().contains("payload"), "Error should name the offending field: " + ex.getMessage()); + } + + @Test + public void testValidateNoVariantColumns_variantInNestedRecord_throws() { + HoodieSchema nested = HoodieSchema.createRecord("Nested", "ns", null, Collections.singletonList( + HoodieSchemaField.of("v", HoodieSchema.createVariant()))); + HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, Collections.singletonList( + HoodieSchemaField.of("inner", nested))); + HoodieNotSupportedException ex = assertThrows( + HoodieNotSupportedException.class, + () -> HoodieSparkLanceWriter.validateNoVariantColumns(record)); + assertTrue(ex.getMessage().contains("inner.v"), "Error should point at nested path: " + ex.getMessage()); + } + + @Test + public void testValidateNoVariantColumns_variantInArray_throws() { + HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, Collections.singletonList( + HoodieSchemaField.of("items", HoodieSchema.createArray(HoodieSchema.createVariant())))); + HoodieNotSupportedException ex = assertThrows( + HoodieNotSupportedException.class, + () -> HoodieSparkLanceWriter.validateNoVariantColumns(record)); + assertTrue(ex.getMessage().contains("items[]"), "Error should point at array element path: " + ex.getMessage()); + } + + @Test + public void testValidateNoVariantColumns_variantInMap_throws() { + HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, Collections.singletonList( + HoodieSchemaField.of("attrs", HoodieSchema.createMap(HoodieSchema.createVariant())))); + HoodieNotSupportedException ex = assertThrows( + HoodieNotSupportedException.class, + () -> HoodieSparkLanceWriter.validateNoVariantColumns(record)); + assertTrue(ex.getMessage().contains("attrs.<value>"), "Error should point at map value path: " + ex.getMessage()); + } + + @Test + public void testValidateNoVariantColumns_variantInNullableUnion_throws() { + HoodieSchema nullableVariant = HoodieSchema.createUnion( + HoodieSchema.NULL_SCHEMA, HoodieSchema.createVariant()); + HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, Collections.singletonList( + HoodieSchemaField.of("payload", nullableVariant))); + HoodieNotSupportedException ex = assertThrows( + HoodieNotSupportedException.class, + () -> HoodieSparkLanceWriter.validateNoVariantColumns(record)); + assertTrue(ex.getMessage().contains("payload"), "Error should name the field: " + ex.getMessage()); + } }
