This is an automated email from the ASF dual-hosted git repository.

voonhous pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 162cac2910ee fix(lance): fail fast when write schema contains VARIANT 
columns (#18775)
162cac2910ee is described below

commit 162cac2910ee299fb112a8604a54553082481b7c
Author: Rahil C <[email protected]>
AuthorDate: Tue May 19 01:09:20 2026 -0700

    fix(lance): fail fast when write schema contains VARIANT columns (#18775)
    
    Lance does not currently support VARIANT in its file format
    (https://lance.org/guide/data_types/#arrow-type-system). Without a
    guard, writes that include VARIANT-typed columns fail deep inside the
    Avro-to-Arrow conversion layer (LanceArrowUtils.toArrowSchema) with a
    cryptic error.
    
    Add a recursive Avro-schema walk in HoodieSparkLanceWriter that throws
    HoodieNotSupportedException up front with a user-friendly message
    naming the offending column path. Invoke it from
    HoodieSparkFileWriterFactory.newLanceFileWriter so every Lance write
    path (Spark DataSource, DeltaStreamer, bootstrap, async clustering,
    async compaction) is covered before any Arrow allocator is opened.
    
    Mirrors the existing VECTOR element-type guard in the same writer.
---
 .../io/storage/HoodieSparkFileWriterFactory.java   |  1 +
 .../hudi/io/storage/HoodieSparkLanceWriter.java    | 44 +++++++++++++
 .../io/storage/TestHoodieSparkLanceWriter.java     | 72 ++++++++++++++++++++++
 3 files changed, 117 insertions(+)

diff --git 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
index e8faef103e7c..7b3571d8df20 100644
--- 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
+++ 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
@@ -115,6 +115,7 @@ public class HoodieSparkFileWriterFactory extends 
HoodieFileWriterFactory {
   @Override
   protected HoodieFileWriter newLanceFileWriter(String instantTime, 
StoragePath path, HoodieConfig config, HoodieSchema schema,
                                                 TaskContextSupplier 
taskContextSupplier) throws IOException {
+    HoodieSparkLanceWriter.validateNoVariantColumns(schema);
     boolean populateMetaFields = 
config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS);
     StructType structType = HoodieInternalRowUtils.getCachedSchema(schema);
     boolean enableBloomFilter = enableBloomFilter(populateMetaFields, config);
diff --git 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
index fe109a1ffe60..3bdf12d9059b 100644
--- 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
+++ 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
@@ -24,6 +24,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier;
 import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.schema.HoodieSchema;
+import org.apache.hudi.common.schema.HoodieSchemaField;
 import org.apache.hudi.common.schema.HoodieSchemaType;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.exception.HoodieNotSupportedException;
@@ -196,6 +197,49 @@ public class HoodieSparkLanceWriter extends 
HoodieBaseLanceWriter<InternalRow, U
    * <p>Only top-level fields are inspected; Hudi BLOB and VECTOR are 
top-level types in the Hudi
    * schema model.
    */
+  /**
+   * Fail fast if the write schema contains any VARIANT-typed column. The 
Lance file format
+   * does not currently support VARIANT (see 
https://lance.org/guide/data_types/#arrow-type-system);
+   * without this guard the write would fail deep in the Avro-to-Arrow 
conversion layer with a
+   * cryptic error. Walks the schema recursively so nested VARIANT fields 
(inside records, unions,
+   * arrays, maps) are also caught.
+   */
+  static void validateNoVariantColumns(HoodieSchema schema) {
+    checkNoVariant(schema, "");
+  }
+
+  private static void checkNoVariant(HoodieSchema schema, String path) {
+    HoodieSchemaType type = schema.getType();
+    if (type == HoodieSchemaType.VARIANT) {
+      throw new HoodieNotSupportedException(
+          "Lance base-file format does not currently support VARIANT columns "
+              + "(see https://lance.org/guide/data_types/#arrow-type-system). "
+              + "Found VARIANT field at '" + (path.isEmpty() ? "<root>" : 
path) + "'. "
+              + "Use Parquet for tables with VARIANT columns.");
+    }
+    switch (type) {
+      case RECORD:
+        for (HoodieSchemaField f : schema.getFields()) {
+          String childPath = path.isEmpty() ? f.name() : path + "." + f.name();
+          checkNoVariant(f.schema(), childPath);
+        }
+        break;
+      case UNION:
+        for (HoodieSchema branch : schema.getTypes()) {
+          checkNoVariant(branch, path);
+        }
+        break;
+      case ARRAY:
+        checkNoVariant(schema.getElementType(), path + "[]");
+        break;
+      case MAP:
+        checkNoVariant(schema.getValueType(), path + ".<value>");
+        break;
+      default:
+        // Primitive or BLOB / VECTOR — nothing to recurse into for VARIANT 
detection.
+    }
+  }
+
   private static StructType enrichSparkSchemaForLance(StructType sparkSchema) {
     Map<Integer, HoodieSchema.Vector> vectorColumns =
         VectorConversionUtils.detectVectorColumnsFromMetadata(sparkSchema);
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
 
b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
index 2f425b2158e4..722833a7756c 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
@@ -23,8 +23,12 @@ import org.apache.hudi.common.bloom.BloomFilter;
 import org.apache.hudi.common.bloom.BloomFilterFactory;
 import org.apache.hudi.common.config.HoodieStorageConfig;
 import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.schema.HoodieSchema;
+import org.apache.hudi.common.schema.HoodieSchemaField;
+import org.apache.hudi.common.schema.HoodieSchemaType;
 import org.apache.hudi.common.testutils.HoodieTestUtils;
 import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieNotSupportedException;
 import org.apache.hudi.io.memory.HoodieArrowAllocator;
 import org.apache.hudi.storage.HoodieStorage;
 import org.apache.hudi.storage.StoragePath;
@@ -57,6 +61,8 @@ import org.lance.file.LanceFileReader;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
 import static org.apache.hudi.common.bloom.BloomFilterTypeCode.SIMPLE;
@@ -567,4 +573,70 @@ public class TestHoodieSparkLanceWriter {
       return false;
     }
   }
+
+  // ----- VARIANT-on-Lance guard tests -----
+
+  @Test
+  public void testValidateNoVariantColumns_noVariant_succeeds() {
+    HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, 
Arrays.asList(
+        HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+        HoodieSchemaField.of("name", 
HoodieSchema.create(HoodieSchemaType.STRING))));
+    HoodieSparkLanceWriter.validateNoVariantColumns(record);
+  }
+
+  @Test
+  public void testValidateNoVariantColumns_topLevelVariant_throws() {
+    HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, 
Collections.singletonList(
+        HoodieSchemaField.of("payload", HoodieSchema.createVariant())));
+    HoodieNotSupportedException ex = assertThrows(
+        HoodieNotSupportedException.class,
+        () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+    assertTrue(ex.getMessage().contains("Lance"));
+    assertTrue(ex.getMessage().contains("VARIANT"));
+    assertTrue(ex.getMessage().contains("payload"), "Error should name the 
offending field: " + ex.getMessage());
+  }
+
+  @Test
+  public void testValidateNoVariantColumns_variantInNestedRecord_throws() {
+    HoodieSchema nested = HoodieSchema.createRecord("Nested", "ns", null, 
Collections.singletonList(
+        HoodieSchemaField.of("v", HoodieSchema.createVariant())));
+    HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, 
Collections.singletonList(
+        HoodieSchemaField.of("inner", nested)));
+    HoodieNotSupportedException ex = assertThrows(
+        HoodieNotSupportedException.class,
+        () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+    assertTrue(ex.getMessage().contains("inner.v"), "Error should point at 
nested path: " + ex.getMessage());
+  }
+
+  @Test
+  public void testValidateNoVariantColumns_variantInArray_throws() {
+    HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, 
Collections.singletonList(
+        HoodieSchemaField.of("items", 
HoodieSchema.createArray(HoodieSchema.createVariant()))));
+    HoodieNotSupportedException ex = assertThrows(
+        HoodieNotSupportedException.class,
+        () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+    assertTrue(ex.getMessage().contains("items[]"), "Error should point at 
array element path: " + ex.getMessage());
+  }
+
+  @Test
+  public void testValidateNoVariantColumns_variantInMap_throws() {
+    HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, 
Collections.singletonList(
+        HoodieSchemaField.of("attrs", 
HoodieSchema.createMap(HoodieSchema.createVariant()))));
+    HoodieNotSupportedException ex = assertThrows(
+        HoodieNotSupportedException.class,
+        () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+    assertTrue(ex.getMessage().contains("attrs.<value>"), "Error should point 
at map value path: " + ex.getMessage());
+  }
+
+  @Test
+  public void testValidateNoVariantColumns_variantInNullableUnion_throws() {
+    HoodieSchema nullableVariant = HoodieSchema.createUnion(
+        HoodieSchema.NULL_SCHEMA, HoodieSchema.createVariant());
+    HoodieSchema record = HoodieSchema.createRecord("R", "ns", null, 
Collections.singletonList(
+        HoodieSchemaField.of("payload", nullableVariant)));
+    HoodieNotSupportedException ex = assertThrows(
+        HoodieNotSupportedException.class,
+        () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+    assertTrue(ex.getMessage().contains("payload"), "Error should name the 
field: " + ex.getMessage());
+  }
 }

Reply via email to