Re: [PR] feat (variant): Add Unshredded Variant read & write support [hudi]

via GitHub Mon, 12 Jan 2026 09:54:27 -0800


voonhous commented on code in PR #17833:
URL: https://github.com/apache/hudi/pull/17833#discussion_r2683288544



##########
hudi-spark-datasource/hudi-spark4.0.x/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowParquetWriteSupportVariant.java:
##########
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.io.storage.row;
+
+import org.apache.hudi.common.config.HoodieConfig;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.schema.HoodieSchema;
+import org.apache.hudi.common.schema.HoodieSchemaField;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.config.HoodieWriteConfig;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Type;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.unsafe.types.VariantVal;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
+import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+/**
+ * Tests for Variant type support in {@link HoodieRowParquetWriteSupport}.
+ * Verifies that Spark VariantType data is correctly written to Parquet groups 
with 'metadata' and 'value' binary fields,
+ * respecting shredded vs unshredded schemas.
+ */
+public class TestHoodieRowParquetWriteSupportVariant {
+
+  @TempDir
+  public File tempDir;
+
+  /**
+   * Tests that an Unshredded Variant (defined by HoodieSchema) is written as:
+   * <pre>
+   * group {
+   * required binary metadata;
+   * required binary value;
+   * }
+   * </pre>
+   */
+  @Test
+  public void testWriteUnshreddedVariant() throws IOException {

Review Comment:
   These test aren't really meaningful, they pass without the variant read + 
write code changes anyways. 



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/feature/index/TestExpressionIndex.scala:
##########
@@ -159,7 +159,7 @@ class TestExpressionIndex extends HoodieSparkSqlTestBase 
with SparkAdapterSuppor
   }
 
   test("Test Create Expression Index Syntax") {
-    withTempDir { tmp =>
+    withTempDir { tmp =>2

Review Comment:
   Typo that i added by mistake, remove it later.



##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala:
##########
@@ -46,7 +61,7 @@ object HoodieParquetFileFormatHelper {
       val requiredType = f.dataType
       if (fileStructMap.contains(f.name) && !isDataTypeEqual(requiredType, 
fileStructMap(f.name))) {
         val readerType = addMissingFields(requiredType, fileStructMap(f.name))
-        implicitTypeChangeInfo.put(new 
Integer(requiredSchema.fieldIndex(f.name)), 
org.apache.hudi.common.util.collection.Pair.of(requiredType, readerType))
+        
implicitTypeChangeInfo.put(Integer.valueOf(requiredSchema.fieldIndex(f.name)), 
org.apache.hudi.common.util.collection.Pair.of(requiredType, readerType))

Review Comment:
   `new Integer` is marked for removal in Java 9. Using `Integer#valueOf` 
instead. This could be a separate PR.



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/schema/TestVariantDataType.scala:
##########
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.hudi.dml.schema
+
+import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase
+
+class TestVariantDataType extends HoodieSparkSqlTestBase {
+
+  test("Test COW Table with Variant Data Type") {

Review Comment:
   Let's add a MOR test to see if there's anything missing too.



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderOnSparkVariant.scala:
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.common.table.read
+
+import org.apache.hudi.SparkAdapterSupport
+import org.apache.hudi.common.table.HoodieTableMetaClient
+import org.apache.hudi.common.testutils.{HoodieTestTable, HoodieTestUtils}
+import org.apache.hudi.common.util.{Option => HOption}
+import org.apache.hudi.storage.StorageConfiguration
+import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration
+import org.apache.hudi.util.CloseableInternalRowIterator
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.{HoodieSparkKryoRegistrar, SparkConf}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import 
org.apache.spark.sql.internal.SQLConf.LEGACY_RESPECT_NULLABILITY_IN_TEXT_DATASET_CONVERSION
+import org.apache.spark.sql.types.StructType
+import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
+import org.junit.jupiter.api.Assertions.{assertArrayEquals, assertEquals, 
assertTrue}
+
+import java.nio.file.{Files, Path}
+
+class TestHoodieFileGroupReaderOnSparkVariant extends SparkAdapterSupport {

Review Comment:
   This can be removed, i added this for debugging to allow for finer grain 
control. It is almost identical to `TestVariantDataType.scala`. Just that this 
instantiates a `CloseableInternalRowIterator` for row reading.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat (variant): Add Unshredded Variant read & write support [hudi]

Reply via email to