Re: [PR] [HUDI-9669] Add schema on write support to hive [hudi]

via GitHub Tue, 12 Aug 2025 17:00:09 -0700


yihua commented on code in PR #13654:
URL: https://github.com/apache/hudi/pull/13654#discussion_r2271499399



##########
hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java:
##########
@@ -19,68 +19,307 @@
 
 package org.apache.hudi.hadoop.utils;
 
-import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.avro.AvroSchemaUtils;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.exception.HoodieAvroSchemaException;
+import org.apache.hudi.exception.SchemaCompatibilityException;
 
-import com.github.benmanes.caffeine.cache.Cache;
-import com.github.benmanes.caffeine.cache.Caffeine;
+import org.apache.avro.JsonProperties;
+import org.apache.avro.LogicalTypes;
 import org.apache.avro.Schema;
-import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
 import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.Deque;
+import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;
+import java.util.Objects;
 import java.util.function.UnaryOperator;
 
+import static org.apache.hudi.avro.AvroSchemaUtils.isNullable;
+import static org.apache.hudi.avro.HoodieAvroUtils.createFullName;
+import static org.apache.hudi.avro.HoodieAvroUtils.createNamePrefix;
+import static org.apache.hudi.avro.HoodieAvroUtils.getOldFieldNameWithRenaming;
+import static org.apache.hudi.avro.HoodieAvroUtils.toJavaDate;
+import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;
+
 public class HoodieArrayWritableAvroUtils {
 
-  private static final Cache<Pair<Schema, Schema>, int[]>
-      PROJECTION_CACHE = Caffeine.newBuilder().maximumSize(1000).build();
+  public static ArrayWritable rewriteRecordWithNewSchema(ArrayWritable 
writable, Schema oldSchema, Schema newSchema, Map<String, String> renameCols) {
+    return (ArrayWritable) rewriteRecordWithNewSchema(writable, oldSchema, 
newSchema, renameCols, new LinkedList<>());
+  }
 
-  public static int[] getProjection(Schema from, Schema to) {
-    return PROJECTION_CACHE.get(Pair.of(from, to), schemas -> {
-      List<Schema.Field> toFields = to.getFields();
-      int[] newProjection = new int[toFields.size()];
-      for (int i = 0; i < newProjection.length; i++) {
-        newProjection[i] = from.getField(toFields.get(i).name()).pos();
-      }
-      return newProjection;
-    });
+  private static Writable rewriteRecordWithNewSchema(Writable writable, Schema 
oldAvroSchema, Schema newAvroSchema, Map<String, String> renameCols, 
Deque<String> fieldNames) {
+    if (writable == null) {
+      return null;
+    }
+    Schema oldSchema = AvroSchemaUtils.resolveNullableSchema(oldAvroSchema);
+    Schema newSchema = AvroSchemaUtils.resolveNullableSchema(newAvroSchema);

Review Comment:
   OK as long as there is no regression, we can tackle this separately.  
HUDI-9706 to track.



##########
hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java:
##########
@@ -328,15 +328,24 @@ public void 
testSchemaEvolutionWhenBaseFilesWithDifferentSchema(HoodieFileFormat
     }
   }
 
+  private static Stream<Arguments> testArgsForDifferentBaseAndLogFormats() {
+    boolean supportsORC = supportedFileFormats.contains(HoodieFileFormat.ORC);
+    return Stream.of(
+        arguments(supportsORC ? HoodieFileFormat.ORC : 
HoodieFileFormat.PARQUET, "avro"),
+        arguments(HoodieFileFormat.PARQUET, "parquet")

Review Comment:
   That is fine.  My point is `Parquet` as the base file format and `Avro` as 
the log file format is the common combination, so should that be tested always?



##########
hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java:
##########
@@ -1054,7 +1055,7 @@ private static Object rewriteRecordWithNewSchema(Object 
oldRecord,
     if (oldRecord == null) {
       return null;
     }
-    if (oldAvroSchema.equals(newSchema)) {
+    if (areSchemasProjectionEquivalent(oldAvroSchema,newSchema)) {

Review Comment:
   ```suggestion
       if (areSchemasProjectionEquivalent(oldAvroSchema, newSchema)) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-9669] Add schema on write support to hive [hudi]

Reply via email to