yihua commented on code in PR #13654:
URL: https://github.com/apache/hudi/pull/13654#discussion_r2271499399
##########
hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieArrayWritableAvroUtils.java:
##########
@@ -19,68 +19,307 @@
package org.apache.hudi.hadoop.utils;
-import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.avro.AvroSchemaUtils;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.exception.HoodieAvroSchemaException;
+import org.apache.hudi.exception.SchemaCompatibilityException;
-import com.github.benmanes.caffeine.cache.Cache;
-import com.github.benmanes.caffeine.cache.Caffeine;
+import org.apache.avro.JsonProperties;
+import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
-import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.Deque;
+import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
+import java.util.Objects;
import java.util.function.UnaryOperator;
+import static org.apache.hudi.avro.AvroSchemaUtils.isNullable;
+import static org.apache.hudi.avro.HoodieAvroUtils.createFullName;
+import static org.apache.hudi.avro.HoodieAvroUtils.createNamePrefix;
+import static org.apache.hudi.avro.HoodieAvroUtils.getOldFieldNameWithRenaming;
+import static org.apache.hudi.avro.HoodieAvroUtils.toJavaDate;
+import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;
+
public class HoodieArrayWritableAvroUtils {
- private static final Cache<Pair<Schema, Schema>, int[]>
- PROJECTION_CACHE = Caffeine.newBuilder().maximumSize(1000).build();
+ public static ArrayWritable rewriteRecordWithNewSchema(ArrayWritable
writable, Schema oldSchema, Schema newSchema, Map<String, String> renameCols) {
+ return (ArrayWritable) rewriteRecordWithNewSchema(writable, oldSchema,
newSchema, renameCols, new LinkedList<>());
+ }
- public static int[] getProjection(Schema from, Schema to) {
- return PROJECTION_CACHE.get(Pair.of(from, to), schemas -> {
- List<Schema.Field> toFields = to.getFields();
- int[] newProjection = new int[toFields.size()];
- for (int i = 0; i < newProjection.length; i++) {
- newProjection[i] = from.getField(toFields.get(i).name()).pos();
- }
- return newProjection;
- });
+ private static Writable rewriteRecordWithNewSchema(Writable writable, Schema
oldAvroSchema, Schema newAvroSchema, Map<String, String> renameCols,
Deque<String> fieldNames) {
+ if (writable == null) {
+ return null;
+ }
+ Schema oldSchema = AvroSchemaUtils.resolveNullableSchema(oldAvroSchema);
+ Schema newSchema = AvroSchemaUtils.resolveNullableSchema(newAvroSchema);
Review Comment:
OK as long as there is no regression, we can tackle this separately.
HUDI-9706 to track.
##########
hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java:
##########
@@ -328,15 +328,24 @@ public void
testSchemaEvolutionWhenBaseFilesWithDifferentSchema(HoodieFileFormat
}
}
+ private static Stream<Arguments> testArgsForDifferentBaseAndLogFormats() {
+ boolean supportsORC = supportedFileFormats.contains(HoodieFileFormat.ORC);
+ return Stream.of(
+ arguments(supportsORC ? HoodieFileFormat.ORC :
HoodieFileFormat.PARQUET, "avro"),
+ arguments(HoodieFileFormat.PARQUET, "parquet")
Review Comment:
That is fine. My point is `Parquet` as the base file format and `Avro` as
the log file format is the common combination, so should that be tested always?
##########
hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java:
##########
@@ -1054,7 +1055,7 @@ private static Object rewriteRecordWithNewSchema(Object
oldRecord,
if (oldRecord == null) {
return null;
}
- if (oldAvroSchema.equals(newSchema)) {
+ if (areSchemasProjectionEquivalent(oldAvroSchema,newSchema)) {
Review Comment:
```suggestion
if (areSchemasProjectionEquivalent(oldAvroSchema, newSchema)) {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]