Davis Zhang created HUDI-9363:
---------------------------------
Summary: Cannot create secondary index if table has array[int] or
map column type
Key: HUDI-9363
URL: https://issues.apache.org/jira/browse/HUDI-9363
Project: Apache Hudi
Issue Type: Bug
Reporter: Davis Zhang
Hudi 1.x
create pristine hudi table with record key string, sec_key column String, some
data column with array and map type.
insert some value.
create RLI on key column
create index on sec_key column, it will hit issues
{code:java}
org.apache.avro.AvroRuntimeException: Not a record: ["null","int"]
at org.apache.avro.Schema.getFields(Schema.java:283)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1236)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1249)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1258)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1241)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:175)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:103)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:82)
at
org.apache.hudi.common.table.log.HoodieFileSliceReader.<init>(HoodieFileSliceReader.java:56)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.createSecondaryIndexGenerator(SecondaryIndexRecordGenerationUtils.java:312)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.lambda$readSecondaryKeysFromFileSlices$12f25254$1(SecondaryIndexRecordGenerationUtils.java:268)
at
org.apache.hudi.data.HoodieJavaRDD.lambda$flatMap$a6598fcb$1(HoodieJavaRDD.java:160)
at
org.apache.spark.api.java.JavaRDDLike.$anonfun$flatMap$1(JavaRDDLike.scala:125)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.util.Iterators$.size(Iterators.scala:29)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1787)
at org.apache.spark.rdd.RDD.$anonfun$count$1(RDD.scala:1296)
at org.apache.spark.rdd.RDD.$anonfun$count$1$adapted(RDD.scala:1296)
at
org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
at org.apache.spark.scheduler.Task.run(Task.scala:141)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
25/05/01 14:41:13 ERROR Executor: Exception in task 3.0 in stage 62.0 (TID 205)
org.apache.avro.AvroRuntimeException: Not a record: ["null","int"]
at org.apache.avro.Schema.getFields(Schema.java:283)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1236)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1249)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1258)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1241)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:175)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:103)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:82)
at
org.apache.hudi.common.table.log.HoodieFileSliceReader.<init>(HoodieFileSliceReader.java:56)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.createSecondaryIndexGenerator(SecondaryIndexRecordGenerationUtils.java:312)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.lambda$readSecondaryKeysFromFileSlices$12f25254$1(SecondaryIndexRecordGenerationUtils.java:268)
at
org.apache.hudi.data.HoodieJavaRDD.lambda$flatMap$a6598fcb$1(HoodieJavaRDD.java:160)
at
org.apache.spark.api.java.JavaRDDLike.$anonfun$flatMap$1(JavaRDDLike.scala:125)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.util.Iterators$.size(Iterators.scala:29)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1787)
at org.apache.spark.rdd.RDD.$anonfun$count$1(RDD.scala:1296)
at org.apache.spark.rdd.RDD.$anonfun$count$1$adapted(RDD.scala:1296)
at
org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
at org.apache.spark.scheduler.Task.run(Task.scala:141)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
25/05/01 14:41:13 ERROR Executor: Exception in task 1.0 in stage 62.0 (TID 203)
org.apache.avro.AvroRuntimeException: Not a record: ["null","int"]
at org.apache.avro.Schema.getFields(Schema.java:283)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1236)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1249)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1258)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1241)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:175)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:103)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:82)
at
org.apache.hudi.common.table.log.HoodieFileSliceReader.<init>(HoodieFileSliceReader.java:56)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.createSecondaryIndexGenerator(SecondaryIndexRecordGenerationUtils.java:312)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.lambda$readSecondaryKeysFromFileSlices$12f25254$1(SecondaryIndexRecordGenerationUtils.java:268)
at
org.apache.hudi.data.HoodieJavaRDD.lambda$flatMap$a6598fcb$1(HoodieJavaRDD.java:160)
at
org.apache.spark.api.java.JavaRDDLike.$anonfun$flatMap$1(JavaRDDLike.scala:125)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.util.Iterators$.size(Iterators.scala:29)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1787)
at org.apache.spark.rdd.RDD.$anonfun$count$1(RDD.scala:1296)
at org.apache.spark.rdd.RDD.$anonfun$count$1$adapted(RDD.scala:1296)
at
org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
at org.apache.spark.scheduler.Task.run(Task.scala:141)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
25/05/01 14:41:13 ERROR Executor: Exception in task 0.0 in stage 62.0 (TID 202)
org.apache.avro.AvroRuntimeException: Not a record: ["null","int"]
at org.apache.avro.Schema.getFields(Schema.java:283)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1236)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1249)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1258)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1241)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:175)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:103)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:82)
at
org.apache.hudi.common.table.log.HoodieFileSliceReader.<init>(HoodieFileSliceReader.java:56)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.createSecondaryIndexGenerator(SecondaryIndexRecordGenerationUtils.java:312)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.lambda$readSecondaryKeysFromFileSlices$12f25254$1(SecondaryIndexRecordGenerationUtils.java:268)
at
org.apache.hudi.data.HoodieJavaRDD.lambda$flatMap$a6598fcb$1(HoodieJavaRDD.java:160)
at
org.apache.spark.api.java.JavaRDDLike.$anonfun$flatMap$1(JavaRDDLike.scala:125)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.util.Iterators$.size(Iterators.scala:29)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1787)
at org.apache.spark.rdd.RDD.$anonfun$count$1(RDD.scala:1296)
at org.apache.spark.rdd.RDD.$anonfun$count$1$adapted(RDD.scala:1296)
at
org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
at org.apache.spark.scheduler.Task.run(Task.scala:141)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
25/05/01 14:41:13 ERROR Executor: Exception in task 6.0 in stage 62.0 (TID 208)
org.apache.avro.AvroRuntimeException: Not a record: ["null","int"]
at org.apache.avro.Schema.getFields(Schema.java:283)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1236)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1249)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1258)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1241)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:175)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:103)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:82)
at
org.apache.hudi.common.table.log.HoodieFileSliceReader.<init>(HoodieFileSliceReader.java:56)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.createSecondaryIndexGenerator(SecondaryIndexRecordGenerationUtils.java:312)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.lambda$readSecondaryKeysFromFileSlices$12f25254$1(SecondaryIndexRecordGenerationUtils.java:268)
at
org.apache.hudi.data.HoodieJavaRDD.lambda$flatMap$a6598fcb$1(HoodieJavaRDD.java:160)
at
org.apache.spark.api.java.JavaRDDLike.$anonfun$flatMap$1(JavaRDDLike.scala:125)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.util.Iterators$.size(Iterators.scala:29)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1787)
at org.apache.spark.rdd.RDD.$anonfun$count$1(RDD.scala:1296)
at org.apache.spark.rdd.RDD.$anonfun$count$1$adapted(RDD.scala:1296)
at
org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
at org.apache.spark.scheduler.Task.run(Task.scala:141)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
25/05/01 14:41:13 ERROR Executor: Exception in task 5.0 in stage 62.0 (TID 207)
org.apache.avro.AvroRuntimeException: Not a record: ["null","int"]
at org.apache.avro.Schema.getFields(Schema.java:283)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1236)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1249)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1258)
at
org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(HoodieAvroUtils.java:1241)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:175)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:103)
at
org.apache.hudi.io.hadoop.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:82)
at
org.apache.hudi.common.table.log.HoodieFileSliceReader.<init>(HoodieFileSliceReader.java:56)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.createSecondaryIndexGenerator(SecondaryIndexRecordGenerationUtils.java:312)
at
org.apache.hudi.metadata.SecondaryIndexRecordGenerationUtils.lambda$readSecondaryKeysFromFileSlices$12f25254$1(SecondaryIndexRecordGenerationUtils.java:268)
at
org.apache.hudi.data.HoodieJavaRDD.lambda$flatMap$a6598fcb$1(HoodieJavaRDD.java:160)
at
org.apache.spark.api.java.JavaRDDLike.$anonfun$flatMap$1(JavaRDDLike.scala:125)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.util.Iterators$.size(Iterators.scala:29)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1787)
at org.apache.spark.rdd.RDD.$anonfun$count$1(RDD.scala:1296)
at org.apache.spark.rdd.RDD.$anonfun$count$1$adapted(RDD.scala:1296)
at
org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
at org.apache.spark.scheduler.Task.run(Task.scala:141)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829) {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)