Wally Tang created HUDI-5204:
--------------------------------
Summary: Unable to write data when Schema evolution is enabled
Key: HUDI-5204
URL: https://issues.apache.org/jira/browse/HUDI-5204
Project: Apache Hudi
Issue Type: Bug
Reporter: Wally Tang
When the schema evolution is enabled, unable to write data to the table and
throw ArrayIndexOutOfBoundsException
Environment Description
Hudi version : master
Spark version : 3.2.2
Storage (HDFS/S3/GCS..) : localhost
Scene of reproduction:
{code:java}
CREATE TABLE default.hudi_mor_pt_002 (
`key` BIGINT,
`A1` STRING,
`A2` STRING,
`A3` STRING)
using hudi
TBLPROPERTIES(
'preCombineField' = 'key',
'primaryKey' = 'key',
'type' = 'mor');
set hoodie.schema.on.read.enable=true;
insert into default.hudi_mor_pt_002 select 1 as key, 'A' as A1, 'A' as A2, 'A'
as A3;
{code}
log
{code:java}
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure:
Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage
7.0 (TID 5) (172.20.10.4 executor driver): java.lang.RuntimeException: Error
while encoding: java.lang.ArrayIndexOutOfBoundsException: 4
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 0, _hoodie_commit_time), StringType), true,
false, true) AS _hoodie_commit_time#103
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 1, _hoodie_commit_seqno), StringType), true,
false, true) AS _hoodie_commit_seqno#104
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 2, _hoodie_record_key), StringType), true,
false, true) AS _hoodie_record_key#105
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 3, _hoodie_partition_path), StringType),
true, false, true) AS _hoodie_partition_path#106
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 4, _hoodie_file_name), StringType), true,
false, true) AS _hoodie_file_name#107
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 5, key), LongType) AS key#108L
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 6, A1), StringType), true, false, true) AS
A1#109
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 7, A2), StringType), true, false, true) AS
A2#110
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 8, A3), StringType), true, false, true) AS
A3#111
at
org.apache.spark.sql.errors.QueryExecutionErrors$.expressionEncodingError(QueryExecutionErrors.scala:1052)
at
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:210)
at
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:193)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
at scala.collection.Iterator.isEmpty(Iterator.scala:387)
at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.isEmpty(WholeStageCodegenExec.scala:757)
at
org.apache.hudi.HoodieSparkUtils$.$anonfun$createRdd$2(HoodieSparkUtils.scala:103)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.sql.execution.SQLConfInjectingRDD.compute(SQLConfInjectingRDD.scala:58)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 4
at
org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:174)
at org.apache.spark.sql.Row.isNullAt(Row.scala:214)
at org.apache.spark.sql.Row.isNullAt$(Row.scala:214)
at
org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:166)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_2$(Unknown
Source)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
Source)
at
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:207)
... 33 more
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
at
scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
at scala.Option.foreach(Option.scala:407)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at org.apache.spark.api.java.JavaRDDLike.collect(JavaRDDLike.scala:362)
at org.apache.spark.api.java.JavaRDDLike.collect$(JavaRDDLike.scala:361)
at
org.apache.spark.api.java.AbstractJavaRDDLike.collect(JavaRDDLike.scala:45)
at
org.apache.hudi.data.HoodieJavaRDD.collectAsList(HoodieJavaRDD.java:155)
at
org.apache.hudi.index.simple.HoodieSimpleIndex.fetchRecordLocationsForAffectedPartitions(HoodieSimpleIndex.java:142)
at
org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocationInternal(HoodieSimpleIndex.java:113)
at
org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocation(HoodieSimpleIndex.java:91)
at
org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:51)
at
org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:34)
at
org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:53)
... 99 more
Caused by: java.lang.RuntimeException: Error while encoding:
java.lang.ArrayIndexOutOfBoundsException: 4 {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)