Wally Tang created HUDI-5204:
--------------------------------

             Summary: Unable to write data when Schema evolution is enabled
                 Key: HUDI-5204
                 URL: https://issues.apache.org/jira/browse/HUDI-5204
             Project: Apache Hudi
          Issue Type: Bug
            Reporter: Wally Tang


When the schema evolution is enabled, unable to write data to the table and 
throw ArrayIndexOutOfBoundsException

 

Environment Description

Hudi version : master

Spark version : 3.2.2

Storage (HDFS/S3/GCS..) : localhost

 

Scene of reproduction:
{code:java}
CREATE TABLE default.hudi_mor_pt_002 (
  `key` BIGINT,
  `A1` STRING,
  `A2` STRING,
  `A3` STRING)
using hudi
TBLPROPERTIES(
  'preCombineField' = 'key',
  'primaryKey' = 'key',
  'type' = 'mor');

set hoodie.schema.on.read.enable=true;

insert into default.hudi_mor_pt_002 select 1 as key, 'A' as A1, 'A' as A2, 'A' 
as A3;

{code}
log
{code:java}
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: 
Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 
7.0 (TID 5) (172.20.10.4 executor driver): java.lang.RuntimeException: Error 
while encoding: java.lang.ArrayIndexOutOfBoundsException: 4
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 0, _hoodie_commit_time), StringType), true, 
false, true) AS _hoodie_commit_time#103
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 1, _hoodie_commit_seqno), StringType), true, 
false, true) AS _hoodie_commit_seqno#104
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 2, _hoodie_record_key), StringType), true, 
false, true) AS _hoodie_record_key#105
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 3, _hoodie_partition_path), StringType), 
true, false, true) AS _hoodie_partition_path#106
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 4, _hoodie_file_name), StringType), true, 
false, true) AS _hoodie_file_name#107
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 5, key), LongType) AS key#108L
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 6, A1), StringType), true, false, true) AS 
A1#109
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 7, A2), StringType), true, false, true) AS 
A2#110
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else 
staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, 
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, 
org.apache.spark.sql.Row, true]), 8, A3), StringType), true, false, true) AS 
A3#111
        at 
org.apache.spark.sql.errors.QueryExecutionErrors$.expressionEncodingError(QueryExecutionErrors.scala:1052)
        at 
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:210)
        at 
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:193)
        at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
 Source)
        at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at 
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
        at scala.collection.Iterator.isEmpty(Iterator.scala:387)
        at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
        at 
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.isEmpty(WholeStageCodegenExec.scala:757)
        at 
org.apache.hudi.HoodieSparkUtils$.$anonfun$createRdd$2(HoodieSparkUtils.scala:103)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
        at 
org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
        at 
org.apache.spark.sql.execution.SQLConfInjectingRDD.compute(SQLConfInjectingRDD.scala:58)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
        at 
org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
        at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
        at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
        at org.apache.spark.scheduler.Task.run(Task.scala:131)
        at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 4
        at 
org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:174)
        at org.apache.spark.sql.Row.isNullAt(Row.scala:214)
        at org.apache.spark.sql.Row.isNullAt$(Row.scala:214)
        at 
org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:166)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_2$(Unknown
 Source)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
 Source)
        at 
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:207)
        ... 33 more


Driver stacktrace:
        at 
org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
        at 
scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
        at 
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
        at 
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
        at scala.Option.foreach(Option.scala:407)
        at 
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
        at 
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
        at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
        at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
        at org.apache.spark.api.java.JavaRDDLike.collect(JavaRDDLike.scala:362)
        at org.apache.spark.api.java.JavaRDDLike.collect$(JavaRDDLike.scala:361)
        at 
org.apache.spark.api.java.AbstractJavaRDDLike.collect(JavaRDDLike.scala:45)
        at 
org.apache.hudi.data.HoodieJavaRDD.collectAsList(HoodieJavaRDD.java:155)
        at 
org.apache.hudi.index.simple.HoodieSimpleIndex.fetchRecordLocationsForAffectedPartitions(HoodieSimpleIndex.java:142)
        at 
org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocationInternal(HoodieSimpleIndex.java:113)
        at 
org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocation(HoodieSimpleIndex.java:91)
        at 
org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:51)
        at 
org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:34)
        at 
org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:53)
        ... 99 more
Caused by: java.lang.RuntimeException: Error while encoding: 
java.lang.ArrayIndexOutOfBoundsException: 4 {code}
 

 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to