[
https://issues.apache.org/jira/browse/HUDI-8834?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Davis Zhang updated HUDI-8834:
------------------------------
Description:
only for MOR, COW works fine
Job aborted due to stage failure: Task 0 in stage 40.0 failed 1 times, most
recent failure: Lost task 0.0 in stage 40.0 (TID 52) (192.168.1.109 executor
driver): org.apache.spark.SparkException: Encountered error while reading file
[file:/private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet|file:///private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet].
Details:
at
org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:877)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:307)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at
org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
at org.apache.spark.RangePartitioner$.$anonfun$sketch$1(Partitioner.scala:322)
at
org.apache.spark.RangePartitioner$.$anonfun$sketch$1$adapted(Partitioner.scala:320)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:908)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:908)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
at org.apache.spark.scheduler.Task.run(Task.scala:139)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at scala.collection.immutable.StringLike.split(StringLike.scala:260)
at scala.collection.immutable.StringLike.split$(StringLike.scala:259)
at scala.collection.immutable.StringOps.split(StringOps.scala:33)
at
org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
at
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
at
org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
at
org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
at
org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
at
org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:395)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNextBaseRecord(HoodieBaseFileGroupRecordBuffer.java:504)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodieKeyBasedFileGroupRecordBuffer.java:130)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:139)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:154)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:248)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:319)
at
org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:272)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:297)
... 21 more
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2790)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2726)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2725)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2725)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1211)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1211)
at scala.Option.foreach(Option.scala:407)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1211)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2989)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2928)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2917)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:976)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1022)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1021)
at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:320)
at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:187)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:290)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:173)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:167)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture$lzycompute(ShuffleExchangeExec.scala:143)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture(ShuffleExchangeExec.scala:139)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.$anonfun$submitShuffleJob$1(ShuffleExchangeExec.scala:68)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob(ShuffleExchangeExec.scala:68)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob$(ShuffleExchangeExec.scala:67)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.submitShuffleJob(ShuffleExchangeExec.scala:115)
at
org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture$lzycompute(QueryStageExec.scala:181)
at
org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture(QueryStageExec.scala:181)
at
org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.doMaterialize(QueryStageExec.scala:183)
at
org.apache.spark.sql.execution.adaptive.QueryStageExec.materialize(QueryStageExec.scala:82)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$5(AdaptiveSparkPlanExec.scala:272)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$5$adapted(AdaptiveSparkPlanExec.scala:270)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:270)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:242)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:387)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:360)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4218)
at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:3459)
at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4208)
at
org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4206)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4206)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:3459)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.checkAnswer(HoodieSparkSqlTestBase.scala:126)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$62(TestMergeIntoTable.scala:1395)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$62$adapted(TestMergeIntoTable.scala:1353)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withTempDir(HoodieSparkSqlTestBase.scala:92)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$61(TestMergeIntoTable.scala:1353)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$3(HoodieSparkSqlTestBase.scala:310)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withSQLConf(HoodieSparkSqlTestBase.scala:282)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$1(HoodieSparkSqlTestBase.scala:309)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$1$adapted(HoodieSparkSqlTestBase.scala:301)
at scala.collection.immutable.List.foreach(List.scala:431)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withRecordType(HoodieSparkSqlTestBase.scala:301)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$60(TestMergeIntoTable.scala:1353)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$60$adapted(TestMergeIntoTable.scala:1351)
at scala.collection.immutable.List.foreach(List.scala:431)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$59(TestMergeIntoTable.scala:1351)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$test$1(HoodieSparkSqlTestBase.scala:104)
at org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85)
at org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83)
at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
at org.scalatest.Transformer.apply(Transformer.scala:22)
at org.scalatest.Transformer.apply(Transformer.scala:20)
at
org.scalatest.funsuite.AnyFunSuiteLike$$anon$1.apply(AnyFunSuiteLike.scala:189)
at org.scalatest.TestSuite.withFixture(TestSuite.scala:196)
at org.scalatest.TestSuite.withFixture$(TestSuite.scala:195)
at org.scalatest.funsuite.AnyFunSuite.withFixture(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.invokeWithFixture$1(AnyFunSuiteLike.scala:187)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTest$1(AnyFunSuiteLike.scala:199)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
at org.scalatest.funsuite.AnyFunSuiteLike.runTest(AnyFunSuiteLike.scala:199)
at org.scalatest.funsuite.AnyFunSuiteLike.runTest$(AnyFunSuiteLike.scala:181)
at org.scalatest.funsuite.AnyFunSuite.runTest(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTests$1(AnyFunSuiteLike.scala:232)
at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413)
at scala.collection.immutable.List.foreach(List.scala:431)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396)
at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475)
at org.scalatest.funsuite.AnyFunSuiteLike.runTests(AnyFunSuiteLike.scala:232)
at org.scalatest.funsuite.AnyFunSuiteLike.runTests$(AnyFunSuiteLike.scala:231)
at org.scalatest.funsuite.AnyFunSuite.runTests(AnyFunSuite.scala:1562)
at org.scalatest.Suite.run(Suite.scala:1112)
at org.scalatest.Suite.run$(Suite.scala:1094)
at
org.scalatest.funsuite.AnyFunSuite.org$scalatest$funsuite$AnyFunSuiteLike$$super$run(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$run$1(AnyFunSuiteLike.scala:236)
at org.scalatest.SuperEngine.runImpl(Engine.scala:535)
at org.scalatest.funsuite.AnyFunSuiteLike.run(AnyFunSuiteLike.scala:236)
at org.scalatest.funsuite.AnyFunSuiteLike.run$(AnyFunSuiteLike.scala:235)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.org$scalatest$BeforeAndAfterAll$$super$run(HoodieSparkSqlTestBase.scala:48)
at org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:213)
at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210)
at org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.run(HoodieSparkSqlTestBase.scala:48)
at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:45)
at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13(Runner.scala:1314)
at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13$adapted(Runner.scala:1308)
at scala.collection.immutable.List.foreach(List.scala:431)
at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:1308)
at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24(Runner.scala:993)
at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24$adapted(Runner.scala:971)
at
org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:1474)
at
org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:971)
at org.scalatest.tools.Runner$.run(Runner.scala:798)
at org.scalatest.tools.Runner.run(Runner.scala)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2or3(ScalaTestRunner.java:43)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:26)
Caused by: org.apache.spark.SparkException: Encountered error while reading
file
[file:/private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet|file:///private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet].
Details:
at
org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:877)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:307)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at
org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
at org.apache.spark.RangePartitioner$.$anonfun$sketch$1(Partitioner.scala:322)
at
org.apache.spark.RangePartitioner$.$anonfun$sketch$1$adapted(Partitioner.scala:320)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:908)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:908)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
at org.apache.spark.scheduler.Task.run(Task.scala:139)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at scala.collection.immutable.StringLike.split(StringLike.scala:260)
at scala.collection.immutable.StringLike.split$(StringLike.scala:259)
at scala.collection.immutable.StringOps.split(StringOps.scala:33)
at
org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
at
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
at
org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
at
org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
at
org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
at
org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:395)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNextBaseRecord(HoodieBaseFileGroupRecordBuffer.java:504)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodieKeyBasedFileGroupRecordBuffer.java:130)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:139)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:154)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:248)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:319)
at
org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:272)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:297)
... 21 more
```
testConfigs.foreach { case (tableType, sparkSqlOptimizedWrites) =>
log.info(s"=== Testing MergeInto with partial insert: tableType=$tableType,
sparkSqlOptimizedWrites=$sparkSqlOptimizedWrites ===")
withRecordType()(withTempDir { tmp =>
spark.sql("set hoodie.payload.combined.schema.validate = true")
// Create a partitioned table
val tableName = generateTableName
spark.sql(
s"""
|create table $tableName (|
|id bigint,|
|name string,|
|price double,|
|ts bigint,|
|dt string|
|) using hudi|
|tblproperties (|
|type = '$tableType',|
|primaryKey = 'id',|
|precombineKey = 'ts'|
|)|
|partitioned by(dt)|
|location '${tmp.getCanonicalPath}'
""".stripMargin)|
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1L, '2021-03-21'")
// Set optimized sql merge setting
spark.sql(s"set ${SPARK_SQL_OPTIMIZED_WRITES.key()}=$sparkSqlOptimizedWrites")
spark.sql(
s"""
|merge into $tableName as t0|
|using (|
|select 2 as id, 'a2' as name, 10 as price, 2L as ts, '2021-03-20' as dt|
|union|
|select 1 as id, 'a1_updated' as name, 11 as price, 3L as ts, '2021-03-21' as
dt|
|) s0|
|on s0.id = t0.id|
|when matched then update set t0.name = s0.name, t0.price = s0.price|
|when not matched and s0.id % 2 = 0 then insert (id, name, dt)|
|values(s0.id, s0.name, s0.dt)
""".stripMargin)
checkAnswer(s"select id, name, price, dt from $tableName order by id")( <=====
NPE is thrown
Seq(1, "a1_updated", 11, "2021-03-21"),
Seq(2, "a2", null, "2021-03-20")
)
})
}
```|
was:
Job aborted due to stage failure: Task 0 in stage 40.0 failed 1 times, most
recent failure: Lost task 0.0 in stage 40.0 (TID 52) (192.168.1.109 executor
driver): org.apache.spark.SparkException: Encountered error while reading file
file:/private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet.
Details:
at
org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:877)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:307)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at
org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
at org.apache.spark.RangePartitioner$.$anonfun$sketch$1(Partitioner.scala:322)
at
org.apache.spark.RangePartitioner$.$anonfun$sketch$1$adapted(Partitioner.scala:320)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:908)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:908)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
at org.apache.spark.scheduler.Task.run(Task.scala:139)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at scala.collection.immutable.StringLike.split(StringLike.scala:260)
at scala.collection.immutable.StringLike.split$(StringLike.scala:259)
at scala.collection.immutable.StringOps.split(StringOps.scala:33)
at
org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
at
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
at
org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
at
org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
at
org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
at
org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:395)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNextBaseRecord(HoodieBaseFileGroupRecordBuffer.java:504)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodieKeyBasedFileGroupRecordBuffer.java:130)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:139)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:154)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:248)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:319)
at
org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:272)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:297)
... 21 more
Driver stacktrace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in
stage 40.0 failed 1 times, most recent failure: Lost task 0.0 in stage 40.0
(TID 52) (192.168.1.109 executor driver): org.apache.spark.SparkException:
Encountered error while reading file
file:/private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet.
Details:
at
org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:877)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:307)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at
org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
at org.apache.spark.RangePartitioner$.$anonfun$sketch$1(Partitioner.scala:322)
at
org.apache.spark.RangePartitioner$.$anonfun$sketch$1$adapted(Partitioner.scala:320)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:908)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:908)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
at org.apache.spark.scheduler.Task.run(Task.scala:139)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at scala.collection.immutable.StringLike.split(StringLike.scala:260)
at scala.collection.immutable.StringLike.split$(StringLike.scala:259)
at scala.collection.immutable.StringOps.split(StringOps.scala:33)
at
org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
at
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
at
org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
at
org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
at
org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
at
org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:395)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNextBaseRecord(HoodieBaseFileGroupRecordBuffer.java:504)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodieKeyBasedFileGroupRecordBuffer.java:130)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:139)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:154)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:248)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:319)
at
org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:272)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:297)
... 21 more
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2790)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2726)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2725)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2725)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1211)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1211)
at scala.Option.foreach(Option.scala:407)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1211)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2989)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2928)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2917)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:976)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1022)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1021)
at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:320)
at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:187)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:290)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:173)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:167)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture$lzycompute(ShuffleExchangeExec.scala:143)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture(ShuffleExchangeExec.scala:139)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.$anonfun$submitShuffleJob$1(ShuffleExchangeExec.scala:68)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob(ShuffleExchangeExec.scala:68)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob$(ShuffleExchangeExec.scala:67)
at
org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.submitShuffleJob(ShuffleExchangeExec.scala:115)
at
org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture$lzycompute(QueryStageExec.scala:181)
at
org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture(QueryStageExec.scala:181)
at
org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.doMaterialize(QueryStageExec.scala:183)
at
org.apache.spark.sql.execution.adaptive.QueryStageExec.materialize(QueryStageExec.scala:82)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$5(AdaptiveSparkPlanExec.scala:272)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$5$adapted(AdaptiveSparkPlanExec.scala:270)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:270)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:242)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:387)
at
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:360)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4218)
at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:3459)
at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4208)
at
org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4206)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4206)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:3459)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.checkAnswer(HoodieSparkSqlTestBase.scala:126)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$62(TestMergeIntoTable.scala:1395)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$62$adapted(TestMergeIntoTable.scala:1353)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withTempDir(HoodieSparkSqlTestBase.scala:92)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$61(TestMergeIntoTable.scala:1353)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$3(HoodieSparkSqlTestBase.scala:310)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withSQLConf(HoodieSparkSqlTestBase.scala:282)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$1(HoodieSparkSqlTestBase.scala:309)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$1$adapted(HoodieSparkSqlTestBase.scala:301)
at scala.collection.immutable.List.foreach(List.scala:431)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withRecordType(HoodieSparkSqlTestBase.scala:301)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$60(TestMergeIntoTable.scala:1353)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$60$adapted(TestMergeIntoTable.scala:1351)
at scala.collection.immutable.List.foreach(List.scala:431)
at
org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$59(TestMergeIntoTable.scala:1351)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$test$1(HoodieSparkSqlTestBase.scala:104)
at org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85)
at org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83)
at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
at org.scalatest.Transformer.apply(Transformer.scala:22)
at org.scalatest.Transformer.apply(Transformer.scala:20)
at
org.scalatest.funsuite.AnyFunSuiteLike$$anon$1.apply(AnyFunSuiteLike.scala:189)
at org.scalatest.TestSuite.withFixture(TestSuite.scala:196)
at org.scalatest.TestSuite.withFixture$(TestSuite.scala:195)
at org.scalatest.funsuite.AnyFunSuite.withFixture(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.invokeWithFixture$1(AnyFunSuiteLike.scala:187)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTest$1(AnyFunSuiteLike.scala:199)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
at org.scalatest.funsuite.AnyFunSuiteLike.runTest(AnyFunSuiteLike.scala:199)
at org.scalatest.funsuite.AnyFunSuiteLike.runTest$(AnyFunSuiteLike.scala:181)
at org.scalatest.funsuite.AnyFunSuite.runTest(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTests$1(AnyFunSuiteLike.scala:232)
at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413)
at scala.collection.immutable.List.foreach(List.scala:431)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396)
at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475)
at org.scalatest.funsuite.AnyFunSuiteLike.runTests(AnyFunSuiteLike.scala:232)
at org.scalatest.funsuite.AnyFunSuiteLike.runTests$(AnyFunSuiteLike.scala:231)
at org.scalatest.funsuite.AnyFunSuite.runTests(AnyFunSuite.scala:1562)
at org.scalatest.Suite.run(Suite.scala:1112)
at org.scalatest.Suite.run$(Suite.scala:1094)
at
org.scalatest.funsuite.AnyFunSuite.org$scalatest$funsuite$AnyFunSuiteLike$$super$run(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$run$1(AnyFunSuiteLike.scala:236)
at org.scalatest.SuperEngine.runImpl(Engine.scala:535)
at org.scalatest.funsuite.AnyFunSuiteLike.run(AnyFunSuiteLike.scala:236)
at org.scalatest.funsuite.AnyFunSuiteLike.run$(AnyFunSuiteLike.scala:235)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.org$scalatest$BeforeAndAfterAll$$super$run(HoodieSparkSqlTestBase.scala:48)
at org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:213)
at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210)
at org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208)
at
org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.run(HoodieSparkSqlTestBase.scala:48)
at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:45)
at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13(Runner.scala:1314)
at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13$adapted(Runner.scala:1308)
at scala.collection.immutable.List.foreach(List.scala:431)
at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:1308)
at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24(Runner.scala:993)
at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24$adapted(Runner.scala:971)
at
org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:1474)
at
org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:971)
at org.scalatest.tools.Runner$.run(Runner.scala:798)
at org.scalatest.tools.Runner.run(Runner.scala)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2or3(ScalaTestRunner.java:43)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:26)
Caused by: org.apache.spark.SparkException: Encountered error while reading
file
file:/private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet.
Details:
at
org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:877)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:307)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at
org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
at org.apache.spark.RangePartitioner$.$anonfun$sketch$1(Partitioner.scala:322)
at
org.apache.spark.RangePartitioner$.$anonfun$sketch$1$adapted(Partitioner.scala:320)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:908)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:908)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
at org.apache.spark.scheduler.Task.run(Task.scala:139)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at scala.collection.immutable.StringLike.split(StringLike.scala:260)
at scala.collection.immutable.StringLike.split$(StringLike.scala:259)
at scala.collection.immutable.StringOps.split(StringOps.scala:33)
at
org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
at
org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
at
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
at
org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
at
org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
at
org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
at
org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:395)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNextBaseRecord(HoodieBaseFileGroupRecordBuffer.java:504)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodieKeyBasedFileGroupRecordBuffer.java:130)
at
org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:139)
at
org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:154)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:248)
at
org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:319)
at
org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:272)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:297)
... 21 more
```
testConfigs.foreach { case (tableType, sparkSqlOptimizedWrites) =>
log.info(s"=== Testing MergeInto with partial insert: tableType=$tableType,
sparkSqlOptimizedWrites=$sparkSqlOptimizedWrites ===")
withRecordType()(withTempDir { tmp =>
spark.sql("set hoodie.payload.combined.schema.validate = true")
// Create a partitioned table
val tableName = generateTableName
spark.sql(
s"""
| create table $tableName (
| id bigint,
| name string,
| price double,
| ts bigint,
| dt string
| ) using hudi
| tblproperties (
| type = '$tableType',
| primaryKey = 'id',
| precombineKey = 'ts'
| )
| partitioned by(dt)
| location '${tmp.getCanonicalPath}'
""".stripMargin)
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1L, '2021-03-21'")
// Set optimized sql merge setting
spark.sql(s"set ${SPARK_SQL_OPTIMIZED_WRITES.key()}=$sparkSqlOptimizedWrites")
spark.sql(
s"""
| merge into $tableName as t0
| using (
| select 2 as id, 'a2' as name, 10 as price, 2L as ts, '2021-03-20' as dt
| union
| select 1 as id, 'a1_updated' as name, 11 as price, 3L as ts, '2021-03-21' as
dt
| ) s0
| on s0.id = t0.id
| when matched then update set t0.name = s0.name, t0.price = s0.price
| when not matched and s0.id % 2 = 0 then insert (id, name, dt)
| values(s0.id, s0.name, s0.dt)
""".stripMargin)
checkAnswer(s"select id, name, price, dt from $tableName order by id")( <=====
NPE is thrown
Seq(1, "a1_updated", 11, "2021-03-21"),
Seq(2, "a2", null, "2021-03-20")
)
})
}
```
> MIT partial update NPE when not setting primary key column
> ----------------------------------------------------------
>
> Key: HUDI-8834
> URL: https://issues.apache.org/jira/browse/HUDI-8834
> Project: Apache Hudi
> Issue Type: Bug
> Reporter: Davis Zhang
> Priority: Major
>
> only for MOR, COW works fine
> Job aborted due to stage failure: Task 0 in stage 40.0 failed 1 times, most
> recent failure: Lost task 0.0 in stage 40.0 (TID 52) (192.168.1.109 executor
> driver): org.apache.spark.SparkException: Encountered error while reading
> file
> [file:/private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet|file:///private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet].
> Details:
> at
> org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:877)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:307)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at
> org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
> at org.apache.spark.RangePartitioner$.$anonfun$sketch$1(Partitioner.scala:322)
> at
> org.apache.spark.RangePartitioner$.$anonfun$sketch$1$adapted(Partitioner.scala:320)
> at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:908)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:908)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
> at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
> at org.apache.spark.scheduler.Task.run(Task.scala:139)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:750)
> Caused by: java.lang.NullPointerException
> at scala.collection.immutable.StringLike.split(StringLike.scala:260)
> at scala.collection.immutable.StringLike.split$(StringLike.scala:259)
> at scala.collection.immutable.StringOps.split(StringOps.scala:33)
> at
> org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
> at
> java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
> at
> org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
> at
> org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
> at
> org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:395)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNextBaseRecord(HoodieBaseFileGroupRecordBuffer.java:504)
> at
> org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodieKeyBasedFileGroupRecordBuffer.java:130)
> at
> org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:139)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:154)
> at
> org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:248)
> at
> org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:319)
> at
> org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:272)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:297)
> ... 21 more
>
> Driver stacktrace:
> at
> org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2790)
> at
> org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2726)
> at
> org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2725)
> at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
> at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2725)
> at
> org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1211)
> at
> org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1211)
> at scala.Option.foreach(Option.scala:407)
> at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1211)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2989)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2928)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2917)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:976)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
> at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1022)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
> at org.apache.spark.rdd.RDD.collect(RDD.scala:1021)
> at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:320)
> at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:187)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:290)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:173)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:167)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture$lzycompute(ShuffleExchangeExec.scala:143)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture(ShuffleExchangeExec.scala:139)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.$anonfun$submitShuffleJob$1(ShuffleExchangeExec.scala:68)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob(ShuffleExchangeExec.scala:68)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob$(ShuffleExchangeExec.scala:67)
> at
> org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.submitShuffleJob(ShuffleExchangeExec.scala:115)
> at
> org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture$lzycompute(QueryStageExec.scala:181)
> at
> org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture(QueryStageExec.scala:181)
> at
> org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.doMaterialize(QueryStageExec.scala:183)
> at
> org.apache.spark.sql.execution.adaptive.QueryStageExec.materialize(QueryStageExec.scala:82)
> at
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$5(AdaptiveSparkPlanExec.scala:272)
> at
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$5$adapted(AdaptiveSparkPlanExec.scala:270)
> at scala.collection.Iterator.foreach(Iterator.scala:943)
> at scala.collection.Iterator.foreach$(Iterator.scala:943)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> at scala.collection.IterableLike.foreach(IterableLike.scala:74)
> at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
> at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
> at
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:270)
> at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
> at
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:242)
> at
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:387)
> at
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:360)
> at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4218)
> at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:3459)
> at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4208)
> at
> org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
> at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4206)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
> at
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
> at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
> at
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
> at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4206)
> at org.apache.spark.sql.Dataset.collect(Dataset.scala:3459)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.checkAnswer(HoodieSparkSqlTestBase.scala:126)
> at
> org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$62(TestMergeIntoTable.scala:1395)
> at
> org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$62$adapted(TestMergeIntoTable.scala:1353)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withTempDir(HoodieSparkSqlTestBase.scala:92)
> at
> org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$61(TestMergeIntoTable.scala:1353)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$3(HoodieSparkSqlTestBase.scala:310)
> at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withSQLConf(HoodieSparkSqlTestBase.scala:282)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$1(HoodieSparkSqlTestBase.scala:309)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$withRecordType$1$adapted(HoodieSparkSqlTestBase.scala:301)
> at scala.collection.immutable.List.foreach(List.scala:431)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.withRecordType(HoodieSparkSqlTestBase.scala:301)
> at
> org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$60(TestMergeIntoTable.scala:1353)
> at
> org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$60$adapted(TestMergeIntoTable.scala:1351)
> at scala.collection.immutable.List.foreach(List.scala:431)
> at
> org.apache.spark.sql.hudi.dml.TestMergeIntoTable.$anonfun$new$59(TestMergeIntoTable.scala:1351)
> at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.$anonfun$test$1(HoodieSparkSqlTestBase.scala:104)
> at org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85)
> at org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83)
> at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
> at org.scalatest.Transformer.apply(Transformer.scala:22)
> at org.scalatest.Transformer.apply(Transformer.scala:20)
> at
> org.scalatest.funsuite.AnyFunSuiteLike$$anon$1.apply(AnyFunSuiteLike.scala:189)
> at org.scalatest.TestSuite.withFixture(TestSuite.scala:196)
> at org.scalatest.TestSuite.withFixture$(TestSuite.scala:195)
> at org.scalatest.funsuite.AnyFunSuite.withFixture(AnyFunSuite.scala:1562)
> at
> org.scalatest.funsuite.AnyFunSuiteLike.invokeWithFixture$1(AnyFunSuiteLike.scala:187)
> at
> org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTest$1(AnyFunSuiteLike.scala:199)
> at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
> at org.scalatest.funsuite.AnyFunSuiteLike.runTest(AnyFunSuiteLike.scala:199)
> at org.scalatest.funsuite.AnyFunSuiteLike.runTest$(AnyFunSuiteLike.scala:181)
> at org.scalatest.funsuite.AnyFunSuite.runTest(AnyFunSuite.scala:1562)
> at
> org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTests$1(AnyFunSuiteLike.scala:232)
> at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413)
> at scala.collection.immutable.List.foreach(List.scala:431)
> at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
> at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396)
> at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475)
> at org.scalatest.funsuite.AnyFunSuiteLike.runTests(AnyFunSuiteLike.scala:232)
> at org.scalatest.funsuite.AnyFunSuiteLike.runTests$(AnyFunSuiteLike.scala:231)
> at org.scalatest.funsuite.AnyFunSuite.runTests(AnyFunSuite.scala:1562)
> at org.scalatest.Suite.run(Suite.scala:1112)
> at org.scalatest.Suite.run$(Suite.scala:1094)
> at
> org.scalatest.funsuite.AnyFunSuite.org$scalatest$funsuite$AnyFunSuiteLike$$super$run(AnyFunSuite.scala:1562)
> at
> org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$run$1(AnyFunSuiteLike.scala:236)
> at org.scalatest.SuperEngine.runImpl(Engine.scala:535)
> at org.scalatest.funsuite.AnyFunSuiteLike.run(AnyFunSuiteLike.scala:236)
> at org.scalatest.funsuite.AnyFunSuiteLike.run$(AnyFunSuiteLike.scala:235)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.org$scalatest$BeforeAndAfterAll$$super$run(HoodieSparkSqlTestBase.scala:48)
> at org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:213)
> at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210)
> at org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208)
> at
> org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.run(HoodieSparkSqlTestBase.scala:48)
> at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:45)
> at
> org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13(Runner.scala:1314)
> at
> org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13$adapted(Runner.scala:1308)
> at scala.collection.immutable.List.foreach(List.scala:431)
> at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:1308)
> at
> org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24(Runner.scala:993)
> at
> org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24$adapted(Runner.scala:971)
> at
> org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:1474)
> at
> org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:971)
> at org.scalatest.tools.Runner$.run(Runner.scala:798)
> at org.scalatest.tools.Runner.run(Runner.scala)
> at
> org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2or3(ScalaTestRunner.java:43)
> at
> org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:26)
> Caused by: org.apache.spark.SparkException: Encountered error while reading
> file
> [file:/private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet|file:///private/var/folders/sl/gfxz9xjx57ddcttsjcthbnm40000gn/T/spark-8fa9230c-30d5-4d21-b7a7-b4bec8f63faf/dt=2021-03-21/97de7071-1bb7-43a3-b582-2ce8fe91e1a1-0_0-11-13_20250106205456348.parquet].
> Details:
> at
> org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:877)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:307)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at
> org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
> at org.apache.spark.RangePartitioner$.$anonfun$sketch$1(Partitioner.scala:322)
> at
> org.apache.spark.RangePartitioner$.$anonfun$sketch$1$adapted(Partitioner.scala:320)
> at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:908)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:908)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
> at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
> at org.apache.spark.scheduler.Task.run(Task.scala:139)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:750)
> Caused by: java.lang.NullPointerException
> at scala.collection.immutable.StringLike.split(StringLike.scala:260)
> at scala.collection.immutable.StringLike.split$(StringLike.scala:259)
> at scala.collection.immutable.StringOps.split(StringOps.scala:33)
> at
> org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
> at
> java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
> at
> org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
> at
> org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
> at
> org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:395)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNextBaseRecord(HoodieBaseFileGroupRecordBuffer.java:504)
> at
> org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodieKeyBasedFileGroupRecordBuffer.java:130)
> at
> org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:139)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:154)
> at
> org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:248)
> at
> org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:319)
> at
> org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:272)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:297)
> ... 21 more
>
> ```
> testConfigs.foreach { case (tableType, sparkSqlOptimizedWrites) =>
> log.info(s"=== Testing MergeInto with partial insert: tableType=$tableType,
> sparkSqlOptimizedWrites=$sparkSqlOptimizedWrites ===")
> withRecordType()(withTempDir { tmp =>
> spark.sql("set hoodie.payload.combined.schema.validate = true")
> // Create a partitioned table
> val tableName = generateTableName
> spark.sql(
> s"""
> |create table $tableName (|
> |id bigint,|
> |name string,|
> |price double,|
> |ts bigint,|
> |dt string|
> |) using hudi|
> |tblproperties (|
> |type = '$tableType',|
> |primaryKey = 'id',|
> |precombineKey = 'ts'|
> |)|
> |partitioned by(dt)|
> |location '${tmp.getCanonicalPath}'
> """.stripMargin)|
> spark.sql(s"insert into $tableName select 1, 'a1', 10, 1L, '2021-03-21'")
> // Set optimized sql merge setting
> spark.sql(s"set ${SPARK_SQL_OPTIMIZED_WRITES.key()}=$sparkSqlOptimizedWrites")
> spark.sql(
> s"""
> |merge into $tableName as t0|
> |using (|
> |select 2 as id, 'a2' as name, 10 as price, 2L as ts, '2021-03-20' as dt|
> |union|
> |select 1 as id, 'a1_updated' as name, 11 as price, 3L as ts, '2021-03-21' as
> dt|
> |) s0|
> |on s0.id = t0.id|
> |when matched then update set t0.name = s0.name, t0.price = s0.price|
> |when not matched and s0.id % 2 = 0 then insert (id, name, dt)|
> |values(s0.id, s0.name, s0.dt)
> """.stripMargin)
> checkAnswer(s"select id, name, price, dt from $tableName order by id")(
> <===== NPE is thrown
> Seq(1, "a1_updated", 11, "2021-03-21"),
> Seq(2, "a2", null, "2021-03-20")
> )
> })
> }
> ```|
--
This message was sent by Atlassian Jira
(v8.20.10#820010)