[
https://issues.apache.org/jira/browse/HUDI-8405?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Y Ethan Guo updated HUDI-8405:
------------------------------
Fix Version/s: 1.1.0
> StreamWriteOperatorCoordinator receive an unexpected event for older instant
> ----------------------------------------------------------------------------
>
> Key: HUDI-8405
> URL: https://issues.apache.org/jira/browse/HUDI-8405
> Project: Apache Hudi
> Issue Type: Bug
> Components: flink
> Affects Versions: 0.14.1
> Reporter: Zhenqiu Huang
> Priority: Major
> Fix For: 1.1.0
>
>
> 2024-10-14 18:32:06.983 [pool-42-thread-1] INFO
> org.apache.hudi.sink.StreamWriteOperatorCoordinator - Executor executes
> action [taking checkpoint 2761] success!
> 2024-10-14 18:32:07.244 [pool-42-thread-1] ERROR
> org.apache.hudi.sink.StreamWriteOperatorCoordinator - Executor executes
> action [handle write metadata event for instant 20241014183138281] error
> java.lang.IllegalStateException: Receive an unexpected event for instant
> 20241014183202321 from task 38
> at
> org.apache.hudi.common.util.ValidationUtils.checkState(ValidationUtils.java:76)
> at
> org.apache.hudi.sink.StreamWriteOperatorCoordinator.handleWriteMetaEvent(StreamWriteOperatorCoordinator.java:487)
> at
> org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$handleEventFromOperator$4(StreamWriteOperatorCoordinator.java:294)
> at
> org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:130)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> 2024-10-14 18:32:07.253 [flink-pekko.actor.default-dispatcher-60] INFO
> o.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler - Restarting job.
> org.apache.flink.util.FlinkException: Global failure triggered by
> OperatorCoordinator for 'hoodie_append_write: default_database.testTable ->
> Sink: dummy' (operator 3b4fb0393e19db327344ee58eb55db3c).
> at
> org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder$LazyInitializedCoordinatorContext.failJob(OperatorCoordinatorHolder.java:624)
> at
> org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$start$0(StreamWriteOperatorCoordinator.java:197)
> at
> org.apache.hudi.sink.utils.NonThrownExecutor.handleException(NonThrownExecutor.java:142)
> at
> org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:133)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.hudi.exception.HoodieException: Executor executes
> action [handle write metadata event for instant 20241014183138281] error
> ... 6 common frames omitted
> Caused by: java.lang.IllegalStateException: Receive an unexpected event for
> instant 20241014183202321 from task 38
> at
> org.apache.hudi.common.util.ValidationUtils.checkState(ValidationUtils.java:76)
> at
> org.apache.hudi.sink.StreamWriteOperatorCoordinator.handleWriteMetaEvent(StreamWriteOperatorCoordinator.java:487)
> at
> org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$handleEventFromOperator$4(StreamWriteOperatorCoordinator.java:294)
> at
> org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:130)
> ... 3 common frames omitted
> 2024-10-14 18:32:07.254 [flink-pekko.actor.default-dispatcher-60] INFO
> org.apache.flink.runtime.executiongraph.ExecutionGraph - Job
> HooverJobMetadata (ff15c67583819159ab1ba9f1b3f6e13f) switched from state
> RUNNING to CANCELLING.
> 2024-10-14 18:32:07.256 [flink-pekko.actor.default-dispatcher-60] WARN
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager - Failed to
> trigger or complete checkpoint 2761 for job ff15c67583819159ab1ba9f1b3f6e13f.
> (0 consecutive failed attempts so far)
> org.apache.flink.runtime.checkpoint.CheckpointException: Checkpoint
> Coordinator is suspending.
> at
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.stopCheckpointScheduler(CheckpointCoordinator.java:1976)
> at
> org.apache.flink.runtime.checkpoint.CheckpointCoordinatorDeActivator.jobStatusChanges(CheckpointCoordinatorDeActivator.java:46)
> at
> org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.notifyJobStatusChange(DefaultExecutionGraph.java:1589)
> at
> org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.transitionState(DefaultExecutionGraph.java:1155)
> at
> org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.transitionState(DefaultExecutionGraph.java:1127)
> at
> org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.cancel(DefaultExecutionGraph.java:955)
> at
> org.apache.flink.runtime.scheduler.adaptive.Restarting.<init>(Restarting.java:65)
> at
> org.apache.flink.runtime.scheduler.adaptive.Restarting$Factory.getState(Restarting.java:160)
> at
> org.apache.flink.runtime.scheduler.adaptive.Restarting$Factory.getState(Restarting.java:125)
> at
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.transitionToState(AdaptiveScheduler.java:1295)
> at
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.goToRestarting(AdaptiveScheduler.java:977)
> at
> org.apache.flink.runtime.scheduler.adaptive.FailureResultUtil.restartOrFail(FailureResultUtil.java:28)
> at
> org.apache.flink.runtime.scheduler.adaptive.Executing.onFailure(Executing.java:93)
> at
> org.apache.flink.runtime.scheduler.adaptive.StateWithExecutionGraph.handleGlobalFailure(StateWithExecutionGraph.java:357)
> at
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.handleGlobalFailure(AdaptiveScheduler.java:534)
> at
> org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder$LazyInitializedCoordinatorContext.lambda$failJob$0(OperatorCoordinatorHolder.java:642)
> at
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451)
> at
> org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
> at
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451)
> at
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218)
> at
> org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85)
> at
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168)
> at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33)
> at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29)
> at scala.PartialFunction.applyOrElse(PartialFunction.scala:127)
> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126)
> at
> org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29)
> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175)
> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176)
> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176)
> at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547)
> at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545)
> at org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229)
> at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590)
> at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557)
> at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280)
> at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241)
> at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253)
> at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
> at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
> at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
> at
> java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157)
--
This message was sent by Atlassian Jira
(v8.20.10#820010)