Zhenqiu Huang created HUDI-8405:
-----------------------------------

             Summary: StreamWriteOperatorCoordinator receive an unexpected 
event for older instant
                 Key: HUDI-8405
                 URL: https://issues.apache.org/jira/browse/HUDI-8405
             Project: Apache Hudi
          Issue Type: Bug
          Components: flink
    Affects Versions: 0.14.1
            Reporter: Zhenqiu Huang


2024-10-14 18:32:06.983 [pool-42-thread-1] INFO  
org.apache.hudi.sink.StreamWriteOperatorCoordinator  - Executor executes action 
[taking checkpoint 2761] success!
2024-10-14 18:32:07.244 [pool-42-thread-1] ERROR 
org.apache.hudi.sink.StreamWriteOperatorCoordinator  - Executor executes action 
[handle write metadata event for instant 20241014183138281] error
java.lang.IllegalStateException: Receive an unexpected event for instant 
20241014183202321 from task 38
at 
org.apache.hudi.common.util.ValidationUtils.checkState(ValidationUtils.java:76)
at 
org.apache.hudi.sink.StreamWriteOperatorCoordinator.handleWriteMetaEvent(StreamWriteOperatorCoordinator.java:487)
at 
org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$handleEventFromOperator$4(StreamWriteOperatorCoordinator.java:294)
at 
org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:130)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2024-10-14 18:32:07.253 [flink-pekko.actor.default-dispatcher-60] INFO  
o.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler  - Restarting job.
org.apache.flink.util.FlinkException: Global failure triggered by 
OperatorCoordinator for 'hoodie_append_write: default_database.testTable -> 
Sink: dummy' (operator 3b4fb0393e19db327344ee58eb55db3c).
at 
org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder$LazyInitializedCoordinatorContext.failJob(OperatorCoordinatorHolder.java:624)
at 
org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$start$0(StreamWriteOperatorCoordinator.java:197)
at 
org.apache.hudi.sink.utils.NonThrownExecutor.handleException(NonThrownExecutor.java:142)
at 
org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:133)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hudi.exception.HoodieException: Executor executes action 
[handle write metadata event for instant 20241014183138281] error
... 6 common frames omitted
Caused by: java.lang.IllegalStateException: Receive an unexpected event for 
instant 20241014183202321 from task 38
at 
org.apache.hudi.common.util.ValidationUtils.checkState(ValidationUtils.java:76)
at 
org.apache.hudi.sink.StreamWriteOperatorCoordinator.handleWriteMetaEvent(StreamWriteOperatorCoordinator.java:487)
at 
org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$handleEventFromOperator$4(StreamWriteOperatorCoordinator.java:294)
at 
org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:130)
... 3 common frames omitted
2024-10-14 18:32:07.254 [flink-pekko.actor.default-dispatcher-60] INFO  
org.apache.flink.runtime.executiongraph.ExecutionGraph  - Job HooverJobMetadata 
(ff15c67583819159ab1ba9f1b3f6e13f) switched from state RUNNING to CANCELLING.
2024-10-14 18:32:07.256 [flink-pekko.actor.default-dispatcher-60] WARN  
org.apache.flink.runtime.checkpoint.CheckpointFailureManager  - Failed to 
trigger or complete checkpoint 2761 for job ff15c67583819159ab1ba9f1b3f6e13f. 
(0 consecutive failed attempts so far)
org.apache.flink.runtime.checkpoint.CheckpointException: Checkpoint Coordinator 
is suspending.
at 
org.apache.flink.runtime.checkpoint.CheckpointCoordinator.stopCheckpointScheduler(CheckpointCoordinator.java:1976)
at 
org.apache.flink.runtime.checkpoint.CheckpointCoordinatorDeActivator.jobStatusChanges(CheckpointCoordinatorDeActivator.java:46)
at 
org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.notifyJobStatusChange(DefaultExecutionGraph.java:1589)
at 
org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.transitionState(DefaultExecutionGraph.java:1155)
at 
org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.transitionState(DefaultExecutionGraph.java:1127)
at 
org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.cancel(DefaultExecutionGraph.java:955)
at 
org.apache.flink.runtime.scheduler.adaptive.Restarting.<init>(Restarting.java:65)
at 
org.apache.flink.runtime.scheduler.adaptive.Restarting$Factory.getState(Restarting.java:160)
at 
org.apache.flink.runtime.scheduler.adaptive.Restarting$Factory.getState(Restarting.java:125)
at 
org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.transitionToState(AdaptiveScheduler.java:1295)
at 
org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.goToRestarting(AdaptiveScheduler.java:977)
at 
org.apache.flink.runtime.scheduler.adaptive.FailureResultUtil.restartOrFail(FailureResultUtil.java:28)
at 
org.apache.flink.runtime.scheduler.adaptive.Executing.onFailure(Executing.java:93)
at 
org.apache.flink.runtime.scheduler.adaptive.StateWithExecutionGraph.handleGlobalFailure(StateWithExecutionGraph.java:357)
at 
org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.handleGlobalFailure(AdaptiveScheduler.java:534)
at 
org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder$LazyInitializedCoordinatorContext.lambda$failJob$0(OperatorCoordinatorHolder.java:642)
at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451)
at 
org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451)
at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218)
at 
org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85)
at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168)
at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33)
at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:127)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126)
at 
org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176)
at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547)
at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545)
at org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229)
at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590)
at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557)
at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280)
at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241)
at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253)
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157)





--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to