This is an automated email from the ASF dual-hosted git repository. zhongjiajie pushed a commit to branch 3.0.4-prepare in repository https://gitbox.apache.org/repos/asf/dolphinscheduler.git
commit bc1cf25f4d705efd5fe500a9b27691b1b83fb771 Author: sssqhai <[email protected]> AuthorDate: Fri Dec 16 19:55:02 2022 +0800 Solve the deadlock problem caused by queuing (#13191) * Solve the deadlock problem caused by queuing * Solve the deadlock problem caused by queuing * Solve the deadlock problem caused by queuing * Solve the deadlock problem caused by queuing,move the event to the tail by throwing a exception Co-authored-by: wfs <[email protected]> (cherry picked from commit 7a0a2c2a46a0224d0c9fb1f649c1f901d11e814b) --- ...ateHandler.java => StateEventHandleFailure.java} | 21 +++++++++------------ .../server/master/event/StateEventHandler.java | 5 +++-- .../master/event/TaskWaitTaskGroupStateHandler.java | 15 +++++++++++++-- .../master/runner/WorkflowExecuteRunnable.java | 8 ++++++++ 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java similarity index 59% copy from dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java copy to dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java index 9a3c59a949..5e757c7858 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java @@ -17,20 +17,17 @@ package org.apache.dolphinscheduler.server.master.event; -import org.apache.dolphinscheduler.common.enums.StateEventType; -import org.apache.dolphinscheduler.server.master.runner.WorkflowExecuteRunnable; - -import com.google.auto.service.AutoService; +/** + * This exception represent the exception can be recovered, when we get this exception, + * we will move the event to the fail of the queue. + */ +public class StateEventHandleFailure extends Exception { -@AutoService(StateEventHandler.class) -public class TaskWaitTaskGroupStateHandler implements StateEventHandler { - @Override - public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) { - return workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent); + public StateEventHandleFailure(String message) { + super(message); } - @Override - public StateEventType getEventType() { - return StateEventType.WAIT_TASK_GROUP; + public StateEventHandleFailure(String message, Throwable throwable) { + super(message, throwable); } } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java index 00808b2e29..377ea71f62 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java @@ -28,9 +28,10 @@ public interface StateEventHandler { * @param stateEvent given state event. * @throws StateEventHandleException this exception means it can be recovered. * @throws StateEventHandleError this exception means it cannot be recovered, so the event need to drop. + * @throws StateEventHandleException this means it can be recovered. */ - boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) - throws StateEventHandleException, StateEventHandleError; + boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, + StateEvent stateEvent) throws StateEventHandleException, StateEventHandleError, StateEventHandleFailure; StateEventType getEventType(); } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java index 9a3c59a949..bb0c7b8b70 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java @@ -20,13 +20,24 @@ package org.apache.dolphinscheduler.server.master.event; import org.apache.dolphinscheduler.common.enums.StateEventType; import org.apache.dolphinscheduler.server.master.runner.WorkflowExecuteRunnable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.google.auto.service.AutoService; @AutoService(StateEventHandler.class) public class TaskWaitTaskGroupStateHandler implements StateEventHandler { + + private static final Logger logger = LoggerFactory.getLogger(TaskWaitTaskGroupStateHandler.class); + @Override - public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) { - return workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent); + public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, + StateEvent stateEvent) throws StateEventHandleFailure { + logger.info("Handle task instance wait task group event, taskInstanceId: {}", stateEvent.getTaskInstanceId()); + if (!workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent)) { + throw new StateEventHandleFailure("Task state event handle failed due to robing taskGroup resource failed"); + } + return true; } @Override diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java index 5c0fec018d..fda109e1f4 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java @@ -70,6 +70,7 @@ import org.apache.dolphinscheduler.server.master.dispatch.executor.NettyExecutor import org.apache.dolphinscheduler.server.master.event.StateEvent; import org.apache.dolphinscheduler.server.master.event.StateEventHandleError; import org.apache.dolphinscheduler.server.master.event.StateEventHandleException; +import org.apache.dolphinscheduler.server.master.event.StateEventHandleFailure; import org.apache.dolphinscheduler.server.master.event.StateEventHandler; import org.apache.dolphinscheduler.server.master.event.StateEventHandlerManager; import org.apache.dolphinscheduler.server.master.metrics.TaskMetrics; @@ -279,6 +280,13 @@ public class WorkflowExecuteRunnable implements Callable<WorkflowSubmitStatue> { stateEvent, stateEventHandleException); ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS); + } catch (StateEventHandleFailure stateEventHandleFailure) { + logger.error("State event handle failed, will move event to the tail: {}", + stateEvent, + stateEventHandleFailure); + this.stateEvents.remove(stateEvent); + this.stateEvents.offer(stateEvent); + ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS); } catch (Exception e) { // we catch the exception here, since if the state event handle failed, the state event will still keep in the stateEvents queue. logger.error("State event handle error, get a unknown exception, will retry this event: {}",
