caishunfeng commented on a change in pull request #7540:
URL: https://github.com/apache/dolphinscheduler/pull/7540#discussion_r773619657
##########
File path:
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/worker/WorkerServer.java
##########
@@ -146,14 +149,14 @@ public void run() {
try {
this.workerRegistryClient.registry();
this.workerRegistryClient.setRegistryStoppable(this);
- Set<String> workerZkPaths =
this.workerRegistryClient.getWorkerZkPaths();
-
- this.workerRegistryClient.handleDeadServer(workerZkPaths,
NodeType.WORKER, Constants.DELETE_OP);
} catch (Exception e) {
- logger.error(e.getMessage(), e);
+ logger.error("worker registry error", e);
throw new RuntimeException(e);
}
+ // solve dead lock
+
logger.info(org.apache.dolphinscheduler.spi.utils.PropertyUtils.dumpProperties());
Review comment:
I can't get the point that why this logger.info can solve dead lock?
##########
File path:
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterRegistryClient.java
##########
@@ -150,37 +151,76 @@ public void closeRegistry() {
}
/**
- * remove zookeeper node path
+ * remove master node path
*
- * @param path zookeeper node path
- * @param nodeType zookeeper node type
+ * @param path node path
+ * @param nodeType node type
* @param failover is failover
*/
- public void removeNodePath(String path, NodeType nodeType, boolean
failover) {
+ public void removeMasterNodePath(String path, NodeType nodeType, boolean
failover) {
logger.info("{} node deleted : {}", nodeType, path);
- String failoverPath = getFailoverLockPath(nodeType);
+
+ if (StringUtils.isEmpty(path)) {
+ logger.error("server down error: empty path: {}, nodeType:{}",
path, nodeType);
+ return;
+ }
+
+ String serverHost = registryClient.getHostByEventDataPath(path);
+ if (StringUtils.isEmpty(serverHost)) {
+ logger.error("server down error: unknown path: {}, nodeType:{}",
path, nodeType);
+ return;
+ }
+
+ String failoverPath = getFailoverLockPath(nodeType, serverHost);
try {
registryClient.getLock(failoverPath);
+ if (!registryClient.exists(path)) {
+ logger.info("path: {} not exists", path);
+ // handle dead server
+ registryClient.handleDeadServer(Collections.singleton(path),
nodeType, Constants.ADD_OP);
+ }
+
+ //failover server
+ if (failover) {
Review comment:
The failove logic will be skipped if the master can't get the zk lock.
##########
File path:
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterRegistryClient.java
##########
@@ -330,13 +435,21 @@ private void failoverMaster(String masterHost) {
continue;
}
+ if (serverStartupTime != null && processInstance.getRestartTime()
!= null
+ &&
processInstance.getRestartTime().after(serverStartupTime)) {
+ continue;
+ }
+
logger.info("failover process instance id: {}",
processInstance.getId());
List<TaskInstance> validTaskInstanceList =
processService.findValidTaskListByProcessId(processInstance.getId());
for (TaskInstance taskInstance : validTaskInstanceList) {
if (Constants.NULL.equals(taskInstance.getHost())) {
continue;
}
+ if (taskInstance.getState().typeIsFinished()) {
+ continue;
+ }
logger.info("failover task instance id: {}, process instance
id: {}", taskInstance.getId(), taskInstance.getProcessInstanceId());
failoverTaskInstance(processInstance, taskInstance);
Review comment:
need to check by `checkTaskInstanceNeedFailover` before
`failoverTaskInstance`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]