[ https://issues.apache.org/jira/browse/TRAFODION-2668?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16064113#comment-16064113 ]
ASF GitHub Bot commented on TRAFODION-2668: ------------------------------------------- Github user kevinxu021 commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/1144#discussion_r124162405 --- Diff: dcs/src/main/java/org/trafodion/dcs/server/ServerManager.java --- @@ -414,32 +431,40 @@ public Boolean call() throws Exception { // finish if (f != null) { Integer result = f.get(); + LOG.debug("Server handler [" + instance + ":" + result + "] finished"); int childInstance = result.intValue(); // get the node id - boolean isRunning = serverHandlers[childInstance-1].serverMonitor.isPidRunning(); - String nid = serverHandlers[childInstance-1].serverMonitor.nid; - String pid = serverHandlers[childInstance-1].serverMonitor.pid; - serverHandlers[childInstance-1] = null; - LOG.debug("Server handler [" + instance + ":" + result - + "] finished, restarting"); - if (isRunning) - LOG.info("mxosrvr " + nid + "," + pid + " still running"); - else - LOG.info("mxosrvr " + nid + "," + pid + " exited, restarting"); + boolean isRunning = serverHandlers[childInstance - 1].serverMonitor.monitor(); + String nid = serverHandlers[childInstance - 1].serverMonitor.nid; + String pid = serverHandlers[childInstance - 1].serverMonitor.pid; + int restartAttempts = serverHandlers[childInstance - 1].getRestartAttempts(); + + serverHandlers[childInstance - 1] = null; retryCounter = retryCounterFactory.create(); while (!isTrafodionRunning(nid)) { - if (!retryCounter.shouldRetry()) { - throw new IOException("Node " + nid + " is not Up"); - } else { - retryCounter.sleepUntilNextRetry(); - retryCounter.useRetry(); - } - } - serverHandlers[childInstance-1] = new ServerHandler(childInstance); - completionService.submit(serverHandlers[childInstance-1]); + if (!retryCounter.shouldRetry()) { + throw new IOException("Node " + nid + " is not Up"); + } else { + retryCounter.sleepUntilNextRetry(); + retryCounter.useRetry(); + } + } + if (isRunning) { + restartAttempts = this.restartAttempts; + LOG.info("mxosrvr " + nid + "," + pid + " still running"); + } + else { + restartAttempts--; + LOG.info("mxosrvr " + nid + "," + pid + " exited, restarting, restart time : " + restartAttempts); + } + if (restartAttempts > 0) { --- End diff -- Merge the logic with line 444 > one mxosrvr can't startup, dcsServer down > ----------------------------------------- > > Key: TRAFODION-2668 > URL: https://issues.apache.org/jira/browse/TRAFODION-2668 > Project: Apache Trafodion > Issue Type: Bug > Reporter: mashengchen > Assignee: mashengchen > > if one of the mxosrvr have any error, while dscserver is starting, dcsserver > will down -- This message was sent by Atlassian JIRA (v6.4.14#64029)