[ https://issues.apache.org/jira/browse/TRAFODION-2668?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16063633#comment-16063633 ]
ASF GitHub Bot commented on TRAFODION-2668: ------------------------------------------- Github user selvaganesang commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/1144#discussion_r124095950 --- Diff: dcs/src/main/java/org/trafodion/dcs/server/ServerManager.java --- @@ -414,32 +431,40 @@ public Boolean call() throws Exception { // finish if (f != null) { Integer result = f.get(); + LOG.debug("Server handler [" + instance + ":" + result + "] finished"); int childInstance = result.intValue(); // get the node id - boolean isRunning = serverHandlers[childInstance-1].serverMonitor.isPidRunning(); - String nid = serverHandlers[childInstance-1].serverMonitor.nid; - String pid = serverHandlers[childInstance-1].serverMonitor.pid; - serverHandlers[childInstance-1] = null; - LOG.debug("Server handler [" + instance + ":" + result - + "] finished, restarting"); - if (isRunning) - LOG.info("mxosrvr " + nid + "," + pid + " still running"); - else - LOG.info("mxosrvr " + nid + "," + pid + " exited, restarting"); + boolean isRunning = serverHandlers[childInstance - 1].serverMonitor.monitor(); + String nid = serverHandlers[childInstance - 1].serverMonitor.nid; + String pid = serverHandlers[childInstance - 1].serverMonitor.pid; + int restartAttempts = serverHandlers[childInstance - 1].getRestartAttempts(); + + serverHandlers[childInstance - 1] = null; retryCounter = retryCounterFactory.create(); while (!isTrafodionRunning(nid)) { - if (!retryCounter.shouldRetry()) { - throw new IOException("Node " + nid + " is not Up"); - } else { - retryCounter.sleepUntilNextRetry(); - retryCounter.useRetry(); - } - } - serverHandlers[childInstance-1] = new ServerHandler(childInstance); - completionService.submit(serverHandlers[childInstance-1]); + if (!retryCounter.shouldRetry()) { --- End diff -- I think you need to fix retryCounterFactory to do the looping for retry count or get rid of retryCounter completely and use the restartAttempt variable. For the latter case, the line no 445 shouldn't be using retryCounter.shouldRetry() method. How was this change unit tested? > one mxosrvr can't startup, dcsServer down > ----------------------------------------- > > Key: TRAFODION-2668 > URL: https://issues.apache.org/jira/browse/TRAFODION-2668 > Project: Apache Trafodion > Issue Type: Bug > Reporter: mashengchen > Assignee: mashengchen > > if one of the mxosrvr have any error, while dscserver is starting, dcsserver > will down -- This message was sent by Atlassian JIRA (v6.4.14#64029)