[ 
https://issues.apache.org/jira/browse/TRAFODION-2668?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16064113#comment-16064113
 ] 

ASF GitHub Bot commented on TRAFODION-2668:
-------------------------------------------

Github user kevinxu021 commented on a diff in the pull request:

    
https://github.com/apache/incubator-trafodion/pull/1144#discussion_r124162405
  
    --- Diff: dcs/src/main/java/org/trafodion/dcs/server/ServerManager.java ---
    @@ -414,32 +431,40 @@ public Boolean call() throws Exception {
                                                                  // finish
                     if (f != null) {
                         Integer result = f.get();
    +                    LOG.debug("Server handler [" + instance + ":" + result 
+ "] finished");
                         int childInstance = result.intValue();
                         // get the node id
    -                    boolean isRunning = 
serverHandlers[childInstance-1].serverMonitor.isPidRunning();
    -                    String nid = 
serverHandlers[childInstance-1].serverMonitor.nid;
    -                    String pid = 
serverHandlers[childInstance-1].serverMonitor.pid; 
    -                    serverHandlers[childInstance-1] = null;
    -                    LOG.debug("Server handler [" + instance + ":" + result
    -                            + "] finished, restarting");
    -                    if (isRunning)
    -                        LOG.info("mxosrvr " + nid + "," + pid + " still 
running");
    -                    else
    -                        LOG.info("mxosrvr " + nid + "," + pid + " exited, 
restarting");
    +                    boolean isRunning = serverHandlers[childInstance - 
1].serverMonitor.monitor();
    +                    String nid = serverHandlers[childInstance - 
1].serverMonitor.nid;
    +                    String pid = serverHandlers[childInstance - 
1].serverMonitor.pid;
    +                    int restartAttempts = serverHandlers[childInstance - 
1].getRestartAttempts();
    +
    +                    serverHandlers[childInstance - 1] = null;
                         retryCounter = retryCounterFactory.create();
                         while (!isTrafodionRunning(nid)) {
    -                       if (!retryCounter.shouldRetry()) {
    -                          throw new IOException("Node " + nid + " is not 
Up");
    -                       } else {
    -                           retryCounter.sleepUntilNextRetry();
    -                           retryCounter.useRetry();
    -                       }
    -                   }
    -                   serverHandlers[childInstance-1] = new 
ServerHandler(childInstance);
    -                   
completionService.submit(serverHandlers[childInstance-1]);
    +                        if (!retryCounter.shouldRetry()) {
    +                            throw new IOException("Node " + nid + " is not 
Up");
    +                        } else {
    +                            retryCounter.sleepUntilNextRetry();
    +                            retryCounter.useRetry();
    +                        }
    +                    }
    +                    if (isRunning) {
    +                        restartAttempts = this.restartAttempts;
    +                        LOG.info("mxosrvr " + nid + "," + pid + " still 
running");
    +                    }
    +                    else {
    +                        restartAttempts--;
    +                        LOG.info("mxosrvr " + nid + "," + pid + " exited, 
restarting, restart time : " + restartAttempts);
    +                    }
    +                    if (restartAttempts > 0) {
    --- End diff --
    
    Merge the logic with line 444


> one mxosrvr can't startup, dcsServer down
> -----------------------------------------
>
>                 Key: TRAFODION-2668
>                 URL: https://issues.apache.org/jira/browse/TRAFODION-2668
>             Project: Apache Trafodion
>          Issue Type: Bug
>            Reporter: mashengchen
>            Assignee: mashengchen
>
> if one of the mxosrvr have any error, while dscserver is starting, dcsserver 
> will down



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to