[GitHub] incubator-trafodion pull request #1344: TRAFODION-2844 add strategy to dcsse...
Github user kevinxu021 commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/1344#discussion_r157153197 --- Diff: dcs/src/main/java/org/trafodion/dcs/Constants.java --- @@ -201,6 +201,12 @@ /** Default value for user program restart handler retry interval millis */ public static final int DEFAULT_DCS_SERVER_USER_PROGRAM_RESTART_HANDLER_RETRY_INTERVAL_MILLIS = 5000; +/** Configuration key for user program restart handler retry interval millis */ +public static final String DCS_SERVER_USER_PROGRAM_RESTART_HANDLER_RETRY_INTERVAL_MINUTES = "dcs.server.user.program.restart.handler.retry.interval.minutes"; --- End diff -- I would suggest to be "dcs.server.user.program.restart.handler.retry.timeout.minutes" Add this new setting into dcs-default.xml ---
[GitHub] incubator-trafodion pull request #1344: TRAFODION-2844 add strategy to dcsse...
Github user kevinxu021 commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/1344#discussion_r157155029 --- Diff: dcs/src/main/java/org/trafodion/dcs/server/ServerManager.java --- @@ -242,7 +246,7 @@ private void cleanupZk() { zkc.delete(registeredPath, -1); } catch (Exception e) { e.printStackTrace(); -LOG.debug(e); +LOG.error(e); --- End diff -- LOG.error(e.getMessage(), e); ---
[GitHub] incubator-trafodion pull request #1344: TRAFODION-2844 add strategy to dcsse...
Github user kevinxu021 commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/1344#discussion_r157166199 --- Diff: dcs/src/main/java/org/trafodion/dcs/util/RetryCounter.java --- @@ -20,54 +20,79 @@ */ package org.trafodion.dcs.util; +import java.util.LinkedList; +import java.util.Queue; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class RetryCounter { - private static final Log LOG = LogFactory.getLog(RetryCounter.class); - private final int maxRetries; - private int retriesRemaining; - private final int retryIntervalMillis; - private final TimeUnit timeUnit; +private static final Log LOG = LogFactory.getLog(RetryCounter.class); +private final int maxRetries; +private int retriesRemaining; +private int retryInterval; +private Queue queue; - public RetryCounter(int maxRetries, - int retryIntervalMillis, TimeUnit timeUnit) { -this.maxRetries = maxRetries; -this.retriesRemaining = maxRetries; -this.retryIntervalMillis = retryIntervalMillis; -this.timeUnit = timeUnit; - } +public RetryCounter(int maxRetries, int retryInterval) { --- End diff -- Not very reasonable to remove last parameter. ---
[GitHub] incubator-trafodion pull request #1344: TRAFODION-2844 add strategy to dcsse...
Github user kevinxu021 commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/1344#discussion_r157156557 --- Diff: dcs/src/main/java/org/trafodion/dcs/server/ServerManager.java --- @@ -242,7 +246,7 @@ private void cleanupZk() { zkc.delete(registeredPath, -1); } catch (Exception e) { --- End diff -- Make the changes for the exception type which should be real exception instead of a general exception. ---
[GitHub] incubator-trafodion pull request #1344: TRAFODION-2844 add strategy to dcsse...
Github user hegdean commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/1344#discussion_r157074271 --- Diff: dcs/src/main/java/org/trafodion/dcs/server/ServerManager.java --- @@ -255,33 +259,17 @@ private void cleanupZk() { CountDownLatch startSignal = new CountDownLatch(1); RetryCounter retryCounter; -public void reset() { -startSignal.countDown(); -startSignal = new CountDownLatch(1); -boolean isRunning = this.serverMonitor.monitor(); -String nid = this.serverMonitor.nid; -String pid = this.serverMonitor.pid; - -if (isRunning) { -LOG.info("mxosrvr " + nid + "," + pid + " still running"); -this.retryCounter.resetAttemptTimes(); -} else { -LOG.info("mxosrvr " + nid + "," + pid + " exited, restarting, restart attempt time : " -+ this.retryCounter.getAttemptTimes()); -} -} - public ServerHandler(Configuration conf ,int childInstance) { int maxRestartAttempts = conf.getInt(Constants.DCS_SERVER_USER_PROGRAM_RESTART_HANDLER_ATTEMPTS, Constants.DEFAULT_DCS_SERVER_USER_PROGRAM_RESTART_HANDLER_ATTEMPTS); -int retryIntervalMillis = conf.getInt( - Constants.DCS_SERVER_USER_PROGRAM_RESTART_HANDLER_RETRY_INTERVAL_MILLIS, - Constants.DEFAULT_DCS_SERVER_USER_PROGRAM_RESTART_HANDLER_RETRY_INTERVAL_MILLIS); +int retryIntervalMinutes = conf.getInt( --- End diff -- Why did we change from millis to minutes. Millis is more granular and you can achieve mins from millis ---
[GitHub] incubator-trafodion pull request #1344: TRAFODION-2844 add strategy to dcsse...
GitHub user mashengchen opened a pull request: https://github.com/apache/incubator-trafodion/pull/1344 TRAFODION-2844 add strategy to dcsserver for restart moxsrvr when mxosrvr down ,dcsserver will restart it , and if mxosrvr down a lot of times in a period of time , there should reject the restart. if time between 6 times age(default setting) and this time are a very long time, dcsserver should allow the restart You can merge this pull request into a Git repository by running: $ git pull https://github.com/mashengchen/incubator-trafodion mxosrvrRestart Alternatively you can review and apply these changes as the patch at: https://github.com/apache/incubator-trafodion/pull/1344.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #1344 commit d41654fe70fd67740651da503e3c73e1b64e4168 Author: avenDate: 2017-12-14T10:35:54Z TRAFODION-2844 add strategy to dcsserver for restart moxsrvr ---