Repository: trafodion Updated Branches: refs/heads/master 4ec0da84b -> 9c59d7803
[TRAFODION-3164] Restart mxosrvrs on-demand This code adds a zookeeper value to signal to mxosrvr processes when it is time to exit. This allows to phase out processes when things in the environment change. The process will wait until it is idle before exiting. Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/99da63e9 Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/99da63e9 Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/99da63e9 Branch: refs/heads/master Commit: 99da63e9cf156c7848a8613cb37fe768e368331f Parents: 7184f8f Author: Hans Zeller <[email protected]> Authored: Tue Jul 31 20:42:00 2018 +0000 Committer: Hans Zeller <[email protected]> Committed: Tue Jul 31 20:42:00 2018 +0000 ---------------------------------------------------------------------- core/conn/odbc/src/odbc/nsksrvr/SrvrMain.cpp | 40 +++++++++++++++++++- .../org/trafodion/dcs/master/DcsMaster.java | 6 ++- 2 files changed, 44 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafodion/blob/99da63e9/core/conn/odbc/src/odbc/nsksrvr/SrvrMain.cpp ---------------------------------------------------------------------- diff --git a/core/conn/odbc/src/odbc/nsksrvr/SrvrMain.cpp b/core/conn/odbc/src/odbc/nsksrvr/SrvrMain.cpp index 72350ec..c23406f 100644 --- a/core/conn/odbc/src/odbc/nsksrvr/SrvrMain.cpp +++ b/core/conn/odbc/src/odbc/nsksrvr/SrvrMain.cpp @@ -100,9 +100,11 @@ bool keepaliveStatus = false; int keepaliveIdletime; int keepaliveIntervaltime; int keepaliveRetrycount; +long epoch = -1; void watcher(zhandle_t *zzh, int type, int state, const char *path, void *watcherCtx); bool verifyPortAvailable(const char * idForPort, int portNumber); BOOL getInitParamSrvr(int argc, char *argv[], SRVR_INIT_PARAM_Def &initParam, char* strName, char* strValue); +long getEpoch(zhandle_t *zh); //only support positive number BOOL getNumberTemp( char* strValue, int& nValue ) @@ -651,6 +653,10 @@ catch(SB_Fatal_Excep sbfe) exit(1); } + // get the current epoch from zookeeper and also put a watch on it + // (to be even safer, take epoch as a command line arg) + epoch = getEpoch(zh); + //LCOV_EXCL_START // when a server dies, the MXOAS sends message to CFG. CFG creates the MXOSRVR process // and passess only one command line atribute: -SQL CLEANUP OBSOLETE VOLATILE TABLES @@ -988,6 +994,18 @@ void watcher(zhandle_t *zzh, int type, int state, const char *path, void *watche zh=0; } } + + if (type == ZOO_CHANGED_EVENT) { + string masterNode(zkRootNode); + + masterNode.append("/dcs/master"); + + if (masterNode.compare(path) == 0) { + if (getEpoch(zzh) != epoch) { + shutdownThisThing=1; + } + } + } } bool verifyPortAvailable(const char * idForPort, @@ -1536,4 +1554,24 @@ BOOL getInitParamSrvr(int argc, char *argv[], SRVR_INIT_PARAM_Def &initParam, ch } - +// The "epoch" is a time period between configuration changes in the +// system. When such a configuration change happens (e.g. the +// executable of the mxosrvr is replaced, or a system default is being +// changed), we want to stop all existing mxosrvrs once they become +// idle and replace them with new ones. Therefore, keep a watch on +// this value and exit when it changes and when our state is or +// becomes idle. +long getEpoch(zhandle_t *zh) { + char path[2000]; + char zkData[1000]; + int zkDataLen = sizeof(zkData); + int result = -1; + + snprintf(path, sizeof(path), "%s/dcs/master", zkRootNode); + int rc = zoo_get(zh, path, 1, zkData, &zkDataLen, NULL); + + if (rc == ZOK && zkDataLen > 0) + result = atol(zkData); + + return result; +} http://git-wip-us.apache.org/repos/asf/trafodion/blob/99da63e9/dcs/src/main/java/org/trafodion/dcs/master/DcsMaster.java ---------------------------------------------------------------------- diff --git a/dcs/src/main/java/org/trafodion/dcs/master/DcsMaster.java b/dcs/src/main/java/org/trafodion/dcs/master/DcsMaster.java index 719c3d3..4852fab 100644 --- a/dcs/src/main/java/org/trafodion/dcs/master/DcsMaster.java +++ b/dcs/src/main/java/org/trafodion/dcs/master/DcsMaster.java @@ -54,6 +54,7 @@ import org.apache.zookeeper.ZooKeeper.States; import org.apache.hadoop.util.StringUtils; import org.trafodion.dcs.Constants; +import org.trafodion.dcs.util.Bytes; import org.trafodion.dcs.util.DcsConfiguration; import org.trafodion.dcs.util.DcsNetworkConfiguration; import org.trafodion.dcs.util.InfoServer; @@ -85,6 +86,7 @@ public class DcsMaster implements Runnable { private JVMShutdownHook jvmShutdownHook; private static String trafodionHome; private CountDownLatch isLeader = new CountDownLatch(1); + private int epoch = 1; private MasterLeaderElection mle = null; @@ -162,9 +164,11 @@ public class DcsMaster implements Runnable { stat = zkc.exists(parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_MASTER, false); if (stat == null) { + byte[] data = Bytes.toBytes(Long.toString(epoch)); + zkc.create(parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_MASTER, - new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, + data, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } stat = zkc.exists(parentZnode
