Repository: oozie Updated Branches: refs/heads/master 1bbc2decf -> e7c642c1c
OOZIE-1847 HA - Oozie servers should shutdown (or go in safe mode) in case of ZK failure Project: http://git-wip-us.apache.org/repos/asf/oozie/repo Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/e7c642c1 Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/e7c642c1 Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/e7c642c1 Branch: refs/heads/master Commit: e7c642c1cf3b5ab4e7f20f99f78edcc69e7d3931 Parents: 1bbc2de Author: Purshotam Shah <[email protected]> Authored: Tue Sep 9 15:00:22 2014 -0700 Committer: Purshotam Shah <[email protected]> Committed: Tue Sep 9 15:00:22 2014 -0700 ---------------------------------------------------------------------- .../event/listener/ZKConnectionListener.java | 62 ++++++++++++++++++++ .../java/org/apache/oozie/util/ZKUtils.java | 26 +++++++- core/src/main/resources/oozie-default.xml | 9 +++ release-log.txt | 1 + 4 files changed, 97 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java b/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java new file mode 100644 index 0000000..1406b6a --- /dev/null +++ b/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.oozie.event.listener; + +import org.apache.curator.framework.CuratorFramework; +import org.apache.curator.framework.state.ConnectionState; +import org.apache.curator.framework.state.ConnectionStateListener; +import org.apache.oozie.service.Services; +import org.apache.oozie.util.XLog; +import org.apache.oozie.util.ZKUtils; + +/** + * ZKConnectionListener listens on ZK connection status. + */ +public class ZKConnectionListener implements ConnectionStateListener { + + private XLog LOG = XLog.getLog(getClass()); + + public ZKConnectionListener() { + LOG.info("ZKConnectionListener started"); + } + + @Override + public void stateChanged(final CuratorFramework client, final ConnectionState newState) { + LOG.trace("ZK connection status = " + newState.toString()); +// if (newState == ConnectionState.CONNECTED) { +// ZK connected +// } + if (newState == ConnectionState.SUSPENDED) { + LOG.warn("ZK connection is suspended, waiting for reconnect. If connection doesn't reconnect before " + + ZKUtils.getZKConnectionTimeout() + " Oozie server will shutdown itself"); + } + + if (newState == ConnectionState.RECONNECTED) { + // ZK connected is reconnected. + LOG.warn("ZK connection is reestablished"); + } + + if (newState == ConnectionState.LOST) { + LOG.fatal("ZK is connection is not reconnected in " + ZKUtils.getZKConnectionTimeout() + + ", shutting down Oozie server"); + Services.get().destroy(); + System.exit(1); + } + } +} http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/core/src/main/java/org/apache/oozie/util/ZKUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/oozie/util/ZKUtils.java b/core/src/main/java/org/apache/oozie/util/ZKUtils.java index b69e758..ec32092 100644 --- a/core/src/main/java/org/apache/oozie/util/ZKUtils.java +++ b/core/src/main/java/org/apache/oozie/util/ZKUtils.java @@ -19,6 +19,7 @@ package org.apache.oozie.util; import com.google.common.annotations.VisibleForTesting; + import java.io.IOException; import java.util.Collections; import java.util.HashMap; @@ -26,7 +27,9 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; + import javax.security.auth.login.Configuration; + import org.apache.curator.RetryPolicy; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; @@ -40,8 +43,11 @@ import org.apache.curator.x.discovery.ServiceDiscoveryBuilder; import org.apache.curator.x.discovery.ServiceInstance; import org.apache.curator.x.discovery.details.InstanceSerializer; import org.apache.oozie.ErrorCode; + import static org.apache.oozie.service.HadoopAccessorService.KERBEROS_KEYTAB; import static org.apache.oozie.service.HadoopAccessorService.KERBEROS_PRINCIPAL; + +import org.apache.oozie.event.listener.ZKConnectionListener; import org.apache.oozie.service.ServiceException; import org.apache.oozie.service.Services; import org.apache.zookeeper.ZooDefs.Perms; @@ -50,7 +56,6 @@ import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Id; import org.apache.zookeeper.data.Stat; - /** * This class provides a singleton for interacting with ZooKeeper that other classes can use. It handles connecting to ZooKeeper, * service discovery, and publishing metadata about this server. @@ -72,6 +77,8 @@ import org.apache.zookeeper.data.Stat; * Oozie's existing security configuration parameters (b) use/convert every znode under the namespace (including the namespace * itself) to have ACLs such that only Oozie servers have access (i.e. if "service/host@REALM" is the Kerberos principal, then * "service" will be used for the ACLs). + * <p> + * Oozie server will shutdown itself if ZK connection is lost for ${ZK_CONNECTION_TIMEOUT}. */ public class ZKUtils { /** @@ -86,6 +93,11 @@ public class ZKUtils { public static final String ZK_NAMESPACE = "oozie.zookeeper.namespace"; /** + *Default ZK connection timeout ( in sec). If connection is lost for more than timeout, then Oozie server will shutdown itself. + */ + public static final String ZK_CONNECTION_TIMEOUT = "oozie.zookeeper.connection.timeout"; + + /** * oozie-env environment variable for specifying the Oozie instance ID */ public static final String OOZIE_INSTANCE_ID = "oozie.instance.id"; @@ -111,6 +123,7 @@ public class ZKUtils { private XLog log; private static ZKUtils zk = null; + private static int zkConnectionTimeout; /** * Private Constructor for the singleton; it connects to ZooKeeper and advertises this Oozie Server. @@ -160,6 +173,8 @@ public class ZKUtils { RetryPolicy retryPolicy = ZKUtils.getRetryPloicy(); String zkConnectionString = Services.get().getConf().get(ZK_CONNECTION_STRING, "localhost:2181"); String zkNamespace = Services.get().getConf().get(ZK_NAMESPACE, "oozie"); + zkConnectionTimeout = Services.get().getConf().getInt(ZK_CONNECTION_TIMEOUT, 180); + ACLProvider aclProvider; if (Services.get().getConf().getBoolean(ZK_SECURE, false)) { log.info("Connecting to ZooKeeper with SASL/Kerberos and using 'sasl' ACLs"); @@ -177,8 +192,10 @@ public class ZKUtils { .connectString(zkConnectionString) .retryPolicy(retryPolicy) .aclProvider(aclProvider) + .connectionTimeoutMs(zkConnectionTimeout * 1000) // in ms .build(); client.start(); + client.getConnectionStateListenable().addListener(new ZKConnectionListener()); } private void advertiseService() throws Exception { @@ -388,4 +405,11 @@ public class ZKUtils { public static RetryPolicy getRetryPloicy() { return new ExponentialBackoffRetry(1000, 3); } + /** + * Return ZK connection timeout + * @return + */ + public static int getZKConnectionTimeout(){ + return zkConnectionTimeout; + } } http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/core/src/main/resources/oozie-default.xml ---------------------------------------------------------------------- diff --git a/core/src/main/resources/oozie-default.xml b/core/src/main/resources/oozie-default.xml index a5ccd3c..b931d52 100644 --- a/core/src/main/resources/oozie-default.xml +++ b/core/src/main/resources/oozie-default.xml @@ -2075,6 +2075,15 @@ </description> </property> + <property> + <name>oozie.zookeeper.connection.timeout</name> + <value>180</value> + <description> + Default ZK connection timeout (in sec). If connection is lost for more than timeout, then Oozie server will shutdown + itself. + </description> + </property> + <!-- <property> <name>oozie.instance.id</name> http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/release-log.txt ---------------------------------------------------------------------- diff --git a/release-log.txt b/release-log.txt index e73eb75..b3bbb2f 100644 --- a/release-log.txt +++ b/release-log.txt @@ -1,5 +1,6 @@ -- Oozie 4.2.0 release (trunk - unreleased) +OOZIE-1847 HA - Oozie servers should shutdown (or go in safe mode) in case of ZK failure (puru) OOZIE-1957 Coord update command override group when oozie.service.AuthorizationService.default.group.as.acl is set and group/acl is not configured in job property (puru) OOZIE-1818 CoordMaterializeTransitionXCommand verifyPrecondition doesn't verify current time (puru) OOZIE-1653 Support ALL to allowed error code of the user retry (seoeun25 via rkanter)
