Repository: oozie
Updated Branches:
  refs/heads/master 1bbc2decf -> e7c642c1c


OOZIE-1847 HA - Oozie servers should shutdown (or go in safe mode) in case of 
ZK failure


Project: http://git-wip-us.apache.org/repos/asf/oozie/repo
Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/e7c642c1
Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/e7c642c1
Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/e7c642c1

Branch: refs/heads/master
Commit: e7c642c1cf3b5ab4e7f20f99f78edcc69e7d3931
Parents: 1bbc2de
Author: Purshotam Shah <[email protected]>
Authored: Tue Sep 9 15:00:22 2014 -0700
Committer: Purshotam Shah <[email protected]>
Committed: Tue Sep 9 15:00:22 2014 -0700

----------------------------------------------------------------------
 .../event/listener/ZKConnectionListener.java    | 62 ++++++++++++++++++++
 .../java/org/apache/oozie/util/ZKUtils.java     | 26 +++++++-
 core/src/main/resources/oozie-default.xml       |  9 +++
 release-log.txt                                 |  1 +
 4 files changed, 97 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java 
b/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java
new file mode 100644
index 0000000..1406b6a
--- /dev/null
+++ 
b/core/src/main/java/org/apache/oozie/event/listener/ZKConnectionListener.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oozie.event.listener;
+
+import org.apache.curator.framework.CuratorFramework;
+import org.apache.curator.framework.state.ConnectionState;
+import org.apache.curator.framework.state.ConnectionStateListener;
+import org.apache.oozie.service.Services;
+import org.apache.oozie.util.XLog;
+import org.apache.oozie.util.ZKUtils;
+
+/**
+ * ZKConnectionListener listens on ZK connection status.
+ */
+public class ZKConnectionListener implements ConnectionStateListener {
+
+    private XLog LOG = XLog.getLog(getClass());
+
+    public ZKConnectionListener() {
+        LOG.info("ZKConnectionListener started");
+    }
+
+    @Override
+    public void stateChanged(final CuratorFramework client, final 
ConnectionState newState) {
+        LOG.trace("ZK connection status  = " + newState.toString());
+//        if (newState == ConnectionState.CONNECTED) {
+//             ZK connected
+//        }
+        if (newState == ConnectionState.SUSPENDED) {
+            LOG.warn("ZK connection is suspended, waiting for reconnect. If 
connection doesn't reconnect before "
+                    + ZKUtils.getZKConnectionTimeout() + " Oozie server will 
shutdown itself");
+        }
+
+        if (newState == ConnectionState.RECONNECTED) {
+            // ZK connected is reconnected.
+            LOG.warn("ZK connection is reestablished");
+        }
+
+        if (newState == ConnectionState.LOST) {
+            LOG.fatal("ZK is connection is not reconnected in " + 
ZKUtils.getZKConnectionTimeout()
+                    + ", shutting down Oozie server");
+            Services.get().destroy();
+            System.exit(1);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/core/src/main/java/org/apache/oozie/util/ZKUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/oozie/util/ZKUtils.java 
b/core/src/main/java/org/apache/oozie/util/ZKUtils.java
index b69e758..ec32092 100644
--- a/core/src/main/java/org/apache/oozie/util/ZKUtils.java
+++ b/core/src/main/java/org/apache/oozie/util/ZKUtils.java
@@ -19,6 +19,7 @@
 package org.apache.oozie.util;
 
 import com.google.common.annotations.VisibleForTesting;
+
 import java.io.IOException;
 import java.util.Collections;
 import java.util.HashMap;
@@ -26,7 +27,9 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+
 import javax.security.auth.login.Configuration;
+
 import org.apache.curator.RetryPolicy;
 import org.apache.curator.framework.CuratorFramework;
 import org.apache.curator.framework.CuratorFrameworkFactory;
@@ -40,8 +43,11 @@ import 
org.apache.curator.x.discovery.ServiceDiscoveryBuilder;
 import org.apache.curator.x.discovery.ServiceInstance;
 import org.apache.curator.x.discovery.details.InstanceSerializer;
 import org.apache.oozie.ErrorCode;
+
 import static org.apache.oozie.service.HadoopAccessorService.KERBEROS_KEYTAB;
 import static 
org.apache.oozie.service.HadoopAccessorService.KERBEROS_PRINCIPAL;
+
+import org.apache.oozie.event.listener.ZKConnectionListener;
 import org.apache.oozie.service.ServiceException;
 import org.apache.oozie.service.Services;
 import org.apache.zookeeper.ZooDefs.Perms;
@@ -50,7 +56,6 @@ import org.apache.zookeeper.data.ACL;
 import org.apache.zookeeper.data.Id;
 import org.apache.zookeeper.data.Stat;
 
-
 /**
  * This class provides a singleton for interacting with ZooKeeper that other 
classes can use.  It handles connecting to ZooKeeper,
  * service discovery, and publishing metadata about this server.
@@ -72,6 +77,8 @@ import org.apache.zookeeper.data.Stat;
  * Oozie's existing security configuration parameters (b) use/convert every 
znode under the namespace (including the namespace
  * itself) to have ACLs such that only Oozie servers have access (i.e. if 
"service/host@REALM" is the Kerberos principal, then
  * "service" will be used for the ACLs).
+ * <p>
+ * Oozie server will shutdown itself if ZK connection is lost for 
${ZK_CONNECTION_TIMEOUT}.
  */
 public class ZKUtils {
     /**
@@ -86,6 +93,11 @@ public class ZKUtils {
     public static final String ZK_NAMESPACE = "oozie.zookeeper.namespace";
 
     /**
+     *Default ZK connection timeout ( in sec). If connection is lost for more 
than timeout, then Oozie server will shutdown itself.
+     */
+    public static final String ZK_CONNECTION_TIMEOUT = 
"oozie.zookeeper.connection.timeout";
+
+    /**
      * oozie-env environment variable for specifying the Oozie instance ID
      */
     public static final String OOZIE_INSTANCE_ID = "oozie.instance.id";
@@ -111,6 +123,7 @@ public class ZKUtils {
     private XLog log;
 
     private static ZKUtils zk = null;
+    private static int zkConnectionTimeout;
 
     /**
      * Private Constructor for the singleton; it connects to ZooKeeper and 
advertises this Oozie Server.
@@ -160,6 +173,8 @@ public class ZKUtils {
         RetryPolicy retryPolicy = ZKUtils.getRetryPloicy();
         String zkConnectionString = 
Services.get().getConf().get(ZK_CONNECTION_STRING, "localhost:2181");
         String zkNamespace = Services.get().getConf().get(ZK_NAMESPACE, 
"oozie");
+        zkConnectionTimeout = 
Services.get().getConf().getInt(ZK_CONNECTION_TIMEOUT, 180);
+
         ACLProvider aclProvider;
         if (Services.get().getConf().getBoolean(ZK_SECURE, false)) {
             log.info("Connecting to ZooKeeper with SASL/Kerberos and using 
'sasl' ACLs");
@@ -177,8 +192,10 @@ public class ZKUtils {
                                             .connectString(zkConnectionString)
                                             .retryPolicy(retryPolicy)
                                             .aclProvider(aclProvider)
+                                            
.connectionTimeoutMs(zkConnectionTimeout * 1000) // in ms
                                             .build();
         client.start();
+        client.getConnectionStateListenable().addListener(new 
ZKConnectionListener());
     }
 
     private void advertiseService() throws Exception {
@@ -388,4 +405,11 @@ public class ZKUtils {
     public static RetryPolicy getRetryPloicy() {
         return new ExponentialBackoffRetry(1000, 3);
     }
+    /**
+     * Return ZK connection timeout
+     * @return
+     */
+    public static int getZKConnectionTimeout(){
+        return zkConnectionTimeout;
+    }
 }

http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/core/src/main/resources/oozie-default.xml
----------------------------------------------------------------------
diff --git a/core/src/main/resources/oozie-default.xml 
b/core/src/main/resources/oozie-default.xml
index a5ccd3c..b931d52 100644
--- a/core/src/main/resources/oozie-default.xml
+++ b/core/src/main/resources/oozie-default.xml
@@ -2075,6 +2075,15 @@
         </description>
     </property>
 
+    <property>
+        <name>oozie.zookeeper.connection.timeout</name>
+        <value>180</value>
+        <description>
+        Default ZK connection timeout (in sec). If connection is lost for more 
than timeout, then Oozie server will shutdown
+        itself.
+        </description>
+    </property>
+
 <!--
     <property>
         <name>oozie.instance.id</name>

http://git-wip-us.apache.org/repos/asf/oozie/blob/e7c642c1/release-log.txt
----------------------------------------------------------------------
diff --git a/release-log.txt b/release-log.txt
index e73eb75..b3bbb2f 100644
--- a/release-log.txt
+++ b/release-log.txt
@@ -1,5 +1,6 @@
 -- Oozie 4.2.0 release (trunk - unreleased)
 
+OOZIE-1847 HA - Oozie servers should shutdown (or go in safe mode) in case of 
ZK failure (puru)
 OOZIE-1957 Coord update command override group when 
oozie.service.AuthorizationService.default.group.as.acl is set and group/acl is 
not configured in job property (puru)
 OOZIE-1818 CoordMaterializeTransitionXCommand verifyPrecondition doesn't 
verify current time (puru)
 OOZIE-1653 Support ALL to allowed error code of the user retry (seoeun25 via 
rkanter)

Reply via email to