Repository: ambari Updated Branches: refs/heads/feature-branch-AMBARI-21307 9ad286791 -> 12c50a909 (forced update)
AMBARI-21593 : AMS stopped after RU [AMS distributed mode with 2 collectors] (avijayan) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/c7b350b6 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/c7b350b6 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/c7b350b6 Branch: refs/heads/feature-branch-AMBARI-21307 Commit: c7b350b678b82bae1c0834744249cb534fed18f1 Parents: 2bab215 Author: Aravindan Vijayan <avija...@hortonworks.com> Authored: Mon Jul 31 14:30:27 2017 -0700 Committer: Aravindan Vijayan <avija...@hortonworks.com> Committed: Mon Jul 31 14:30:27 2017 -0700 ---------------------------------------------------------------------- .../MetricCollectorHAController.java | 42 +++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/c7b350b6/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java ---------------------------------------------------------------------- diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java index 53e6304..addb14e 100644 --- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java +++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java @@ -26,6 +26,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.MetricsSystemInitializationException; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.TimelineMetricConfiguration; import org.apache.helix.HelixAdmin; +import org.apache.helix.HelixException; import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; @@ -123,20 +124,41 @@ public class MetricCollectorHAController { admin = new ZKHelixAdmin(zkConnectUrl); // create cluster LOG.info("Creating zookeeper cluster node: " + clusterName); - admin.addCluster(clusterName, false); + boolean clusterAdded = admin.addCluster(clusterName, false); + LOG.info("Was cluster added successfully? " + clusterAdded); // Adding host to the cluster - List<String> nodes = Collections.EMPTY_LIST; - try { - nodes = admin.getInstancesInCluster(clusterName); - } catch (ZkNoNodeException ex) { - LOG.warn("Child znode under /" + CLUSTER_NAME + " not found.Recreating the cluster."); - admin.addCluster(clusterName, true); + boolean success = false; + int tries = 5; + int sleepTimeInSeconds = 5; + + for (int i = 0; i < tries && !success; i++) { + try { + List<String> nodes = admin.getInstancesInCluster(clusterName); + if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) { + LOG.info("Adding participant instance " + instanceConfig); + admin.addInstance(clusterName, instanceConfig); + success = true; + } + } catch (HelixException | ZkNoNodeException ex) { + LOG.warn("Helix Cluster not yet setup fully."); + if (i < tries - 1) { + LOG.info("Waiting for " + sleepTimeInSeconds + " seconds and retrying."); + TimeUnit.SECONDS.sleep(sleepTimeInSeconds); + } else { + LOG.error(ex); + } + } } - if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) { - LOG.info("Adding participant instance " + instanceConfig); - admin.addInstance(clusterName, instanceConfig); + if (!success) { + LOG.info("Trying to create " + clusterName + " again since waiting for the creation did not help."); + admin.addCluster(clusterName, true); + List<String> nodes = admin.getInstancesInCluster(clusterName); + if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) { + LOG.info("Adding participant instance " + instanceConfig); + admin.addInstance(clusterName, instanceConfig); + } } // Add a state model