This is an automated email from the ASF dual-hosted git repository.
kturner pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/fluo-muchos.git
The following commit(s) were added to refs/heads/master by this push:
new e14a10d Support (optional) HA capabilities for Hadoop & Accumulo
(#284)
e14a10d is described below
commit e14a10de69f9c011fecde91590be3e6d96886cbe
Author: Karthick Narendran <[email protected]>
AuthorDate: Tue Sep 24 14:10:21 2019 +0100
Support (optional) HA capabilities for Hadoop & Accumulo (#284)
---
README.md | 17 +++
ansible/accumulo.yml | 2 +-
ansible/hadoop-ha.yml | 44 ++++++++
ansible/roles/accumulo/templates/gc | 4 +-
ansible/roles/accumulo/templates/masters | 4 +-
ansible/roles/accumulo/templates/monitor | 4 +-
ansible/roles/accumulo/templates/tracers | 4 +-
ansible/roles/azure/tasks/create_vmss.yml | 31 ++++++
ansible/roles/hadoop-ha/tasks/format-nn.yml | 22 ++++
ansible/roles/hadoop-ha/tasks/format-zk.yml | 25 +++++
ansible/roles/hadoop-ha/tasks/main.yml | 49 +++++++++
ansible/roles/hadoop-ha/tasks/start-dn.yml | 25 +++++
ansible/roles/hadoop-ha/tasks/start-journal.yml | 26 +++++
ansible/roles/hadoop-ha/tasks/start-nn1.yml | 25 +++++
ansible/roles/hadoop-ha/tasks/start-nn2.yml | 28 +++++
ansible/roles/hadoop-ha/tasks/start-yarn.yml | 28 +++++
ansible/roles/hadoop-ha/tasks/start-zkfc.yml | 25 +++++
.../{hadoop => hadoop-ha}/templates/core-site.xml | 6 +-
ansible/roles/hadoop-ha/templates/hdfs-site.xml | 117 +++++++++++++++++++++
.../templates/mapred-site.xml | 4 +-
ansible/roles/hadoop-ha/templates/slaves | 3 +
ansible/roles/hadoop-ha/templates/workers | 3 +
.../{hadoop => hadoop-ha}/templates/yarn-site.xml | 15 ++-
ansible/roles/hadoop/templates/core-site.xml | 2 +
ansible/roles/hadoop/templates/hdfs-site.xml | 2 +
ansible/roles/hadoop/templates/mapred-site.xml | 2 +
ansible/roles/hadoop/templates/yarn-site.xml | 2 +
conf/muchos.props.example | 6 +-
lib/muchos/azure.py | 1 +
lib/muchos/config.py | 11 +-
lib/muchos/existing.py | 19 +++-
31 files changed, 539 insertions(+), 17 deletions(-)
diff --git a/README.md b/README.md
index d124b1f..57645f4 100644
--- a/README.md
+++ b/README.md
@@ -280,6 +280,21 @@ master, etc. It also has variables in the `[all:vars]`
section that contain sett
useful in user playbooks. It is recommended that any user-defined Ansible
playbooks should be
managed in their own git repository (see [mikewalch/muchos-custom][mc] for an
example).
+## High-Availability (optional)
+
+Additionally, Muchos can be configured to provide High-Availability for HDFS &
Accumulo components. By default,
+this feature is off, however it can be turned on by editing the following
settings in [muchos.props]
+under the `general` section as shown below:
+
+```ini
+hdfs_ha = True # default is False
+nameservice_id = muchoshacluster # Logical name for the cluster, no
special characters
+```
+
+Before enabling HA, it is strongly recommended you read the Apache doc for
[HDFS HA] & [Accumulo HA]
+
+Also in the `[nodes]` section of [muchos.props] ensure the `journalnode` and
`zkfc` service are configured to run.
+
## Terminating your cluster
If you launched your cluster, run the following command to terminate your
cluster. WARNING - All
@@ -360,3 +375,5 @@ The following command runs the unit tests:
[Docker swarm]: https://docs.docker.com/engine/swarm/
[Portainer]: https://github.com/portainer/portainer
[checksums]: conf/checksums
+[HDFS HA]:
https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithQJM.html
+[Accumulo HA]:
https://accumulo.apache.org/1.9/accumulo_user_manual.html#_components
diff --git a/ansible/accumulo.yml b/ansible/accumulo.yml
index 82f27ec..2af9d67 100644
--- a/ansible/accumulo.yml
+++ b/ansible/accumulo.yml
@@ -22,7 +22,7 @@
- hosts: all:!{{ azure_proxy_host }}
roles:
- accumulo
-- hosts: accumulomaster
+- hosts: accumulomaster[0]
tasks:
- import_tasks: roles/accumulo/tasks/init-accumulo.yml
handlers:
diff --git a/ansible/hadoop-ha.yml b/ansible/hadoop-ha.yml
new file mode 100644
index 0000000..8fbd8d4
--- /dev/null
+++ b/ansible/hadoop-ha.yml
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+- hosts: all:!{{ azure_proxy_host }}
+ roles:
+ - hadoop-ha
+- hosts: journalnode
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/start-journal.yml
+- hosts: namenode[0]
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/format-nn.yml
+- hosts: namenode[0]
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/format-zk.yml
+- hosts: namenode
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/start-zkfc.yml
+- hosts: namenode[0]
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/start-nn1.yml
+- hosts: namenode[1]
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/start-nn2.yml
+- hosts: workers
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/start-dn.yml
+- hosts: resourcemanager
+ tasks:
+ - import_tasks: roles/hadoop-ha/tasks/start-yarn.yml
diff --git a/ansible/roles/accumulo/templates/gc
b/ansible/roles/accumulo/templates/gc
index 3cd0bac..59f7b48 100644
--- a/ansible/roles/accumulo/templates/gc
+++ b/ansible/roles/accumulo/templates/gc
@@ -1 +1,3 @@
-{{ groups['accumulomaster'][0] }}
+{% for host in groups['accumulomaster'] %}
+{{ host }}
+{% endfor %}
diff --git a/ansible/roles/accumulo/templates/masters
b/ansible/roles/accumulo/templates/masters
index 3cd0bac..59f7b48 100644
--- a/ansible/roles/accumulo/templates/masters
+++ b/ansible/roles/accumulo/templates/masters
@@ -1 +1,3 @@
-{{ groups['accumulomaster'][0] }}
+{% for host in groups['accumulomaster'] %}
+{{ host }}
+{% endfor %}
diff --git a/ansible/roles/accumulo/templates/monitor
b/ansible/roles/accumulo/templates/monitor
index 3cd0bac..59f7b48 100644
--- a/ansible/roles/accumulo/templates/monitor
+++ b/ansible/roles/accumulo/templates/monitor
@@ -1 +1,3 @@
-{{ groups['accumulomaster'][0] }}
+{% for host in groups['accumulomaster'] %}
+{{ host }}
+{% endfor %}
diff --git a/ansible/roles/accumulo/templates/tracers
b/ansible/roles/accumulo/templates/tracers
index 3cd0bac..59f7b48 100644
--- a/ansible/roles/accumulo/templates/tracers
+++ b/ansible/roles/accumulo/templates/tracers
@@ -1 +1,3 @@
-{{ groups['accumulomaster'][0] }}
+{% for host in groups['accumulomaster'] %}
+{{ host }}
+{% endfor %}
diff --git a/ansible/roles/azure/tasks/create_vmss.yml
b/ansible/roles/azure/tasks/create_vmss.yml
index 1feb360..9a6eb21 100644
--- a/ansible/roles/azure/tasks/create_vmss.yml
+++ b/ansible/roles/azure/tasks/create_vmss.yml
@@ -214,18 +214,49 @@
path: "{{ deploy_path }}/conf/muchos.props"
line: "{{ item }} = namenode,resourcemanager,accumulomaster,zookeeper"
with_items: "{{ instances_dict | json_query('[0].value') }}"
+ when: not hdfs_ha|bool
- name: Assign metrics to the second node of the cluster
lineinfile:
path: "{{ deploy_path }}/conf/muchos.props"
line: "{{ item }} = metrics"
with_items: "{{ instances_dict | json_query('[1].value') }}"
+ when: not hdfs_ha|bool
- name: Add worker nodes to muchos.props
lineinfile:
path: "{{ deploy_path }}/conf/muchos.props"
line: "{{ item }} = worker"
with_items: "{{ instances_dict | json_query('[2:].value') }}"
+ when: not hdfs_ha|bool
+
+- name: Assign Accumulo master, HDFS HA components cluster roles to the first
node of the cluster
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ line: "{{ item }} =
namenode,resourcemanager,accumulomaster,zookeeper,journalnode,zkfc"
+ with_items: "{{ instances_dict | json_query('[0].value') }}"
+ when: hdfs_ha|bool
+
+- name: Assign Accumulo master, HDFS HA components cluster roles to the second
node of the cluster
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ line: "{{ item }} =
zookeeper,metrics,journalnode,namenode,zkfc,accumulomaster"
+ with_items: "{{ instances_dict | json_query('[1].value') }}"
+ when: hdfs_ha|bool
+
+- name: Assign HDFS HA components cluster roles to the third node of the
cluster
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ line: "{{ item }} = journalnode,zookeeper"
+ with_items: "{{ instances_dict | json_query('[2].value') }}"
+ when: hdfs_ha|bool
+
+- name: Add worker nodes to muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ line: "{{ item }} = worker"
+ with_items: "{{ instances_dict | json_query('[3:].value') }}"
+ when: hdfs_ha|bool
- name: Change proxy hostname to azure proxy host in muchos.props
lineinfile:
diff --git a/ansible/roles/hadoop-ha/tasks/format-nn.yml
b/ansible/roles/hadoop-ha/tasks/format-nn.yml
new file mode 100644
index 0000000..1a28e02
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/format-nn.yml
@@ -0,0 +1,22 @@
+#
+### Licensed to the Apache Software Foundation (ASF) under one or more
+### contributor license agreements. See the NOTICE file distributed with
+### this work for additional information regarding copyright ownership.
+### The ASF licenses this file to You under the Apache License, Version 2.0
+### (the "License"); you may not use this file except in compliance with
+### the License. You may obtain a copy of the License at
+###
+### http://www.apache.org/licenses/LICENSE-2.0
+###
+### Unless required by applicable law or agreed to in writing, software
+### distributed under the License is distributed on an "AS IS" BASIS,
+### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+### See the License for the specific language governing permissions and
+### limitations under the License.
+###
+##
+##
+- name: "format namenode"
+ command: "nohup {{ hadoop_home }}/bin/hdfs namenode -format"
+ args:
+ creates: "{{ worker_data_dirs[0] }}/hadoop/name/current/fsimage_*"
diff --git a/ansible/roles/hadoop-ha/tasks/format-zk.yml
b/ansible/roles/hadoop-ha/tasks/format-zk.yml
new file mode 100644
index 0000000..f440a65
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/format-zk.yml
@@ -0,0 +1,25 @@
+#
+### Licensed to the Apache Software Foundation (ASF) under one or more
+### contributor license agreements. See the NOTICE file distributed with
+### this work for additional information regarding copyright ownership.
+### The ASF licenses this file to You under the Apache License, Version 2.0
+### (the "License"); you may not use this file except in compliance with
+### the License. You may obtain a copy of the License at
+###
+### http://www.apache.org/licenses/LICENSE-2.0
+###
+### Unless required by applicable law or agreed to in writing, software
+### distributed under the License is distributed on an "AS IS" BASIS,
+### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+### See the License for the specific language governing permissions and
+### limitations under the License.
+###
+#
+- name: Check if DFSZKFailoverController is running
+ shell: jps | grep "DFSZKFailoverController" | grep -v grep
+ ignore_errors: yes
+ changed_when: false
+ register: zkfc_status
+- name: "Initialize HA state in ZK"
+ command: "nohup {{ hadoop_home }}/bin/hdfs zkfc -formatZK -force"
+ when: zkfc_status.rc == 1
diff --git a/ansible/roles/hadoop-ha/tasks/main.yml
b/ansible/roles/hadoop-ha/tasks/main.yml
new file mode 100644
index 0000000..8bc03de
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/main.yml
@@ -0,0 +1,49 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+- name: "install hadoop tarball"
+ unarchive: src={{ tarballs_dir }}/{{ hadoop_tarball }} dest={{ install_dir
}} creates={{ hadoop_home }} copy=yes
+- name: "configure hadoop with templates"
+ template: src={{ item }} dest={{ hadoop_home }}/etc/hadoop/{{ item }}
+ with_items:
+ - core-site.xml
+ - hdfs-site.xml
+ - yarn-site.xml
+ - mapred-site.xml
+- name: "configure hadoop 2"
+ template: src={{ item }} dest={{ hadoop_home }}/etc/hadoop/{{ item }}
+ with_items:
+ - slaves
+ when: hadoop_major_version == '2'
+- name: "configure hadoop 3"
+ template: src={{ item }} dest={{ hadoop_home }}/etc/hadoop/{{ item }}
+ with_items:
+ - workers
+ when: hadoop_major_version == '3'
+- name: "copy spark yarn shuffle jar to hadoop lib"
+ command: cp {{ spark_home }}/yarn/spark-{{ spark_version }}-yarn-shuffle.jar
{{ hadoop_home }}/share/hadoop/yarn/lib/ creates={{ hadoop_home
}}/share/hadoop/yarn/lib/spark-{{ spark_version }}-yarn-shuffle.jar
+ when: "'spark' in groups"
+- name: "setup hadoop short circuit socket dir"
+ file: path=/var/lib/hadoop-hdfs state=directory owner={{ cluster_user }}
group={{ cluster_group }} mode=0755
+ become: yes
+- name: "Configure hadoop log dir"
+ replace:
+ path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh"
+ regexp: '.*export\s+HADOOP_LOG_DIR.*'
+ replace: "export HADOOP_LOG_DIR={{ worker_data_dirs[0] }}/logs/hadoop"
+- name: "Create hadoop log dir"
+ file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory
diff --git a/ansible/roles/hadoop-ha/tasks/start-dn.yml
b/ansible/roles/hadoop-ha/tasks/start-dn.yml
new file mode 100644
index 0000000..0898199
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/start-dn.yml
@@ -0,0 +1,25 @@
+#
+### Licensed to the Apache Software Foundation (ASF) under one or more
+### contributor license agreements. See the NOTICE file distributed with
+### this work for additional information regarding copyright ownership.
+### The ASF licenses this file to You under the Apache License, Version 2.0
+### (the "License"); you may not use this file except in compliance with
+### the License. You may obtain a copy of the License at
+###
+### http://www.apache.org/licenses/LICENSE-2.0
+###
+### Unless required by applicable law or agreed to in writing, software
+### distributed under the License is distributed on an "AS IS" BASIS,
+### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+### See the License for the specific language governing permissions and
+### limitations under the License.
+###
+##
+- name: Check if DataNode is running
+ shell: jps | grep " DataNode" | grep -v grep
+ ignore_errors: yes
+ changed_when: false
+ register: datanode_status
+- name: "start datanodes"
+ command: "nohup {{ hadoop_home }}/sbin/hadoop-daemon.sh start datanode"
+ when: datanode_status.rc == 1
diff --git a/ansible/roles/hadoop-ha/tasks/start-journal.yml
b/ansible/roles/hadoop-ha/tasks/start-journal.yml
new file mode 100644
index 0000000..fcd8226
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/start-journal.yml
@@ -0,0 +1,26 @@
+#
+### Licensed to the Apache Software Foundation (ASF) under one or more
+### contributor license agreements. See the NOTICE file distributed with
+### this work for additional information regarding copyright ownership.
+### The ASF licenses this file to You under the Apache License, Version 2.0
+### (the "License"); you may not use this file except in compliance with
+### the License. You may obtain a copy of the License at
+###
+### http://www.apache.org/licenses/LICENSE-2.0
+###
+### Unless required by applicable law or agreed to in writing, software
+### distributed under the License is distributed on an "AS IS" BASIS,
+### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+### See the License for the specific language governing permissions and
+### limitations under the License.
+###
+
+- name: Check if JournalNode is running
+ shell: jps | grep "JournalNode" | grep -v grep
+ ignore_errors: yes
+ changed_when: false
+ register: journalnode_status
+
+- name: "start journalnode"
+ command: "nohup {{ hadoop_home }}/sbin/hadoop-daemon.sh start journalnode"
+ when: journalnode_status.rc == 1
diff --git a/ansible/roles/hadoop-ha/tasks/start-nn1.yml
b/ansible/roles/hadoop-ha/tasks/start-nn1.yml
new file mode 100644
index 0000000..e78e1ec
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/start-nn1.yml
@@ -0,0 +1,25 @@
+#
+### Licensed to the Apache Software Foundation (ASF) under one or more
+### contributor license agreements. See the NOTICE file distributed with
+### this work for additional information regarding copyright ownership.
+### The ASF licenses this file to You under the Apache License, Version 2.0
+### (the "License"); you may not use this file except in compliance with
+### the License. You may obtain a copy of the License at
+###
+### http://www.apache.org/licenses/LICENSE-2.0
+###
+### Unless required by applicable law or agreed to in writing, software
+### distributed under the License is distributed on an "AS IS" BASIS,
+### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+### See the License for the specific language governing permissions and
+### limitations under the License.
+###
+##
+- name: Check if NameNode is running
+ shell: jps | grep " NameNode" | grep -v grep
+ ignore_errors: yes
+ changed_when: false
+ register: namenode_status
+- name: "start namenode"
+ command: "nohup {{ hadoop_home }}/sbin/hadoop-daemon.sh start namenode"
+ when: namenode_status.rc == 1
diff --git a/ansible/roles/hadoop-ha/tasks/start-nn2.yml
b/ansible/roles/hadoop-ha/tasks/start-nn2.yml
new file mode 100644
index 0000000..98ba326
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/start-nn2.yml
@@ -0,0 +1,28 @@
+#
+### Licensed to the Apache Software Foundation (ASF) under one or more
+### contributor license agreements. See the NOTICE file distributed with
+### this work for additional information regarding copyright ownership.
+### The ASF licenses this file to You under the Apache License, Version 2.0
+### (the "License"); you may not use this file except in compliance with
+### the License. You may obtain a copy of the License at
+###
+### http://www.apache.org/licenses/LICENSE-2.0
+###
+### Unless required by applicable law or agreed to in writing, software
+### distributed under the License is distributed on an "AS IS" BASIS,
+### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+### See the License for the specific language governing permissions and
+### limitations under the License.
+###
+##
+- name: Check if NameNode is running
+ shell: jps | grep " NameNode" | grep -v grep
+ ignore_errors: yes
+ changed_when: false
+ register: namenode_status
+- name: "bootstrap standby"
+ command: "nohup {{ hadoop_home }}/bin/hdfs namenode -bootstrapStandby -force"
+ when: namenode_status.rc == 1
+- name: "start namenode"
+ command: "nohup {{ hadoop_home }}/sbin/hadoop-daemon.sh start namenode"
+ when: namenode_status.rc == 1
diff --git a/ansible/roles/hadoop-ha/tasks/start-yarn.yml
b/ansible/roles/hadoop-ha/tasks/start-yarn.yml
new file mode 100644
index 0000000..10c0c45
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/start-yarn.yml
@@ -0,0 +1,28 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+- name: "start hadoop yarn 2.x"
+ command: "{{ hadoop_home }}/sbin/start-yarn.sh"
+ register: start_yarn_result
+ changed_when: start_yarn_result.stdout | search("starting
(:?resource|node)manager")
+ when: hadoop_major_version == '2'
+- name: "start hadoop yarn 3.x"
+ command: "nohup {{ hadoop_home }}/sbin/start-yarn.sh"
+ register: start_yarn_result
+ changed_when: start_yarn_result.rc == 0
+ failed_when: start_yarn_result.rc >= 2
+ when: hadoop_major_version == '3'
diff --git a/ansible/roles/hadoop-ha/tasks/start-zkfc.yml
b/ansible/roles/hadoop-ha/tasks/start-zkfc.yml
new file mode 100644
index 0000000..f3d5957
--- /dev/null
+++ b/ansible/roles/hadoop-ha/tasks/start-zkfc.yml
@@ -0,0 +1,25 @@
+#
+### Licensed to the Apache Software Foundation (ASF) under one or more
+### contributor license agreements. See the NOTICE file distributed with
+### this work for additional information regarding copyright ownership.
+### The ASF licenses this file to You under the Apache License, Version 2.0
+### (the "License"); you may not use this file except in compliance with
+### the License. You may obtain a copy of the License at
+###
+### http://www.apache.org/licenses/LICENSE-2.0
+###
+### Unless required by applicable law or agreed to in writing, software
+### distributed under the License is distributed on an "AS IS" BASIS,
+### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+### See the License for the specific language governing permissions and
+### limitations under the License.
+###
+##
+- name: Check if DFSZKFailoverController is running
+ shell: jps | grep "DFSZKFailoverController" | grep -v grep
+ ignore_errors: yes
+ changed_when: false
+ register: zkfc_status
+- name: "start zkfc"
+ command: "nohup {{ hadoop_home }}/sbin/hadoop-daemon.sh start zkfc"
+ when: zkfc_status.rc == 1
diff --git a/ansible/roles/hadoop/templates/core-site.xml
b/ansible/roles/hadoop-ha/templates/core-site.xml
similarity index 90%
copy from ansible/roles/hadoop/templates/core-site.xml
copy to ansible/roles/hadoop-ha/templates/core-site.xml
index 409c103..dd54827 100644
--- a/ansible/roles/hadoop/templates/core-site.xml
+++ b/ansible/roles/hadoop-ha/templates/core-site.xml
@@ -23,7 +23,7 @@
<configuration>
<property>
- <name>fs.defaultFS</name>
+ <name>fs.defaultFS</name>
<value>{{ hdfs_root }}</value>
</property>
<property>
@@ -34,4 +34,8 @@
<name>dfs.domain.socket.path</name>
<value>/var/lib/hadoop-hdfs/dn_socket</value>
</property>
+ <property>
+ <name>ha.zookeeper.quorum</name>
+ <value>{{ zookeeper_connect }}</value>
+ </property>
</configuration>
diff --git a/ansible/roles/hadoop-ha/templates/hdfs-site.xml
b/ansible/roles/hadoop-ha/templates/hdfs-site.xml
new file mode 100644
index 0000000..b80b3d1
--- /dev/null
+++ b/ansible/roles/hadoop-ha/templates/hdfs-site.xml
@@ -0,0 +1,117 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+ <property>
+ <name>dfs.datanode.synconclose</name>
+ <value>true</value>
+ </property>
+ <property>
+ <name>dfs.namenode.name.dir</name>
+ <value>{{ worker_data_dirs[0] }}/hadoop/name</value>
+ </property>
+ <property>
+ <name>dfs.datanode.data.dir</name>
+ <value>{% for dir in worker_data_dirs -%}
+ {{ dir }}/hadoop/data
+ {%- if not loop.last -%} , {%- endif -%}
+ {%- endfor %}</value>
+ </property>
+ <property>
+ <name>dfs.namenode.fs-limits.max-directory-items</name>
+ <value>6400000</value>
+ </property>
+ <property>
+ <name>dfs.client.read.shortcircuit</name>
+ <value>true</value>
+ </property>
+ <property>
+ <name>dfs.domain.socket.path</name>
+ <value>/var/lib/hadoop-hdfs/dn_socket</value>
+ </property>
+ <property>
+ <name>dfs.nameservices</name>
+ <value>{{ nameservice_id }}</value>
+ </property>
+ <property>
+ <name>dfs.ha.namenodes.{{ nameservice_id }}</name>
+ <value>nn1,nn2</value>
+ </property>
+ <property>
+ <name>dfs.namenode.rpc-address.{{ nameservice_id }}.nn1</name>
+ <value>{{ groups['namenode'][0] }}:8020</value>
+ </property>
+ <property>
+ <name>dfs.namenode.rpc-address.{{ nameservice_id }}.nn2</name>
+ <value>{{ groups['namenode'][1] }}:8020</value>
+ </property>
+ <property>
+ <name>dfs.namenode.http-address.{{ nameservice_id }}.nn1</name>
+ <value>{{ groups['namenode'][0] }}:50070</value>
+ </property>
+ <property>
+ <name>dfs.namenode.http-address.{{ nameservice_id }}.nn2</name>
+ <value>{{ groups['namenode'][1] }}:50070</value>
+ </property>
+ <property>
+ <name>dfs.namenode.https-address.{{ nameservice_id }}.nn1</name>
+ <value>{{ groups['namenode'][0] }}:50071</value>
+ </property>
+ <property>
+ <name>dfs.namenode.https-address.{{ nameservice_id }}.nn2</name>
+ <value>{{ groups['namenode'][1] }}:50071</value>
+ </property>
+ <property>
+ <name>dfs.namenode.shared.edits.dir</name>
+ <value>qjournal://{{ journal_quorum }}/{{ nameservice_id }}</value>
+ </property>
+ <property>
+ <name>dfs.client.failover.proxy.provider.{{ nameservice_id }}</name>
+
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
+ </property>
+ <property>
+ <name>dfs.ha.fencing.methods</name>
+ <value>shell(/usr/bin/true)</value>
+ </property>
+ <property>
+ <name>dfs.journalnode.edits.dir</name>
+ <value>{{ worker_data_dirs[0] }}/hadoop/journal</value>
+ </property>
+ <property>
+ <name>dfs.ha.automatic-failover.enabled</name>
+ <value>true</value>
+ </property>
+</configuration>
diff --git a/ansible/roles/hadoop/templates/mapred-site.xml
b/ansible/roles/hadoop-ha/templates/mapred-site.xml
similarity index 95%
copy from ansible/roles/hadoop/templates/mapred-site.xml
copy to ansible/roles/hadoop-ha/templates/mapred-site.xml
index 7dbb5f6..c6be0ce 100644
--- a/ansible/roles/hadoop/templates/mapred-site.xml
+++ b/ansible/roles/hadoop-ha/templates/mapred-site.xml
@@ -28,14 +28,14 @@
</property>
<property>
<name>mapreduce.cluster.temp.dir</name>
- <value>{% for dir in worker_data_dirs -%}
+ <value>{% for dir in worker_data_dirs -%}
{{ dir }}/hadoop/mapred/temp
{%- if not loop.last -%} , {%- endif -%}
{%- endfor %}</value>
</property>
<property>
<name>mapreduce.cluster.local.dir</name>
- <value>{% for dir in worker_data_dirs -%}
+ <value>{% for dir in worker_data_dirs -%}
{{ dir }}/hadoop/mapred/local
{%- if not loop.last -%} , {%- endif -%}
{%- endfor %}</value>
diff --git a/ansible/roles/hadoop-ha/templates/slaves
b/ansible/roles/hadoop-ha/templates/slaves
new file mode 100644
index 0000000..cd3348c
--- /dev/null
+++ b/ansible/roles/hadoop-ha/templates/slaves
@@ -0,0 +1,3 @@
+{% for host in groups['workers'] %}
+{{ host }}
+{% endfor %}
diff --git a/ansible/roles/hadoop-ha/templates/workers
b/ansible/roles/hadoop-ha/templates/workers
new file mode 100644
index 0000000..cd3348c
--- /dev/null
+++ b/ansible/roles/hadoop-ha/templates/workers
@@ -0,0 +1,3 @@
+{% for host in groups['workers'] %}
+{{ host }}
+{% endfor %}
diff --git a/ansible/roles/hadoop/templates/yarn-site.xml
b/ansible/roles/hadoop-ha/templates/yarn-site.xml
similarity index 81%
copy from ansible/roles/hadoop/templates/yarn-site.xml
copy to ansible/roles/hadoop-ha/templates/yarn-site.xml
index c0ba8d9..85033a6 100644
--- a/ansible/roles/hadoop/templates/yarn-site.xml
+++ b/ansible/roles/hadoop-ha/templates/yarn-site.xml
@@ -18,6 +18,19 @@
limitations under the License.
-->
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+-->
<!-- Put site-specific property overrides in this file. -->
@@ -28,7 +41,7 @@
</property>
<property>
<name>yarn.nodemanager.local-dirs</name>
- <value>{% for dir in worker_data_dirs -%}
+ <value>{% for dir in worker_data_dirs -%}
{{ dir }}/hadoop/yarn/local
{%- if not loop.last -%} , {%- endif -%}
{%- endfor %}</value>
diff --git a/ansible/roles/hadoop/templates/core-site.xml
b/ansible/roles/hadoop/templates/core-site.xml
index 409c103..56232aa 100644
--- a/ansible/roles/hadoop/templates/core-site.xml
+++ b/ansible/roles/hadoop/templates/core-site.xml
@@ -21,6 +21,8 @@
<!-- Put site-specific property overrides in this file. -->
+<!-- when editing this file please consider if changes are also needed in
roles/hadoop-ha/templates -->
+
<configuration>
<property>
<name>fs.defaultFS</name>
diff --git a/ansible/roles/hadoop/templates/hdfs-site.xml
b/ansible/roles/hadoop/templates/hdfs-site.xml
index 684d4c4..557d7db 100644
--- a/ansible/roles/hadoop/templates/hdfs-site.xml
+++ b/ansible/roles/hadoop/templates/hdfs-site.xml
@@ -21,6 +21,8 @@
<!-- Put site-specific property overrides in this file. -->
+<!-- when editing this file please consider if changes are also needed in
roles/hadoop-ha/templates -->
+
<configuration>
<property>
<name>dfs.datanode.synconclose</name>
diff --git a/ansible/roles/hadoop/templates/mapred-site.xml
b/ansible/roles/hadoop/templates/mapred-site.xml
index 7dbb5f6..a95eb77 100644
--- a/ansible/roles/hadoop/templates/mapred-site.xml
+++ b/ansible/roles/hadoop/templates/mapred-site.xml
@@ -21,6 +21,8 @@
<!-- Put site-specific property overrides in this file. -->
+<!-- when editing this file please consider if changes are also needed in
roles/hadoop-ha/templates -->
+
<configuration>
<property>
<name>mapreduce.framework.name</name>
diff --git a/ansible/roles/hadoop/templates/yarn-site.xml
b/ansible/roles/hadoop/templates/yarn-site.xml
index c0ba8d9..ac62174 100644
--- a/ansible/roles/hadoop/templates/yarn-site.xml
+++ b/ansible/roles/hadoop/templates/yarn-site.xml
@@ -21,6 +21,8 @@
<!-- Put site-specific property overrides in this file. -->
+<!-- when editing this file please consider if changes are also needed in
roles/hadoop-ha/templates -->
+
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
diff --git a/conf/muchos.props.example b/conf/muchos.props.example
index 489c26b..9a9ca81 100644
--- a/conf/muchos.props.example
+++ b/conf/muchos.props.example
@@ -50,6 +50,10 @@ install_hub = True
java_package=java-1.8.0-openjdk-devel
# The package to use for java 11
# java_package=java-11-openjdk-devel
+# Please read the High-Availability section of the README before switching to
'True'
+hdfs_ha = False
+# Give a logical name for the cluster, all one word no special characters.
Required to support HDFS HA.
+nameservice_id = muchoshacluster
[ec2]
# AWS machine image to use. The default below is for a CentOS 7 image (in
us-east-1).
@@ -107,7 +111,7 @@ subnet = subnet1
subnet_cidr = 10.1.0.0/16
# Size of the cluster to provision.
# A virtual machine scale set (VMSS) with these many VMs will be created.
-# The minimum allowed size for this is 3 nodes.
+# The minimum allowed size for this is 3 nodes for non-HA & 4 nodes for HA
setup
numnodes = 8
# The size of each virtual machine. See the following link for other sizes:
# https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general
diff --git a/lib/muchos/azure.py b/lib/muchos/azure.py
index c9a16b8..c5d1cb6 100644
--- a/lib/muchos/azure.py
+++ b/lib/muchos/azure.py
@@ -35,6 +35,7 @@ class VmssCluster(ExistingCluster):
config = self.config
azure_config = dict(config.items("azure"))
azure_config["admin_username"] = config.get("general", "cluster_user")
+ azure_config["hdfs_ha"] = config.get("general", "hdfs_ha")
azure_config["vmss_name"] = config.cluster_name
azure_config["deploy_path"] = config.deploy_path
azure_config = {k: VmssCluster._parse_config_value(v)
diff --git a/lib/muchos/config.py b/lib/muchos/config.py
index 060a1bf..32bf4ad 100644
--- a/lib/muchos/config.py
+++ b/lib/muchos/config.py
@@ -22,9 +22,9 @@ import os
import json
import glob
-SERVICES = ['zookeeper', 'namenode', 'resourcemanager', 'accumulomaster',
'mesosmaster', 'worker', 'fluo', 'fluo_yarn', 'metrics', 'spark', 'client',
'swarmmanager']
+SERVICES = ['zookeeper', 'namenode', 'resourcemanager', 'accumulomaster',
'mesosmaster', 'worker', 'fluo', 'fluo_yarn', 'metrics', 'spark', 'client',
'swarmmanager', 'journalnode', 'zkfc']
-OPTIONAL_SERVICES = ['fluo', 'fluo_yarn', 'metrics', 'mesosmaster', 'spark',
'client', 'swarmmanager']
+OPTIONAL_SERVICES = ['fluo', 'fluo_yarn', 'metrics', 'mesosmaster', 'spark',
'client', 'swarmmanager', 'journalnode', 'zkfc']
class DeployConfig(ConfigParser):
@@ -449,11 +449,14 @@ HOST_VAR_DEFAULTS = {
'hadoop_tarball': 'hadoop-{{ hadoop_version }}.tar.gz',
'hadoop_version': None,
'hadoop_major_version': '"{{ hadoop_version.split(\'.\')[0] }}"',
- 'hdfs_root': 'hdfs://{{ groups[\'namenode\'][0] }}:8020',
+ 'hdfs_root': "{% if hdfs_ha %}hdfs://{{ nameservice_id }}{% else %}hdfs://{{
groups[\'namenode\'][0] }}:8020{% endif %}",
+ 'hdfs_ha': None,
+ 'nameservice_id': None,
'install_dir': None,
'install_hub': None,
'java_home': '"/usr/lib/jvm/java"',
'java_package': '"java-1.8.0-openjdk-devel"',
+ 'journal_quorum': "{% for host in groups['journalnode'] %}{{ host }}:8485{%
if not loop.last %};{% endif %}{% endfor %}",
'maven_home': '"{{ install_dir }}/apache-maven-{{ maven_version }}"',
'maven_tarball': 'apache-maven-{{ maven_version }}-bin.tar.gz',
'maven_version': '3.6.1',
@@ -463,7 +466,7 @@ HOST_VAR_DEFAULTS = {
'tarballs_dir': '"{{ user_home }}/tarballs"',
'user_home': None,
'worker_data_dirs': None,
- 'zookeeper_connect': '"{{ groups[\'zookeepers\']|join(\',\') }}"',
+ 'zookeeper_connect': "{% for host in groups['zookeepers'] %}{{ host
}}:2181{% if not loop.last %},{% endif %}{% endfor %}",
'zookeeper_client_port': '"2181"',
'zookeeper_home': '"{{ install_dir }}/zookeeper-{{ zookeeper_version }}"',
'zookeeper_tarball': 'zookeeper-{{ zookeeper_version }}.tar.gz',
diff --git a/lib/muchos/existing.py b/lib/muchos/existing.py
index 5253d6d..b0edfe3 100644
--- a/lib/muchos/existing.py
+++ b/lib/muchos/existing.py
@@ -72,7 +72,10 @@ class ExistingCluster:
print("- import_playbook: common.yml", file=site_file)
print("- import_playbook: zookeeper.yml", file=site_file)
- print("- import_playbook: hadoop.yml", file=site_file)
+ if config.get("general","hdfs_ha") == 'True':
+ print("- import_playbook: hadoop-ha.yml", file=site_file)
+ else:
+ print("- import_playbook: hadoop.yml", file=site_file)
if config.has_service("spark"):
print("- import_playbook: spark.yml", file=site_file)
@@ -92,8 +95,18 @@ class ExistingCluster:
ansible_conf = join(config.deploy_path, "ansible/conf")
with open(join(ansible_conf, "hosts"), 'w') as hosts_file:
print("[proxy]\n{0}".format(config.proxy_hostname()),
file=hosts_file)
-
print("\n[accumulomaster]\n{0}".format(config.get_service_hostnames("accumulomaster")[0]),
file=hosts_file)
-
print("\n[namenode]\n{0}".format(config.get_service_hostnames("namenode")[0]),
file=hosts_file)
+ print("\n[accumulomaster]", file=hosts_file)
+ for (index, accu_host) in
enumerate(config.get_service_hostnames("accumulomaster"), start=1):
+ print("{0}".format(accu_host,index), file=hosts_file)
+ print("\n[namenode]",file=hosts_file)
+ for (index, nn_host) in
enumerate(config.get_service_hostnames("namenode"), start=1):
+ print("{0}".format(nn_host,index), file=hosts_file)
+ print("\n[journalnode]",file=hosts_file)
+ for (index, jn_host) in
enumerate(config.get_service_hostnames("journalnode"), start=1):
+ print("{0}".format(jn_host,index), file=hosts_file)
+ print("\n[zkfc]",file=hosts_file)
+ for (index, zkfc_host) in
enumerate(config.get_service_hostnames("zkfc"), start=1):
+ print("{0}".format(zkfc_host,index), file=hosts_file)
print("\n[resourcemanager]\n{0}".format(config.get_service_hostnames("resourcemanager")[0]),
file=hosts_file)
if config.has_service("spark"):