This is an automated email from the ASF dual-hosted git repository.

arvindsh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/fluo-muchos.git


The following commit(s) were added to refs/heads/master by this push:
     new dd898d7  Add optional support for Azure ADLS Gen2 (#304)
dd898d7 is described below

commit dd898d79344a1df1f3990697717de6944b7a06a8
Author: Shan <[email protected]>
AuthorDate: Wed Jan 8 17:00:09 2020 -0500

    Add optional support for Azure ADLS Gen2 (#304)
    
    Add optional support for Azure ADLS Gen2
    - Muchos launch can now create Azure ADLS Gen2 accounts
    - Muchos setup can use Azure ADLS Gen2 storage accounts for Accumulo
    - Volume chooser is configured to use local HDFS for WALs and ADLS Gen2 for 
tables
---
 README.md                                          |   3 +
 ansible/accumulo.yml                               |  10 +
 .../handlers/init-adlsgen2.yml}                    |   6 +-
 .../main.yml => accumulo/tasks/add-adlsgen2.yml}   |  10 +-
 .../main.yml => accumulo/tasks/init-adlsgen2.yml}  |  11 +-
 ansible/roles/accumulo/templates/accumulo-env.sh   |   7 +
 .../roles/accumulo/templates/accumulo.properties   |   6 +
 ansible/roles/azure/tasks/create_adlsgen2.yml      | 235 +++++++++++++++++++++
 ansible/roles/azure/tasks/main.yml                 |   2 +
 ansible/roles/hadoop-ha/tasks/main.yml             |   8 +
 ansible/roles/hadoop-ha/templates/core-site.xml    |  30 +++
 ansible/roles/hadoop-ha/templates/mapred-site.xml  |   6 +
 ansible/roles/hadoop-ha/templates/yarn-site.xml    |   6 +
 ansible/roles/hadoop/tasks/main.yml                |   8 +
 ansible/roles/hadoop/templates/core-site.xml       |  30 +++
 ansible/roles/hadoop/templates/mapred-site.xml     |   6 +
 ansible/roles/hadoop/templates/yarn-site.xml       |   6 +
 conf/muchos.props.example                          |  25 +++
 lib/muchos/config/azure.py                         |  27 ++-
 19 files changed, 427 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 4339ae8..4dd1fcc 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,9 @@ Under the `azure` section, edit following values as per 
your configuration
 * `numnodes` to change the cluster size in terms of number of nodes deployed
 * `vm_sku` to specify the VM size to use. You can choose from the
   [available VM 
sizes](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general).
+* `use_adlsg2` to use Azure Data Lake Storage(ADLS) Gen2 as datastore for 
Accumulo
+  [ADLS Gen2 
Doc](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction).
+  [Setup ADLS Gen2 as datastore for 
Accumulo](https://accumulo.apache.org/blog/2019/10/15/accumulo-adlsgen2-notes.html).
 
 Within Azure the `nodes` section is auto populated with the hostnames and 
their default roles.
 
diff --git a/ansible/accumulo.yml b/ansible/accumulo.yml
index 2af9d67..2352c85 100644
--- a/ansible/accumulo.yml
+++ b/ansible/accumulo.yml
@@ -27,6 +27,16 @@
     - import_tasks: roles/accumulo/tasks/init-accumulo.yml
   handlers:
     - import_tasks: roles/accumulo/handlers/init-accumulo.yml
+- hosts: all:!{{ azure_proxy_host }}
+  tasks:
+    - import_tasks: roles/accumulo/tasks/add-adlsgen2.yml
+      when: accumulo_major_version == '2' and use_adlsg2 == True
+- hosts: accumulomaster[0]
+  tasks:
+    - import_tasks: roles/accumulo/tasks/init-adlsgen2.yml
+      when: accumulo_major_version == '2' and use_adlsg2 == True
+  handlers:
+    - import_tasks: roles/accumulo/handlers/init-adlsgen2.yml
 - hosts: accumulo
   tasks:
     - name: "start accumulo 1.0"
diff --git a/ansible/roles/azure/tasks/main.yml 
b/ansible/roles/accumulo/handlers/init-adlsgen2.yml
similarity index 86%
copy from ansible/roles/azure/tasks/main.yml
copy to ansible/roles/accumulo/handlers/init-adlsgen2.yml
index 6ec80d7..06f67b5 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/accumulo/handlers/init-adlsgen2.yml
@@ -1,5 +1,3 @@
----
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -17,5 +15,5 @@
 # limitations under the License.
 #
 
-# tasks file for azure
-- import_tasks: create_vmss.yml
+- name: "Initialize Apache Accumulo on ADLS Gen2 volume"
+  command: "{{ accumulo_home }}/bin/accumulo init --add-volumes"
diff --git a/ansible/roles/azure/tasks/main.yml 
b/ansible/roles/accumulo/tasks/add-adlsgen2.yml
similarity index 78%
copy from ansible/roles/azure/tasks/main.yml
copy to ansible/roles/accumulo/tasks/add-adlsgen2.yml
index 6ec80d7..8056f2d 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/accumulo/tasks/add-adlsgen2.yml
@@ -1,5 +1,3 @@
----
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-# tasks file for azure
-- import_tasks: create_vmss.yml
+- name: Add ADLS Gen2 volume
+  lineinfile:
+    path: "{{ accumulo_home }}/conf/accumulo.properties"
+    regexp: '^instance.volumes='
+    line: "instance.volumes={{ hdfs_root }}/accumulo,{{ 
instance_volumes_preferred }}"
diff --git a/ansible/roles/azure/tasks/main.yml 
b/ansible/roles/accumulo/tasks/init-adlsgen2.yml
similarity index 67%
copy from ansible/roles/azure/tasks/main.yml
copy to ansible/roles/accumulo/tasks/init-adlsgen2.yml
index 6ec80d7..505b23d 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/accumulo/tasks/init-adlsgen2.yml
@@ -1,5 +1,3 @@
----
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-# tasks file for azure
-- import_tasks: create_vmss.yml
+- name: "determine if accumulo needs to be initialized on adlsgen2"
+  command: "{{ hadoop_home }}/bin/hdfs dfs -stat {{ 
instance_volumes_preferred[0] }}"
+  register: adlsgen2_stat
+  changed_when: adlsgen2_stat.rc != 0
+  failed_when: adlsgen2_stat.rc != 0 and 'No such file or directory' not in 
adlsgen2_stat.stderr
+  notify: Initialize Apache Accumulo on ADLS Gen2 volume
diff --git a/ansible/roles/accumulo/templates/accumulo-env.sh 
b/ansible/roles/accumulo/templates/accumulo-env.sh
index a6a1bc6..083007b 100755
--- a/ansible/roles/accumulo/templates/accumulo-env.sh
+++ b/ansible/roles/accumulo/templates/accumulo-env.sh
@@ -41,6 +41,10 @@ export HADOOP_HOME={{ hadoop_home }}
 export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"
 
 
CLASSPATH="${conf}:${lib}/*:${HADOOP_CONF_DIR}:${ZOOKEEPER_HOME}/*:${HADOOP_HOME}/share/hadoop/client/*"
+{% if use_adlsg2 == True %}
+CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*"
+CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*"
+{% endif %}
 export CLASSPATH
 
 JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}"
@@ -50,6 +54,9 @@ JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}"
   '-XX:OnOutOfMemoryError=kill -9 %p'
   '-XX:-OmitStackTraceInFastThrow'
   '-Djava.net.preferIPv4Stack=true'
+{% if use_adlsg2 == True %}
+  '-Dorg.wildfly.openssl.path=/usr/lib64'
+{% endif %}
   "-Daccumulo.native.lib.path=${lib}/native")
 
 case "$cmd" in
diff --git a/ansible/roles/accumulo/templates/accumulo.properties 
b/ansible/roles/accumulo/templates/accumulo.properties
index 895cc99..eac3ddf 100644
--- a/ansible/roles/accumulo/templates/accumulo.properties
+++ b/ansible/roles/accumulo/templates/accumulo.properties
@@ -42,3 +42,9 @@ tserver.server.threads.minimum=64
 
 ## The maximum size for each write-ahead log
 tserver.walog.max.size=512M
+
+{% if use_adlsg2 == True %}
+general.volume.chooser=org.apache.accumulo.server.fs.PreferredVolumeChooser
+general.custom.volume.preferred.default={{ instance_volumes_preferred }}
+general.custom.volume.preferred.logger={{ hdfs_root }}/accumulo
+{% endif %}
diff --git a/ansible/roles/azure/tasks/create_adlsgen2.yml 
b/ansible/roles/azure/tasks/create_adlsgen2.yml
new file mode 100644
index 0000000..cd674dd
--- /dev/null
+++ b/ansible/roles/azure/tasks/create_adlsgen2.yml
@@ -0,0 +1,235 @@
+---
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# These Ansible tasks only run on the client machine where Muchos runs
+# At a high level, the various sections in this file do the following:
+# 1. Create an Azure ADLS Gen2 storage account.
+# 2. Create User Assigned Identity.
+# 3. Assign roles to storage accounts.
+# 4. Create filesysystem/container in storage accounts.
+# 5. Update tenant_id, client_id and instance_volumes_preferred in 
muchos.props.
+# 6. Assign User Assigned Identity to VMSS.
+
+- name: Generate MD5 checksum based on resource_group name, vmss_name and 
cluster name
+  shell: echo -n {{ resource_group + vmss_name + location  }}|md5sum|tr -cd 
"[:alnum:]"|cut -c 1-16|tr '[:upper:]' '[:lower:]'
+  register: StorageAccountMD5
+
+- name: Generate random names for storage account names
+  set_fact:
+   StorageAccountName: "{{ StorageAccountMD5.stdout + 
99|random(seed=resource_group)|string + 99|random(seed=vmss_name)|string + 
9|random(seed=location)|string }}"
+
+- name: Initialize instance variables
+  set_fact:
+    InstanceVolumesAuto: []
+    InstanceVolumesManual: []
+
+- name: Validate instance_volumes_input 
+  fail: msg="Variable instance_volumes_input incorrectly specified, Both 
Manual and Auto cannot be specified at same time"
+  when: instance_volumes_input.split('|')[0].split(',') != [''] and 
instance_volumes_input.split('|')[1].split(',') != ['']
+
+- name: Assign manual or autogenerated volumes
+  set_fact:
+    InstanceVolumesTemp: "{{ 
instance_volumes_input.split('|')[0].split(',')|list if 
instance_volumes_input.split('|')[0].split(',') != [''] else 
instance_volumes_input.split('|')[1].split(',')|list }}" 
+
+- name: Retrieve sequence end number to get the number of storage accounts 
+  set_fact:
+     InstanceVolumesEndSequence: "{{ '1' if  
instance_volumes_input.split('|')[0].split(',') == ['']  else 
InstanceVolumesTemp[0]|int }}"
+
+- name: Generate names for Storage Accounts
+  set_fact:
+     InstanceVolumesAuto: "{{ InstanceVolumesAuto + 
['abfss://'+'accumulodata'+'@'+StorageAccountName+item+'.'+InstanceVolumesTemp[1]+'/accumulo']
 }}"
+  with_sequence: start=1 end={{ InstanceVolumesEndSequence|int }}
+  when: InstanceVolumesTemp[0]|int != 0
+
+- name: Retrieve ABFSS values when specified manually 
+  set_fact:
+     InstanceVolumesManual: "{{ InstanceVolumesManual +  [ item ] }}"
+  loop:
+    "{{ InstanceVolumesTemp }}"
+  when: item.split('://')[0] == 'abfss' and  
instance_volumes_input.split('|')[0].split(',') ==  ['']
+
+# This is  final list of instance volumes 
+- name: Assign variables for autogeneration or manual for storage account 
creation
+  set_fact:
+     InstanceVolumes: "{{ InstanceVolumesManual if  
instance_volumes_input.split('|')[0].split(',') ==  [''] else 
InstanceVolumesAuto }}" 
+
+- name: Update instance_volumes_preferred  in muchos.props
+  lineinfile:
+    path: "{{ deploy_path }}/conf/muchos.props"
+    regexp: 
'^instance_volumes_preferred\s*=\s*|^[#]instance_volumes_preferred\s*=\s*'
+    line: "instance_volumes_preferred = {{ InstanceVolumes|join(',') }}"
+
+# Not registering variable because  storage values are not visible immediately
+- name: Create ADLS Gen2 storage acount using REST API
+  azure_rm_resource:
+    resource_group: "{{ resource_group }}"
+    provider: Storage
+    resource_type: storageAccounts
+    resource_name: "{{ item.split('@')[1].split('.')[0] }}"
+    api_version: '2019-04-01'
+    idempotency: yes
+    state: present
+    body:
+      sku:
+         name: "{{ adls_storage_type }}"
+      kind: StorageV2
+      properties:
+         isHnsEnabled: yes
+      location:  "{{ location }}"
+  loop:
+      "{{ InstanceVolumes }}"
+
+# Creating User Assigned identity with vmss_name suffixed by ua-msi if not 
specified in muchos.props
+# Not registering variable because user identity values are not visible 
immediately
+- name: Create User Assigned Identity 
+  azure_rm_resource:
+    resource_group: "{{ resource_group }}"
+    provider: ManagedIdentity
+    resource_type: userAssignedIdentities
+    resource_name: "{{ user_assigned_identity if user_assigned_identity !='' 
else vmss_name + '-ua-msi' }}"
+    api_version: '2018-11-30'
+    idempotency: yes
+    state: present
+    body:
+      location:  "{{ location }}"
+
+# Retrieving  facts about User Assigned Identity 
+- name: Get facts for User Assigned Identity
+  azure_rm_resource_facts:
+    resource_group: "{{ resource_group }}"
+    provider: ManagedIdentity
+    resource_type: userAssignedIdentities
+    resource_name: "{{ user_assigned_identity if user_assigned_identity !='' 
else vmss_name + '-ua-msi' }}"
+    api_version: '2018-11-30'
+  register: UserAssignedIdentityInfo
+  retries: 20
+  delay: 15
+  until:  
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('')
 is defined
+
+- name: Update principal_id in muchos.props
+  lineinfile:
+    path: "{{ deploy_path }}/conf/muchos.props"
+    regexp: '^principal_id\s*=\s*|^[#]principal_id\s*=\s*'
+    line: "principal_id = {{ 
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('')
 }}"
+
+# This will be used to assign the MSI for VMSS
+- name: Format User Assigned Identity for API
+  set_fact:
+    UserAssignedIdentityArr: "{{ 
UserAssignedIdentityInfo.response|default({})|map(attribute='id')|map('regex_replace','^(.*)$','{\"\\1\":{}}')|list}}"
+
+# Retrieve facts about role assignment
+- name: Get role definition id for "Storage Blob Data Contributor"
+  azure_rm_resource_facts:
+    resource_group: "{{ resource_group }}"
+    provider: Authorization
+    resource_type:  roleDefinitions
+    resource_name:  ba92f5b4-2d11-453d-a403-e96b0029c9fe
+    api_version: '2015-07-01'
+  register: RoleDefinitionInfo
+
+# Retrieve storage acount informationn. 
+- name: Check if the storage accounts is visible
+  azure_rm_storageaccount_facts:
+        resource_group: "{{ resource_group }}"
+        name:   "{{ item.split('@')[1].split('.')[0] }}"
+  register: StorageAccountsInfo
+  retries: 20
+  delay: 15
+  until:  
StorageAccountsInfo.storageaccounts|sum(start=[])|map(attribute='id')|join('') 
is defined
+  loop:
+     "{{ InstanceVolumes }}"
+
+# Retrieve storage accounts id  creeated -- Used for account assignments
+- name: Get the id of storage accounts created
+  set_fact:
+    StorageAccountsId: 
"{{StorageAccountsInfo.results|map(attribute='ansible_facts')|map(attribute='azure_storageaccounts')|sum(start=[])|map(attribute='id')|list|unique
 }}"
+  
+# Adding this module since role aassignment fails if it already exists.
+- name:  Get facts about role assignment
+  azure_rm_roleassignment_facts:
+     scope: "{{ item }}"
+     assignee: "{{ 
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('')
 }}"
+     role_definition_id: "{{ 
RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}"
+  register: RoleAssignmentResults
+  retries: 20
+  delay: 15
+  until:  
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('')
 is  defined and  RoleDefinitionInfo.response|map(attribute='id')|join('') is 
defined
+  loop:
+    "{{ StorageAccountsId }}"
+
+- name: Set fact for getting storage accounts that have assigned roles
+  set_fact:
+    StorageAccountRoles: "{{ item|map(attribute='scope')|list|unique }}"
+  no_log: True
+  loop:
+      "{{RoleAssignmentResults.results|map(attribute='roleassignments')|list 
}}"
+
+# This retry logic is needed due to race condition between storage account 
create complete and role assignment
+- name: Create a role assignment
+  azure_rm_roleassignment:
+     scope: "{{ item }}"
+     assignee_object_id: "{{ 
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('')
 }}"
+     role_definition_id: "{{ 
RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}"
+     state: present
+  retries: 30 
+  delay: 15
+  register: roleassignresult
+  until: roleassignresult is succeeded
+  loop:
+     "{{ StorageAccountsId }}"
+  when: item  not in StorageAccountRoles
+
+# This retry logic is needed due to race condition between storage account 
creation and creating filesystem
+- name: Create container/Filesystem on ADLS Gen2 
+  azure_rm_storageblob:
+    resource_group: "{{ resource_group }}" 
+    storage_account_name:  "{{ item.split('@')[1].split('.')[0] }}"
+    container: "{{ item.split('@')[0].split('://')[1] }}"
+  retries: 30
+  delay: 15
+  register: createfsresult            
+  until: createfsresult is succeeded and (createfsresult.changed == False or 
(createfsresult.changed == True and createfsresult.container|length > 0))
+  loop:
+    "{{ InstanceVolumes }}"
+
+# Retrieve tenantId  for core-site.xml 
+- name: Update tenantId in muchos.props
+  lineinfile:
+    path: "{{ deploy_path }}/conf/muchos.props"
+    regexp: '^azure_tenant_id\s*=\s*|^[#]azure_tenant_id\s*=\s*'
+    line: "azure_tenant_id = {{ 
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='tenantId')|list|join('')
 }}"
+  
+# Retrieve clientId  for core-site.xml
+- name: Update clientid in muchos.props
+  lineinfile:
+    path: "{{ deploy_path }}/conf/muchos.props"
+    regexp: '^azure_client_id\s*=\s*|^[#]azure_client_id\s*=\s*'
+    line: "azure_client_id = {{ 
UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='clientId')|list|join('')
 }}"
+
+- name: Assign User Assigned Identity to VMSS
+  azure_rm_resource:
+    resource_group: "{{ resource_group }}"
+    provider: Compute
+    resource_type: virtualMachineScaleSets
+    resource_name: "{{ vmss_name }}"
+    api_version: '2019-03-01'
+    body:
+     location: "{{ location }}"
+     identity:
+       type: UserAssigned
+       userAssignedIdentities: "{{ UserAssignedIdentityArr|join('') }}"
diff --git a/ansible/roles/azure/tasks/main.yml 
b/ansible/roles/azure/tasks/main.yml
index 6ec80d7..a846779 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/azure/tasks/main.yml
@@ -19,3 +19,5 @@
 
 # tasks file for azure
 - import_tasks: create_vmss.yml
+- import_tasks: create_adlsgen2.yml
+  when: use_adlsg2 == True
diff --git a/ansible/roles/hadoop-ha/tasks/main.yml 
b/ansible/roles/hadoop-ha/tasks/main.yml
index 7f456c8..dd92ae1 100644
--- a/ansible/roles/hadoop-ha/tasks/main.yml
+++ b/ansible/roles/hadoop-ha/tasks/main.yml
@@ -54,3 +54,11 @@
     replace: "export HADOOP_LOG_DIR={{ worker_data_dirs[0] }}/logs/hadoop"
 - name: "Create hadoop log dir"
   file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory
+- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh
+  blockinfile:
+    path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh"
+    insertafter: EOF
+    block: |
+       export HADOOP_OPTIONAL_TOOLS=hadoop-azure
+       export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64 
${HADOOP_OPTS}"
+  when: hadoop_major_version == '3' and use_adlsg2 == True
diff --git a/ansible/roles/hadoop-ha/templates/core-site.xml 
b/ansible/roles/hadoop-ha/templates/core-site.xml
index dd54827..d717c5c 100644
--- a/ansible/roles/hadoop-ha/templates/core-site.xml
+++ b/ansible/roles/hadoop-ha/templates/core-site.xml
@@ -38,4 +38,34 @@
     <name>ha.zookeeper.quorum</name>
     <value>{{ zookeeper_connect }}</value>
   </property>
+{% if use_adlsg2 == True %}
+  <property>
+    <name>fs.azure.account.auth.type</name>
+    <value>OAuth</value>
+  </property>
+  <property>
+    <name>fs.azure.account.oauth.provider.type</name>
+    <value>org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider</value>
+  </property>
+  <property>
+    <name>fs.azure.account.oauth2.msi.tenant</name>
+    <value>{{ azure_tenant_id}}</value>
+  </property>
+  <property>
+    <name>fs.azure.account.oauth2.client.id</name>
+    <value>{{ azure_client_id }}</value>
+  </property>
+  <property>
+    <name>fs.azure.use.upn</name>
+    <value>true</value>
+  </property>
+  <property>
+    
<name>fs.azure.identity.transformer.service.principal.substitution.list</name>
+    <value>*</value>
+  </property>
+  <property>
+    <name>fs.azure.identity.transformer.service.principal.id</name>
+    <value>{{ principal_id }}</value>
+  </property>
+{% endif %}
 </configuration>
diff --git a/ansible/roles/hadoop-ha/templates/mapred-site.xml 
b/ansible/roles/hadoop-ha/templates/mapred-site.xml
index c6be0ce..c3def16 100644
--- a/ansible/roles/hadoop-ha/templates/mapred-site.xml
+++ b/ansible/roles/hadoop-ha/templates/mapred-site.xml
@@ -54,4 +54,10 @@
     <value>HADOOP_MAPRED_HOME={{ hadoop_home }}</value>
   </property>
 {% endif %}
+{% if use_adlsg2 == True %}
+  <property>
+    <name>mapreduce.application.classpath</name>
+    
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+  </property>
+{% endif %}
 </configuration>
diff --git a/ansible/roles/hadoop-ha/templates/yarn-site.xml 
b/ansible/roles/hadoop-ha/templates/yarn-site.xml
index 85033a6..eb45896 100644
--- a/ansible/roles/hadoop-ha/templates/yarn-site.xml
+++ b/ansible/roles/hadoop-ha/templates/yarn-site.xml
@@ -93,4 +93,10 @@
     <name>twill.java.reserved.memory.mb</name>
     <value>{{ twill_reserve_mem_mb }}</value>
   </property>
+  {% if use_adlsg2 == True %}
+  <property>
+    <name>yarn.application.classpath</name>
+    
<value>${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+  </property>
+  {% endif %}
 </configuration>
diff --git a/ansible/roles/hadoop/tasks/main.yml 
b/ansible/roles/hadoop/tasks/main.yml
index d0219b3..a6733a9 100644
--- a/ansible/roles/hadoop/tasks/main.yml
+++ b/ansible/roles/hadoop/tasks/main.yml
@@ -55,3 +55,11 @@
 - name: "Create hadoop log dir"
   file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory
 
+- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh
+  blockinfile:
+    path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh"
+    insertafter: EOF
+    block: |
+       export HADOOP_OPTIONAL_TOOLS=hadoop-azure
+       export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64 
${HADOOP_OPTS}"
+  when: hadoop_major_version == '3' and use_adlsg2 == True
diff --git a/ansible/roles/hadoop/templates/core-site.xml 
b/ansible/roles/hadoop/templates/core-site.xml
index 56232aa..c5f1597 100644
--- a/ansible/roles/hadoop/templates/core-site.xml
+++ b/ansible/roles/hadoop/templates/core-site.xml
@@ -36,4 +36,34 @@
     <name>dfs.domain.socket.path</name>
     <value>/var/lib/hadoop-hdfs/dn_socket</value>
   </property>
+{% if use_adlsg2 == True %}
+  <property>
+    <name>fs.azure.account.auth.type</name>
+    <value>OAuth</value>
+  </property>
+  <property>
+    <name>fs.azure.account.oauth.provider.type</name>
+    <value>org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider</value>
+  </property>
+  <property>
+    <name>fs.azure.account.oauth2.msi.tenant</name>
+    <value>{{ azure_tenant_id}}</value>
+  </property>
+  <property>
+    <name>fs.azure.account.oauth2.client.id</name>
+    <value>{{ azure_client_id }}</value>
+  </property>
+  <property>
+    <name>fs.azure.use.upn</name>
+    <value>true</value>
+  </property>
+  <property>
+    
<name>fs.azure.identity.transformer.service.principal.substitution.list</name>
+    <value>*</value>
+  </property>
+  <property>
+    <name>fs.azure.identity.transformer.service.principal.id</name>
+    <value>{{ principal_id }}</value>
+  </property>
+{% endif %}
 </configuration>
diff --git a/ansible/roles/hadoop/templates/mapred-site.xml 
b/ansible/roles/hadoop/templates/mapred-site.xml
index a95eb77..7ecf751 100644
--- a/ansible/roles/hadoop/templates/mapred-site.xml
+++ b/ansible/roles/hadoop/templates/mapred-site.xml
@@ -56,4 +56,10 @@
     <value>HADOOP_MAPRED_HOME={{ hadoop_home }}</value>
   </property>
 {% endif %}
+{% if use_adlsg2 == True %}
+  <property>
+    <name>mapreduce.application.classpath</name>
+    
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+  </property>
+{% endif %}
 </configuration>
diff --git a/ansible/roles/hadoop/templates/yarn-site.xml 
b/ansible/roles/hadoop/templates/yarn-site.xml
index ac62174..847f98b 100644
--- a/ansible/roles/hadoop/templates/yarn-site.xml
+++ b/ansible/roles/hadoop/templates/yarn-site.xml
@@ -82,4 +82,10 @@
     <name>twill.java.reserved.memory.mb</name>
     <value>{{ twill_reserve_mem_mb }}</value>
   </property>
+  {% if use_adlsg2 == True %}
+  <property>
+    <name>yarn.application.classpath</name>
+    
<value>${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/*</value>
+  </property>
+  {% endif %}
 </configuration>
diff --git a/conf/muchos.props.example b/conf/muchos.props.example
index b34a437..41cb5d5 100644
--- a/conf/muchos.props.example
+++ b/conf/muchos.props.example
@@ -129,6 +129,31 @@ metrics_drive_root = var-data
 # Optional proxy VM. If not set, the first node of the cluster will be 
selected as the proxy.
 azure_proxy_host =
 location = westus2
+# Enable ADLS Gen2 storage configuration. Muchos parameters 
instance_volumes_input, instance_volumes_preferred & adls_storage_type is not 
required if use_adlsg2 is false.
+use_adlsg2 = False
+# Storage accounts can be auto generated or manually specified. "|" is used as 
separator between manual and auto generated storage account names and must be 
specified
+# Manual and Auto generated names are mutually exclusive
+#
+# Specifying storage accounts manually:
+#    
|abfss://<container-name>@<storage-account-name>.<domain-name>/<folder-name>". 
Use comma to specify multiple entries
+#    
Example:|abfss://[email protected]/accumulo,abfss://[email protected]/accumulo
+#
+# Specifying auto-generated storage accounts:
+#   <Number-of-Storage-Accounts>,<domain-name>|
+#   Example: 3,dfs.core.windows.net|
+instance_volumes_input = 1,dfs.core.windows.net|
+# Do not update "instance_volumes_preferred", it will be populated dynamically 
during launch phase of muchos
+instance_volumes_preferred =
+# Type of storage for ADLS Gen2 storage accounts
+adls_storage_type = Standard_LRS
+# Specify user assigned identity name. "{{ vmss_name }}-ua-msi"  will be 
created if value is not provided
+user_assigned_identity =
+# Do not update "azure_tenant_id", it will be populated dynamically during 
launch phase of muchos
+azure_tenant_id =
+# Do not update "azure_client_id", it will be populated dynamically during 
launch phase of muchos
+azure_client_id =
+# Do not update "principal_id", it will be populated dynamically during launch 
phase of muchos when "use_hdfs = False"
+principal_id = 
 # Optional Azure fileshare to mount on all nodes.
 # Path and credentials must be updated to enable this.
 #azure_fileshare_mount = /mnt/azure-fileshare
diff --git a/lib/muchos/config/azure.py b/lib/muchos/config/azure.py
index fe93b55..86c584c 100644
--- a/lib/muchos/config/azure.py
+++ b/lib/muchos/config/azure.py
@@ -104,4 +104,29 @@ class AzureDeployConfig(BaseConfig):
     @ansible_host_var(name='az_logs_key')
     @default(None)
     def logs_key(self):
-        return self.get('azure', 'az_logs_key')
\ No newline at end of file
+        return self.get('azure', 'az_logs_key')
+
+    @ansible_host_var(name='use_adlsg2')
+    @default(None)
+    def use_adlsg2(self):
+        return self.get('azure', 'use_adlsg2')
+
+    @ansible_host_var(name='azure_tenant_id')
+    @default(None)
+    def azure_tenant_id(self):
+        return self.get('azure', 'azure_tenant_id')
+
+    @ansible_host_var(name='azure_client_id')
+    @default(None)
+    def azure_client_id(self):
+        return self.get('azure', 'azure_client_id')
+
+    @ansible_host_var(name='principal_id')
+    @default(None)
+    def principal_id(self):
+        return self.get('azure', 'principal_id')
+
+    @ansible_host_var(name='instance_volumes_preferred')
+    @default(None)
+    def instance_volumes_preferred(self):
+        return self.get('azure', 'instance_volumes_preferred')

Reply via email to