This is an automated email from the ASF dual-hosted git repository. lfrolov pushed a commit to branch DATALAB-2998 in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
commit e6d0b171333cf7fd1e65b55a34543965b811cd5a Author: leonidfrolov <[email protected]> AuthorDate: Tue Sep 20 16:49:48 2022 +0300 [DATALAB-2998]: added new files for zeppelin dataengine-service connection --- .../src/general/lib/os/fab.py | 6 + ...common_notebook_configure_dataengine-service.py | 129 +++++++++++++++++++++ .../zeppelin_dataengine-service_create_configs.py | 94 +++++++++++++++ .../zeppelin_install_dataengine-service_kernels.py | 116 ++++++++++++++++++ 4 files changed, 345 insertions(+) diff --git a/infrastructure-provisioning/src/general/lib/os/fab.py b/infrastructure-provisioning/src/general/lib/os/fab.py index 6635258d2..928a6b5c4 100644 --- a/infrastructure-provisioning/src/general/lib/os/fab.py +++ b/infrastructure-provisioning/src/general/lib/os/fab.py @@ -1428,3 +1428,9 @@ def update_pyopenssl_lib(os_user): conn.sudo('touch /home/{}/.ensure_dir/pyopenssl_updated'.format(os_user)) except: sys.exit(1) + +def get_hdinsight_headnode_private_ip(os_user, cluster_name, keyfile): + init_datalab_connection('{}-ssh.azurehdinsight.net'.format(cluster_name), os_user, keyfile) + headnode_private_ip = conn.sudo("cat /etc/hosts | grep headnode | awk '{print $1}'") + conn.close() + return headnode_private_ip diff --git a/infrastructure-provisioning/src/general/scripts/azure/common_notebook_configure_dataengine-service.py b/infrastructure-provisioning/src/general/scripts/azure/common_notebook_configure_dataengine-service.py new file mode 100644 index 000000000..92484657f --- /dev/null +++ b/infrastructure-provisioning/src/general/scripts/azure/common_notebook_configure_dataengine-service.py @@ -0,0 +1,129 @@ +#!/usr/bin/python3 + +# ***************************************************************************** +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ****************************************************************************** + +import datalab.fab +import datalab.actions_lib +import datalab.meta_lib +import json +from datalab.logger import logging +import os +import sys +import traceback +import subprocess +from fabric import * + + +def clear_resources(): + AzureActions.terminate_hdinsight_cluster(notebook_config['resource_group_name'], notebook_config['cluster_name']) + for storage_account in AzureMeta.list_storage_accounts(notebook_config['resource_group_name']): + if notebook_config['storage_account_name_tag'] == storage_account.tags["Name"]: + AzureActions.remove_storage_account(notebook_config['resource_group_name'], storage_account.name) + # AzureActions.remove_kernels(notebook_config['notebook_name'], notebook_config['cluster_name'], + # os.environ['dataproc_version'], os.environ['conf_os_user'], notebook_config['key_path']) + + +if __name__ == "__main__": + # generating variables dictionary + AzureMeta = datalab.meta_lib.AzureMeta() + AzureActions = datalab.actions_lib.AzureActions() + logging.info('Generating infrastructure names and tags') + notebook_config = dict() + notebook_config['resource_group_name'] = os.environ['azure_resource_group_name'] + notebook_config['service_base_name'] = (os.environ['conf_service_base_name']) + notebook_config['notebook_name'] = os.environ['notebook_instance_name'] + notebook_config['edge_user_name'] = (os.environ['edge_user_name']) + notebook_config['project_name'] = (os.environ['project_name']).replace('_', '-').lower() + notebook_config['project_tag'] = notebook_config['project_name'] + notebook_config['endpoint_name'] = (os.environ['endpoint_name']).replace('_', '-').lower() + notebook_config['endpoint_tag'] = notebook_config['endpoint_name'] + notebook_config['tag_name'] = notebook_config['service_base_name'] + '-tag' + notebook_config['bucket_name'] = '{0}-{1}-{2}-bucket'.format(notebook_config['service_base_name'], + notebook_config['project_name'], + notebook_config['endpoint_name']) + notebook_config['cluster_name'] = '{}-{}-{}-des-{}'.format(notebook_config['service_base_name'], + notebook_config['project_name'], + notebook_config['endpoint_name'], + notebook_config['computational_name']) + notebook_config['storage_account_name_tag'] = ('{}-bucket'.format(notebook_config['cluster_name'])).lower() + notebook_config['notebook_ip'] = AzureMeta.get_private_ip_address(notebook_config['resource_group_name'], + notebook_config['notebook_name']) + notebook_config['key_path'] = '{0}{1}.pem'.format(os.environ['conf_key_dir'], os.environ['conf_key_name']) + edge_instance_name = '{0}-{1}-{2}-edge'.format(notebook_config['service_base_name'], + notebook_config['project_name'], notebook_config['endpoint_tag']) + edge_instance_hostname = AzureMeta.get_private_ip_address(notebook_config['resource_group_name'], + edge_instance_name) + + if os.environ['application'] == 'deeplearning': + application = 'jupyter' + else: + application = os.environ['application'] + + try: + logging.info('[INSTALLING KERNELS INTO SPECIFIED NOTEBOOK]') + params = "--bucket {} --cluster_name {} --dataproc_version {} --keyfile {} --notebook_ip {} --region {} " \ + "--edge_user_name {} --project_name {} --os_user {} --edge_hostname {} --proxy_port {} " \ + "--scala_version {} --application {}" \ + .format(notebook_config['storage_account_name_tag'], notebook_config['cluster_name'], os.environ['dataproc_version'], + notebook_config['key_path'], notebook_config['notebook_ip'], os.environ['gcp_region'], + notebook_config['edge_user_name'], notebook_config['project_name'], os.environ['conf_os_user'], + edge_instance_hostname, '3128', os.environ['notebook_scala_version'], os.environ['application']) + try: + subprocess.run("~/scripts/{}_{}.py {}".format(application, 'install_dataengine-service_kernels', params), + shell=True, check=True) + except: + traceback.print_exc() + raise Exception + except Exception as err: + clear_resources() + datalab.fab.append_result("Failed installing Dataproc kernels.", str(err)) + sys.exit(1) + + try: + logging.info('[UPDATING SPARK CONFIGURATION FILES ON NOTEBOOK]') + params = "--hostname {0} " \ + "--keyfile {1} " \ + "--os_user {2} " \ + .format(notebook_config['notebook_ip'], + notebook_config['key_path'], + os.environ['conf_os_user']) + try: + subprocess.run("~/scripts/{0}.py {1}".format('common_configure_spark', params), shell=True, check=True) + except: + traceback.print_exc() + raise Exception + except Exception as err: + datalab.fab.append_result("Failed to configure Spark.", str(err)) + clear_resources() + sys.exit(1) + + try: + with open("/root/result.json", 'w') as result: + res = {"notebook_name": notebook_config['notebook_name'], + "Tag_name": notebook_config['tag_name'], + "Action": "Configure notebook server"} + logging.info(json.dumps(res)) + result.write(json.dumps(res)) + except Exception as err: + datalab.fab.append_result("Error with writing results", str(err)) + clear_resources() + sys.exit(1) diff --git a/infrastructure-provisioning/src/general/scripts/azure/zeppelin_dataengine-service_create_configs.py b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_dataengine-service_create_configs.py new file mode 100644 index 000000000..f645b64b4 --- /dev/null +++ b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_dataengine-service_create_configs.py @@ -0,0 +1,94 @@ +#!/usr/bin/python3 + +# ***************************************************************************** +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ****************************************************************************** + +import argparse +import subprocess +from datalab.actions_lib import jars, yarn, install_emr_spark, spark_defaults, installing_python, configure_zeppelin_emr_interpreter +from datalab.common_lib import * +from datalab.fab import configuring_notebook, update_zeppelin_interpreters +from datalab.notebook_lib import * +from fabric import * + +parser = argparse.ArgumentParser() +parser.add_argument('--bucket', type=str, default='') +parser.add_argument('--cluster_name', type=str, default='') +parser.add_argument('--dry_run', type=str, default='false') +parser.add_argument('--emr_version', type=str, default='') +parser.add_argument('--spark_version', type=str, default='') +parser.add_argument('--scala_version', type=str, default='') +parser.add_argument('--hadoop_version', type=str, default='') +parser.add_argument('--matplotlib_version', type=str, default='') +parser.add_argument('--region', type=str, default='') +parser.add_argument('--excluded_lines', type=str, default='') +parser.add_argument('--project_name', type=str, default='') +parser.add_argument('--os_user', type=str, default='') +parser.add_argument('--edge_hostname', type=str, default='') +parser.add_argument('--proxy_port', type=str, default='') +parser.add_argument('--livy_version', type=str, default='') +parser.add_argument('--multiple_clusters', type=str, default='') +parser.add_argument('--numpy_version', type=str, default='') +parser.add_argument('--application', type=str, default='') +parser.add_argument('--r_enabled', type=str, default='') +args = parser.parse_args() + +emr_dir = '/opt/' + args.emr_version + '/jars/' +kernels_dir = '/home/' + args.os_user + '/.local/share/jupyter/kernels/' +spark_dir = '/opt/' + args.emr_version + '/' + args.cluster_name + '/spark/' +yarn_dir = '/opt/' + args.emr_version + '/' + args.cluster_name + '/conf/' + + +def install_remote_livy(args): + subprocess.run('sudo chown ' + args.os_user + ':' + args.os_user + ' -R /opt/zeppelin/', shell=True, check=True) + subprocess.run('sudo service zeppelin-notebook stop', shell=True, check=True) + subprocess.run('sudo -i wget http://archive.cloudera.com/beta/livy/livy-server-' + args.livy_version + '.zip -O /opt/' + + args.emr_version + '/' + args.cluster_name + '/livy-server-' + args.livy_version + '.zip', shell=True, check=True) + subprocess.run('sudo unzip /opt/' + + args.emr_version + '/' + args.cluster_name + '/livy-server-' + args.livy_version + '.zip -d /opt/' + + args.emr_version + '/' + args.cluster_name + '/', shell=True, check=True) + subprocess.run('sudo mv /opt/' + args.emr_version + '/' + args.cluster_name + '/livy-server-' + args.livy_version + + '/ /opt/' + args.emr_version + '/' + args.cluster_name + '/livy/', shell=True, check=True) + livy_path = '/opt/' + args.emr_version + '/' + args.cluster_name + '/livy/' + subprocess.run('sudo mkdir -p ' + livy_path + '/logs', shell=True, check=True) + subprocess.run('sudo mkdir -p /var/run/livy', shell=True, check=True) + subprocess.run('sudo chown ' + args.os_user + ':' + args.os_user + ' -R /var/run/livy', shell=True, check=True) + subprocess.run('sudo chown ' + args.os_user + ':' + args.os_user + ' -R ' + livy_path, shell=True, check=True) + + +if __name__ == "__main__": + if args.dry_run == 'true': + parser.print_help() + else: + result = prepare(emr_dir, yarn_dir) + if result == False : + jars(args, emr_dir) + yarn(args, yarn_dir) + install_emr_spark(args) + spark_defaults(args) + configuring_notebook(args.emr_version) + if args.multiple_clusters == 'true': + install_remote_livy(args) + installing_python(args.region, args.bucket, args.project_name, args.cluster_name, args.application, + args.numpy_version, args.matplotlib_version) + configure_zeppelin_emr_interpreter(args.emr_version, args.cluster_name, args.region, spark_dir, args.os_user, + yarn_dir, args.bucket, args.project_name, endpoint_url, args.multiple_clusters) + update_zeppelin_interpreters(args.multiple_clusters, args.r_enabled) diff --git a/infrastructure-provisioning/src/general/scripts/azure/zeppelin_install_dataengine-service_kernels.py b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_install_dataengine-service_kernels.py new file mode 100644 index 000000000..b8ea051d1 --- /dev/null +++ b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_install_dataengine-service_kernels.py @@ -0,0 +1,116 @@ +#!/usr/bin/python3 + +# ***************************************************************************** +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ****************************************************************************** + +import argparse +import os +from datalab.meta_lib import * +from fabric import * + +parser = argparse.ArgumentParser() +parser.add_argument('--bucket', type=str, default='') +parser.add_argument('--cluster_name', type=str, default='') +parser.add_argument('--dry_run', type=str, default='false') +parser.add_argument('--emr_version', type=str, default='') +parser.add_argument('--keyfile', type=str, default='') +parser.add_argument('--region', type=str, default='') +parser.add_argument('--notebook_ip', type=str, default='') +parser.add_argument('--scala_version', type=str, default='') +parser.add_argument('--emr_excluded_spark_properties', type=str, default='') +parser.add_argument('--project_name', type=str, default='') +parser.add_argument('--os_user', type=str, default='') +parser.add_argument('--edge_hostname', type=str, default='') +parser.add_argument('--proxy_port', type=str, default='') +parser.add_argument('--application', type=str, default='') +args = parser.parse_args() + + +def configure_notebook(args): + templates_dir = '/root/templates/' + scripts_dir = '/root/scripts/' + if os.environ['notebook_multiple_clusters'] == 'true': + conn.put(templates_dir + 'dataengine-service_interpreter_livy.json', '/tmp/dataengine-service_interpreter.json') + else: + conn.put(templates_dir + 'dataengine-service_interpreter_spark.json', '/tmp/dataengine-service_interpreter.json') + conn.put('{}{}_dataengine-service_create_configs.py'.format(scripts_dir, args.application), + '/tmp/zeppelin_dataengine-service_create_configs.py') + conn.sudo('\cp /tmp/zeppelin_dataengine-service_create_configs.py ' + '/usr/local/bin/zeppelin_dataengine-service_create_configs.py') + conn.sudo('chmod 755 /usr/local/bin/zeppelin_dataengine-service_create_configs.py') + conn.sudo('mkdir -p /usr/lib/python3.8/datalab/') + conn.run('mkdir -p /tmp/datalab_libs/') + conn.local('rsync -e "ssh -i {}" /usr/lib/python3.8/datalab/*.py {}@{}:/tmp/datalab_libs/'.format(args.keyfile, args.os_user, args.notebook_ip)) + conn.run('chmod a+x /tmp/datalab_libs/*') + conn.sudo('mv /tmp/datalab_libs/* /usr/lib/python3.8/datalab/') + if exists(conn, '/usr/lib64'): + conn.sudo('mkdir -p /usr/lib64/python3.8') + conn.sudo('ln -fs /usr/lib/python3.8/datalab /usr/lib64/python3.8/datalab') + + +if __name__ == "__main__": + global conn + conn = datalab.fab.init_datalab_connection(args.notebook_ip, args.os_user, args.keyfile) + configure_notebook(args) + spark_version = "None" #get_spark_version(args.cluster_name) + hadoop_version = "None" #get_hadoop_version(args.cluster_name) + livy_version = os.environ['notebook_livy_version'] + r_enabled = os.environ['notebook_r_enabled'] + numpy_version = os.environ['notebook_numpy_version'] + matplotlib_version = os.environ['notebook_matplotlib_version'] + command = "/usr/bin/python3 /usr/local/bin/zeppelin_dataengine-service_create_configs.py " \ + "--bucket {0} " \ + "--cluster_name {1} " \ + "--emr_version {2} " \ + "--spark_version {3} " \ + "--hadoop_version {4} " \ + "--region {5} " \ + "--excluded_lines '{6}' " \ + "--project_name {7} " \ + "--os_user {8} " \ + "--edge_hostname {9} " \ + "--proxy_port {10} " \ + "--scala_version {11} " \ + "--livy_version {12} " \ + "--multiple_clusters {13} " \ + "--numpy_version {14} " \ + "--matplotlib_version {15} " \ + "--application {16} " \ + "--r_enabled {17}" \ + .format(args.bucket, + args.cluster_name, + args.emr_version, + spark_version, + hadoop_version, + args.region, + args.emr_excluded_spark_properties, + args.project_name, + args.os_user, + args.edge_hostname, + args.proxy_port, + args.scala_version, + livy_version, + os.environ['notebook_multiple_clusters'], + numpy_version, + matplotlib_version, + args.application, + r_enabled) + conn.sudo(command) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
