This is an automated email from the ASF dual-hosted git repository. mykolabodnar pushed a commit to branch DATALAB-2449 in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
commit 45599a2448d91b27cd8cd206e1a896447324e236 Author: bodnarmykola <[email protected]> AuthorDate: Thu Jul 1 14:58:56 2021 +0300 [DATALAB-2449] - [GCP] GPU type and count fixed for Jupyter and Spark cluster --- .../src/general/lib/gcp/actions_lib.py | 10 +++---- .../general/scripts/gcp/common_create_instance.py | 4 ++- .../general/scripts/gcp/common_prepare_notebook.py | 17 ++++++++---- .../general/scripts/gcp/dataengine_configure.py | 32 ++++++++++++++++++++++ .../src/general/scripts/gcp/dataengine_prepare.py | 31 ++++++++++++++------- .../src/general/scripts/gcp/jupyter_configure.py | 2 +- 6 files changed, 73 insertions(+), 23 deletions(-) diff --git a/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py b/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py index a2429bc..fa1d891 100644 --- a/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py +++ b/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py @@ -320,7 +320,7 @@ class GCPActions: initial_user, image_name, secondary_image_name, service_account_name, instance_class, network_tag, labels, static_ip='', primary_disk_size='12', secondary_disk_size='30', - gpu_accelerator_type='None'): + gpu_accelerator_type='None', gpu_accelerator_count='1'): key = RSA.importKey(open(ssh_key_path, 'rb').read()) ssh_key = key.publickey().exportKey("OpenSSH").decode('UTF-8') unique_index = datalab.meta_lib.GCPMeta().get_index_by_service_account_name(service_account_name) @@ -439,12 +439,12 @@ class GCPActions: if instance_class == 'notebook' or instance_class == 'dataengine': del instance_params['networkInterfaces'][0]['accessConfigs'] if gpu_accelerator_type != 'None': - request = self.service.acceleratorTypes().list(project=self.project, zone = zone) - result = request.execute().get('items') - gpu_accelerator_type = result[0].get('name') + #request = self.service.acceleratorTypes().list(project=self.project, zone = zone) + #result = request.execute().get('items') + #gpu_accelerator_type = result[0].get('name') instance_params['guestAccelerators'] = [ { - "acceleratorCount": 1, + "acceleratorCount": gpu_accelerator_count, "acceleratorType": "projects/{0}/zones/{1}/acceleratorTypes/{2}".format( self.project, zone, gpu_accelerator_type) } diff --git a/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py b/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py index 3ab863f..b62f882 100644 --- a/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py +++ b/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py @@ -45,6 +45,7 @@ parser.add_argument('--instance_class', type=str, default='') parser.add_argument('--static_ip', type=str, default='') parser.add_argument('--labels', type=str, default='{"empty":"string"}') parser.add_argument('--gpu_accelerator_type', type=str, default='None') +parser.add_argument('--gpu_accelerator_count', type=str, default='None') parser.add_argument('--network_tag', type=str, default='') parser.add_argument('--cluster_name', type=str, default='') parser.add_argument('--service_base_name', type=str, default='') @@ -62,7 +63,8 @@ if __name__ == "__main__": args.instance_size, args.ssh_key_path, args.initial_user, args.image_name, args.secondary_image_name, args.service_account_name, args.instance_class, args.network_tag, json.loads(args.labels), args.static_ip, - args.primary_disk_size, args.secondary_disk_size, args.gpu_accelerator_type) + args.primary_disk_size, args.secondary_disk_size, args.gpu_accelerator_type, + args.gpu_accelerator_count) else: parser.print_help() sys.exit(2) diff --git a/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py b/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py index 3ba882d..17ac8e0 100644 --- a/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py +++ b/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py @@ -97,7 +97,7 @@ if __name__ == "__main__": notebook_config['project_name'], notebook_config['endpoint_name'], notebook_config['exploratory_name']) - notebook_config['primary_disk_size'] = (lambda x: '50' if x == 'deeplearning' else '16')( + notebook_config['primary_disk_size'] = (lambda x: '60' if x == 'deeplearning' else '20')( os.environ['application']) notebook_config['secondary_disk_size'] = os.environ['notebook_disk_size'] @@ -155,9 +155,14 @@ if __name__ == "__main__": notebook_config['secondary_image_name'].get('name')) notebook_config['gpu_accelerator_type'] = 'None' + notebook_config['gpu_accelerator_count'] = 'None' if os.environ['application'] in ('tensor', 'tensor-rstudio', 'deeplearning') or os.environ['gpu_enabled'] == 'True': - notebook_config['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type'] + if os.environ['gpuType'] != '': + notebook_config['gpu_accelerator_type'] = os.environ['gpuType'] + notebook_config['gpu_accelerator_count'] = os.environ['gpuCount'] + else: + notebook_config['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type'] notebook_config['network_tag'] = '{0}-{1}-{2}-ps'.format(notebook_config['service_base_name'], notebook_config['project_name'], @@ -194,16 +199,16 @@ if __name__ == "__main__": params = "--instance_name {0} --region {1} --zone {2} --vpc_name {3} --subnet_name {4} --instance_size {5} " \ "--ssh_key_path {6} --initial_user {7} --service_account_name {8} --image_name {9} " \ "--secondary_image_name {10} --instance_class {11} --primary_disk_size {12} " \ - "--secondary_disk_size {13} --gpu_accelerator_type {14} --network_tag {15} --labels '{16}' " \ - "--service_base_name {17}".\ + "--secondary_disk_size {13} --gpu_accelerator_type {14} --gpu_accelerator_count {15} --network_tag {16} --labels '{17}' " \ + "--service_base_name {18}".\ format(notebook_config['instance_name'], notebook_config['region'], notebook_config['zone'], notebook_config['vpc_name'], notebook_config['subnet_name'], notebook_config['instance_size'], notebook_config['ssh_key_path'], notebook_config['initial_user'], notebook_config['notebook_service_account_name'], notebook_config['primary_image_name'], notebook_config['secondary_image_name'], 'notebook', notebook_config['primary_disk_size'], notebook_config['secondary_disk_size'], notebook_config['gpu_accelerator_type'], - notebook_config['network_tag'], json.dumps(notebook_config['labels']), - notebook_config['service_base_name']) + notebook_config['gpu_accelerator_count'], notebook_config['network_tag'], + json.dumps(notebook_config['labels']), notebook_config['service_base_name']) try: subprocess.run("~/scripts/{}.py {}".format('common_create_instance', params), shell=True, check=True) except: diff --git a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py index 8507703..87e6bb2 100644 --- a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py +++ b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py @@ -124,6 +124,22 @@ def configure_slave(slave_number, data_engine): datalab.fab.append_result("Failed to configure slave node.", str(err)) sys.exit(1) + if 'slave_gpu_type' in os.environ: + try: + print('[INSTALLING GPU DRIVERS ON MASTER NODE]') + params = "--hostname {} --keyfile {} --os_user {}".format( + slave_hostname, keyfile_name, data_engine['datalab_ssh_user']) + try: + subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True) + except: + datalab.fab.append_result("Failed installing gpu drivers") + raise Exception + + except Exception as err: + datalab.fab.append_result("Failed to install GPU drivers.", str(err)) + GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone']) + sys.exit(1) + def clear_resources(): for i in range(data_engine['instance_count'] - 1): @@ -298,6 +314,22 @@ if __name__ == "__main__": clear_resources() sys.exit(1) + if 'master_gpu_type' in os.environ: + try: + print('[INSTALLING GPU DRIVERS ON MASTER NODE]') + params = "--hostname {} --keyfile {} --os_user {}".format( + master_node_hostname, keyfile_name, data_engine['datalab_ssh_user']) + try: + subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True) + except: + datalab.fab.append_result("Failed installing gpu drivers") + raise Exception + + except Exception as err: + datalab.fab.append_result("Failed to install GPU drivers.", str(err)) + GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone']) + sys.exit(1) + try: jobs = [] for slave in range(data_engine['instance_count'] - 1): diff --git a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py index 64a27c0..2051b7c 100644 --- a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py +++ b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py @@ -150,7 +150,17 @@ if __name__ == "__main__": data_engine['gpu_accelerator_type'] = 'None' if os.environ['application'] in ('tensor', 'tensor-rstudio', 'deeplearning'): - data_engine['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type'] + if os.environ['gpu_type'] != '': + notebook_config['gpu_accelerator_type'] = os.environ['gpu_type'] + else: + notebook_config['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type'] + + if 'master_gpu_type' in os.environ: + data_engine['gpu_master_accelerator_type'] = os.environ['master_gpu_type'] + data_engine['gpu_master_accelerator_count'] = os.environ['master_gpu_count'] + data_engine['gpu_slave_accelerator_type'] = os.environ['slave_gpu_type'] + data_engine['gpu_slave_accelerator_count'] = os.environ['slave_gpu_count'] + data_engine['network_tag'] = '{0}-{1}-{2}-ps'.format(data_engine['service_base_name'], data_engine['project_name'], data_engine['endpoint_name']) additional_tags = os.environ['tags'].replace("': '", ":").replace("', '", ",").replace("{'", "").replace( @@ -185,14 +195,14 @@ if __name__ == "__main__": params = "--instance_name {0} --region {1} --zone {2} --vpc_name {3} --subnet_name {4} --instance_size {5} " \ "--ssh_key_path {6} --initial_user {7} --service_account_name {8} --image_name {9} " \ "--secondary_image_name {10} --instance_class {11} --primary_disk_size {12} " \ - "--secondary_disk_size {13} --gpu_accelerator_type {14} --network_tag {15} --cluster_name {16} " \ - "--labels '{17}' --service_base_name {18}". \ + "--secondary_disk_size {13} --gpu_accelerator_type {14} --gpu_accelerator_count {15} --network_tag {16} --cluster_name {17} " \ + "--labels '{18}' --service_base_name {19}". \ format(data_engine['master_node_name'], data_engine['region'], data_engine['zone'], data_engine['vpc_name'], data_engine['subnet_name'], data_engine['master_size'], data_engine['ssh_key_path'], initial_user, data_engine['dataengine_service_account_name'], data_engine['primary_image_name'], data_engine['secondary_image_name'], 'dataengine', data_engine['primary_disk_size'], - data_engine['secondary_disk_size'], data_engine['gpu_accelerator_type'], - data_engine['network_tag'], data_engine['cluster_name'], + data_engine['secondary_disk_size'], data_engine['gpu_master_accelerator_type'], + data_engine['gpu_master_accelerator_count'], data_engine['network_tag'], data_engine['cluster_name'], json.dumps(data_engine['master_labels']), data_engine['service_base_name']) try: subprocess.run("~/scripts/{}.py {}".format('common_create_instance', params), shell=True, check=True) @@ -212,16 +222,17 @@ if __name__ == "__main__": params = "--instance_name {0} --region {1} --zone {2} --vpc_name {3} --subnet_name {4} " \ "--instance_size {5} --ssh_key_path {6} --initial_user {7} --service_account_name {8} " \ "--image_name {9} --secondary_image_name {10} --instance_class {11} --primary_disk_size {12} " \ - "--secondary_disk_size {13} --gpu_accelerator_type {14} --network_tag {15} --cluster_name {16} " \ - "--labels '{17}' --service_base_name {18}". \ + "--secondary_disk_size {13} --gpu_accelerator_type {14} --gpu_accelerator_count {15} --network_tag {16} --cluster_name {17} " \ + "--labels '{18}' --service_base_name {19}". \ format(slave_name, data_engine['region'], data_engine['zone'], data_engine['vpc_name'], data_engine['subnet_name'], data_engine['slave_size'], data_engine['ssh_key_path'], initial_user, data_engine['dataengine_service_account_name'], data_engine['primary_image_name'], data_engine['secondary_image_name'], 'dataengine', data_engine['primary_disk_size'], - data_engine['secondary_disk_size'], data_engine['gpu_accelerator_type'], - data_engine['network_tag'], data_engine['cluster_name'], - json.dumps(data_engine['slave_labels']), data_engine['service_base_name']) + data_engine['secondary_disk_size'], data_engine['gpu_slave_accelerator_type'], + data_engine['gpu_slave_accelerator_count'], data_engine['network_tag'], + data_engine['cluster_name'], json.dumps(data_engine['slave_labels']), + data_engine['service_base_name']) try: subprocess.run("~/scripts/{}.py {}".format('common_create_instance', params), shell=True, check=True) except: diff --git a/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py b/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py index 14a48f6..0ede3eb 100644 --- a/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py +++ b/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py @@ -245,7 +245,7 @@ if __name__ == "__main__": try: subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True) except: - datalab.fab.append_result("Failed installing users key") + datalab.fab.append_result("Failed installing gpu drivers") raise Exception except Exception as err: --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
