This is an automated email from the ASF dual-hosted git repository. mykolabodnar pushed a commit to branch DATALAB-2398 in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
commit 5b71bea8100d421a3bf1c69473bffa90d535688e Author: bodnarmykola <[email protected]> AuthorDate: Tue Jun 8 16:44:35 2021 +0300 [DATALAB-2398] - [Jupyter with TensorFlow][GCP] python specific version via vevn usage implemented --- .../src/general/lib/os/debian/notebook_lib.py | 62 +++++++++++----------- .../src/general/lib/os/fab.py | 12 +++++ .../src/general/scripts/gcp/tensor_configure.py | 16 ++++++ .../src/general/templates/os/tensorboard.service | 2 +- .../src/tensor/scripts/configure_tensor_node.py | 12 +++-- 5 files changed, 70 insertions(+), 34 deletions(-) diff --git a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py index d6d7d63..ec725c2 100644 --- a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py +++ b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py @@ -293,37 +293,37 @@ def install_tensor(os_user, cuda_version, cuda_file_name, if not exists(datalab.fab.conn,'/home/{}/.ensure_dir/tensor_ensured'.format(os_user)): try: # install nvidia drivers - datalab.fab.conn.sudo('''bash -c 'echo "blacklist nouveau" >> /etc/modprobe.d/blacklist-nouveau.conf' ''') - datalab.fab.conn.sudo('''bash -c 'echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf' ''') - datalab.fab.conn.sudo('update-initramfs -u') - datalab.fab.conn.sudo('reboot', warn=True) - time.sleep(60) - manage_pkg('-y install', 'remote', 'dkms libglvnd-dev') - kernel_version = datalab.fab.conn.run('uname -r | tr -d "[..0-9-]"').stdout.replace('\n','') - if kernel_version == 'azure': - manage_pkg('-y install', 'remote', 'linux-modules-`uname -r`') - else: + #datalab.fab.conn.sudo('''bash -c 'echo "blacklist nouveau" >> /etc/modprobe.d/blacklist-nouveau.conf' ''') + #datalab.fab.conn.sudo('''bash -c 'echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf' ''') + #datalab.fab.conn.sudo('update-initramfs -u') + #datalab.fab.conn.sudo('reboot', warn=True) + #time.sleep(60) + ##manage_pkg('-y install', 'remote', 'dkms libglvnd-dev') + #kernel_version = datalab.fab.conn.run('uname -r | tr -d "[..0-9-]"').stdout.replace('\n','') + #if kernel_version == 'azure': + # manage_pkg('-y install', 'remote', 'linux-modules-`uname -r`') + #else: # legacy support for old kernels - datalab.fab.conn.sudo(''' bash -c 'if [[ $(apt-cache search linux-image-`uname -r`) ]]; then apt-get -y ''' - '''install linux-image-`uname -r`; else apt-get -y install linux-modules-`uname -r`; fi;' ''') - datalab.fab.conn.sudo('wget https://us.download.nvidia.com/tesla/{0}/NVIDIA-Linux-x86_64-{0}.run -O ' - '/home/{1}/NVIDIA-Linux-x86_64-{0}.run'.format(nvidia_version, os_user)) - datalab.fab.conn.sudo('/bin/bash /home/{0}/NVIDIA-Linux-x86_64-{1}.run -s --dkms'.format(os_user, nvidia_version)) - datalab.fab.conn.sudo('rm -f /home/{0}/NVIDIA-Linux-x86_64-{1}.run'.format(os_user, nvidia_version)) + # datalab.fab.conn.sudo(''' bash -c 'if [[ $(apt-cache search linux-image-`uname -r`) ]]; then apt-get -y ''' + # '''install linux-image-`uname -r`; else apt-get -y install linux-modules-`uname -r`; fi;' ''') + #datalab.fab.conn.sudo('wget https://us.download.nvidia.com/tesla/{0}/NVIDIA-Linux-x86_64-{0}.run -O ' + # '/home/{1}/NVIDIA-Linux-x86_64-{0}.run'.format(nvidia_version, os_user)) + #datalab.fab.conn.sudo('/bin/bash /home/{0}/NVIDIA-Linux-x86_64-{1}.run -s --dkms'.format(os_user, nvidia_version)) + #datalab.fab.conn.sudo('rm -f /home/{0}/NVIDIA-Linux-x86_64-{1}.run'.format(os_user, nvidia_version)) # install cuda - datalab.fab.conn.sudo('python3 -m pip install --upgrade pip=={0} wheel numpy=={1} --no-cache-dir'.format( - os.environ['conf_pip_version'], os.environ['notebook_numpy_version'])) - datalab.fab.conn.sudo('wget -P /opt https://developer.download.nvidia.com/compute/cuda/{0}/Prod/local_installers/{1}'.format( - cuda_version, cuda_file_name)) - datalab.fab.conn.sudo('apt -y install gcc-8 g++-8') - datalab.fab.conn.sudo('update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 8') - datalab.fab.conn.sudo('update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 8') - datalab.fab.conn.sudo('sh /opt/{} --silent --toolkit'.format(cuda_file_name)) + #datalab.fab.conn.sudo('python3 -m pip install --upgrade pip=={0} wheel numpy=={1} --no-cache-dir'.format( + # os.environ['conf_pip_version'], os.environ['notebook_numpy_version'])) + #datalab.fab.conn.sudo('wget -P /opt https://developer.download.nvidia.com/compute/cuda/{0}/Prod/local_installers/{1}'.format( + # cuda_version, cuda_file_name)) + #datalab.fab.conn.sudo('apt -y install gcc-8 g++-8') + ##datalab.fab.conn.sudo('update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 8') + #datalab.fab.conn.sudo('update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 8') + #datalab.fab.conn.sudo('sh /opt/{} --silent --toolkit'.format(cuda_file_name)) #datalab.fab.conn.sudo('update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9') #datalab.fab.conn.sudo('update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9') - datalab.fab.conn.sudo('mv /usr/local/cuda-{} /opt/'.format(cuda_version)) - datalab.fab.conn.sudo('ln -s /opt/cuda-{0} /usr/local/cuda-{0}'.format(cuda_version)) - datalab.fab.conn.sudo('rm -f /opt/{}'.format(cuda_file_name)) + #datalab.fab.conn.sudo('mv /usr/local/cuda-{} /opt/'.format(cuda_version)) + #datalab.fab.conn.sudo('ln -s /opt/cuda-{0} /usr/local/cuda-{0}'.format(cuda_version)) + #datalab.fab.conn.sudo('rm -f /opt/{}'.format(cuda_file_name)) # install cuDNN datalab.fab.conn.run('wget https://developer.download.nvidia.com/compute/redist/cudnn/v{0}/{1} -O /tmp/{1}'.format( cudnn_version, cudnn_file_name)) @@ -336,11 +336,13 @@ def install_tensor(os_user, cuda_version, cuda_file_name, datalab.fab.conn.run('''bash -l -c 'echo "export LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH:/opt/cudnn/lib64:/usr/local/cuda/lib64\"" >> ~/.bashrc' ''') # install TensorFlow and run TensorBoard # datalab.fab.conn.sudo('python2.7 -m pip install --upgrade https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-{}-cp27-none-linux_x86_64.whl --no-cache-dir'.format(tensorflow_version)) - datalab.fab.conn.sudo('python3 -m pip install --upgrade tensorflow-gpu=={} --no-cache-dir'.format(tensorflow_version)) + datalab.fab.install_venv_pip_pkg('tensorflow-gpu',tensorflow_version) datalab.fab.conn.sudo('mkdir /var/log/tensorboard') datalab.fab.conn.sudo('chown {0}:{0} -R /var/log/tensorboard'.format(os_user)) datalab.fab.conn.put('{}tensorboard.service'.format(templates_dir), '/tmp/tensorboard.service') datalab.fab.conn.sudo("sed -i 's|OS_USR|{}|' /tmp/tensorboard.service".format(os_user)) + venv_activation = 'source /opt/python/python{0}/bin/activate &&'.format(os.environ['notebook_python_venv_version'], os.environ['notebook_python_venv_version'][:3]) + datalab.fab.conn.sudo("sed -i 's|VENV_ACTIVATION|{}|' /tmp/tensorboard.service".format(venv_activation)) http_proxy = datalab.fab.conn.run('''bash -l -c 'echo $http_proxy' ''').stdout.replace('\n','') https_proxy = datalab.fab.conn.run('''bash -l -c 'echo $https_proxy' ''').stdout.replace('\n','') datalab.fab.conn.sudo('sed -i \'/\[Service\]/ a\Environment=\"HTTP_PROXY={}\"\' /tmp/tensorboard.service'.format( @@ -520,13 +522,13 @@ def install_cntk(os_user, cntk_version): def install_keras(os_user, keras_version): if not exists(datalab.fab.conn,'/home/{}/.ensure_dir/keras_ensured'.format(os_user)): - datalab.fab.conn.sudo('pip3 install keras=={} --no-cache-dir'.format(keras_version)) + datalab.fab.install_venv_pip_pkg('keras',keras_version) datalab.fab.conn.sudo('touch /home/{}/.ensure_dir/keras_ensured'.format(os_user)) def install_theano(os_user, theano_version): if not exists(datalab.fab.conn,'/home/{}/.ensure_dir/theano_ensured'.format(os_user)): - datalab.fab.conn.sudo('python3 -m pip install Theano=={} --no-cache-dir'.format(theano_version)) + datalab.fab.install_venv_pip_pkg('Theano',theano_version) datalab.fab.conn.sudo('touch /home/{}/.ensure_dir/theano_ensured'.format(os_user)) diff --git a/infrastructure-provisioning/src/general/lib/os/fab.py b/infrastructure-provisioning/src/general/lib/os/fab.py index 9447242..5f1d6b8 100644 --- a/infrastructure-provisioning/src/general/lib/os/fab.py +++ b/infrastructure-provisioning/src/general/lib/os/fab.py @@ -57,6 +57,18 @@ def ensure_python_venv(python_venv_version): print('Error:', str(err)) sys.exit(1) +def install_venv_pip_pkg(pkg_name, pkg_version = ''): + try: + venv_install_command = 'source /opt/python/python{0}/bin/activate && /opt/python/python{0}/bin/pip{1}'.format( + os.environ['notebook_python_venv_version'], os.environ['notebook_python_venv_version'][:3]) + if pkg_version: + pip_pkg = '{}=={}'.format(pkg_name,pkg_version) + else: + pip_pkg = pkg_name + conn.sudo('''bash -l -c '{0} install {1} --no-cache-dir' '''.format(venv_install_command, pip_pkg)) + except Exception as err: + print('Error:', str(err)) + sys.exit(1) def ensure_pip(requisites): try: diff --git a/infrastructure-provisioning/src/general/scripts/gcp/tensor_configure.py b/infrastructure-provisioning/src/general/scripts/gcp/tensor_configure.py index f26bb69..9708b3b 100644 --- a/infrastructure-provisioning/src/general/scripts/gcp/tensor_configure.py +++ b/infrastructure-provisioning/src/general/scripts/gcp/tensor_configure.py @@ -155,6 +155,22 @@ if __name__ == "__main__": GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone']) sys.exit(1) + #Installing GPU drivers + try: + print('[INSTALLING GPU DRIVERS]') + params = "--hostname {} --keyfile {} --os_user {}".format( + instance_hostname, notebook_config['ssh_key_path'], notebook_config['datalab_ssh_user']) + try: + subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True) + except: + datalab.fab.append_result("Failed installing users key") + raise Exception + + except Exception as err: + datalab.fab.append_result("Failed to install GPU drivers.", str(err)) + GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone']) + sys.exit(1) + # installing and configuring TensorFlow and all dependencies try: logging.info('[CONFIGURE TENSORFLOW NOTEBOOK INSTANCE]') diff --git a/infrastructure-provisioning/src/general/templates/os/tensorboard.service b/infrastructure-provisioning/src/general/templates/os/tensorboard.service index f7dee8b..bf7c949 100644 --- a/infrastructure-provisioning/src/general/templates/os/tensorboard.service +++ b/infrastructure-provisioning/src/general/templates/os/tensorboard.service @@ -25,7 +25,7 @@ Description=Tensorflow Tensorboard [Service] Type=simple PIDFile=/var/run/tensorboard.pid -ExecStart=/bin/bash -c "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cudnn/lib64:/usr/local/cuda/lib64; tensorboard --logdir=/var/log/tensorboard --host 0.0.0.0 --port 6006" +ExecStart=/bin/bash -c "VENV_ACTIVATION export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cudnn/lib64:/usr/local/cuda/lib64; tensorboard --logdir=/var/log/tensorboard --host 0.0.0.0 --port 6006" ExecStop=/bin/bash -c "for i in $(ps aux | grep 'tensorboard' | grep -v grep | awk '{print $2}'); do kill -9 $i; done" User=OS_USR Group=OS_USR diff --git a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py index 6953719..c9b5e3f 100644 --- a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py +++ b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py @@ -49,6 +49,8 @@ jupyter_version = os.environ['notebook_jupyter_version'] nvidia_version = os.environ['notebook_nvidia_version'] theano_version = os.environ['notebook_theano_version'] keras_version = os.environ['notebook_keras_version'] +python_venv_version = os.environ['notebook_python_venv_version'] +python_venv_path = '/opt/python/python{0}/bin/python{1}'.format(python_venv_version, python_venv_version[:3]) if args.region == 'cn-north-1': spark_link = "http://mirrors.hust.edu.cn/apache/spark/spark-" + spark_version + "/spark-" + spark_version + \ "-bin-hadoop" + hadoop_version + ".tgz" @@ -93,6 +95,10 @@ if __name__ == "__main__": print("Install Python 3 modules") ensure_python3_libraries(args.os_user) + # INSTALL PYTHON IN VIRTUALENV + print("Configure Python Virtualenv") + ensure_python_venv(python_venv_version) + # INSTALL TENSORFLOW AND OTHER DEEP LEARNING LIBRARIES print("Install TensorFlow") install_tensor(args.os_user, cuda_version, cuda_file_name, @@ -116,10 +122,10 @@ if __name__ == "__main__": configure_local_spark(jars_dir, templates_dir) # INSTALL JUPYTER KERNELS - print("Install pyspark local kernel for Jupyter") - ensure_pyspark_local_kernel(args.os_user, pyspark_local_path_dir, templates_dir, spark_version) + #print("Install pyspark local kernel for Jupyter") + #ensure_pyspark_local_kernel(args.os_user, pyspark_local_path_dir, templates_dir, spark_version) print("Install py3spark local kernel for Jupyter") - ensure_py3spark_local_kernel(args.os_user, py3spark_local_path_dir, templates_dir, spark_version) + ensure_py3spark_local_kernel(args.os_user, py3spark_local_path_dir, templates_dir, spark_version, python_venv_path, python_venv_version) # INSTALL UNGIT print("Install nodejs") --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
