This is an automated email from the ASF dual-hosted git repository.
jevans pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet-ci.git
The following commit(s) were added to refs/heads/master by this push:
new 89330e4 Update AMI generation tools. Add configs for Windows AMIs
with Cuda 11.4. (#37)
89330e4 is described below
commit 89330e47a581fc9062ed8f9482faf6bf579e47bc
Author: Joe Evans <[email protected]>
AuthorDate: Wed Oct 13 15:43:12 2021 -0700
Update AMI generation tools. Add configs for Windows AMIs with Cuda 11.4.
(#37)
* Add new config for mxnetlinux-gpu-g4 nodes.
* Add mxnetlinux_cpu for ubuntu 20.04 config.
* Remove sleep.
* Rename files
* Add delay before starting jenkins client so software updates can be
installed and services restarted.
* remove sleep
* Add new windows installer script for cuda11.4 and cudnn8.2.
* Add new userdata files for new cuda versions.
* Use variable for cuda root path.
* Disable strict filename checking when copying logfiles via scp.
* Add requirements.txt file for dependencies for create_ami.py.
* Update descriptions of nodes.
* Update Python version.
* Use Python 3.8.10, 3.8.12 is not available from chocolately.
Co-authored-by: Joe Evans <[email protected]>
---
.../lambda_mxnet_ci/autoscaling/handler.py | 17 +-
tools/ami-creator/create_ami.py | 4 +-
tools/ami-creator/requirements.txt | 2 +
.../scripts/win2019_cuda114_installer.py | 416 +++++++++++++++++++++
...inux_cpu.txt => mxnetlinux_cpu_ubuntu_1804.txt} | 1 -
...inux_cpu.txt => mxnetlinux_cpu_ubuntu_2004.txt} | 13 +-
...inux_cpu.txt => mxnetlinux_gpu_ubuntu_2004.txt} | 65 ++--
...-c5.txt => mxnetwindows_cpu_win2019_cuda11.txt} | 0
...c5.txt => mxnetwindows_cpu_win2019_cuda114.txt} | 4 +-
...txt => mxnetwindows_gpu_win2019_cuda114_g3.txt} | 4 +-
....txt => mxnetwindows_gpu_win2019_cuda11_g3.txt} | 0
11 files changed, 461 insertions(+), 65 deletions(-)
diff --git
a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
index e751e99..2f8cdc3 100755
--- a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
+++ b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
@@ -1260,7 +1260,7 @@ def _get_slave_configuration():
},
'mxnetlinux-cpu': {
'num_executors': _get_nb_executors_per_label()['mxnetlinux-cpu'],
# Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
16.04 on a c5.18xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'mxnetlinux-cpu', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
@@ -1287,7 +1287,7 @@ def _get_slave_configuration():
},
'restricted-mxnetlinux-gpu': {
'num_executors':
_get_nb_executors_per_label()['restricted-mxnetlinux-gpu'], # Number of
executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
16.04 on a g3.8xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on a
GPU instance',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'restricted-mxnetlinux-gpu', # Space separated labels
string
'exclusive': True, # Only run jobs assigned to it
@@ -1312,18 +1312,9 @@ def _get_slave_configuration():
'tunnel': _get_jenkins_private_tunnel_address(),
'job_name_restriction_regex': '^restricted-(.*)' # Only run jobs
which start with restricted-
},
- 'mxnetlinux-gpu-p3-8xlarge': {
- 'num_executors':
_get_nb_executors_per_label()['mxnetlinux-gpu-p3-8xlarge'], # Number of
executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
16.04 on a p3.8xlarge',
- 'remote_fs': '/home/jenkins_slave', # Remote workspace location
- 'labels': 'mxnetlinux-gpu-p3-8xlarge', # Space separated labels
string
- 'exclusive': True, # Only run jobs assigned to it
- 'tunnel': _get_jenkins_private_tunnel_address(),
- 'job_name_restriction_regex': '^(?!restricted-).+' # Run only
unrestricted jobs
- },
'mxnetwindows-cpu': {
'num_executors':
_get_nb_executors_per_label()['mxnetwindows-cpu'], # Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Windows
Datacenter 2016 on a c5.18xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Windows',
'remote_fs': 'C:/jenkins_slave', # Remote workspace location
'labels': 'mxnetwindows-cpu', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
@@ -1332,7 +1323,7 @@ def _get_slave_configuration():
},
'mxnetwindows-gpu': {
'num_executors':
_get_nb_executors_per_label()['mxnetwindows-gpu'], # Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Windows
Datacenter 2016 on a g3.8xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Windows on
a GPU instance',
'remote_fs': 'C:/jenkins_slave', # Remote workspace location
'labels': 'mxnetwindows-gpu', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
diff --git a/tools/ami-creator/create_ami.py b/tools/ami-creator/create_ami.py
index 53c31a0..578af75 100755
--- a/tools/ami-creator/create_ami.py
+++ b/tools/ami-creator/create_ami.py
@@ -94,7 +94,7 @@ def wait_for_instance(instance, private_key):
logging.exception("Unable to get password data for
windows instance")
# attempt to save the latest userdata execute log
logfile = "log/userdata-{}.log".format(instance_id)
- ret =
subprocess.run(["scp","-q","-o","StrictHostKeyChecking=no","-o","ConnectTimeout=10","-i",private_key,"administrator@{}:\"C:\\ProgramData\Amazon\\EC2-Windows\\Launch\\Log\\UserdataExecution.log\"".format(i.public_ip_address),logfile])
+ ret =
subprocess.run(["scp","-q","-T","-o","StrictHostKeyChecking=no","-o","ConnectTimeout=10","-i",private_key,"administrator@{}:\"C:\\ProgramData\Amazon\\EC2-Windows\\Launch\\Log\\UserdataExecution.log\"".format(i.public_ip_address),logfile])
if ret.returncode == 0:
if os.stat(logfile).st_size != last_log_size:
last_log_size = os.stat(logfile).st_size
@@ -103,7 +103,7 @@ def wait_for_instance(instance, private_key):
logging.debug("Unable to retrieve userdata log via ssh,
does this windows system have sshd installed and running?")
continue
install_logfile = "log/install-{}.log".format(instance_id)
- ret =
subprocess.run(["scp","-q","-o","StrictHostKeyChecking=no","-i",private_key,"administrator@{}:\"C:\\install.log\"".format(i.public_ip_address),install_logfile])
+ ret =
subprocess.run(["scp","-q","-T","-o","StrictHostKeyChecking=no","-i",private_key,"administrator@{}:\"C:\\install.log\"".format(i.public_ip_address),install_logfile])
if ret.returncode == 0:
if os.stat(install_logfile).st_size !=
last_install_log_size:
last_install_log_size =
os.stat(install_logfile).st_size
diff --git a/tools/ami-creator/requirements.txt
b/tools/ami-creator/requirements.txt
new file mode 100644
index 0000000..42eeb8e
--- /dev/null
+++ b/tools/ami-creator/requirements.txt
@@ -0,0 +1,2 @@
+boto3
+pycryptodome
diff --git a/tools/ami-creator/scripts/win2019_cuda114_installer.py
b/tools/ami-creator/scripts/win2019_cuda114_installer.py
new file mode 100644
index 0000000..fcdd807
--- /dev/null
+++ b/tools/ami-creator/scripts/win2019_cuda114_installer.py
@@ -0,0 +1,416 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""Dependency installer for Windows"""
+
+__author__ = 'Pedro Larroy, Chance Bair, Joe Evans'
+__version__ = '0.4'
+
+import argparse
+import errno
+import logging
+import os
+import psutil
+import shutil
+import subprocess
+import urllib
+import stat
+import tempfile
+import zipfile
+from time import sleep
+from urllib.error import HTTPError
+import logging
+from subprocess import check_output, check_call, call
+import re
+import sys
+import urllib.request
+import contextlib
+import glob
+
+import ssl
+
+ssl._create_default_https_context = ssl._create_unverified_context
+
+log = logging.getLogger(__name__)
+
+
+DEPS = {
+ 'openblas':
'https://windows-post-install.s3-us-west-2.amazonaws.com/OpenBLAS-windows-v0_2_19.zip',
+ 'opencv':
'https://windows-post-install.s3-us-west-2.amazonaws.com/opencv-windows-4.1.2-vc14_vc15.zip',
+ 'cudnn8':
'https://windows-post-install.s3-us-west-2.amazonaws.com/cudnn-11.4-windows-x64-v8.2.4.15.zip',
+ 'perl':
'http://strawberryperl.com/download/5.30.1.1/strawberry-perl-5.30.1.1-64bit.msi',
+ 'clang':
'https://github.com/llvm/llvm-project/releases/download/llvmorg-9.0.1/LLVM-9.0.1-win64.exe',
+}
+
+DEFAULT_SUBPROCESS_TIMEOUT = 3600
+
+
[email protected]
+def remember_cwd():
+ '''
+ Restore current directory when exiting context
+ '''
+ curdir = os.getcwd()
+ try:
+ yield
+ finally:
+ os.chdir(curdir)
+
+
+def retry(target_exception, tries=4, delay_s=1, backoff=2):
+ """Retry calling the decorated function using an exponential backoff.
+
+ http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+ original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+ :param target_exception: the exception to check. may be a tuple of
+ exceptions to check
+ :type target_exception: Exception or tuple
+ :param tries: number of times to try (not retry) before giving up
+ :type tries: int
+ :param delay_s: initial delay between retries in seconds
+ :type delay_s: int
+ :param backoff: backoff multiplier e.g. value of 2 will double the delay
+ each retry
+ :type backoff: int
+ """
+ import time
+ from functools import wraps
+
+ def decorated_retry(f):
+ @wraps(f)
+ def f_retry(*args, **kwargs):
+ mtries, mdelay = tries, delay_s
+ while mtries > 1:
+ try:
+ return f(*args, **kwargs)
+ except target_exception as e:
+ logging.warning("Exception: %s, Retrying in %d
seconds...", str(e), mdelay)
+ time.sleep(mdelay)
+ mtries -= 1
+ mdelay *= backoff
+ return f(*args, **kwargs)
+
+ return f_retry # true decorator
+
+ return decorated_retry
+
+
+@retry((ValueError, OSError, HTTPError), tries=5, delay_s=2, backoff=5)
+def download(url, dest=None, progress=False) -> str:
+ from urllib.request import urlopen
+ from urllib.parse import (urlparse, urlunparse)
+ import progressbar
+ import http.client
+
+ class ProgressCB():
+ def __init__(self):
+ self.pbar = None
+
+ def __call__(self, block_num, block_size, total_size):
+ if not self.pbar and total_size > 0:
+ self.pbar = progressbar.bar.ProgressBar(max_value=total_size)
+ downloaded = block_num * block_size
+ if self.pbar:
+ if downloaded < total_size:
+ self.pbar.update(downloaded)
+ else:
+ self.pbar.finish()
+ if dest and os.path.isdir(dest):
+ local_file = os.path.split(urlparse(url).path)[1]
+ local_path = os.path.normpath(os.path.join(dest, local_file))
+ else:
+ local_path = dest
+ with urlopen(url) as c:
+ content_length = c.getheader('content-length')
+ length = int(content_length) if content_length and isinstance(c,
http.client.HTTPResponse) else None
+ if length and local_path and os.path.exists(local_path) and
os.stat(local_path).st_size == length:
+ log.debug(f"download('{url}'): Already downloaded.")
+ return local_path
+ log.debug(f"download({url}, {local_path}): downloading {length} bytes")
+ if local_path:
+ with tempfile.NamedTemporaryFile(delete=False) as tmpfd:
+ urllib.request.urlretrieve(url, filename=tmpfd.name,
reporthook=ProgressCB() if progress else None)
+ shutil.move(tmpfd.name, local_path)
+ else:
+ (local_path, _) = urllib.request.urlretrieve(url,
reporthook=ProgressCB())
+ log.debug(f"download({url}, {local_path}'): done.")
+ return local_path
+
+
+# Takes arguments and runs command on host. Shell is disabled by default.
+# TODO: Move timeout to args
+def run_command(*args, shell=False, timeout=DEFAULT_SUBPROCESS_TIMEOUT,
**kwargs):
+ try:
+ logging.info("Issuing command: {}".format(args))
+ res = subprocess.check_output(*args, shell=shell,
timeout=timeout).decode("utf-8").replace("\r\n", "\n")
+ logging.info("Output: {}".format(res))
+ except subprocess.CalledProcessError as e:
+ raise RuntimeError("command '{}' return with error (code {}):
{}".format(e.cmd, e.returncode, e.output))
+ return res
+
+
+# Copies source directory recursively to destination.
+def copy(src, dest):
+ try:
+ shutil.copytree(src, dest)
+ logging.info("Moved {} to {}".format(src, dest))
+ except OSError as e:
+ # If the error was caused because the source wasn't a directory
+ if e.errno == errno.ENOTDIR:
+ shutil.copy(src, dest)
+ logging.info("Moved {} to {}".format(src, dest))
+ else:
+ raise RuntimeError("copy return with error: {}".format(e))
+
+
+# Workaround for windows readonly attribute error
+def on_rm_error(func, path, exc_info):
+ # path contains the path of the file that couldn't be removed
+ # let's just assume that it's read-only and unlink it.
+ os.chmod(path, stat.S_IWRITE)
+ os.unlink(path)
+
+
+def reboot_system():
+ logging.info("Rebooting system now...")
+ run_command("shutdown -r -t 5")
+ exit(0)
+
+
+def shutdown_system():
+ logging.info("Shutting down system now...")
+ # wait 20 sec so we can capture the install logs
+ run_command("shutdown -s -t 20")
+ exit(0)
+
+def install_vs():
+ if os.path.exists("C:\\Program Files (x86)\\Microsoft Visual
Studio\\2019"):
+ logging.info("MSVS already installed, skipping.")
+ return False
+ # Visual Studio 2019
+ # Components:
https://docs.microsoft.com/en-us/visualstudio/install/workload-component-id-vs-community?view=vs-2019#visual-studio-core-editor-included-with-visual-studio-community-2019
+ logging.info("Installing Visual Studio 2019...")
+ vs_file_path =
download('https://windows-post-install.s3-us-west-2.amazonaws.com/vs_community__1246179388.1585201415.exe')
+ run_command("PowerShell Rename-Item -Path {} -NewName
\"{}.exe\"".format(vs_file_path,
+
vs_file_path.split('\\')[-1]), shell=True)
+ vs_file_path = vs_file_path + '.exe'
+ logging.info("Installing VisualStudio 2019.....")
+ ret = call(vs_file_path +
+ ' --add Microsoft.VisualStudio.Workload.ManagedDesktop'
+ ' --add Microsoft.VisualStudio.Workload.NetCoreTools'
+ ' --add Microsoft.VisualStudio.Workload.NetWeb'
+ ' --add Microsoft.VisualStudio.Workload.Node'
+ ' --add Microsoft.VisualStudio.Workload.Office'
+ ' --add Microsoft.VisualStudio.Component.TypeScript.2.0'
+ ' --add Microsoft.VisualStudio.Component.TestTools.WebLoadTest'
+ ' --add Component.GitHub.VisualStudio'
+ ' --add
Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core'
+ ' --add Microsoft.VisualStudio.Component.Static.Analysis.Tools'
+ ' --add Microsoft.VisualStudio.Component.VC.CMake.Project'
+ ' --add Microsoft.VisualStudio.Component.VC.140'
+ ' --add
Microsoft.VisualStudio.Component.Windows10SDK.18362.Desktop'
+ ' --add Microsoft.VisualStudio.Component.Windows10SDK.18362.UWP'
+ ' --add
Microsoft.VisualStudio.Component.Windows10SDK.18362.UWP.Native'
+ ' --add
Microsoft.VisualStudio.ComponentGroup.Windows10SDK.18362'
+ ' --add Microsoft.VisualStudio.Component.Windows10SDK.16299'
+ ' --wait'
+ ' --passive'
+ ' --norestart'
+ )
+
+ if ret == 3010 or ret == 0:
+ # 3010 is restart required
+ logging.info("VS install successful.")
+ else:
+ raise RuntimeError("VS failed to install, exit status {}".format(ret))
+
+ # Workaround for --wait sometimes ignoring the subprocesses doing
component installs
+ def vs_still_installing():
+ return {'vs_installer.exe', 'vs_installershell.exe',
'vs_setup_bootstrapper.exe'} & set(map(lambda process: process.name(),
psutil.process_iter()))
+ timer = 0
+ while vs_still_installing() and timer < DEFAULT_SUBPROCESS_TIMEOUT:
+ logging.warning("VS installers still running for %d s", timer)
+ if timer % 60 == 0:
+ logging.info("Waiting for Visual Studio to install for the last {}
seconds".format(str(timer)))
+ sleep(1)
+ timer += 1
+ if vs_still_installing():
+ logging.warning("VS install still running after timeout (%d)",
DEFAULT_SUBPROCESS_TIMEOUT)
+ else:
+ logging.info("Visual studio install complete.")
+ return True
+
+
+def install_perl():
+ if os.path.exists("C:\\Strawberry\\perl\\bin\\perl.exe"):
+ logging.info("Perl already installed, skipping.")
+ return False
+ logging.info("Installing Perl")
+ with tempfile.TemporaryDirectory() as tmpdir:
+ perl_file_path = download(DEPS['perl'], tmpdir)
+ check_call(['msiexec ', '/n', '/passive', '/i', perl_file_path])
+ logging.info("Perl install complete")
+ return True
+
+
+def install_clang():
+ if os.path.exists("C:\\Program Files\\LLVM"):
+ logging.info("Clang already installed, skipping.")
+ return False
+ logging.info("Installing Clang")
+ with tempfile.TemporaryDirectory() as tmpdir:
+ clang_file_path = download(DEPS['clang'], tmpdir)
+ run_command(clang_file_path + " /S /D=C:\\Program Files\\LLVM")
+ logging.info("Clang install complete")
+ return True
+
+
+def install_openblas():
+ if os.path.exists("C:\\Program Files\\OpenBLAS-windows-v0_2_19"):
+ logging.info("OpenBLAS already installed, skipping.")
+ return False
+ logging.info("Installing OpenBLAS")
+ local_file = download(DEPS['openblas'])
+ with zipfile.ZipFile(local_file, 'r') as zip:
+ zip.extractall("C:\\Program Files")
+ run_command("PowerShell Set-ItemProperty -path
'hklm:\\system\\currentcontrolset\\control\\session manager\\environment' -Name
OpenBLAS_HOME -Value 'C:\\Program Files\\OpenBLAS-windows-v0_2_19'")
+ logging.info("Openblas Install complete")
+ return True
+
+
+def install_mkl():
+ if os.path.exists("C:\\Program Files (x86)\\IntelSWTools"):
+ logging.info("Intel MKL already installed, skipping.")
+ return False
+ logging.info("Installing MKL 2019.3.203...")
+ file_path =
download("http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/15247/w_mkl_2019.3.203.exe")
+ run_command("{} --silent --remove-extracted-files yes --a install
-output=C:\mkl-install-log.txt -eula=accept".format(file_path))
+ logging.info("MKL Install complete")
+ return True
+
+
+def install_opencv():
+ if os.path.exists("C:\\Program Files\\opencv"):
+ logging.info("OpenCV already installed, skipping.")
+ return False
+ logging.info("Installing OpenCV")
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local_file = download(DEPS['opencv'])
+ with zipfile.ZipFile(local_file, 'r') as zip:
+ zip.extractall(tmpdir)
+ copy(f'{tmpdir}\\opencv\\build', r'c:\Program Files\opencv')
+
+ run_command("PowerShell Set-ItemProperty -path
'hklm:\\system\\currentcontrolset\\control\\session manager\\environment' -Name
OpenCV_DIR -Value 'C:\\Program Files\\opencv'")
+ logging.info("OpenCV install complete")
+ return True
+
+def install_cudnn8():
+ cuda_root_path = "C:\\Program Files\\NVIDIA GPU Computing
Toolkit\\CUDA\\v11.4"
+ if os.path.exists(cuda_root_path + "\\bin\\cudnn64_8.dll"):
+ logging.info("cuDNN8 already installed, skipping.")
+ return False
+ # cuDNN
+ logging.info("Installing cuDNN8")
+ with tempfile.TemporaryDirectory() as tmpdir:
+ local_file = download(DEPS['cudnn8'])
+ logging.info("Extracting cuDNN archive.")
+ with zipfile.ZipFile(local_file, 'r') as zip:
+ zip.extractall(tmpdir)
+ logging.info("Copying cuDNN distribution files.")
+ for f in glob.glob(tmpdir+"\\cuda\\bin\\*"):
+ copy(f, cuda_root_path + "\\bin")
+ for f in glob.glob(tmpdir+"\\cuda\\include\\*.h"):
+ copy(f, cuda_root_path + "\\include")
+ for f in glob.glob(tmpdir+"\\cuda\\lib\\x64\\*"):
+ copy(f, cuda_root_path + "\\lib\\x64")
+ logging.info("cuDNN8 install complete")
+ return True
+
+def instance_family():
+ return
urllib.request.urlopen('http://instance-data/latest/meta-data/instance-type').read().decode().split('.')[0]
+
+def install_cuda114():
+ if os.path.exists("C:\\Program Files\\NVIDIA GPU Computing
Toolkit\\CUDA\\v11.4\\bin"):
+ logging.info("CUDA 11.4 already installed, skipping.")
+ return False
+ logging.info("Downloadinng CUDA 11.4...")
+ cuda_file_path = download(
+
'https://windows-post-install.s3-us-west-2.amazonaws.com/cuda_11.4.2_win10_network.exe')
+ try:
+ check_call("PowerShell Rename-Item -Path {} -NewName
\"{}.exe\"".format(cuda_file_path,
+
cuda_file_path.split('\\')[-1]), shell=True)
+ except subprocess.CalledProcessError as e:
+ logging.exception("Rename file failed")
+ cuda_file_path = cuda_file_path + '.exe'
+ logging.info("Installing CUDA 11.4...")
+ check_call(cuda_file_path + ' -s')
+ logging.info("Done installing CUDA 11.4.")
+ return True
+
+def schedule_aws_userdata():
+ logging.info("Scheduling AWS init so userdata will run on next boot...")
+ run_command("PowerShell
C:\\ProgramData\\Amazon\\EC2-Windows\\Launch\\Scripts\\InitializeInstance.ps1
-Schedule")
+
+def add_paths():
+ # TODO: Add python paths (python -> C:\\Python37\\python.exe, python2 ->
C:\\Python27\\python.exe)
+ logging.info("Adding Windows Kits to PATH...")
+ current_path = run_command(
+ "PowerShell (Get-Itemproperty -path
'hklm:\\system\\currentcontrolset\\control\\session manager\\environment' -Name
Path).Path")
+ current_path = current_path.rstrip()
+ logging.debug("current_path: {}".format(current_path))
+ new_path = current_path + \
+ ";C:\\Program Files (x86)\\Windows
Kits\\10\\bin\\10.0.16299.0\\x86;C:\\Program
Files\\OpenBLAS-windows-v0_2_19\\bin;C:\\Program Files\\LLVM\\bin;C:\\Program
Files\\opencv\\bin;C:\\Program Files\\opencv\\x64\\vc15\\bin"
+ logging.debug("new_path: {}".format(new_path))
+ run_command("PowerShell Set-ItemProperty -path
'hklm:\\system\\currentcontrolset\\control\\session manager\\environment' -Name
Path -Value '" + new_path + "'")
+
+
+def script_name() -> str:
+ """:returns: script name with leading paths removed"""
+ return os.path.split(sys.argv[0])[1]
+
+def remove_install_task():
+ logging.info("Removing stage2 startup task...")
+ run_command("PowerShell Unregister-ScheduledTask -TaskName 'Stage2Install'
-Confirm:$false")
+
+
+def main():
+ logging.getLogger().setLevel(os.environ.get('LOGLEVEL', logging.DEBUG))
+ logging.basicConfig(filename="C:\\install.log", format='{}: %(asctime)sZ
%(levelname)s %(message)s'.format(script_name()))
+
+ # install all necessary software and reboot after some components
+
+ # for CUDA, the last version you install will be the default, based on
PATH variable
+ if install_cuda114():
+ reboot_system()
+ install_cudnn8()
+ if install_vs():
+ reboot_system()
+ install_openblas()
+ install_mkl()
+ install_opencv()
+ install_perl()
+ install_clang()
+ add_paths()
+ remove_install_task()
+ schedule_aws_userdata()
+ shutdown_system()
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/tools/ami-creator/userdata/mxnetlinux_cpu.txt
b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_1804.txt
similarity index 99%
copy from tools/ami-creator/userdata/mxnetlinux_cpu.txt
copy to tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_1804.txt
index c89ace1..21703ba 100644
--- a/tools/ami-creator/userdata/mxnetlinux_cpu.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_1804.txt
@@ -44,7 +44,6 @@ write_files:
content: |
#!/bin/sh
set -ex
- sleep 180
python3 /home/jenkins_slave/scripts/slave-autoconnect.py
--slave-name-file=/home/jenkins_slave/jenkins_slave_name
--master-file=/home/jenkins_slave/jenkins_master_url
--master-private-file=/home/jenkins_slave/jenkins_master_private_url >
/home/jenkins_slave/auto-connect.log
- path: /etc/fstab
content: |
diff --git a/tools/ami-creator/userdata/mxnetlinux_cpu.txt
b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
similarity index 88%
copy from tools/ami-creator/userdata/mxnetlinux_cpu.txt
copy to tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
index c89ace1..d802023 100644
--- a/tools/ami-creator/userdata/mxnetlinux_cpu.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
@@ -4,11 +4,6 @@ apt_reboot_if_required: false
package_update: true
package_upgrade: true
-apt:
- sources:
- focal.list:
- source: deb [arch=amd64] http://us-west-2.ec2.archive.ubuntu.com/ubuntu/
focal main restricted universe
-
packages:
- htop
- wget
@@ -21,17 +16,13 @@ packages:
- python3-joblib
- docker.io
- qemu
+ - qemu-user-static
- binfmt-support
- awscli
- nfs-common
- libattr1-dev
write_files:
- - path: /etc/apt/preferences
- content: |
- Package: *
- Pin: release a=focal
- Pin-Priority: 100
- path: /etc/docker/daemon.json
content: |
{
@@ -44,7 +35,6 @@ write_files:
content: |
#!/bin/sh
set -ex
- sleep 180
python3 /home/jenkins_slave/scripts/slave-autoconnect.py
--slave-name-file=/home/jenkins_slave/jenkins_slave_name
--master-file=/home/jenkins_slave/jenkins_master_url
--master-private-file=/home/jenkins_slave/jenkins_master_private_url >
/home/jenkins_slave/auto-connect.log
- path: /etc/fstab
content: |
@@ -71,7 +61,6 @@ runcmd:
- [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
- [ "curl", "-L",
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64",
"-o", "/usr/bin/docker-compose" ]
- [ "chmod", "+x", "/usr/bin/docker-compose" ]
- - [ "apt-get", "install", "-t", "focal", "-y", "qemu-user-static" ]
- [ "wget", "-O", "/tmp/qemu-binfmt-conf.sh",
"https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh"
]
- [ "chmod", "+x", "/tmp/qemu-binfmt-conf.sh" ]
- [ "/tmp/qemu-binfmt-conf.sh", "--persistent", "yes", "--qemu-suffix",
"-static", "--qemu-path", "/usr/bin", "--systemd", "ALL" ]
diff --git a/tools/ami-creator/userdata/mxnetlinux_cpu.txt
b/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
similarity index 67%
rename from tools/ami-creator/userdata/mxnetlinux_cpu.txt
rename to tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
index c89ace1..d00fcae 100644
--- a/tools/ami-creator/userdata/mxnetlinux_cpu.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
@@ -3,40 +3,40 @@
apt_reboot_if_required: false
package_update: true
package_upgrade: true
-
apt:
sources:
- focal.list:
- source: deb [arch=amd64] http://us-west-2.ec2.archive.ubuntu.com/ubuntu/
focal main restricted universe
+ nvidia-cuda-drivers.list:
+ source: |
+ deb
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /
+ keyid: 7FA2AF80
+ docker.list:
+ source: deb [arch=amd64] https://download.docker.com/linux/ubuntu
$RELEASE stable
+ keyid: 0EBFCD88
+ nvidia-container-runtime.list:
+ source: |
+ deb https://nvidia.github.io/libnvidia-container/ubuntu20.04/$(ARCH) /
+ deb
https://nvidia.github.io/nvidia-container-runtime/ubuntu20.04/$(ARCH) /
+ deb https://nvidia.github.io/nvidia-docker/ubuntu20.04/$(ARCH) /
+ keyid: F796ECB0
+
+apt_get_command: ["apt-get", "--option=Dpkg::Options::=--force-confold",
"--option=Dpkg::options::=--force-unsafe-io", "--assume-yes", "--quiet",
"--install-recommends"]
packages:
- - htop
- wget
+ - curl
- openjdk-8-jre
- git
- python3
- python3-pip
- - python3-yaml
- - python3-jenkins
- - python3-joblib
- - docker.io
- - qemu
- - binfmt-support
- - awscli
- - nfs-common
- - libattr1-dev
+ - cuda-drivers
+ - apt-transport-https
+ - ca-certificates
+ - software-properties-common
+ - docker-ce
+ - nvidia-container-toolkit
+ - nvidia-container-runtime
write_files:
- - path: /etc/apt/preferences
- content: |
- Package: *
- Pin: release a=focal
- Pin-Priority: 100
- - path: /etc/docker/daemon.json
- content: |
- {
- "live-restore": true
- }
- path: /etc/cron.d/jenkins-start-slave
content: |
@reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh
@@ -44,13 +44,16 @@ write_files:
content: |
#!/bin/sh
set -ex
- sleep 180
python3 /home/jenkins_slave/scripts/slave-autoconnect.py
--slave-name-file=/home/jenkins_slave/jenkins_slave_name
--master-file=/home/jenkins_slave/jenkins_master_url
--master-private-file=/home/jenkins_slave/jenkins_master_private_url >
/home/jenkins_slave/auto-connect.log
- path: /etc/fstab
content: |
/swapfile none swap sw 0 0
append: true
-
+ - path: /etc/docker/daemon.json
+ content: |
+ {
+ "live-restore": true
+ }
runcmd:
- [ "fallocate", "-l", "10G", "/swapfile" ]
@@ -61,20 +64,16 @@ runcmd:
- [ "useradd", "jenkins_slave" ]
- [ "usermod", "-L", "jenkins_slave" ]
- [ "mkdir", "-p", "/home/jenkins_slave/remoting",
"/home/jenkins_slave/scripts" ]
- - [ "pip3", "install", "docker<4.0.0", "boto3" ]
- - [ "pip3", "install", "--upgrade", "awscli" ]
- - [ "usermod", "-aG", "docker", "jenkins_slave" ]
- - [ "systemctl", "enable", "docker" ]
+ - [ "pip3", "install", "boto3", "python-jenkins", "joblib", "docker",
"awscli" ]
- [ "wget", "-O", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"https://raw.githubusercontent.com/apache/incubator-mxnet-ci/master/tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py"
]
- [ "touch", "/home/jenkins_slave/auto-connect.log" ]
- [ "chown", "-R", "jenkins_slave:jenkins_slave", "/home/jenkins_slave" ]
- [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
+ - [ "usermod", "-aG", "docker", "jenkins_slave" ]
+ - [ "systemctl", "enable", "docker" ]
+ - [ "service", "docker", "restart" ]
- [ "curl", "-L",
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64",
"-o", "/usr/bin/docker-compose" ]
- [ "chmod", "+x", "/usr/bin/docker-compose" ]
- - [ "apt-get", "install", "-t", "focal", "-y", "qemu-user-static" ]
- - [ "wget", "-O", "/tmp/qemu-binfmt-conf.sh",
"https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh"
]
- - [ "chmod", "+x", "/tmp/qemu-binfmt-conf.sh" ]
- - [ "/tmp/qemu-binfmt-conf.sh", "--persistent", "yes", "--qemu-suffix",
"-static", "--qemu-path", "/usr/bin", "--systemd", "ALL" ]
- [ "rm", "-f", "/var/lib/cloud/instances/*/sem/config_scripts_user",
"/var/lib/cloud/instance/sem/config_scripts_user" ]
- [ "sleep", "10" ]
- [ "halt", "-p" ]
diff --git a/tools/ami-creator/userdata/win2019-cuda11-c5.txt
b/tools/ami-creator/userdata/mxnetwindows_cpu_win2019_cuda11.txt
similarity index 100%
copy from tools/ami-creator/userdata/win2019-cuda11-c5.txt
copy to tools/ami-creator/userdata/mxnetwindows_cpu_win2019_cuda11.txt
diff --git a/tools/ami-creator/userdata/win2019-cuda11-c5.txt
b/tools/ami-creator/userdata/mxnetwindows_cpu_win2019_cuda114.txt
similarity index 96%
rename from tools/ami-creator/userdata/win2019-cuda11-c5.txt
rename to tools/ami-creator/userdata/mxnetwindows_cpu_win2019_cuda114.txt
index 02bf240..2f95ea2 100644
--- a/tools/ami-creator/userdata/win2019-cuda11-c5.txt
+++ b/tools/ami-creator/userdata/mxnetwindows_cpu_win2019_cuda114.txt
@@ -35,7 +35,7 @@ refreshenv
Remove-Item -path ./install.ps1 -force
# install dependencies
-Check-Call { C:\ProgramData\chocolatey\choco install python --version=3.7.0
--force -y --no-progress }
+Check-Call { C:\ProgramData\chocolatey\choco install python --version=3.8.10
--force -y --no-progress }
refreshenv
Check-Call { C:\Python37\python -m pip install --upgrade pip }
Invoke-WebRequest -Uri
https://raw.githubusercontent.com/aiengines/ci/master/ami_generation/windows/requirements.txt
-OutFile requirements.txt
@@ -81,7 +81,7 @@ function Check-Call {
}
Set-ExecutionPolicy Bypass -Scope Process -Force
cd C:\Users\Administrator
-Invoke-WebRequest -Uri
https://windows-post-install.s3-us-west-2.amazonaws.com/win2019_cuda11_installer.py
-OutFile C:\Users\Administrator\windows_deps_headless_installer.py
+Invoke-WebRequest -Uri
https://windows-post-install.s3-us-west-2.amazonaws.com/win2019_cuda114_installer.py
-OutFile C:\Users\Administrator\windows_deps_headless_installer.py
Check-Call { C:\Python37\python windows_deps_headless_installer.py }
'@
diff --git a/tools/ami-creator/userdata/win2019-cuda11-g3.txt
b/tools/ami-creator/userdata/mxnetwindows_gpu_win2019_cuda114_g3.txt
similarity index 96%
copy from tools/ami-creator/userdata/win2019-cuda11-g3.txt
copy to tools/ami-creator/userdata/mxnetwindows_gpu_win2019_cuda114_g3.txt
index 77eb16d..563286c 100644
--- a/tools/ami-creator/userdata/win2019-cuda11-g3.txt
+++ b/tools/ami-creator/userdata/mxnetwindows_gpu_win2019_cuda114_g3.txt
@@ -35,7 +35,7 @@ refreshenv
Remove-Item -path ./install.ps1 -force
# install dependencies
-Check-Call { C:\ProgramData\chocolatey\choco install python --version=3.7.0
--force -y --no-progress }
+Check-Call { C:\ProgramData\chocolatey\choco install python --version=3.8.10
--force -y --no-progress }
refreshenv
Check-Call { C:\Python37\python -m pip install --upgrade pip }
Invoke-WebRequest -Uri
https://raw.githubusercontent.com/aiengines/ci/master/ami_generation/windows/requirements.txt
-OutFile requirements.txt
@@ -81,7 +81,7 @@ function Check-Call {
}
Set-ExecutionPolicy Bypass -Scope Process -Force
cd C:\Users\Administrator
-Invoke-WebRequest -Uri
https://windows-post-install.s3-us-west-2.amazonaws.com/win2019_cuda11_installer.py
-OutFile C:\Users\Administrator\windows_deps_headless_installer.py
+Invoke-WebRequest -Uri
https://windows-post-install.s3-us-west-2.amazonaws.com/win2019_cuda114_installer.py
-OutFile C:\Users\Administrator\windows_deps_headless_installer.py
Check-Call { C:\Python37\python windows_deps_headless_installer.py }
'@
diff --git a/tools/ami-creator/userdata/win2019-cuda11-g3.txt
b/tools/ami-creator/userdata/mxnetwindows_gpu_win2019_cuda11_g3.txt
similarity index 100%
rename from tools/ami-creator/userdata/win2019-cuda11-g3.txt
rename to tools/ami-creator/userdata/mxnetwindows_gpu_win2019_cuda11_g3.txt