This is an automated email from the ASF dual-hosted git repository.
jevans pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet-ci.git
The following commit(s) were added to refs/heads/master by this push:
new 1a1e89c Updates to AMI creation utility and autoscaling lambda (#42)
1a1e89c is described below
commit 1a1e89c3eec7fcbb392e14f9b92369801a29eb9c
Author: Joe Evans <[email protected]>
AuthorDate: Fri Jan 21 11:43:56 2022 -0800
Updates to AMI creation utility and autoscaling lambda (#42)
* Update lambda autoscaler to remove windows hourly-billing workarounds,
add jenkins new name for master node, change launch parameters to use default
launch template version instead of using environment variables to specify,
update descriptions of worker nodes.
* Add aarch64 userdata for building graviton jenkins workers.
* Update templates to perform apt upgrade on boot-up, before running
jenkins worker script to prevent auto-updates from killing running CI pipelines.
* Update windows installer script to install the correct VC runtime for
compiling MXnet.
* Update create AMI util with the ability to update launch templates (and
set default version,) if passed.
---
.../lambda_mxnet_ci/autoscaling/handler.py | 54 ++++++-----------
tools/ami-creator/create_ami.py | 69 ++++++++++++++++++++--
tools/ami-creator/scripts/run-auto-connect.bat | 3 +
.../scripts/win2019_cuda114_installer.py | 1 +
....txt => mxnetlinux_cpu_aarch64_ubuntu_2004.txt} | 21 ++++---
.../userdata/mxnetlinux_cpu_ubuntu_2004.txt | 13 +++-
.../userdata/mxnetlinux_gpu_ubuntu_2004.txt | 13 +++-
7 files changed, 123 insertions(+), 51 deletions(-)
diff --git
a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
index 2f8cdc3..3cce28a 100755
--- a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
+++ b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
@@ -59,9 +59,6 @@ RE_NO_AVAILABLE_NODES = r"(^Waiting for next available
executor$)"
# Offline cause for nodes that have been taken offline by the jenkins
monitoring
NODE_MONITOR_OFFLINE_CAUSE = 'hudson.node_monitors'
-# Since windows got hourly billing, we only want to consider instances which
are running close to the full hour
-WINDOWS_MIN_PARTIAL_RUNTIME_SECONDS = 55 * 60
-
# EC2s API only allows a specific number of filters. This constant defines the
chunk size for these requests
EC2_FILTER_CHUNK_SIZE = 40
@@ -279,24 +276,6 @@ def determine_scale_down_nodes(nodes_data: List[Dict[str,
Any]], instance_uptime
display_name)
continue
- # Windows instances are getting billed hourly and need special
handling
- if 'Windows' in
node_data['monitorData']['hudson.node_monitors.ArchitectureMonitor']:
- if display_name not in instance_uptime:
- logging.error('Unable to find uptime for %s', display_name)
- continue
-
- running_duration_seconds = instance_uptime[display_name]
- running_duration_partial = running_duration_seconds % (60 * 60)
- # Don't shutdown instances below XXh50min uptime to make use
of hourly billing
- if running_duration_partial <
WINDOWS_MIN_PARTIAL_RUNTIME_SECONDS:
- considered_nodes[label].append(node_data)
- logging.debug(
- 'Ignoring %s because partial runtime %ds is below
limit of %ds (hourly billing). Total '
- 'runtime: %ds',
- display_name, running_duration_partial,
WINDOWS_MIN_PARTIAL_RUNTIME_SECONDS,
- running_duration_seconds)
- continue
-
# TODO: Check for how long an instance has been idling. There is
no built-in API for now and the
# only way is to go through the entire Jenkins build history. Save
this up for later.
@@ -349,7 +328,7 @@ def _determine_faulty_nodes(nodes: List[Dict[str, Any]],
unconnected_instances:
label2faulty_nodes[label].append(node)
for node in nodes:
- if node['displayName'] == 'master':
+ if node['displayName'] in ['master', 'Built-In Node']:
# Don't do anything for master
continue
@@ -716,7 +695,6 @@ def _launch_ec2_instances(scale_up_slots, ec2_resource):
launch_template = launch_templates[label]
launch_template_id = launch_template['id']
- launch_template_version = launch_template['version']
for target_instance_name in target_instance_names:
logging.debug('Launching instance %s of type %s',
target_instance_name, label)
@@ -730,7 +708,6 @@ def _launch_ec2_instances(scale_up_slots, ec2_resource):
'label': label,
'target_instance_name': target_instance_name,
'launch_template_id': launch_template_id,
- 'launch_template_version': launch_template_version,
'user_data_command': user_data_command
})
@@ -738,14 +715,12 @@ def _launch_ec2_instances(scale_up_slots, ec2_resource):
delayed(_launch_ec2_instance)(ec2_resource=ec2_resource,
label=job['label'],
target_instance_name=job['target_instance_name'],
launch_template_id=job['launch_template_id'],
-
launch_template_version=job['launch_template_version'],
user_data_command=job['user_data_command']) for job in jobs)
return [x for x in started_instance_names if x is not None]
-def _launch_ec2_instance(ec2_resource, label, target_instance_name,
launch_template_id, launch_template_version,
- user_data_command):
+def _launch_ec2_instance(ec2_resource, label, target_instance_name,
launch_template_id, user_data_command):
try:
ec2_resource.meta.client.run_instances(
DryRun=False,
@@ -753,7 +728,7 @@ def _launch_ec2_instance(ec2_resource, label,
target_instance_name, launch_templ
MinCount=1,
LaunchTemplate={
'LaunchTemplateId': launch_template_id,
- 'Version': launch_template_version
+ 'Version': '$Default'
},
TagSpecifications=[
{
@@ -1242,7 +1217,7 @@ def _get_slave_configuration():
return {
'ub18-c6g': {
'num_executors': _get_nb_executors_per_label()['ub18-c6g'], #
Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
18.04 on a c6g.16xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'ub18-c6g', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
@@ -1251,7 +1226,7 @@ def _get_slave_configuration():
},
'restricted-ub18-c6g': {
'num_executors':
_get_nb_executors_per_label()['restricted-ub18-c6g'], # Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
18.04 on a c6g.16xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'restricted-ub18-c6g', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
@@ -1269,7 +1244,7 @@ def _get_slave_configuration():
},
'restricted-mxnetlinux-cpu': {
'num_executors': _get_nb_executors_per_label()['mxnetlinux-cpu'],
# Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
16.04 on a c5.18xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'restricted-mxnetlinux-cpu', # Space separated labels
string
'exclusive': True, # Only run jobs assigned to it
@@ -1278,7 +1253,7 @@ def _get_slave_configuration():
},
'mxnetlinux-gpu': {
'num_executors': _get_nb_executors_per_label()['mxnetlinux-gpu'],
# Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
16.04 on a g3.8xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'mxnetlinux-gpu', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
@@ -1287,7 +1262,7 @@ def _get_slave_configuration():
},
'restricted-mxnetlinux-gpu': {
'num_executors':
_get_nb_executors_per_label()['restricted-mxnetlinux-gpu'], # Number of
executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on a
GPU instance',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'restricted-mxnetlinux-gpu', # Space separated labels
string
'exclusive': True, # Only run jobs assigned to it
@@ -1296,7 +1271,7 @@ def _get_slave_configuration():
},
'mxnetlinux-gpu-g4': {
'num_executors':
_get_nb_executors_per_label()['mxnetlinux-gpu-g4'], # Number of executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
18.04 on a g4dn.4xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on
g4',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'mxnetlinux-gpu-g4', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
@@ -1305,13 +1280,22 @@ def _get_slave_configuration():
},
'restricted-mxnetlinux-gpu-g4': {
'num_executors':
_get_nb_executors_per_label()['restricted-mxnetlinux-gpu-g4'], # Number of
executors
- 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu
18.04 on a g4dn.4xlarge',
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on
g4',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'restricted-mxnetlinux-gpu-g4', # Space separated
labels string
'exclusive': True, # Only run jobs assigned to it
'tunnel': _get_jenkins_private_tunnel_address(),
'job_name_restriction_regex': '^restricted-(.*)' # Only run jobs
which start with restricted-
},
+ 'mxnetlinux-gpu-p3-8xlarge': {
+ 'num_executors':
_get_nb_executors_per_label()['mxnetlinux-gpu-p3-8xlarge'], # Number of
executors
+ 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on
p3.8xlarge',
+ 'remote_fs': '/home/jenkins_slave', # Remote workspace location
+ 'labels': 'mxnetlinux-gpu-p3-8xlarge', # Space separated labels
string
+ 'exclusive': True, # Only run jobs assigned to it
+ 'tunnel': _get_jenkins_private_tunnel_address(),
+ 'job_name_restriction_regex': '^(?!restricted-).+' # Run only
unrestricted jobs
+ },
'mxnetwindows-cpu': {
'num_executors':
_get_nb_executors_per_label()['mxnetwindows-cpu'], # Number of executors
'node_description': '[AUTOSCALING] MXNet slave running Windows',
diff --git a/tools/ami-creator/create_ami.py b/tools/ami-creator/create_ami.py
index 578af75..71f8218 100755
--- a/tools/ami-creator/create_ami.py
+++ b/tools/ami-creator/create_ami.py
@@ -9,7 +9,8 @@ import base64, binascii, getpass, optparse, sys
from Crypto.PublicKey import RSA
from Crypto.Cipher import PKCS1_OAEP, PKCS1_v1_5
-ec2Client = boto3.resource('ec2')
+ec2Resource = boto3.resource('ec2')
+ec2Client = boto3.client('ec2')
def read_userdata(file):
logging.info("Reading userdata from file %s", file)
@@ -18,7 +19,7 @@ def read_userdata(file):
def create_instance(instance_type, disk_size, userdata_file, ami,
security_group, ssh_key):
logging.info("Creating instance type %s for image creation", instance_type)
- instances = ec2Client.create_instances(
+ instances = ec2Resource.create_instances(
BlockDeviceMappings=[
{
'DeviceName': '/dev/sda1',
@@ -74,7 +75,7 @@ def wait_for_instance(instance, private_key):
last_install_log_size = 0
while (current_state['Code'] != 80):
time.sleep(20)
- i = ec2Client.Instance(instance_id)
+ i = ec2Resource.Instance(instance_id)
if current_state['Code'] != i.state['Code']:
current_state = i.state
logging.info("Instance state changed to: %s",
current_state['Name'])
@@ -103,7 +104,7 @@ def wait_for_instance(instance, private_key):
logging.debug("Unable to retrieve userdata log via ssh,
does this windows system have sshd installed and running?")
continue
install_logfile = "log/install-{}.log".format(instance_id)
- ret =
subprocess.run(["scp","-q","-T","-o","StrictHostKeyChecking=no","-i",private_key,"administrator@{}:\"C:\\install.log\"".format(i.public_ip_address),install_logfile])
+ ret =
subprocess.run(["scp","-q","-T","-o","StrictHostKeyChecking=no","-o","ConnectTimeout=10","-i",private_key,"administrator@{}:\"C:\\install.log\"".format(i.public_ip_address),install_logfile])
if ret.returncode == 0:
if os.stat(install_logfile).st_size !=
last_install_log_size:
last_install_log_size =
os.stat(install_logfile).st_size
@@ -130,7 +131,7 @@ def wait_for_ami(image):
logging.info("Waiting for AMI to become available")
while (current_state != 'available'):
time.sleep(5)
- i = ec2Client.Image(ami_id)
+ i = ec2Resource.Image(ami_id)
if current_state != i.state:
current_state = i.state
logging.info("Image state changed to %s", current_state)
@@ -141,6 +142,51 @@ def terminate_instance(instance):
instance.terminate()
+def get_current_lt_version(lt_id):
+ logging.debug("Looking up current version for LT %s", lt_id)
+ response = ec2Client.describe_launch_template_versions(
+ LaunchTemplateId = lt_id,
+ Versions = [ '$Latest' ]
+ )
+ try:
+ latest_version = response['LaunchTemplateVersions'][0]['VersionNumber']
+ logging.debug("Found latest version %s of LT %s", latest_version,
lt_id)
+ return latest_version
+ except:
+ logging.error("Unable to get latest LT version for LT %s", lt_id)
+
+def set_default_lt_version(lt_id, version):
+ logging.debug("Setting the default version to %s for LT %s", version,
lt_id)
+ try:
+ response = ec2Client.modify_launch_template(
+ LaunchTemplateId = lt_id,
+ DefaultVersion = str(version)
+ )
+ except:
+ logging.error("Unable to set default LT version for LT %s", lt_id)
+ return False
+ return True
+
+
+def update_launch_template(lt_id, ami_id):
+ latest_version = get_current_lt_version(lt_id)
+ if not latest_version:
+ logging.error("Unable to get current LT version for LT %s, not
updating.", lt_id)
+ return None
+ logging.info("Updating Launch Template %s with new AMI %s", lt_id, ami_id)
+ response = ec2Client.create_launch_template_version(
+ LaunchTemplateId = lt_id,
+ SourceVersion = str(latest_version),
+ LaunchTemplateData = {
+ 'ImageId': ami_id
+ }
+ )
+ new_version = response['LaunchTemplateVersion']['VersionNumber']
+ logging.debug("Successfully created new LT %s version %s", lt_id,
new_version)
+ set_default_lt_version(lt_id, new_version)
+ return new_version
+
+
def main():
parser = OptionParser()
parser.add_option("-i", "--instance-type", dest="instance_type",
@@ -155,6 +201,8 @@ def main():
help="Security group ID for instance")
parser.add_option("-k", "--key-name", dest="ssh_key",
help="SSH key pair name to use")
+ parser.add_option("-l", "--launch-templates", dest="launch_templates",
+ help="Comma separated list of launch template IDs to update with newly
created AMI")
parser.add_option("-p", "--private-key", dest="private_key",
help="Private key used to SSH into instance or decrypt windows
password")
parser.add_option("-u", "--userdata", dest="userdata",
@@ -195,7 +243,6 @@ def main():
userdata = 'userdata/{}.txt'.format(options.name)
-
instance = create_instance(
instance_type=options.instance_type,
disk_size=options.disk_size,
@@ -209,6 +256,16 @@ def main():
wait_for_ami(image)
terminate_instance(instance)
+ lt_list = options.launch_templates.split(",")
+ if len(lt_list) > 0:
+ logging.info("Updating launch templates with new AMI")
+ for lt_id in lt_list:
+ new_version = update_launch_template(lt_id, image.id)
+ if new_version is None:
+ logging.error("Unable to update LT %s", lt_id)
+ else:
+ logging.info("Created new version %s of LT %s with new AMI",
new_version, lt_id)
+
main()
diff --git a/tools/ami-creator/scripts/run-auto-connect.bat
b/tools/ami-creator/scripts/run-auto-connect.bat
new file mode 100644
index 0000000..114757e
--- /dev/null
+++ b/tools/ami-creator/scripts/run-auto-connect.bat
@@ -0,0 +1,3 @@
+cd C:\
+
+C:\Python38\python.exe .\slave-autoconnect.py
--slave-name-file=C:\jenkins_slave\jenkins_slave_name.txt
--master-file=C:\jenkins_slave\jenkins_master_url.txt
--master-private-file=C:\jenkins_slave\jenkins_master_private_url.txt
diff --git a/tools/ami-creator/scripts/win2019_cuda114_installer.py
b/tools/ami-creator/scripts/win2019_cuda114_installer.py
index fcdd807..dc38823 100644
--- a/tools/ami-creator/scripts/win2019_cuda114_installer.py
+++ b/tools/ami-creator/scripts/win2019_cuda114_installer.py
@@ -224,6 +224,7 @@ def install_vs():
' --add Microsoft.VisualStudio.Component.Static.Analysis.Tools'
' --add Microsoft.VisualStudio.Component.VC.CMake.Project'
' --add Microsoft.VisualStudio.Component.VC.140'
+ ' --add Microsoft.VisualStudio.Component.VC.14.28.x86.x64'
' --add
Microsoft.VisualStudio.Component.Windows10SDK.18362.Desktop'
' --add Microsoft.VisualStudio.Component.Windows10SDK.18362.UWP'
' --add
Microsoft.VisualStudio.Component.Windows10SDK.18362.UWP.Native'
diff --git a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
b/tools/ami-creator/userdata/mxnetlinux_cpu_aarch64_ubuntu_2004.txt
similarity index 80%
copy from tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
copy to tools/ami-creator/userdata/mxnetlinux_cpu_aarch64_ubuntu_2004.txt
index d802023..680839b 100644
--- a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_cpu_aarch64_ubuntu_2004.txt
@@ -15,9 +15,6 @@ packages:
- python3-jenkins
- python3-joblib
- docker.io
- - qemu
- - qemu-user-static
- - binfmt-support
- awscli
- nfs-common
- libattr1-dev
@@ -31,10 +28,20 @@ write_files:
- path: /etc/cron.d/jenkins-start-slave
content: |
@reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh
+ - path: /etc/cron.d/apt-update-on-startup
+ content: |
+ @reboot root /root/apt_update_startup.sh
+ - path: /root/apt_update_startup.sh
+ content: |
+ export DEBIAN_FRONTEND=noninteractive
+ apt update
+ apt upgrade -y
+ touch /tmp/apt.done
- path: /home/jenkins_slave/scripts/launch-autoconnect.sh
content: |
#!/bin/sh
set -ex
+ while [ ! -e /tmp/apt.done ]; do sleep 5; done
python3 /home/jenkins_slave/scripts/slave-autoconnect.py
--slave-name-file=/home/jenkins_slave/jenkins_slave_name
--master-file=/home/jenkins_slave/jenkins_master_url
--master-private-file=/home/jenkins_slave/jenkins_master_private_url >
/home/jenkins_slave/auto-connect.log
- path: /etc/fstab
content: |
@@ -58,13 +65,11 @@ runcmd:
- [ "wget", "-O", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"https://raw.githubusercontent.com/apache/incubator-mxnet-ci/master/tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py"
]
- [ "touch", "/home/jenkins_slave/auto-connect.log" ]
- [ "chown", "-R", "jenkins_slave:jenkins_slave", "/home/jenkins_slave" ]
- - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
- - [ "curl", "-L",
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64",
"-o", "/usr/bin/docker-compose" ]
+ - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh",
"/root/apt_update_startup.sh" ]
+ - [ "curl", "-L",
"https://github.com/docker/compose/releases/download/v2.2.2/docker-compose-linux-aarch64",
"-o", "/usr/bin/docker-compose" ]
- [ "chmod", "+x", "/usr/bin/docker-compose" ]
- - [ "wget", "-O", "/tmp/qemu-binfmt-conf.sh",
"https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh"
]
- - [ "chmod", "+x", "/tmp/qemu-binfmt-conf.sh" ]
- - [ "/tmp/qemu-binfmt-conf.sh", "--persistent", "yes", "--qemu-suffix",
"-static", "--qemu-path", "/usr/bin", "--systemd", "ALL" ]
- [ "rm", "-f", "/var/lib/cloud/instances/*/sem/config_scripts_user",
"/var/lib/cloud/instance/sem/config_scripts_user" ]
+ - [ "rm", "-f", "/tmp/apt.done" ]
- [ "sleep", "10" ]
- [ "halt", "-p" ]
diff --git a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
index d802023..67a5954 100644
--- a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
@@ -31,10 +31,20 @@ write_files:
- path: /etc/cron.d/jenkins-start-slave
content: |
@reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh
+ - path: /etc/cron.d/apt-update-on-startup
+ content: |
+ @reboot root /root/apt_update_startup.sh
+ - path: /root/apt_update_startup.sh
+ content: |
+ export DEBIAN_FRONTEND=noninteractive
+ apt update
+ apt upgrade -y
+ touch /tmp/apt.done
- path: /home/jenkins_slave/scripts/launch-autoconnect.sh
content: |
#!/bin/sh
set -ex
+ while [ ! -e /tmp/apt.done ]; do sleep 5; done
python3 /home/jenkins_slave/scripts/slave-autoconnect.py
--slave-name-file=/home/jenkins_slave/jenkins_slave_name
--master-file=/home/jenkins_slave/jenkins_master_url
--master-private-file=/home/jenkins_slave/jenkins_master_private_url >
/home/jenkins_slave/auto-connect.log
- path: /etc/fstab
content: |
@@ -58,13 +68,14 @@ runcmd:
- [ "wget", "-O", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"https://raw.githubusercontent.com/apache/incubator-mxnet-ci/master/tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py"
]
- [ "touch", "/home/jenkins_slave/auto-connect.log" ]
- [ "chown", "-R", "jenkins_slave:jenkins_slave", "/home/jenkins_slave" ]
- - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
+ - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh",
"/root/apt_update_startup.sh" ]
- [ "curl", "-L",
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64",
"-o", "/usr/bin/docker-compose" ]
- [ "chmod", "+x", "/usr/bin/docker-compose" ]
- [ "wget", "-O", "/tmp/qemu-binfmt-conf.sh",
"https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh"
]
- [ "chmod", "+x", "/tmp/qemu-binfmt-conf.sh" ]
- [ "/tmp/qemu-binfmt-conf.sh", "--persistent", "yes", "--qemu-suffix",
"-static", "--qemu-path", "/usr/bin", "--systemd", "ALL" ]
- [ "rm", "-f", "/var/lib/cloud/instances/*/sem/config_scripts_user",
"/var/lib/cloud/instance/sem/config_scripts_user" ]
+ - [ "rm", "-f", "/tmp/apt.done" ]
- [ "sleep", "10" ]
- [ "halt", "-p" ]
diff --git a/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
b/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
index d00fcae..a7560a3 100644
--- a/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
@@ -40,10 +40,20 @@ write_files:
- path: /etc/cron.d/jenkins-start-slave
content: |
@reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh
+ - path: /etc/cron.d/apt-update-on-startup
+ content: |
+ @reboot root /root/apt_update_startup.sh
+ - path: /root/apt_update_startup.sh
+ content: |
+ export DEBIAN_FRONTEND=noninteractive
+ apt update
+ apt upgrade -y
+ touch /tmp/apt.done
- path: /home/jenkins_slave/scripts/launch-autoconnect.sh
content: |
#!/bin/sh
set -ex
+ while [ ! -e /tmp/apt.done ]; do sleep 5; done
python3 /home/jenkins_slave/scripts/slave-autoconnect.py
--slave-name-file=/home/jenkins_slave/jenkins_slave_name
--master-file=/home/jenkins_slave/jenkins_master_url
--master-private-file=/home/jenkins_slave/jenkins_master_private_url >
/home/jenkins_slave/auto-connect.log
- path: /etc/fstab
content: |
@@ -68,13 +78,14 @@ runcmd:
- [ "wget", "-O", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"https://raw.githubusercontent.com/apache/incubator-mxnet-ci/master/tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py"
]
- [ "touch", "/home/jenkins_slave/auto-connect.log" ]
- [ "chown", "-R", "jenkins_slave:jenkins_slave", "/home/jenkins_slave" ]
- - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
+ - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py",
"/home/jenkins_slave/scripts/launch-autoconnect.sh",
"/root/apt_update_startup.sh" ]
- [ "usermod", "-aG", "docker", "jenkins_slave" ]
- [ "systemctl", "enable", "docker" ]
- [ "service", "docker", "restart" ]
- [ "curl", "-L",
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64",
"-o", "/usr/bin/docker-compose" ]
- [ "chmod", "+x", "/usr/bin/docker-compose" ]
- [ "rm", "-f", "/var/lib/cloud/instances/*/sem/config_scripts_user",
"/var/lib/cloud/instance/sem/config_scripts_user" ]
+ - [ "rm", "-f", "/tmp/apt.done" ]
- [ "sleep", "10" ]
- [ "halt", "-p" ]