This is an automated email from the ASF dual-hosted git repository.

jevans pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet-ci.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a1e89c  Updates to AMI creation utility and autoscaling lambda (#42)
1a1e89c is described below

commit 1a1e89c3eec7fcbb392e14f9b92369801a29eb9c
Author: Joe Evans <[email protected]>
AuthorDate: Fri Jan 21 11:43:56 2022 -0800

    Updates to AMI creation utility and autoscaling lambda (#42)
    
    * Update lambda autoscaler to remove windows hourly-billing workarounds, 
add jenkins new name for master node, change launch parameters to use default 
launch template version instead of using environment variables to specify, 
update descriptions of worker nodes.
    
    * Add aarch64 userdata for building graviton jenkins workers.
    
    * Update templates to perform apt upgrade on boot-up, before running 
jenkins worker script to prevent auto-updates from killing running CI pipelines.
    
    * Update windows installer script to install the correct VC runtime for 
compiling MXnet.
    
    * Update create AMI util with the ability to update launch templates (and 
set default version,) if passed.
---
 .../lambda_mxnet_ci/autoscaling/handler.py         | 54 ++++++-----------
 tools/ami-creator/create_ami.py                    | 69 ++++++++++++++++++++--
 tools/ami-creator/scripts/run-auto-connect.bat     |  3 +
 .../scripts/win2019_cuda114_installer.py           |  1 +
 ....txt => mxnetlinux_cpu_aarch64_ubuntu_2004.txt} | 21 ++++---
 .../userdata/mxnetlinux_cpu_ubuntu_2004.txt        | 13 +++-
 .../userdata/mxnetlinux_gpu_ubuntu_2004.txt        | 13 +++-
 7 files changed, 123 insertions(+), 51 deletions(-)

diff --git 
a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py 
b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
index 2f8cdc3..3cce28a 100755
--- a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
+++ b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py
@@ -59,9 +59,6 @@ RE_NO_AVAILABLE_NODES = r"(^Waiting for next available 
executor$)"
 # Offline cause for nodes that have been taken offline by the jenkins 
monitoring
 NODE_MONITOR_OFFLINE_CAUSE = 'hudson.node_monitors'
 
-# Since windows got hourly billing, we only want to consider instances which 
are running close to the full hour
-WINDOWS_MIN_PARTIAL_RUNTIME_SECONDS = 55 * 60
-
 # EC2s API only allows a specific number of filters. This constant defines the 
chunk size for these requests
 EC2_FILTER_CHUNK_SIZE = 40
 
@@ -279,24 +276,6 @@ def determine_scale_down_nodes(nodes_data: List[Dict[str, 
Any]], instance_uptime
                              display_name)
                 continue
 
-            # Windows instances are getting billed hourly and need special 
handling
-            if 'Windows' in 
node_data['monitorData']['hudson.node_monitors.ArchitectureMonitor']:
-                if display_name not in instance_uptime:
-                    logging.error('Unable to find uptime for %s', display_name)
-                    continue
-
-                running_duration_seconds = instance_uptime[display_name]
-                running_duration_partial = running_duration_seconds % (60 * 60)
-                # Don't shutdown instances below XXh50min uptime to make use 
of hourly billing
-                if running_duration_partial < 
WINDOWS_MIN_PARTIAL_RUNTIME_SECONDS:
-                    considered_nodes[label].append(node_data)
-                    logging.debug(
-                        'Ignoring %s because partial runtime %ds is below 
limit of %ds (hourly billing). Total '
-                        'runtime: %ds',
-                        display_name, running_duration_partial, 
WINDOWS_MIN_PARTIAL_RUNTIME_SECONDS,
-                        running_duration_seconds)
-                    continue
-
             # TODO: Check for how long an instance has been idling. There is 
no built-in API for now and the
             # only way is to go through the entire Jenkins build history. Save 
this up for later.
 
@@ -349,7 +328,7 @@ def _determine_faulty_nodes(nodes: List[Dict[str, Any]], 
unconnected_instances:
                 label2faulty_nodes[label].append(node)
 
     for node in nodes:
-        if node['displayName'] == 'master':
+        if node['displayName'] in ['master', 'Built-In Node']:
             # Don't do anything for master
             continue
 
@@ -716,7 +695,6 @@ def _launch_ec2_instances(scale_up_slots, ec2_resource):
 
         launch_template = launch_templates[label]
         launch_template_id = launch_template['id']
-        launch_template_version = launch_template['version']
 
         for target_instance_name in target_instance_names:
             logging.debug('Launching instance %s of type %s', 
target_instance_name, label)
@@ -730,7 +708,6 @@ def _launch_ec2_instances(scale_up_slots, ec2_resource):
                     'label': label,
                     'target_instance_name': target_instance_name,
                     'launch_template_id': launch_template_id,
-                    'launch_template_version': launch_template_version,
                     'user_data_command': user_data_command
                 })
 
@@ -738,14 +715,12 @@ def _launch_ec2_instances(scale_up_slots, ec2_resource):
         delayed(_launch_ec2_instance)(ec2_resource=ec2_resource, 
label=job['label'],
                                       
target_instance_name=job['target_instance_name'],
                                       
launch_template_id=job['launch_template_id'],
-                                      
launch_template_version=job['launch_template_version'],
                                       
user_data_command=job['user_data_command']) for job in jobs)
 
     return [x for x in started_instance_names if x is not None]
 
 
-def _launch_ec2_instance(ec2_resource, label, target_instance_name, 
launch_template_id, launch_template_version,
-                         user_data_command):
+def _launch_ec2_instance(ec2_resource, label, target_instance_name, 
launch_template_id, user_data_command):
     try:
         ec2_resource.meta.client.run_instances(
             DryRun=False,
@@ -753,7 +728,7 @@ def _launch_ec2_instance(ec2_resource, label, 
target_instance_name, launch_templ
             MinCount=1,
             LaunchTemplate={
                 'LaunchTemplateId': launch_template_id,
-                'Version': launch_template_version
+                'Version': '$Default'
             },
             TagSpecifications=[
                 {
@@ -1242,7 +1217,7 @@ def _get_slave_configuration():
     return {
         'ub18-c6g': {
             'num_executors': _get_nb_executors_per_label()['ub18-c6g'],  # 
Number of executors
-            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 
18.04 on a c6g.16xlarge',
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
             'remote_fs': '/home/jenkins_slave',  # Remote workspace location
             'labels': 'ub18-c6g',  # Space separated labels string
             'exclusive': True,  # Only run jobs assigned to it
@@ -1251,7 +1226,7 @@ def _get_slave_configuration():
         },
         'restricted-ub18-c6g': {
             'num_executors': 
_get_nb_executors_per_label()['restricted-ub18-c6g'],  # Number of executors
-            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 
18.04 on a c6g.16xlarge',
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
             'remote_fs': '/home/jenkins_slave',  # Remote workspace location
             'labels': 'restricted-ub18-c6g',  # Space separated labels string
             'exclusive': True,  # Only run jobs assigned to it
@@ -1269,7 +1244,7 @@ def _get_slave_configuration():
         },
         'restricted-mxnetlinux-cpu': {
             'num_executors': _get_nb_executors_per_label()['mxnetlinux-cpu'],  
# Number of executors
-            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 
16.04 on a c5.18xlarge',
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
             'remote_fs': '/home/jenkins_slave',  # Remote workspace location
             'labels': 'restricted-mxnetlinux-cpu',  # Space separated labels 
string
             'exclusive': True,  # Only run jobs assigned to it
@@ -1278,7 +1253,7 @@ def _get_slave_configuration():
         },
         'mxnetlinux-gpu': {
             'num_executors': _get_nb_executors_per_label()['mxnetlinux-gpu'],  
# Number of executors
-            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 
16.04 on a g3.8xlarge',
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
             'remote_fs': '/home/jenkins_slave',  # Remote workspace location
             'labels': 'mxnetlinux-gpu',  # Space separated labels string
             'exclusive': True,  # Only run jobs assigned to it
@@ -1287,7 +1262,7 @@ def _get_slave_configuration():
         },
         'restricted-mxnetlinux-gpu': {
             'num_executors': 
_get_nb_executors_per_label()['restricted-mxnetlinux-gpu'],  # Number of 
executors
-            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on a 
GPU instance',
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu',
             'remote_fs': '/home/jenkins_slave',  # Remote workspace location
             'labels': 'restricted-mxnetlinux-gpu',  # Space separated labels 
string
             'exclusive': True,  # Only run jobs assigned to it
@@ -1296,7 +1271,7 @@ def _get_slave_configuration():
         },
         'mxnetlinux-gpu-g4': {
             'num_executors': 
_get_nb_executors_per_label()['mxnetlinux-gpu-g4'],  # Number of executors
-            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 
18.04 on a g4dn.4xlarge',
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on 
g4',
             'remote_fs': '/home/jenkins_slave',  # Remote workspace location
             'labels': 'mxnetlinux-gpu-g4',  # Space separated labels string
             'exclusive': True,  # Only run jobs assigned to it
@@ -1305,13 +1280,22 @@ def _get_slave_configuration():
         },
         'restricted-mxnetlinux-gpu-g4': {
             'num_executors': 
_get_nb_executors_per_label()['restricted-mxnetlinux-gpu-g4'],  # Number of 
executors
-            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 
18.04 on a g4dn.4xlarge',
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on 
g4',
             'remote_fs': '/home/jenkins_slave',  # Remote workspace location
             'labels': 'restricted-mxnetlinux-gpu-g4',  # Space separated 
labels string
             'exclusive': True,  # Only run jobs assigned to it
             'tunnel': _get_jenkins_private_tunnel_address(),
             'job_name_restriction_regex': '^restricted-(.*)'  # Only run jobs 
which start with restricted-
         },
+        'mxnetlinux-gpu-p3-8xlarge': {
+            'num_executors': 
_get_nb_executors_per_label()['mxnetlinux-gpu-p3-8xlarge'],  # Number of 
executors
+            'node_description': '[AUTOSCALING] MXNet slave running Ubuntu on 
p3.8xlarge',
+            'remote_fs': '/home/jenkins_slave',  # Remote workspace location
+            'labels': 'mxnetlinux-gpu-p3-8xlarge',  # Space separated labels 
string
+            'exclusive': True,  # Only run jobs assigned to it
+            'tunnel': _get_jenkins_private_tunnel_address(),
+            'job_name_restriction_regex': '^(?!restricted-).+'  # Run only 
unrestricted jobs
+        },
         'mxnetwindows-cpu': {
             'num_executors': 
_get_nb_executors_per_label()['mxnetwindows-cpu'],  # Number of executors
             'node_description': '[AUTOSCALING] MXNet slave running Windows',
diff --git a/tools/ami-creator/create_ami.py b/tools/ami-creator/create_ami.py
index 578af75..71f8218 100755
--- a/tools/ami-creator/create_ami.py
+++ b/tools/ami-creator/create_ami.py
@@ -9,7 +9,8 @@ import base64, binascii, getpass, optparse, sys
 from Crypto.PublicKey import RSA
 from Crypto.Cipher import PKCS1_OAEP, PKCS1_v1_5
 
-ec2Client = boto3.resource('ec2')
+ec2Resource = boto3.resource('ec2')
+ec2Client = boto3.client('ec2')
 
 def read_userdata(file):
     logging.info("Reading userdata from file %s", file)
@@ -18,7 +19,7 @@ def read_userdata(file):
 
 def create_instance(instance_type, disk_size, userdata_file, ami, 
security_group, ssh_key):
     logging.info("Creating instance type %s for image creation", instance_type)
-    instances = ec2Client.create_instances(
+    instances = ec2Resource.create_instances(
         BlockDeviceMappings=[
             {
                 'DeviceName': '/dev/sda1',
@@ -74,7 +75,7 @@ def wait_for_instance(instance, private_key):
     last_install_log_size = 0
     while (current_state['Code'] != 80):
         time.sleep(20)
-        i = ec2Client.Instance(instance_id)
+        i = ec2Resource.Instance(instance_id)
         if current_state['Code'] != i.state['Code']:
             current_state = i.state
             logging.info("Instance state changed to: %s", 
current_state['Name'])
@@ -103,7 +104,7 @@ def wait_for_instance(instance, private_key):
                     logging.debug("Unable to retrieve userdata log via ssh, 
does this windows system have sshd installed and running?")
                     continue
                 install_logfile = "log/install-{}.log".format(instance_id)
-                ret = 
subprocess.run(["scp","-q","-T","-o","StrictHostKeyChecking=no","-i",private_key,"administrator@{}:\"C:\\install.log\"".format(i.public_ip_address),install_logfile])
+                ret = 
subprocess.run(["scp","-q","-T","-o","StrictHostKeyChecking=no","-o","ConnectTimeout=10","-i",private_key,"administrator@{}:\"C:\\install.log\"".format(i.public_ip_address),install_logfile])
                 if ret.returncode == 0:
                     if os.stat(install_logfile).st_size != 
last_install_log_size:
                         last_install_log_size = 
os.stat(install_logfile).st_size
@@ -130,7 +131,7 @@ def wait_for_ami(image):
     logging.info("Waiting for AMI to become available")
     while (current_state != 'available'):
         time.sleep(5)
-        i = ec2Client.Image(ami_id)
+        i = ec2Resource.Image(ami_id)
         if current_state != i.state:
             current_state = i.state
             logging.info("Image state changed to %s", current_state)
@@ -141,6 +142,51 @@ def terminate_instance(instance):
     instance.terminate()
 
 
+def get_current_lt_version(lt_id):
+    logging.debug("Looking up current version for LT %s", lt_id)
+    response = ec2Client.describe_launch_template_versions(
+        LaunchTemplateId = lt_id,
+        Versions = [ '$Latest' ]
+    )
+    try:
+        latest_version = response['LaunchTemplateVersions'][0]['VersionNumber']
+        logging.debug("Found latest version %s of LT %s", latest_version, 
lt_id)
+        return latest_version
+    except:
+        logging.error("Unable to get latest LT version for LT %s", lt_id)
+
+def set_default_lt_version(lt_id, version):
+    logging.debug("Setting the default version to %s for LT %s", version, 
lt_id)
+    try:
+        response = ec2Client.modify_launch_template(
+            LaunchTemplateId = lt_id,
+            DefaultVersion = str(version)
+        )
+    except:
+        logging.error("Unable to set default LT version for LT %s", lt_id)
+        return False
+    return True
+
+
+def update_launch_template(lt_id, ami_id):
+    latest_version = get_current_lt_version(lt_id)
+    if not latest_version:
+        logging.error("Unable to get current LT version for LT %s, not 
updating.", lt_id)
+        return None
+    logging.info("Updating Launch Template %s with new AMI %s", lt_id, ami_id)
+    response = ec2Client.create_launch_template_version(
+        LaunchTemplateId = lt_id,
+        SourceVersion = str(latest_version),
+        LaunchTemplateData = {
+            'ImageId': ami_id
+        }
+    )
+    new_version = response['LaunchTemplateVersion']['VersionNumber']
+    logging.debug("Successfully created new LT %s version %s", lt_id, 
new_version)
+    set_default_lt_version(lt_id, new_version)
+    return new_version
+
+
 def main():
     parser = OptionParser()
     parser.add_option("-i", "--instance-type", dest="instance_type",
@@ -155,6 +201,8 @@ def main():
         help="Security group ID for instance")
     parser.add_option("-k", "--key-name", dest="ssh_key",
         help="SSH key pair name to use")
+    parser.add_option("-l", "--launch-templates", dest="launch_templates",
+        help="Comma separated list of launch template IDs to update with newly 
created AMI")
     parser.add_option("-p", "--private-key", dest="private_key",
         help="Private key used to SSH into instance or decrypt windows 
password")
     parser.add_option("-u", "--userdata", dest="userdata",
@@ -195,7 +243,6 @@ def main():
         userdata = 'userdata/{}.txt'.format(options.name)
 
 
-
     instance = create_instance(
         instance_type=options.instance_type,
         disk_size=options.disk_size,
@@ -209,6 +256,16 @@ def main():
     wait_for_ami(image)
     terminate_instance(instance)
 
+    lt_list = options.launch_templates.split(",")
+    if len(lt_list) > 0:
+        logging.info("Updating launch templates with new AMI")
+        for lt_id in lt_list:
+            new_version = update_launch_template(lt_id, image.id)
+            if new_version is None:
+                logging.error("Unable to update LT %s", lt_id)
+            else:
+                logging.info("Created new version %s of LT %s with new AMI", 
new_version, lt_id)
+
 
 main()
 
diff --git a/tools/ami-creator/scripts/run-auto-connect.bat 
b/tools/ami-creator/scripts/run-auto-connect.bat
new file mode 100644
index 0000000..114757e
--- /dev/null
+++ b/tools/ami-creator/scripts/run-auto-connect.bat
@@ -0,0 +1,3 @@
+cd C:\
+
+C:\Python38\python.exe .\slave-autoconnect.py 
--slave-name-file=C:\jenkins_slave\jenkins_slave_name.txt 
--master-file=C:\jenkins_slave\jenkins_master_url.txt 
--master-private-file=C:\jenkins_slave\jenkins_master_private_url.txt
diff --git a/tools/ami-creator/scripts/win2019_cuda114_installer.py 
b/tools/ami-creator/scripts/win2019_cuda114_installer.py
index fcdd807..dc38823 100644
--- a/tools/ami-creator/scripts/win2019_cuda114_installer.py
+++ b/tools/ami-creator/scripts/win2019_cuda114_installer.py
@@ -224,6 +224,7 @@ def install_vs():
                ' --add Microsoft.VisualStudio.Component.Static.Analysis.Tools'
                ' --add Microsoft.VisualStudio.Component.VC.CMake.Project'
                ' --add Microsoft.VisualStudio.Component.VC.140'
+               ' --add Microsoft.VisualStudio.Component.VC.14.28.x86.x64'
                ' --add 
Microsoft.VisualStudio.Component.Windows10SDK.18362.Desktop'
                ' --add Microsoft.VisualStudio.Component.Windows10SDK.18362.UWP'
                ' --add 
Microsoft.VisualStudio.Component.Windows10SDK.18362.UWP.Native'
diff --git a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt 
b/tools/ami-creator/userdata/mxnetlinux_cpu_aarch64_ubuntu_2004.txt
similarity index 80%
copy from tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
copy to tools/ami-creator/userdata/mxnetlinux_cpu_aarch64_ubuntu_2004.txt
index d802023..680839b 100644
--- a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_cpu_aarch64_ubuntu_2004.txt
@@ -15,9 +15,6 @@ packages:
   - python3-jenkins
   - python3-joblib
   - docker.io
-  - qemu
-  - qemu-user-static
-  - binfmt-support
   - awscli
   - nfs-common
   - libattr1-dev
@@ -31,10 +28,20 @@ write_files:
   - path: /etc/cron.d/jenkins-start-slave
     content: |
       @reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh
+  - path: /etc/cron.d/apt-update-on-startup
+    content: |
+      @reboot root /root/apt_update_startup.sh
+  - path: /root/apt_update_startup.sh
+    content: |
+      export DEBIAN_FRONTEND=noninteractive
+      apt update
+      apt upgrade -y
+      touch /tmp/apt.done
   - path: /home/jenkins_slave/scripts/launch-autoconnect.sh
     content: |
       #!/bin/sh
       set -ex
+      while [ ! -e /tmp/apt.done ]; do sleep 5; done
       python3 /home/jenkins_slave/scripts/slave-autoconnect.py 
--slave-name-file=/home/jenkins_slave/jenkins_slave_name 
--master-file=/home/jenkins_slave/jenkins_master_url 
--master-private-file=/home/jenkins_slave/jenkins_master_private_url > 
/home/jenkins_slave/auto-connect.log
   - path: /etc/fstab
     content: |
@@ -58,13 +65,11 @@ runcmd:
   - [ "wget", "-O", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"https://raw.githubusercontent.com/apache/incubator-mxnet-ci/master/tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py";
 ]
   - [ "touch", "/home/jenkins_slave/auto-connect.log" ]
   - [ "chown", "-R", "jenkins_slave:jenkins_slave", "/home/jenkins_slave" ]
-  - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
-  - [ "curl", "-L", 
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64";,
 "-o", "/usr/bin/docker-compose" ]
+  - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"/home/jenkins_slave/scripts/launch-autoconnect.sh", 
"/root/apt_update_startup.sh" ]
+  - [ "curl", "-L", 
"https://github.com/docker/compose/releases/download/v2.2.2/docker-compose-linux-aarch64";,
 "-o", "/usr/bin/docker-compose" ]
   - [ "chmod", "+x", "/usr/bin/docker-compose" ]
-  - [ "wget", "-O", "/tmp/qemu-binfmt-conf.sh", 
"https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh";
 ]
-  - [ "chmod", "+x", "/tmp/qemu-binfmt-conf.sh" ]
-  - [ "/tmp/qemu-binfmt-conf.sh", "--persistent", "yes", "--qemu-suffix", 
"-static", "--qemu-path", "/usr/bin", "--systemd", "ALL" ]
   - [ "rm", "-f", "/var/lib/cloud/instances/*/sem/config_scripts_user", 
"/var/lib/cloud/instance/sem/config_scripts_user" ]
+  - [ "rm", "-f", "/tmp/apt.done" ]
   - [ "sleep", "10" ]
   - [ "halt", "-p" ]
 
diff --git a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt 
b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
index d802023..67a5954 100644
--- a/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_cpu_ubuntu_2004.txt
@@ -31,10 +31,20 @@ write_files:
   - path: /etc/cron.d/jenkins-start-slave
     content: |
       @reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh
+  - path: /etc/cron.d/apt-update-on-startup
+    content: |
+      @reboot root /root/apt_update_startup.sh
+  - path: /root/apt_update_startup.sh
+    content: |
+      export DEBIAN_FRONTEND=noninteractive
+      apt update
+      apt upgrade -y
+      touch /tmp/apt.done
   - path: /home/jenkins_slave/scripts/launch-autoconnect.sh
     content: |
       #!/bin/sh
       set -ex
+      while [ ! -e /tmp/apt.done ]; do sleep 5; done
       python3 /home/jenkins_slave/scripts/slave-autoconnect.py 
--slave-name-file=/home/jenkins_slave/jenkins_slave_name 
--master-file=/home/jenkins_slave/jenkins_master_url 
--master-private-file=/home/jenkins_slave/jenkins_master_private_url > 
/home/jenkins_slave/auto-connect.log
   - path: /etc/fstab
     content: |
@@ -58,13 +68,14 @@ runcmd:
   - [ "wget", "-O", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"https://raw.githubusercontent.com/apache/incubator-mxnet-ci/master/tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py";
 ]
   - [ "touch", "/home/jenkins_slave/auto-connect.log" ]
   - [ "chown", "-R", "jenkins_slave:jenkins_slave", "/home/jenkins_slave" ]
-  - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
+  - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"/home/jenkins_slave/scripts/launch-autoconnect.sh", 
"/root/apt_update_startup.sh" ]
   - [ "curl", "-L", 
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64";,
 "-o", "/usr/bin/docker-compose" ]
   - [ "chmod", "+x", "/usr/bin/docker-compose" ]
   - [ "wget", "-O", "/tmp/qemu-binfmt-conf.sh", 
"https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh";
 ]
   - [ "chmod", "+x", "/tmp/qemu-binfmt-conf.sh" ]
   - [ "/tmp/qemu-binfmt-conf.sh", "--persistent", "yes", "--qemu-suffix", 
"-static", "--qemu-path", "/usr/bin", "--systemd", "ALL" ]
   - [ "rm", "-f", "/var/lib/cloud/instances/*/sem/config_scripts_user", 
"/var/lib/cloud/instance/sem/config_scripts_user" ]
+  - [ "rm", "-f", "/tmp/apt.done" ]
   - [ "sleep", "10" ]
   - [ "halt", "-p" ]
 
diff --git a/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt 
b/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
index d00fcae..a7560a3 100644
--- a/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
+++ b/tools/ami-creator/userdata/mxnetlinux_gpu_ubuntu_2004.txt
@@ -40,10 +40,20 @@ write_files:
   - path: /etc/cron.d/jenkins-start-slave
     content: |
       @reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh
+  - path: /etc/cron.d/apt-update-on-startup
+    content: |
+      @reboot root /root/apt_update_startup.sh
+  - path: /root/apt_update_startup.sh
+    content: |
+      export DEBIAN_FRONTEND=noninteractive
+      apt update
+      apt upgrade -y
+      touch /tmp/apt.done
   - path: /home/jenkins_slave/scripts/launch-autoconnect.sh
     content: |
       #!/bin/sh
       set -ex
+      while [ ! -e /tmp/apt.done ]; do sleep 5; done
       python3 /home/jenkins_slave/scripts/slave-autoconnect.py 
--slave-name-file=/home/jenkins_slave/jenkins_slave_name 
--master-file=/home/jenkins_slave/jenkins_master_url 
--master-private-file=/home/jenkins_slave/jenkins_master_private_url > 
/home/jenkins_slave/auto-connect.log
   - path: /etc/fstab
     content: |
@@ -68,13 +78,14 @@ runcmd:
   - [ "wget", "-O", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"https://raw.githubusercontent.com/apache/incubator-mxnet-ci/master/tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py";
 ]
   - [ "touch", "/home/jenkins_slave/auto-connect.log" ]
   - [ "chown", "-R", "jenkins_slave:jenkins_slave", "/home/jenkins_slave" ]
-  - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"/home/jenkins_slave/scripts/launch-autoconnect.sh" ]
+  - [ "chmod", "+x", "/home/jenkins_slave/scripts/slave-autoconnect.py", 
"/home/jenkins_slave/scripts/launch-autoconnect.sh", 
"/root/apt_update_startup.sh" ]
   - [ "usermod", "-aG", "docker", "jenkins_slave" ]
   - [ "systemctl", "enable", "docker" ]
   - [ "service", "docker", "restart" ]
   - [ "curl", "-L", 
"https://github.com/docker/compose/releases/download/1.25.5/docker-compose-Linux-x86_64";,
 "-o", "/usr/bin/docker-compose" ]
   - [ "chmod", "+x", "/usr/bin/docker-compose" ]
   - [ "rm", "-f", "/var/lib/cloud/instances/*/sem/config_scripts_user", 
"/var/lib/cloud/instance/sem/config_scripts_user" ]
+  - [ "rm", "-f", "/tmp/apt.done" ]
   - [ "sleep", "10" ]
   - [ "halt", "-p" ]
 

Reply via email to