KellenSunderland commented on a change in pull request #12276: [Don't 
merge][Review] A solution to prevent zombie containers locally and in CI
URL: https://github.com/apache/incubator-mxnet/pull/12276#discussion_r211866417
 
 

 ##########
 File path: ci/build.py
 ##########
 @@ -158,63 +246,142 @@ def default_ccache_dir() -> str:
         return ccache_dir
     return os.path.join(tempfile.gettempdir(), "ci_ccache")
 
+def trim_container_id(cid):
+    return cid[:12]
 
 def container_run(platform: str,
-                  docker_binary: str,
+                  nvidia_runtime: bool,
                   docker_registry: str,
                   shared_memory_size: str,
-                  local_ccache_dir: str,
                   command: List[str],
+                  local_ccache_dir: str,
+                  cleanup: Cleanup,
                   dry_run: bool = False,
-                  interactive: bool = False) -> str:
+                  interactive: bool = False) -> int:
+    CONTAINER_WAIT_S = 600
+    #
+    # Environment setup
+    #
+    environment = {
+        'CCACHE_MAXSIZE': '500G',
+        'CCACHE_TEMPDIR': '/tmp/ccache',  # temp dir should be local and not 
shared
+        'CCACHE_DIR': '/work/ccache',  # this path is inside the container as 
/work/ccache is mounted
+        'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful 
for ccache verification.
+    }
+    # These variables are passed to the container to the process tree killer 
can find runaway process inside the container
+    # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller
+    # 
https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393
+    #
+    JENKINS_ENV_VARS = ['BUILD_NUMBER', 'BUILD_ID', 'BUILD_TAG']
+    environment.update({k: os.environ[k] for k in JENKINS_ENV_VARS if k in 
os.environ})
+    environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in 
os.environ})
+
     tag = get_docker_tag(platform=platform, registry=docker_registry)
     mx_root = get_mxnet_root()
     local_build_folder = buildir()
     # We need to create it first, otherwise it will be created by the docker 
daemon with root only permissions
     os.makedirs(local_build_folder, exist_ok=True)
     os.makedirs(local_ccache_dir, exist_ok=True)
     logging.info("Using ccache directory: %s", local_ccache_dir)
-    runlist = [docker_binary, 'run', '--rm', '-t',
+    docker_client = docker.from_env()
+    # Equivalent command
+    docker_cmd_list = [get_docker_binary(nvidia_runtime), 'run',
+               '--rm',
                '--shm-size={}'.format(shared_memory_size),
                '-v', "{}:/work/mxnet".format(mx_root),  # mount mxnet root
                '-v', "{}:/work/build".format(local_build_folder),  # mount 
mxnet/build for storing build artifacts
                '-v', "{}:/work/ccache".format(local_ccache_dir),
                '-u', '{}:{}'.format(os.getuid(), os.getgid()),
-               '-e', 'CCACHE_MAXSIZE={}'.format(CCACHE_MAXSIZE),
                '-e', 'CCACHE_TEMPDIR=/tmp/ccache',  # temp dir should be local 
and not shared
                '-e', "CCACHE_DIR=/work/ccache",  # this path is inside the 
container as /work/ccache is mounted
                '-e', "CCACHE_LOGFILE=/tmp/ccache.log",  # a container-scoped 
log, useful for ccache verification.
+               '-ti',
                tag]
-    runlist.extend(command)
-    cmd = '\\\n\t'.join(runlist)
-    ret = 0
-    if not dry_run and not interactive:
-        logging.info("Running %s in container %s", command, tag)
-        logging.info("Executing:\n%s\n", cmd)
-        ret = call(runlist)
-
-    docker_run_cmd = ' '.join(runlist)
-    if not dry_run and interactive:
-        into_cmd = deepcopy(runlist)
-        # -ti can't be after the tag, as is interpreted as a command so hook 
it up after the -u argument
-        idx = into_cmd.index('-u') + 2
-        into_cmd[idx:idx] = ['-ti']
-        cmd = '\\\n\t'.join(into_cmd)
-        logging.info("Executing:\n%s\n", cmd)
-        docker_run_cmd = ' '.join(into_cmd)
-        ret = call(into_cmd)
-
-    if not dry_run and not interactive and ret != 0:
-        logging.error("Running of command in container failed (%s):\n%s\n", 
ret, cmd)
-        logging.error("You can get into the container by adding the -i option")
-        raise subprocess.CalledProcessError(ret, cmd)
-
-    return docker_run_cmd
+    docker_cmd_list.extend(command)
+    docker_cmd = ' \\\n\t'.join(docker_cmd_list)
+    logging.info("Running %s in container %s", command, tag)
+    logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
+    ret = 0 # return code of the command inside docker
+    if not dry_run:
+
+
+        #############################
+        #
+        signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, 
signal.SIGTERM})
+        if nvidia_runtime:
+            runtime='nvidia'
+        else:
+            # runc is default (docker info | grep -i runtime)
+            runtime=None
+
+        container = docker_client.containers.run(
+            tag,
+            runtime=runtime,
+            detach=True,
+            command=command,
+            #auto_remove=True,
+            shm_size=shared_memory_size,
+            user='{}:{}'.format(os.getuid(), os.getgid()),
+            volumes={
+                mx_root:
+                    {'bind': '/work/mxnet', 'mode': 'rw'},
+                local_build_folder:
+                    {'bind': '/work/build', 'mode': 'rw'},
+                local_ccache_dir:
+                    {'bind': '/work/ccache', 'mode': 'rw'},
+            },
+            environment=environment)
+        logging.info("Started container: %s", trim_container_id(container.id))
+        # Race condition:
+        # If the previous call is interrupted then it's possible that the 
container is not cleaned up
+        # We avoid by masking the signals temporarily
+        cleanup.add_container(container)
+        signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, 
signal.SIGTERM})
+        #
+        #############################
+
+        stream = container.logs(stream=True, stdout=True, stderr=True)
+        sys.stdout.flush()
+        for chunk in stream:
 
 Review comment:
   Would it be possible to put this in a helper, and describe why it's needed?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to