marcoabreu closed pull request #13092: [MXNET-1193] Fix Docker Cleanup Race 
Condition
URL: https://github.com/apache/incubator-mxnet/pull/13092
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ci/build.py b/ci/build.py
index e2554d9b8ce..8f3fe2d1244 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -281,7 +281,6 @@ def container_run(platform: str,
             # noinspection PyShadowingNames
             # runc is default (docker info | grep -i runtime)
             runtime = 'nvidia'
-
         container = docker_client.containers.run(
             tag,
             runtime=runtime,
@@ -299,52 +298,55 @@ def container_run(platform: str,
                     {'bind': '/work/ccache', 'mode': 'rw'},
             },
             environment=environment)
-        logging.info("Started container: %s", trim_container_id(container.id))
-        # Race condition:
-        # If the previous call is interrupted then it's possible that the 
container is not cleaned up
-        # We avoid by masking the signals temporarily
-        cleanup.add_container(container)
-        signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, 
signal.SIGTERM})
-        #
-        #############################
-
-        stream = container.logs(stream=True, stdout=True, stderr=True)
-        sys.stdout.flush()
-        for chunk in stream:
-            sys.stdout.buffer.write(chunk)
-            sys.stdout.buffer.flush()
-        sys.stdout.flush()
-        stream.close()
-        try:
-            logging.info("Waiting for status of container %s for %d s.",
-                         trim_container_id(container.id),
-                         container_wait_s)
-            wait_result = container.wait(timeout=container_wait_s)
-            logging.info("Container exit status: %s", wait_result)
-            ret = wait_result.get('StatusCode', 200)
-        except Exception as e:
-            logging.exception(e)
-            ret = 150
-
-        # Stop
         try:
-            logging.info("Stopping container: %s", 
trim_container_id(container.id))
-            container.stop()
-        except Exception as e:
-            logging.exception(e)
-            ret = 151
+            logging.info("Started container: %s", 
trim_container_id(container.id))
+            # Race condition:
+            # If the previous call is interrupted then it's possible that the 
container is not cleaned up
+            # We avoid by masking the signals temporarily
+            cleanup.add_container(container)
+            signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, 
signal.SIGTERM})
+            #
+            #############################
+
+            stream = container.logs(stream=True, stdout=True, stderr=True)
+            sys.stdout.flush()
+            for chunk in stream:
+                sys.stdout.buffer.write(chunk)
+                sys.stdout.buffer.flush()
+            sys.stdout.flush()
+            stream.close()
+            try:
+                logging.info("Waiting for status of container %s for %d s.",
+                            trim_container_id(container.id),
+                            container_wait_s)
+                wait_result = container.wait(timeout=container_wait_s)
+                logging.info("Container exit status: %s", wait_result)
+                ret = wait_result.get('StatusCode', 200)
+            except Exception as e:
+                logging.exception(e)
+                ret = 150
 
-        # Remove
-        try:
-            logging.info("Removing container: %s", 
trim_container_id(container.id))
-            container.remove()
-        except Exception as e:
-            logging.exception(e)
-            ret = 152
-        cleanup.remove_container(container)
-        containers = docker_client.containers.list()
-        if containers:
-            logging.info("Other running containers: %s", 
[trim_container_id(x.id) for x in containers])
+            # Stop
+            try:
+                logging.info("Stopping container: %s", 
trim_container_id(container.id))
+                container.stop()
+            except Exception as e:
+                logging.exception(e)
+                ret = 151
+
+            # Remove
+            try:
+                logging.info("Removing container: %s", 
trim_container_id(container.id))
+                container.remove()
+            except Exception as e:
+                logging.exception(e)
+                ret = 152
+            cleanup.remove_container(container)
+            containers = docker_client.containers.list()
+            if containers:
+                logging.info("Other running containers: %s", 
[trim_container_id(x.id) for x in containers])
+        except docker.errors.NotFound as e:
+            logging.info("Container was stopped before cleanup started: %s", e)
     return ret
 
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to