This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 6d927d1  [MXNET-681] Add retry mechanism for the docker build stage 
(#11779)
6d927d1 is described below

commit 6d927d168288ee69cb802f88622132ae5a1120f3
Author: Marco de Abreu <[email protected]>
AuthorDate: Wed Jul 18 21:21:22 2018 +0200

    [MXNET-681] Add retry mechanism for the docker build stage (#11779)
    
    * Add retry mechanism for the docker build stage
    
    * Only log exception at the end of the retry
    
    * Change from env variable to script param and update Jenkinsfile to latest 
state
---
 ci/Jenkinsfile_docker_cache                        |  2 +-
 ci/build.py                                        | 48 ++++++++++++++++------
 ci/docker_cache.py                                 |  3 +-
 docs/Jenkinsfile                                   |  2 +-
 tests/nightly/Jenkinsfile                          |  2 +-
 tests/nightly/JenkinsfileForBinaries               |  2 +-
 .../broken_link_checker_test/JenkinsfileForBLC     |  2 +-
 7 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache
index 60cccb3..550425b 100644
--- a/ci/Jenkinsfile_docker_cache
+++ b/ci/Jenkinsfile_docker_cache
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-total_timeout = 120
+total_timeout = 300
 git_timeout = 15
 // assign any caught errors here
 err = null
diff --git a/ci/build.py b/ci/build.py
index 1652505..09f2d47 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -67,12 +67,13 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str) -> None:
+def build_docker(platform: str, docker_binary: str, registry: str, 
num_retries: int) -> None:
     """
     Build a container for the given platform
     :param platform: Platform
     :param docker_binary: docker binary to use (docker/nvidia-docker)
     :param registry: Dockerhub registry name
+    :param num_retries: Number of retries to build the docker image
     :return: Id of the top level image
     """
 
@@ -90,15 +91,32 @@ def build_docker(platform: str, docker_binary: str, 
registry: str) -> None:
     #
     # This doesn't work with multi head docker files.
     # 
-    cmd = [docker_binary, "build",
-           "-f", get_dockerfile(platform),
-           "--build-arg", "USER_ID={}".format(os.getuid()),
-           "--build-arg", "GROUP_ID={}".format(os.getgid()),
-           "--cache-from", tag,
-           "-t", tag,
-           "docker"]
-    logging.info("Running command: '%s'", ' '.join(cmd))
-    check_call(cmd)
+
+    for i in range(num_retries):
+        logging.info('%d out of %d tries to build the docker image.', i + 1, 
num_retries)
+
+        cmd = [docker_binary, "build",
+               "-f", get_dockerfile(platform),
+               "--build-arg", "USER_ID={}".format(os.getuid()),
+               "--build-arg", "GROUP_ID={}".format(os.getgid()),
+               "--cache-from", tag,
+               "-t", tag,
+               "docker"]
+        logging.info("Running command: '%s'", ' '.join(cmd))
+        try:
+            check_call(cmd)
+            # Docker build was successful. Call break to break out of the 
retry mechanism
+            break
+        except subprocess.CalledProcessError as e:
+            saved_exception = e
+            logging.error('Failed to build docker image')
+            # Building the docker image failed. Call continue to trigger the 
retry mechanism
+            continue
+    else:
+        # Num retries exceeded
+        logging.exception('Exception during build of docker image', 
saved_exception)
+        logging.fatal('Failed to build the docker image, aborting...')
+        sys.exit(1)
 
     # Get image id by reading the tag. It's guaranteed (except race condition) 
that the tag exists. Otherwise, the
     # check_call would have failed
@@ -275,6 +293,11 @@ def main() -> int:
                         default='mxnetci',
                         type=str)
 
+    parser.add_argument("-r", "--docker-build-retries",
+                        help="Number of times to retry building the docker 
image. Default is 1",
+                        default=1,
+                        type=int)
+
     parser.add_argument("-c", "--cache", action="store_true",
                         help="Enable docker registry cache")
 
@@ -294,6 +317,7 @@ def main() -> int:
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
     shared_memory_size = args.shared_memory_size
+    num_docker_build_retires = args.docker_build_retries
 
     if args.list:
         list_platforms()
@@ -302,7 +326,7 @@ def main() -> int:
         tag = get_docker_tag(platform=platform, registry=args.docker_registry)
         if use_cache():
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-        build_docker(platform, docker_binary, registry=args.docker_registry)
+        build_docker(platform, docker_binary, registry=args.docker_registry, 
num_retries=num_docker_build_retires)
         if args.build_only:
             logging.warning("Container was just built. Exiting due to 
build-only.")
             return 0
@@ -336,7 +360,7 @@ def main() -> int:
             tag = get_docker_tag(platform=platform, 
registry=args.docker_registry)
             if use_cache():
                 load_docker_cache(tag=tag, 
docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary, args.docker_registry)
+            build_docker(platform, docker_binary, args.docker_registry, 
num_retries=num_docker_build_retires)
             if args.build_only:
                 continue
             build_platform = "build_{}".format(platform)
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 16abb9e..6637ec3 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -76,7 +76,8 @@ def _build_save_container(platform, registry, load_cache) -> 
str:
     # Start building
     logging.debug('Building %s as %s', platform, docker_tag)
     try:
-        image_id = build_util.build_docker(docker_binary='docker', 
platform=platform, registry=registry)
+        # Increase the number of retries for building the cache.
+        image_id = build_util.build_docker(docker_binary='docker', 
platform=platform, registry=registry, num_retries=10)
         logging.info('Built %s as %s', docker_tag, image_id)
 
         # Push cache to registry
diff --git a/docs/Jenkinsfile b/docs/Jenkinsfile
index 88f75e7..175f637 100644
--- a/docs/Jenkinsfile
+++ b/docs/Jenkinsfile
@@ -51,7 +51,7 @@ try {
       ws('workspace/docs') {
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
-            sh "ci/build.py -p ubuntu_cpu /work/runtime_functions.sh 
build_docs ${params.tags_to_build} ${params.tag_list} ${params.tag_default} 
${params.domain}"
+            sh "ci/build.py -p ubuntu_cpu --docker-registry 
${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --docker-build-retries 3 
/work/runtime_functions.sh build_docs ${params.tags_to_build} 
${params.tag_list} ${params.tag_default} ${params.domain}"
             archiveArtifacts 'docs/build_version_doc/artifacts.tgz'
             build 'restricted-website-publish'
         }
diff --git a/tests/nightly/Jenkinsfile b/tests/nightly/Jenkinsfile
index d869b4f..173a33a 100755
--- a/tests/nightly/Jenkinsfile
+++ b/tests/nightly/Jenkinsfile
@@ -58,7 +58,7 @@ def init_git() {
 }
 
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% 
/work/runtime_functions.sh %FUNCTION_NAME%"
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size 
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : 
'')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)
diff --git a/tests/nightly/JenkinsfileForBinaries 
b/tests/nightly/JenkinsfileForBinaries
index c0c14b2..3d958b1 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -57,7 +57,7 @@ def init_git() {
 }
 
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% 
/work/runtime_functions.sh %FUNCTION_NAME%"
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size 
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : 
'')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)
diff --git a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC 
b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
index 2bcedfe..912b65b 100755
--- a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
+++ b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
@@ -39,7 +39,7 @@ def init_git() {
 }
 
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% 
/work/runtime_functions.sh %FUNCTION_NAME%"
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size 
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : 
'')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)

Reply via email to