szha closed pull request #11779: [MXNET-681] Add retry mechanism for the docker 
build stage
URL: https://github.com/apache/incubator-mxnet/pull/11779
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache
index 60cccb38e12..550425bb932 100644
--- a/ci/Jenkinsfile_docker_cache
+++ b/ci/Jenkinsfile_docker_cache
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-total_timeout = 120
+total_timeout = 300
 git_timeout = 15
 // assign any caught errors here
 err = null
diff --git a/ci/build.py b/ci/build.py
index 1652505cf5f..09f2d4709bd 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -67,12 +67,13 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str) -> None:
+def build_docker(platform: str, docker_binary: str, registry: str, 
num_retries: int) -> None:
     """
     Build a container for the given platform
     :param platform: Platform
     :param docker_binary: docker binary to use (docker/nvidia-docker)
     :param registry: Dockerhub registry name
+    :param num_retries: Number of retries to build the docker image
     :return: Id of the top level image
     """
 
@@ -90,15 +91,32 @@ def build_docker(platform: str, docker_binary: str, 
registry: str) -> None:
     #
     # This doesn't work with multi head docker files.
     # 
-    cmd = [docker_binary, "build",
-           "-f", get_dockerfile(platform),
-           "--build-arg", "USER_ID={}".format(os.getuid()),
-           "--build-arg", "GROUP_ID={}".format(os.getgid()),
-           "--cache-from", tag,
-           "-t", tag,
-           "docker"]
-    logging.info("Running command: '%s'", ' '.join(cmd))
-    check_call(cmd)
+
+    for i in range(num_retries):
+        logging.info('%d out of %d tries to build the docker image.', i + 1, 
num_retries)
+
+        cmd = [docker_binary, "build",
+               "-f", get_dockerfile(platform),
+               "--build-arg", "USER_ID={}".format(os.getuid()),
+               "--build-arg", "GROUP_ID={}".format(os.getgid()),
+               "--cache-from", tag,
+               "-t", tag,
+               "docker"]
+        logging.info("Running command: '%s'", ' '.join(cmd))
+        try:
+            check_call(cmd)
+            # Docker build was successful. Call break to break out of the 
retry mechanism
+            break
+        except subprocess.CalledProcessError as e:
+            saved_exception = e
+            logging.error('Failed to build docker image')
+            # Building the docker image failed. Call continue to trigger the 
retry mechanism
+            continue
+    else:
+        # Num retries exceeded
+        logging.exception('Exception during build of docker image', 
saved_exception)
+        logging.fatal('Failed to build the docker image, aborting...')
+        sys.exit(1)
 
     # Get image id by reading the tag. It's guaranteed (except race condition) 
that the tag exists. Otherwise, the
     # check_call would have failed
@@ -275,6 +293,11 @@ def script_name() -> str:
                         default='mxnetci',
                         type=str)
 
+    parser.add_argument("-r", "--docker-build-retries",
+                        help="Number of times to retry building the docker 
image. Default is 1",
+                        default=1,
+                        type=int)
+
     parser.add_argument("-c", "--cache", action="store_true",
                         help="Enable docker registry cache")
 
@@ -294,6 +317,7 @@ def use_cache():
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
     shared_memory_size = args.shared_memory_size
+    num_docker_build_retires = args.docker_build_retries
 
     if args.list:
         list_platforms()
@@ -302,7 +326,7 @@ def use_cache():
         tag = get_docker_tag(platform=platform, registry=args.docker_registry)
         if use_cache():
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-        build_docker(platform, docker_binary, registry=args.docker_registry)
+        build_docker(platform, docker_binary, registry=args.docker_registry, 
num_retries=num_docker_build_retires)
         if args.build_only:
             logging.warning("Container was just built. Exiting due to 
build-only.")
             return 0
@@ -336,7 +360,7 @@ def use_cache():
             tag = get_docker_tag(platform=platform, 
registry=args.docker_registry)
             if use_cache():
                 load_docker_cache(tag=tag, 
docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary, args.docker_registry)
+            build_docker(platform, docker_binary, args.docker_registry, 
num_retries=num_docker_build_retires)
             if args.build_only:
                 continue
             build_platform = "build_{}".format(platform)
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 16abb9e5419..6637ec37716 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -76,7 +76,8 @@ def _build_save_container(platform, registry, load_cache) -> 
str:
     # Start building
     logging.debug('Building %s as %s', platform, docker_tag)
     try:
-        image_id = build_util.build_docker(docker_binary='docker', 
platform=platform, registry=registry)
+        # Increase the number of retries for building the cache.
+        image_id = build_util.build_docker(docker_binary='docker', 
platform=platform, registry=registry, num_retries=10)
         logging.info('Built %s as %s', docker_tag, image_id)
 
         # Push cache to registry
diff --git a/docs/Jenkinsfile b/docs/Jenkinsfile
index 88f75e71a19..175f637b7d9 100644
--- a/docs/Jenkinsfile
+++ b/docs/Jenkinsfile
@@ -51,7 +51,7 @@ try {
       ws('workspace/docs') {
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
-            sh "ci/build.py -p ubuntu_cpu /work/runtime_functions.sh 
build_docs ${params.tags_to_build} ${params.tag_list} ${params.tag_default} 
${params.domain}"
+            sh "ci/build.py -p ubuntu_cpu --docker-registry 
${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --docker-build-retries 3 
/work/runtime_functions.sh build_docs ${params.tags_to_build} 
${params.tag_list} ${params.tag_default} ${params.domain}"
             archiveArtifacts 'docs/build_version_doc/artifacts.tgz'
             build 'restricted-website-publish'
         }
diff --git a/tests/nightly/Jenkinsfile b/tests/nightly/Jenkinsfile
index d869b4f99d1..173a33ab488 100755
--- a/tests/nightly/Jenkinsfile
+++ b/tests/nightly/Jenkinsfile
@@ -58,7 +58,7 @@ def init_git() {
 }
 
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% 
/work/runtime_functions.sh %FUNCTION_NAME%"
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size 
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : 
'')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)
diff --git a/tests/nightly/JenkinsfileForBinaries 
b/tests/nightly/JenkinsfileForBinaries
index c0c14b26667..3d958b1de7e 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -57,7 +57,7 @@ def init_git() {
 }
 
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% 
/work/runtime_functions.sh %FUNCTION_NAME%"
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size 
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : 
'')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)
diff --git a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC 
b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
index a87db838602..b7ac676af63 100755
--- a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
+++ b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
@@ -39,7 +39,7 @@ def init_git() {
 }
 
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% 
/work/runtime_functions.sh %FUNCTION_NAME%"
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} 
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size 
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : 
'')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to