szha closed pull request #11779: [MXNET-681] Add retry mechanism for the docker build stage URL: https://github.com/apache/incubator-mxnet/pull/11779
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache index 60cccb38e12..550425bb932 100644 --- a/ci/Jenkinsfile_docker_cache +++ b/ci/Jenkinsfile_docker_cache @@ -21,7 +21,7 @@ // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/ // timeout in minutes -total_timeout = 120 +total_timeout = 300 git_timeout = 15 // assign any caught errors here err = null diff --git a/ci/build.py b/ci/build.py index 1652505cf5f..09f2d4709bd 100755 --- a/ci/build.py +++ b/ci/build.py @@ -67,12 +67,13 @@ def get_docker_binary(use_nvidia_docker: bool) -> str: return "nvidia-docker" if use_nvidia_docker else "docker" -def build_docker(platform: str, docker_binary: str, registry: str) -> None: +def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int) -> None: """ Build a container for the given platform :param platform: Platform :param docker_binary: docker binary to use (docker/nvidia-docker) :param registry: Dockerhub registry name + :param num_retries: Number of retries to build the docker image :return: Id of the top level image """ @@ -90,15 +91,32 @@ def build_docker(platform: str, docker_binary: str, registry: str) -> None: # # This doesn't work with multi head docker files. # - cmd = [docker_binary, "build", - "-f", get_dockerfile(platform), - "--build-arg", "USER_ID={}".format(os.getuid()), - "--build-arg", "GROUP_ID={}".format(os.getgid()), - "--cache-from", tag, - "-t", tag, - "docker"] - logging.info("Running command: '%s'", ' '.join(cmd)) - check_call(cmd) + + for i in range(num_retries): + logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries) + + cmd = [docker_binary, "build", + "-f", get_dockerfile(platform), + "--build-arg", "USER_ID={}".format(os.getuid()), + "--build-arg", "GROUP_ID={}".format(os.getgid()), + "--cache-from", tag, + "-t", tag, + "docker"] + logging.info("Running command: '%s'", ' '.join(cmd)) + try: + check_call(cmd) + # Docker build was successful. Call break to break out of the retry mechanism + break + except subprocess.CalledProcessError as e: + saved_exception = e + logging.error('Failed to build docker image') + # Building the docker image failed. Call continue to trigger the retry mechanism + continue + else: + # Num retries exceeded + logging.exception('Exception during build of docker image', saved_exception) + logging.fatal('Failed to build the docker image, aborting...') + sys.exit(1) # Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the # check_call would have failed @@ -275,6 +293,11 @@ def script_name() -> str: default='mxnetci', type=str) + parser.add_argument("-r", "--docker-build-retries", + help="Number of times to retry building the docker image. Default is 1", + default=1, + type=int) + parser.add_argument("-c", "--cache", action="store_true", help="Enable docker registry cache") @@ -294,6 +317,7 @@ def use_cache(): command = list(chain(*args.command)) docker_binary = get_docker_binary(args.nvidiadocker) shared_memory_size = args.shared_memory_size + num_docker_build_retires = args.docker_build_retries if args.list: list_platforms() @@ -302,7 +326,7 @@ def use_cache(): tag = get_docker_tag(platform=platform, registry=args.docker_registry) if use_cache(): load_docker_cache(tag=tag, docker_registry=args.docker_registry) - build_docker(platform, docker_binary, registry=args.docker_registry) + build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires) if args.build_only: logging.warning("Container was just built. Exiting due to build-only.") return 0 @@ -336,7 +360,7 @@ def use_cache(): tag = get_docker_tag(platform=platform, registry=args.docker_registry) if use_cache(): load_docker_cache(tag=tag, docker_registry=args.docker_registry) - build_docker(platform, docker_binary, args.docker_registry) + build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires) if args.build_only: continue build_platform = "build_{}".format(platform) diff --git a/ci/docker_cache.py b/ci/docker_cache.py index 16abb9e5419..6637ec37716 100755 --- a/ci/docker_cache.py +++ b/ci/docker_cache.py @@ -76,7 +76,8 @@ def _build_save_container(platform, registry, load_cache) -> str: # Start building logging.debug('Building %s as %s', platform, docker_tag) try: - image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry) + # Increase the number of retries for building the cache. + image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10) logging.info('Built %s as %s', docker_tag, image_id) # Push cache to registry diff --git a/docs/Jenkinsfile b/docs/Jenkinsfile index 88f75e71a19..175f637b7d9 100644 --- a/docs/Jenkinsfile +++ b/docs/Jenkinsfile @@ -51,7 +51,7 @@ try { ws('workspace/docs') { init_git() timeout(time: max_time, unit: 'MINUTES') { - sh "ci/build.py -p ubuntu_cpu /work/runtime_functions.sh build_docs ${params.tags_to_build} ${params.tag_list} ${params.tag_default} ${params.domain}" + sh "ci/build.py -p ubuntu_cpu --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --docker-build-retries 3 /work/runtime_functions.sh build_docs ${params.tags_to_build} ${params.tag_list} ${params.tag_default} ${params.domain}" archiveArtifacts 'docs/build_version_doc/artifacts.tgz' build 'restricted-website-publish' } diff --git a/tests/nightly/Jenkinsfile b/tests/nightly/Jenkinsfile index d869b4f99d1..173a33ab488 100755 --- a/tests/nightly/Jenkinsfile +++ b/tests/nightly/Jenkinsfile @@ -58,7 +58,7 @@ def init_git() { } def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') { - def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" + def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '') command = command.replaceAll('%PLATFORM%', platform) command = command.replaceAll('%FUNCTION_NAME%', function_name) diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries index c0c14b26667..3d958b1de7e 100755 --- a/tests/nightly/JenkinsfileForBinaries +++ b/tests/nightly/JenkinsfileForBinaries @@ -57,7 +57,7 @@ def init_git() { } def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') { - def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" + def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '') command = command.replaceAll('%PLATFORM%', platform) command = command.replaceAll('%FUNCTION_NAME%', function_name) diff --git a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC index a87db838602..b7ac676af63 100755 --- a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC +++ b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC @@ -39,7 +39,7 @@ def init_git() { } def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') { - def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" + def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '') command = command.replaceAll('%PLATFORM%', platform) command = command.replaceAll('%FUNCTION_NAME%', function_name) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
