This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 6d927d1 [MXNET-681] Add retry mechanism for the docker build stage
(#11779)
6d927d1 is described below
commit 6d927d168288ee69cb802f88622132ae5a1120f3
Author: Marco de Abreu <[email protected]>
AuthorDate: Wed Jul 18 21:21:22 2018 +0200
[MXNET-681] Add retry mechanism for the docker build stage (#11779)
* Add retry mechanism for the docker build stage
* Only log exception at the end of the retry
* Change from env variable to script param and update Jenkinsfile to latest
state
---
ci/Jenkinsfile_docker_cache | 2 +-
ci/build.py | 48 ++++++++++++++++------
ci/docker_cache.py | 3 +-
docs/Jenkinsfile | 2 +-
tests/nightly/Jenkinsfile | 2 +-
tests/nightly/JenkinsfileForBinaries | 2 +-
.../broken_link_checker_test/JenkinsfileForBLC | 2 +-
7 files changed, 43 insertions(+), 18 deletions(-)
diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache
index 60cccb3..550425b 100644
--- a/ci/Jenkinsfile_docker_cache
+++ b/ci/Jenkinsfile_docker_cache
@@ -21,7 +21,7 @@
// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
// timeout in minutes
-total_timeout = 120
+total_timeout = 300
git_timeout = 15
// assign any caught errors here
err = null
diff --git a/ci/build.py b/ci/build.py
index 1652505..09f2d47 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -67,12 +67,13 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"
-def build_docker(platform: str, docker_binary: str, registry: str) -> None:
+def build_docker(platform: str, docker_binary: str, registry: str,
num_retries: int) -> None:
"""
Build a container for the given platform
:param platform: Platform
:param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
+ :param num_retries: Number of retries to build the docker image
:return: Id of the top level image
"""
@@ -90,15 +91,32 @@ def build_docker(platform: str, docker_binary: str,
registry: str) -> None:
#
# This doesn't work with multi head docker files.
#
- cmd = [docker_binary, "build",
- "-f", get_dockerfile(platform),
- "--build-arg", "USER_ID={}".format(os.getuid()),
- "--build-arg", "GROUP_ID={}".format(os.getgid()),
- "--cache-from", tag,
- "-t", tag,
- "docker"]
- logging.info("Running command: '%s'", ' '.join(cmd))
- check_call(cmd)
+
+ for i in range(num_retries):
+ logging.info('%d out of %d tries to build the docker image.', i + 1,
num_retries)
+
+ cmd = [docker_binary, "build",
+ "-f", get_dockerfile(platform),
+ "--build-arg", "USER_ID={}".format(os.getuid()),
+ "--build-arg", "GROUP_ID={}".format(os.getgid()),
+ "--cache-from", tag,
+ "-t", tag,
+ "docker"]
+ logging.info("Running command: '%s'", ' '.join(cmd))
+ try:
+ check_call(cmd)
+ # Docker build was successful. Call break to break out of the
retry mechanism
+ break
+ except subprocess.CalledProcessError as e:
+ saved_exception = e
+ logging.error('Failed to build docker image')
+ # Building the docker image failed. Call continue to trigger the
retry mechanism
+ continue
+ else:
+ # Num retries exceeded
+ logging.exception('Exception during build of docker image',
saved_exception)
+ logging.fatal('Failed to build the docker image, aborting...')
+ sys.exit(1)
# Get image id by reading the tag. It's guaranteed (except race condition)
that the tag exists. Otherwise, the
# check_call would have failed
@@ -275,6 +293,11 @@ def main() -> int:
default='mxnetci',
type=str)
+ parser.add_argument("-r", "--docker-build-retries",
+ help="Number of times to retry building the docker
image. Default is 1",
+ default=1,
+ type=int)
+
parser.add_argument("-c", "--cache", action="store_true",
help="Enable docker registry cache")
@@ -294,6 +317,7 @@ def main() -> int:
command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
shared_memory_size = args.shared_memory_size
+ num_docker_build_retires = args.docker_build_retries
if args.list:
list_platforms()
@@ -302,7 +326,7 @@ def main() -> int:
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
- build_docker(platform, docker_binary, registry=args.docker_registry)
+ build_docker(platform, docker_binary, registry=args.docker_registry,
num_retries=num_docker_build_retires)
if args.build_only:
logging.warning("Container was just built. Exiting due to
build-only.")
return 0
@@ -336,7 +360,7 @@ def main() -> int:
tag = get_docker_tag(platform=platform,
registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag,
docker_registry=args.docker_registry)
- build_docker(platform, docker_binary, args.docker_registry)
+ build_docker(platform, docker_binary, args.docker_registry,
num_retries=num_docker_build_retires)
if args.build_only:
continue
build_platform = "build_{}".format(platform)
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 16abb9e..6637ec3 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -76,7 +76,8 @@ def _build_save_container(platform, registry, load_cache) ->
str:
# Start building
logging.debug('Building %s as %s', platform, docker_tag)
try:
- image_id = build_util.build_docker(docker_binary='docker',
platform=platform, registry=registry)
+ # Increase the number of retries for building the cache.
+ image_id = build_util.build_docker(docker_binary='docker',
platform=platform, registry=registry, num_retries=10)
logging.info('Built %s as %s', docker_tag, image_id)
# Push cache to registry
diff --git a/docs/Jenkinsfile b/docs/Jenkinsfile
index 88f75e7..175f637 100644
--- a/docs/Jenkinsfile
+++ b/docs/Jenkinsfile
@@ -51,7 +51,7 @@ try {
ws('workspace/docs') {
init_git()
timeout(time: max_time, unit: 'MINUTES') {
- sh "ci/build.py -p ubuntu_cpu /work/runtime_functions.sh
build_docs ${params.tags_to_build} ${params.tag_list} ${params.tag_default}
${params.domain}"
+ sh "ci/build.py -p ubuntu_cpu --docker-registry
${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --docker-build-retries 3
/work/runtime_functions.sh build_docs ${params.tags_to_build}
${params.tag_list} ${params.tag_default} ${params.domain}"
archiveArtifacts 'docs/build_version_doc/artifacts.tgz'
build 'restricted-website-publish'
}
diff --git a/tests/nightly/Jenkinsfile b/tests/nightly/Jenkinsfile
index d869b4f..173a33a 100755
--- a/tests/nightly/Jenkinsfile
+++ b/tests/nightly/Jenkinsfile
@@ -58,7 +58,7 @@ def init_git() {
}
def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
- def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM%
/work/runtime_functions.sh %FUNCTION_NAME%"
+ def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' :
'')
command = command.replaceAll('%PLATFORM%', platform)
command = command.replaceAll('%FUNCTION_NAME%', function_name)
diff --git a/tests/nightly/JenkinsfileForBinaries
b/tests/nightly/JenkinsfileForBinaries
index c0c14b2..3d958b1 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -57,7 +57,7 @@ def init_git() {
}
def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
- def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM%
/work/runtime_functions.sh %FUNCTION_NAME%"
+ def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' :
'')
command = command.replaceAll('%PLATFORM%', platform)
command = command.replaceAll('%FUNCTION_NAME%', function_name)
diff --git a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
index 2bcedfe..912b65b 100755
--- a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
+++ b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
@@ -39,7 +39,7 @@ def init_git() {
}
def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
- def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}
%USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM%
/work/runtime_functions.sh %FUNCTION_NAME%"
+ def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}
%USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size
%SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' :
'')
command = command.replaceAll('%PLATFORM%', platform)
command = command.replaceAll('%FUNCTION_NAME%', function_name)