This is an automated email from the ASF dual-hosted git repository.
marcoabreu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 6a7bfe9 Separate refactoring from #12276 in a prior PR (#12296)
6a7bfe9 is described below
commit 6a7bfe905bafe94a33eccd0cb8bc42f0d667f606
Author: Pedro Larroy <[email protected]>
AuthorDate: Mon Aug 27 15:00:00 2018 +0200
Separate refactoring from #12276 in a prior PR (#12296)
* Separate minor refactoring from #12276 in a prior PR
* Address CR comments
---
ci/build.py | 164 ++++++++++++++++++++++-------------------
ci/docker/runtime_functions.sh | 3 +
ci/docker_cache.py | 5 +-
ci/util.py | 71 ++++++++++++++++++
4 files changed, 165 insertions(+), 78 deletions(-)
diff --git a/ci/build.py b/ci/build.py
index a9d6a63..f1a5e99 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -23,31 +23,33 @@
"""
__author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy'
-__version__ = '0.1'
+__version__ = '0.2'
import argparse
import glob
import logging
-import os
import re
import shutil
import subprocess
import sys
import tempfile
-import platform
from copy import deepcopy
from itertools import chain
-from subprocess import call, check_call
+from subprocess import call, check_call, check_output
from typing import *
from util import *
+import pprint
+import requests
+
CCACHE_MAXSIZE = '500G'
-def under_ci() -> bool:
- """:return: True if we run in Jenkins."""
- return 'JOB_NAME' in os.environ
-def get_platforms(path: Optional[str] = "docker"):
+def get_dockerfiles_path():
+ return "docker"
+
+
+def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
"""Get a list of architectures given our dockerfiles"""
dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
@@ -57,10 +59,11 @@ def get_platforms(path: Optional[str] = "docker"):
def get_docker_tag(platform: str, registry: str) -> str:
+ """:return: docker tag to be used for the container"""
return "{0}/build.{1}".format(registry, platform)
-def get_dockerfile(platform: str, path="docker") -> str:
+def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.build.{0}".format(platform))
@@ -68,18 +71,18 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"
-def build_docker(platform: str, docker_binary: str, registry: str,
num_retries: int) -> None:
+def build_docker(platform: str, docker_binary: str, registry: str,
num_retries: int, use_cache: bool) -> str:
"""
Build a container for the given platform
:param platform: Platform
:param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
+ :param use_cache: will pass cache_from to docker to use the previously
pulled tag
:return: Id of the top level image
"""
-
tag = get_docker_tag(platform=platform, registry=registry)
- logging.info("Building container tagged '%s' with %s", tag, docker_binary)
+ logging.info("Building docker container tagged '%s' with %s", tag,
docker_binary)
#
# We add a user with the same group as the executing non-root user so
files created in the
# container match permissions of the local user. Same for the group.
@@ -90,41 +93,29 @@ def build_docker(platform: str, docker_binary: str,
registry: str, num_retries:
# cache-from is needed so we use the cached images tagged from the remote
via
# docker pull see: docker_cache.load_docker_cache
#
+ # This also prevents using local layers for caching:
https://github.com/moby/moby/issues/33002
+ # So to use local caching, we should omit the cache-from by using
--no-dockerhub-cache argument to this
+ # script.
+ #
# This doesn't work with multi head docker files.
- #
-
- for i in range(num_retries):
- logging.info('%d out of %d tries to build the docker image.', i + 1,
num_retries)
-
- cmd = [docker_binary, "build",
- "-f", get_dockerfile(platform),
- "--build-arg", "USER_ID={}".format(os.getuid()),
- "--build-arg", "GROUP_ID={}".format(os.getgid()),
- "--cache-from", tag,
- "-t", tag,
- "docker"]
+ #
+ cmd = [docker_binary, "build",
+ "-f", get_dockerfile(platform),
+ "--build-arg", "USER_ID={}".format(os.getuid()),
+ "--build-arg", "GROUP_ID={}".format(os.getgid())]
+ if use_cache:
+ cmd.extend(["--cache-from", tag])
+ cmd.extend(["-t", tag, get_dockerfiles_path()])
+
+ @retry(subprocess.CalledProcessError, tries=num_retries)
+ def run_cmd():
logging.info("Running command: '%s'", ' '.join(cmd))
- try:
- check_call(cmd)
- # Docker build was successful. Call break to break out of the
retry mechanism
- break
- except subprocess.CalledProcessError as e:
- saved_exception = e
- logging.error('Failed to build docker image')
- # Building the docker image failed. Call continue to trigger the
retry mechanism
- continue
- else:
- # Num retries exceeded
- logging.exception('Exception during build of docker image',
saved_exception)
- logging.fatal('Failed to build the docker image, aborting...')
- sys.exit(1)
+ check_call(cmd)
+ run_cmd()
# Get image id by reading the tag. It's guaranteed (except race condition)
that the tag exists. Otherwise, the
# check_call would have failed
- image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
- if not image_id:
- raise FileNotFoundError('Unable to find docker image id matching with
{}'.format(tag))
- return image_id
+ return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
def _get_local_image_id(docker_binary, docker_tag):
@@ -134,14 +125,17 @@ def _get_local_image_id(docker_binary, docker_tag):
:return: Image id as string or None if tag does not exist
"""
cmd = [docker_binary, "images", "-q", docker_tag]
- image_id_b = subprocess.check_output(cmd)
+ image_id_b = check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
+ if not image_id:
+ raise RuntimeError('Unable to find docker image id matching with tag
{}'.format(docker_tag))
return image_id
def buildir() -> str:
return os.path.join(get_mxnet_root(), "build")
+
def default_ccache_dir() -> str:
# Share ccache across containers
if 'CCACHE_DIR' in os.environ:
@@ -152,6 +146,7 @@ def default_ccache_dir() -> str:
except PermissionError:
logging.info('Unable to make dirs at %s, falling back to local
temp dir', ccache_dir)
# In osx tmpdir is not mountable by default
+ import platform
if platform.system() == 'Darwin':
ccache_dir = "/tmp/_mxnet_ccache"
os.makedirs(ccache_dir, exist_ok=True)
@@ -166,7 +161,7 @@ def container_run(platform: str,
local_ccache_dir: str,
command: List[str],
dry_run: bool = False,
- interactive: bool = False) -> str:
+ interactive: bool = False) -> int:
tag = get_docker_tag(platform=platform, registry=docker_registry)
mx_root = get_mxnet_root()
local_build_folder = buildir()
@@ -193,15 +188,13 @@ def container_run(platform: str,
logging.info("Executing:\n%s\n", cmd)
ret = call(runlist)
- docker_run_cmd = ' '.join(runlist)
if not dry_run and interactive:
into_cmd = deepcopy(runlist)
# -ti can't be after the tag, as is interpreted as a command so hook
it up after the -u argument
idx = into_cmd.index('-u') + 2
into_cmd[idx:idx] = ['-ti']
- cmd = '\\\n\t'.join(into_cmd)
+ cmd = ' \\\n\t'.join(into_cmd)
logging.info("Executing:\n%s\n", cmd)
- docker_run_cmd = ' '.join(into_cmd)
ret = call(into_cmd)
if not dry_run and not interactive and ret != 0:
@@ -209,11 +202,12 @@ def container_run(platform: str,
logging.error("You can get into the container by adding the -i option")
raise subprocess.CalledProcessError(ret, cmd)
- return docker_run_cmd
+ return ret
def list_platforms() -> str:
- print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
+ return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))
+
def load_docker_cache(tag, docker_registry) -> None:
if docker_registry:
@@ -221,24 +215,34 @@ def load_docker_cache(tag, docker_registry) -> None:
import docker_cache
logging.info('Docker cache download is enabled from registry %s',
docker_registry)
docker_cache.load_docker_cache(registry=docker_registry,
docker_tag=tag)
+ # noinspection PyBroadException
except Exception:
logging.exception('Unable to retrieve Docker cache. Continue
without...')
else:
logging.info('Distributed docker cache disabled')
-def main() -> int:
- # We need to be in the same directory than the script so the commands in
the dockerfiles work as
- # expected. But the script can be invoked from a different path
- base = os.path.split(os.path.realpath(__file__))[0]
- os.chdir(base)
- logging.getLogger().setLevel(logging.INFO)
+def log_environment():
+ instance_id = ec2_instance_id_hostname()
+ if instance_id:
+ logging.info("EC2 Instance id: %s", instance_id)
+ pp = pprint.PrettyPrinter(indent=4)
+ logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))
+
- def script_name() -> str:
- return os.path.split(sys.argv[0])[1]
+def script_name() -> str:
+ return os.path.split(sys.argv[0])[1]
+
+def main() -> int:
+ logging.getLogger().setLevel(logging.INFO)
+ logging.getLogger("requests").setLevel(logging.WARNING)
logging.basicConfig(format='{}: %(asctime)-15s
%(message)s'.format(script_name()))
+ logging.info("MXNet container based build tool.")
+ log_environment()
+ chdir_to_script_directory()
+
parser = argparse.ArgumentParser(description="""Utility for building and
testing MXNet on docker
containers""", epilog="")
parser.add_argument("-p", "--platform",
@@ -284,8 +288,10 @@ def main() -> int:
default=1,
type=int)
- parser.add_argument("-c", "--cache", action="store_true",
- help="Enable docker registry cache")
+ parser.add_argument("-c", "--no-dockerhub-cache", action="store_true",
+ help="Disables use of --cache-from option on docker
build, allowing docker"
+ " to use local layers for caching. If absent, we use
the cache from dockerhub"
+ " which is the default.")
parser.add_argument("command",
help="command to run in the container",
@@ -297,35 +303,36 @@ def main() -> int:
type=str)
args = parser.parse_args()
+
def use_cache():
- return args.cache or under_ci()
+ return not args.no_dockerhub_cache or under_ci()
command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
- shared_memory_size = args.shared_memory_size
- num_docker_build_retires = args.docker_build_retries
if args.list:
- list_platforms()
+ print(list_platforms())
elif args.platform:
platform = args.platform
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
- build_docker(platform, docker_binary, registry=args.docker_registry,
num_retries=num_docker_build_retires)
+ build_docker(platform=platform, docker_binary=docker_binary,
registry=args.docker_registry,
+ num_retries=args.docker_build_retries,
use_cache=use_cache())
if args.build_only:
logging.warning("Container was just built. Exiting due to
build-only.")
return 0
if command:
- container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=shared_memory_size,
+ container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=args.shared_memory_size,
command=command,
docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir,
interactive=args.interactive)
elif args.print_docker_run:
- print(container_run(platform=platform,
docker_binary=docker_binary, shared_memory_size=shared_memory_size,
- command=[], dry_run=True,
docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir))
+ print(container_run(platform=platform,
docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
+ command=[], dry_run=True,
docker_registry=args.docker_registry,
+ local_ccache_dir=args.ccache_dir))
elif args.interactive:
- container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=shared_memory_size,
+ container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=args.shared_memory_size,
command=command,
docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir,
interactive=args.interactive)
@@ -334,7 +341,7 @@ def main() -> int:
assert not args.interactive, "when running with -i must provide a
command"
cmd = ["/work/mxnet/ci/docker/runtime_functions.sh",
"build_{}".format(platform)]
logging.info("No command specified, trying default build: %s", '
'.join(cmd))
- container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=shared_memory_size,
+ container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=args.shared_memory_size,
command=cmd, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir)
@@ -346,15 +353,20 @@ def main() -> int:
tag = get_docker_tag(platform=platform,
registry=args.docker_registry)
if use_cache():
load_docker_cache(tag=tag,
docker_registry=args.docker_registry)
- build_docker(platform, docker_binary, args.docker_registry,
num_retries=num_docker_build_retires)
+ build_docker(platform, docker_binary, args.docker_registry,
num_retries=args.docker_build_retries,
+ use_cache=use_cache())
if args.build_only:
continue
- build_platform = "build_{}".format(platform)
- cmd = ["/work/mxnet/ci/docker/runtime_functions.sh",
build_platform]
shutil.rmtree(buildir(), ignore_errors=True)
- container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=shared_memory_size,
- command=cmd, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir)
- plat_buildir = os.path.join(get_mxnet_root(), build_platform)
+ build_platform = "build_{}".format(platform)
+ plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
+
"mxnet_{}".format(build_platform)))
+ if os.path.exists(plat_buildir):
+ logging.warning("{} already exists,
skipping".format(plat_buildir))
+ continue
+ command = ["/work/mxnet/ci/docker/runtime_functions.sh",
build_platform]
+ container_run(platform=platform, docker_binary=docker_binary,
shared_memory_size=args.shared_memory_size,
+ command=command,
docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
shutil.move(buildir(), plat_buildir)
logging.info("Built files left in: %s", plat_buildir)
@@ -383,7 +395,7 @@ Examples:
./build.py -a
- Builds for all platforms and leaves artifacts in build_<platform>
+ Builds for all platforms and leaves artifacts in build_<platform>.
""")
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a124e10..0b6a42c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -593,6 +593,9 @@ build_ubuntu_gpu_cmake() {
ninja -v
}
+build_ubuntu_blc() {
+ echo "pass"
+}
# Testing
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 7a6d110..bebcb25 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -30,6 +30,7 @@ import argparse
import sys
import subprocess
import json
+from typing import *
import build as build_util
@@ -59,7 +60,7 @@ def build_save_containers(platforms, registry, load_cache) ->
int:
return 1 if is_error else 0
-def _build_save_container(platform, registry, load_cache) -> str:
+def _build_save_container(platform, registry, load_cache) -> Optional[str]:
"""
Build image for passed platform and upload the cache to the specified S3
bucket
:param platform: Platform
@@ -77,7 +78,7 @@ def _build_save_container(platform, registry, load_cache) ->
str:
logging.debug('Building %s as %s', platform, docker_tag)
try:
# Increase the number of retries for building the cache.
- image_id = build_util.build_docker(docker_binary='docker',
platform=platform, registry=registry, num_retries=10)
+ image_id = build_util.build_docker(docker_binary='docker',
platform=platform, registry=registry, num_retries=10, use_cache=True)
logging.info('Built %s as %s', docker_tag, image_id)
# Push cache to registry
diff --git a/ci/util.py b/ci/util.py
index 22631f3..98605be 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -17,6 +17,7 @@
import os
import contextlib
+import requests
def get_mxnet_root() -> str:
curpath = os.path.abspath(os.path.dirname(__file__))
@@ -41,3 +42,73 @@ def remember_cwd():
finally: os.chdir(curdir)
+def retry(target_exception, tries=4, delay_s=1, backoff=2):
+ """Retry calling the decorated function using an exponential backoff.
+
+ http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+ original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+ :param target_exception: the exception to check. may be a tuple of
+ exceptions to check
+ :type target_exception: Exception or tuple
+ :param tries: number of times to try (not retry) before giving up
+ :type tries: int
+ :param delay_s: initial delay between retries in seconds
+ :type delay_s: int
+ :param backoff: backoff multiplier e.g. value of 2 will double the delay
+ each retry
+ :type backoff: int
+ """
+ import time
+ from functools import wraps
+
+ def decorated_retry(f):
+ @wraps(f)
+ def f_retry(*args, **kwargs):
+ mtries, mdelay = tries, delay_s
+ while mtries > 1:
+ try:
+ return f(*args, **kwargs)
+ except target_exception as e:
+ logging.warning("Exception: %s, Retrying in %d
seconds...", str(e), mdelay)
+ time.sleep(mdelay)
+ mtries -= 1
+ mdelay *= backoff
+ return f(*args, **kwargs)
+
+ return f_retry # true decorator
+
+ return decorated_retry
+
+
+# noinspection SyntaxError
+def under_ci() -> bool:
+ """:return: True if we run in Jenkins."""
+ return 'JOB_NAME' in os.environ
+
+
+def ec2_instance_id_hostname() -> str:
+ if under_ci():
+ result = []
+ try:
+ r =
requests.get("http://instance-data/latest/meta-data/instance-id")
+ if r.status_code == 200:
+ result.append(r.content.decode())
+ r =
requests.get("http://instance-data/latest/meta-data/public-hostname")
+ if r.status_code == 200:
+ result.append(r.content.decode())
+ return ' '.join(result)
+ except ConnectionError:
+ pass
+ return '?'
+ else:
+ return ''
+
+
+def chdir_to_script_directory():
+ # We need to be in the same directory than the script so the commands in
the dockerfiles work as
+ # expected. But the script can be invoked from a different path
+ base = os.path.split(os.path.realpath(__file__))[0]
+ os.chdir(base)
+
+