This is an automated email from the ASF dual-hosted git repository.

marcoabreu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 6a7bfe9  Separate refactoring from #12276 in a prior PR (#12296)
6a7bfe9 is described below

commit 6a7bfe905bafe94a33eccd0cb8bc42f0d667f606
Author: Pedro Larroy <[email protected]>
AuthorDate: Mon Aug 27 15:00:00 2018 +0200

    Separate refactoring from #12276 in a prior PR (#12296)
    
    * Separate minor refactoring from #12276 in a prior PR
    
    * Address CR comments
---
 ci/build.py                    | 164 ++++++++++++++++++++++-------------------
 ci/docker/runtime_functions.sh |   3 +
 ci/docker_cache.py             |   5 +-
 ci/util.py                     |  71 ++++++++++++++++++
 4 files changed, 165 insertions(+), 78 deletions(-)

diff --git a/ci/build.py b/ci/build.py
index a9d6a63..f1a5e99 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -23,31 +23,33 @@
 """
 
 __author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy'
-__version__ = '0.1'
+__version__ = '0.2'
 
 import argparse
 import glob
 import logging
-import os
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
-import platform
 from copy import deepcopy
 from itertools import chain
-from subprocess import call, check_call
+from subprocess import call, check_call, check_output
 from typing import *
 from util import *
+import pprint
+import requests
+
 
 CCACHE_MAXSIZE = '500G'
 
-def under_ci() -> bool:
-    """:return: True if we run in Jenkins."""
-    return 'JOB_NAME' in os.environ
 
-def get_platforms(path: Optional[str] = "docker"):
+def get_dockerfiles_path():
+    return "docker"
+
+
+def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
     """Get a list of architectures given our dockerfiles"""
     dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
     dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
@@ -57,10 +59,11 @@ def get_platforms(path: Optional[str] = "docker"):
 
 
 def get_docker_tag(platform: str, registry: str) -> str:
+    """:return: docker tag to be used for the container"""
     return "{0}/build.{1}".format(registry, platform)
 
 
-def get_dockerfile(platform: str, path="docker") -> str:
+def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
     return os.path.join(path, "Dockerfile.build.{0}".format(platform))
 
 
@@ -68,18 +71,18 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str, 
num_retries: int) -> None:
+def build_docker(platform: str, docker_binary: str, registry: str, 
num_retries: int, use_cache: bool) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
     :param docker_binary: docker binary to use (docker/nvidia-docker)
     :param registry: Dockerhub registry name
     :param num_retries: Number of retries to build the docker image
+    :param use_cache: will pass cache_from to docker to use the previously 
pulled tag
     :return: Id of the top level image
     """
-
     tag = get_docker_tag(platform=platform, registry=registry)
-    logging.info("Building container tagged '%s' with %s", tag, docker_binary)
+    logging.info("Building docker container tagged '%s' with %s", tag, 
docker_binary)
     #
     # We add a user with the same group as the executing non-root user so 
files created in the
     # container match permissions of the local user. Same for the group.
@@ -90,41 +93,29 @@ def build_docker(platform: str, docker_binary: str, 
registry: str, num_retries:
     # cache-from is needed so we use the cached images tagged from the remote 
via
     # docker pull see: docker_cache.load_docker_cache
     #
+    # This also prevents using local layers for caching: 
https://github.com/moby/moby/issues/33002
+    # So to use local caching, we should omit the cache-from by using 
--no-dockerhub-cache argument to this
+    # script.
+    #
     # This doesn't work with multi head docker files.
-    # 
-
-    for i in range(num_retries):
-        logging.info('%d out of %d tries to build the docker image.', i + 1, 
num_retries)
-
-        cmd = [docker_binary, "build",
-               "-f", get_dockerfile(platform),
-               "--build-arg", "USER_ID={}".format(os.getuid()),
-               "--build-arg", "GROUP_ID={}".format(os.getgid()),
-               "--cache-from", tag,
-               "-t", tag,
-               "docker"]
+    #
+    cmd = [docker_binary, "build",
+           "-f", get_dockerfile(platform),
+           "--build-arg", "USER_ID={}".format(os.getuid()),
+           "--build-arg", "GROUP_ID={}".format(os.getgid())]
+    if use_cache:
+        cmd.extend(["--cache-from", tag])
+    cmd.extend(["-t", tag, get_dockerfiles_path()])
+
+    @retry(subprocess.CalledProcessError, tries=num_retries)
+    def run_cmd():
         logging.info("Running command: '%s'", ' '.join(cmd))
-        try:
-            check_call(cmd)
-            # Docker build was successful. Call break to break out of the 
retry mechanism
-            break
-        except subprocess.CalledProcessError as e:
-            saved_exception = e
-            logging.error('Failed to build docker image')
-            # Building the docker image failed. Call continue to trigger the 
retry mechanism
-            continue
-    else:
-        # Num retries exceeded
-        logging.exception('Exception during build of docker image', 
saved_exception)
-        logging.fatal('Failed to build the docker image, aborting...')
-        sys.exit(1)
+        check_call(cmd)
 
+    run_cmd()
     # Get image id by reading the tag. It's guaranteed (except race condition) 
that the tag exists. Otherwise, the
     # check_call would have failed
-    image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
-    if not image_id:
-        raise FileNotFoundError('Unable to find docker image id matching with 
{}'.format(tag))
-    return image_id
+    return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
 
 
 def _get_local_image_id(docker_binary, docker_tag):
@@ -134,14 +125,17 @@ def _get_local_image_id(docker_binary, docker_tag):
     :return: Image id as string or None if tag does not exist
     """
     cmd = [docker_binary, "images", "-q", docker_tag]
-    image_id_b = subprocess.check_output(cmd)
+    image_id_b = check_output(cmd)
     image_id = image_id_b.decode('utf-8').strip()
+    if not image_id:
+        raise RuntimeError('Unable to find docker image id matching with tag 
{}'.format(docker_tag))
     return image_id
 
 
 def buildir() -> str:
     return os.path.join(get_mxnet_root(), "build")
 
+
 def default_ccache_dir() -> str:
     # Share ccache across containers
     if 'CCACHE_DIR' in os.environ:
@@ -152,6 +146,7 @@ def default_ccache_dir() -> str:
         except PermissionError:
             logging.info('Unable to make dirs at %s, falling back to local 
temp dir', ccache_dir)
     # In osx tmpdir is not mountable by default
+    import platform
     if platform.system() == 'Darwin':
         ccache_dir = "/tmp/_mxnet_ccache"
         os.makedirs(ccache_dir, exist_ok=True)
@@ -166,7 +161,7 @@ def container_run(platform: str,
                   local_ccache_dir: str,
                   command: List[str],
                   dry_run: bool = False,
-                  interactive: bool = False) -> str:
+                  interactive: bool = False) -> int:
     tag = get_docker_tag(platform=platform, registry=docker_registry)
     mx_root = get_mxnet_root()
     local_build_folder = buildir()
@@ -193,15 +188,13 @@ def container_run(platform: str,
         logging.info("Executing:\n%s\n", cmd)
         ret = call(runlist)
 
-    docker_run_cmd = ' '.join(runlist)
     if not dry_run and interactive:
         into_cmd = deepcopy(runlist)
         # -ti can't be after the tag, as is interpreted as a command so hook 
it up after the -u argument
         idx = into_cmd.index('-u') + 2
         into_cmd[idx:idx] = ['-ti']
-        cmd = '\\\n\t'.join(into_cmd)
+        cmd = ' \\\n\t'.join(into_cmd)
         logging.info("Executing:\n%s\n", cmd)
-        docker_run_cmd = ' '.join(into_cmd)
         ret = call(into_cmd)
 
     if not dry_run and not interactive and ret != 0:
@@ -209,11 +202,12 @@ def container_run(platform: str,
         logging.error("You can get into the container by adding the -i option")
         raise subprocess.CalledProcessError(ret, cmd)
 
-    return docker_run_cmd
+    return ret
 
 
 def list_platforms() -> str:
-    print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
+    return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))
+
 
 def load_docker_cache(tag, docker_registry) -> None:
     if docker_registry:
@@ -221,24 +215,34 @@ def load_docker_cache(tag, docker_registry) -> None:
             import docker_cache
             logging.info('Docker cache download is enabled from registry %s', 
docker_registry)
             docker_cache.load_docker_cache(registry=docker_registry, 
docker_tag=tag)
+        # noinspection PyBroadException
         except Exception:
             logging.exception('Unable to retrieve Docker cache. Continue 
without...')
     else:
         logging.info('Distributed docker cache disabled')
 
-def main() -> int:
-    # We need to be in the same directory than the script so the commands in 
the dockerfiles work as
-    # expected. But the script can be invoked from a different path
-    base = os.path.split(os.path.realpath(__file__))[0]
-    os.chdir(base)
 
-    logging.getLogger().setLevel(logging.INFO)
+def log_environment():
+    instance_id = ec2_instance_id_hostname()
+    if instance_id:
+        logging.info("EC2 Instance id: %s", instance_id)
+    pp = pprint.PrettyPrinter(indent=4)
+    logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))
+
 
-    def script_name() -> str:
-        return os.path.split(sys.argv[0])[1]
+def script_name() -> str:
+    return os.path.split(sys.argv[0])[1]
 
+
+def main() -> int:
+    logging.getLogger().setLevel(logging.INFO)
+    logging.getLogger("requests").setLevel(logging.WARNING)
     logging.basicConfig(format='{}: %(asctime)-15s 
%(message)s'.format(script_name()))
 
+    logging.info("MXNet container based build tool.")
+    log_environment()
+    chdir_to_script_directory()
+
     parser = argparse.ArgumentParser(description="""Utility for building and 
testing MXNet on docker
     containers""", epilog="")
     parser.add_argument("-p", "--platform",
@@ -284,8 +288,10 @@ def main() -> int:
                         default=1,
                         type=int)
 
-    parser.add_argument("-c", "--cache", action="store_true",
-                        help="Enable docker registry cache")
+    parser.add_argument("-c", "--no-dockerhub-cache", action="store_true",
+                        help="Disables use of --cache-from option on docker 
build, allowing docker"
+                        " to use local layers for caching. If absent, we use 
the cache from dockerhub"
+                        " which is the default.")
 
     parser.add_argument("command",
                         help="command to run in the container",
@@ -297,35 +303,36 @@ def main() -> int:
                         type=str)
 
     args = parser.parse_args()
+
     def use_cache():
-        return args.cache or under_ci()
+        return not args.no_dockerhub_cache or under_ci()
 
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
-    shared_memory_size = args.shared_memory_size
-    num_docker_build_retires = args.docker_build_retries
 
     if args.list:
-        list_platforms()
+        print(list_platforms())
     elif args.platform:
         platform = args.platform
         tag = get_docker_tag(platform=platform, registry=args.docker_registry)
         if use_cache():
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-        build_docker(platform, docker_binary, registry=args.docker_registry, 
num_retries=num_docker_build_retires)
+        build_docker(platform=platform, docker_binary=docker_binary, 
registry=args.docker_registry,
+                     num_retries=args.docker_build_retries, 
use_cache=use_cache())
         if args.build_only:
             logging.warning("Container was just built. Exiting due to 
build-only.")
             return 0
 
         if command:
-            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=shared_memory_size,
+            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=args.shared_memory_size,
                           command=command, 
docker_registry=args.docker_registry,
                           local_ccache_dir=args.ccache_dir, 
interactive=args.interactive)
         elif args.print_docker_run:
-            print(container_run(platform=platform, 
docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                                command=[], dry_run=True, 
docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir))
+            print(container_run(platform=platform, 
docker_binary=docker_binary, shared_memory_size=args.shared_memory_size,
+                                command=[], dry_run=True, 
docker_registry=args.docker_registry,
+                                local_ccache_dir=args.ccache_dir))
         elif args.interactive:
-            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=shared_memory_size,
+            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=args.shared_memory_size,
                           command=command, 
docker_registry=args.docker_registry,
                           local_ccache_dir=args.ccache_dir, 
interactive=args.interactive)
 
@@ -334,7 +341,7 @@ def main() -> int:
             assert not args.interactive, "when running with -i must provide a 
command"
             cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", 
"build_{}".format(platform)]
             logging.info("No command specified, trying default build: %s", ' 
'.join(cmd))
-            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=shared_memory_size,
+            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=args.shared_memory_size,
                           command=cmd, docker_registry=args.docker_registry,
                           local_ccache_dir=args.ccache_dir)
 
@@ -346,15 +353,20 @@ def main() -> int:
             tag = get_docker_tag(platform=platform, 
registry=args.docker_registry)
             if use_cache():
                 load_docker_cache(tag=tag, 
docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary, args.docker_registry, 
num_retries=num_docker_build_retires)
+            build_docker(platform, docker_binary, args.docker_registry, 
num_retries=args.docker_build_retries,
+                         use_cache=use_cache())
             if args.build_only:
                 continue
-            build_platform = "build_{}".format(platform)
-            cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", 
build_platform]
             shutil.rmtree(buildir(), ignore_errors=True)
-            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=shared_memory_size,
-                          command=cmd, docker_registry=args.docker_registry, 
local_ccache_dir=args.ccache_dir)
-            plat_buildir = os.path.join(get_mxnet_root(), build_platform)
+            build_platform = "build_{}".format(platform)
+            plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
+                                                        
"mxnet_{}".format(build_platform)))
+            if os.path.exists(plat_buildir):
+                logging.warning("{} already exists, 
skipping".format(plat_buildir))
+                continue
+            command = ["/work/mxnet/ci/docker/runtime_functions.sh", 
build_platform]
+            container_run(platform=platform, docker_binary=docker_binary, 
shared_memory_size=args.shared_memory_size,
+                          command=command, 
docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
             shutil.move(buildir(), plat_buildir)
             logging.info("Built files left in: %s", plat_buildir)
 
@@ -383,7 +395,7 @@ Examples:
 
 ./build.py -a
 
-    Builds for all platforms and leaves artifacts in build_<platform>
+    Builds for all platforms and leaves artifacts in build_<platform>.
 
     """)
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a124e10..0b6a42c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -593,6 +593,9 @@ build_ubuntu_gpu_cmake() {
     ninja -v
 }
 
+build_ubuntu_blc() {
+    echo "pass"
+}
 
 # Testing
 
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 7a6d110..bebcb25 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -30,6 +30,7 @@ import argparse
 import sys
 import subprocess
 import json
+from typing import *
 import build as build_util
 
 
@@ -59,7 +60,7 @@ def build_save_containers(platforms, registry, load_cache) -> 
int:
     return 1 if is_error else 0
 
 
-def _build_save_container(platform, registry, load_cache) -> str:
+def _build_save_container(platform, registry, load_cache) -> Optional[str]:
     """
     Build image for passed platform and upload the cache to the specified S3 
bucket
     :param platform: Platform
@@ -77,7 +78,7 @@ def _build_save_container(platform, registry, load_cache) -> 
str:
     logging.debug('Building %s as %s', platform, docker_tag)
     try:
         # Increase the number of retries for building the cache.
-        image_id = build_util.build_docker(docker_binary='docker', 
platform=platform, registry=registry, num_retries=10)
+        image_id = build_util.build_docker(docker_binary='docker', 
platform=platform, registry=registry, num_retries=10, use_cache=True)
         logging.info('Built %s as %s', docker_tag, image_id)
 
         # Push cache to registry
diff --git a/ci/util.py b/ci/util.py
index 22631f3..98605be 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -17,6 +17,7 @@
 
 import os
 import contextlib
+import requests
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -41,3 +42,73 @@ def remember_cwd():
     finally: os.chdir(curdir)
 
 
+def retry(target_exception, tries=4, delay_s=1, backoff=2):
+    """Retry calling the decorated function using an exponential backoff.
+
+    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+    :param target_exception: the exception to check. may be a tuple of
+        exceptions to check
+    :type target_exception: Exception or tuple
+    :param tries: number of times to try (not retry) before giving up
+    :type tries: int
+    :param delay_s: initial delay between retries in seconds
+    :type delay_s: int
+    :param backoff: backoff multiplier e.g. value of 2 will double the delay
+        each retry
+    :type backoff: int
+    """
+    import time
+    from functools import wraps
+
+    def decorated_retry(f):
+        @wraps(f)
+        def f_retry(*args, **kwargs):
+            mtries, mdelay = tries, delay_s
+            while mtries > 1:
+                try:
+                    return f(*args, **kwargs)
+                except target_exception as e:
+                    logging.warning("Exception: %s, Retrying in %d 
seconds...", str(e), mdelay)
+                    time.sleep(mdelay)
+                    mtries -= 1
+                    mdelay *= backoff
+            return f(*args, **kwargs)
+
+        return f_retry  # true decorator
+
+    return decorated_retry
+
+
+# noinspection SyntaxError
+def under_ci() -> bool:
+    """:return: True if we run in Jenkins."""
+    return 'JOB_NAME' in os.environ
+
+
+def ec2_instance_id_hostname() -> str:
+    if under_ci():
+        result = []
+        try:
+            r = 
requests.get("http://instance-data/latest/meta-data/instance-id";)
+            if r.status_code == 200:
+                result.append(r.content.decode())
+            r = 
requests.get("http://instance-data/latest/meta-data/public-hostname";)
+            if r.status_code == 200:
+                result.append(r.content.decode())
+            return ' '.join(result)
+        except ConnectionError:
+            pass
+        return '?'
+    else:
+        return ''
+
+
+def chdir_to_script_directory():
+    # We need to be in the same directory than the script so the commands in 
the dockerfiles work as
+    # expected. But the script can be invoked from a different path
+    base = os.path.split(os.path.realpath(__file__))[0]
+    os.chdir(base)
+
+

Reply via email to