This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a commit to branch v1.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/v1.x by this push:
new 9981e84 [CI][1.x] Cherrypick: Upgrade unix gpu toolchain (#18186)
(#18785)
9981e84 is described below
commit 9981e847fff9270268385068c5b7d0c3929e46f9
Author: Chaitanya Prakash Bapat <[email protected]>
AuthorDate: Tue Aug 18 08:32:35 2020 -0700
[CI][1.x] Cherrypick: Upgrade unix gpu toolchain (#18186) (#18785)
* Update unix gpu toolchain (#18186)
* update nvidiadocker command & remove cuda compat
* replace cu101 with cuda since compat is no longer to be used
* skip flaky tests
* get rid of ubuntu_build_cuda and point ubuntu_cu101 to base gpu instead
of cuda compat
* Revert "skip flaky tests"
This reverts commit 1c720fad8791a4518b4012de2e3339a7cdff5d74.
* revert removal of ubuntu_build_cuda
* add linux gpu g4 node to all steps using g3 in unix-gpu pipeline
* remove docker compose files
* add back the caffe test since caffe is deprecated for mx2.0 and not 1.x
* drop nvidia-docker requirement since docker19.0 supports it by default
:q
* remove compat from dockerfile
* Cherry-pick #18635 to v1.7.x (#18935)
* Remove mention of nightly in pypi (#18635)
* update bert dev.tsv link
Co-authored-by: Sheng Zha <[email protected]>
* disable tvm in CI functions that rely on libcuda compat
* tvm off for ubuntu_gpu_cmake build
* drop tvm from all unix-gpu builds
Co-authored-by: Carin Meier <[email protected]>
Co-authored-by: Sheng Zha <[email protected]>
---
ci/Jenkinsfile_utils.groovy | 1 +
ci/build.py | 25 ++++------
ci/docker/Dockerfile.build.ubuntu_gpu_cu101 | 1 -
ci/docker/runtime_functions.sh | 57 +++-------------------
ci/jenkins/Jenkins_steps.groovy | 75 ++++++-----------------------
ci/jenkins/Jenkinsfile_unix_gpu | 5 +-
6 files changed, 35 insertions(+), 129 deletions(-)
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index e7aeae9..8ecc7e1 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -255,6 +255,7 @@ def assign_node_labels(args) {
// knowing about the limitations.
NODE_LINUX_CPU = args.linux_cpu
NODE_LINUX_GPU = args.linux_gpu
+ NODE_LINUX_GPU_G4 = args.linux_gpu_g4
NODE_LINUX_GPU_P3 = args.linux_gpu_p3
NODE_WINDOWS_CPU = args.windows_cpu
NODE_WINDOWS_GPU = args.windows_gpu
diff --git a/ci/build.py b/ci/build.py
index cbc4121..8c2a6e9 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -66,23 +66,18 @@ def get_dockerfile(platform: str,
path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.{0}".format(platform))
-def get_docker_binary(use_nvidia_docker: bool) -> str:
- return "nvidia-docker" if use_nvidia_docker else "docker"
-
-
-def build_docker(platform: str, docker_binary: str, registry: str,
num_retries: int, no_cache: bool,
+def build_docker(platform: str, registry: str, num_retries: int, no_cache:
bool,
cache_intermediate: bool) -> str:
"""
Build a container for the given platform
:param platform: Platform
- :param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
:param no_cache: pass no-cache to docker to rebuild the images
:return: Id of the top level image
"""
tag = get_docker_tag(platform=platform, registry=registry)
- logging.info("Building docker container tagged '%s' with %s", tag,
docker_binary)
+ logging.info("Building docker container tagged '%s'", tag)
#
# We add a user with the same group as the executing non-root user so
files created in the
# container match permissions of the local user. Same for the group.
@@ -99,7 +94,7 @@ def build_docker(platform: str, docker_binary: str, registry:
str, num_retries:
#
# This doesn't work with multi head docker files.
#
- cmd = [docker_binary, "build",
+ cmd = ["docker", "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
@@ -119,19 +114,19 @@ def build_docker(platform: str, docker_binary: str,
registry: str, num_retries:
run_cmd()
# Get image id by reading the tag. It's guaranteed (except race condition)
that the tag exists. Otherwise, the
# check_call would have failed
- image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
+ image_id = _get_local_image_id(docker_tag=tag)
if not image_id:
raise FileNotFoundError('Unable to find docker image id matching with
{}'.format(tag))
return image_id
-def _get_local_image_id(docker_binary, docker_tag):
+def _get_local_image_id(docker_tag):
"""
Get the image id of the local docker layer with the passed tag
:param docker_tag: docker tag
:return: Image id as string or None if tag does not exist
"""
- cmd = [docker_binary, "images", "-q", docker_tag]
+ cmd = ["docker", "images", "-q", docker_tag]
image_id_b = check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
if not image_id:
@@ -196,8 +191,9 @@ def container_run(docker_client: SafeDockerClient,
# Equivalent command
docker_cmd_list = [
- get_docker_binary(nvidia_runtime),
+ "docker",
'run',
+ "--gpus all" if nvidia_runtime else "",
"--cap-add",
"SYS_PTRACE", # Required by ASAN
'--rm',
@@ -352,7 +348,6 @@ def main() -> int:
args = parser.parse_args()
command = list(chain(*args.command))
- docker_binary = get_docker_binary(args.nvidiadocker)
docker_client = SafeDockerClient()
environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
@@ -366,7 +361,7 @@ def main() -> int:
if args.docker_registry:
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
if not args.run_only:
- build_docker(platform=platform, docker_binary=docker_binary,
registry=args.docker_registry,
+ build_docker(platform=platform, registry=args.docker_registry,
num_retries=args.docker_build_retries,
no_cache=args.no_cache,
cache_intermediate=args.cache_intermediate)
else:
@@ -410,7 +405,7 @@ def main() -> int:
for platform in platforms:
tag = get_docker_tag(platform=platform,
registry=args.docker_registry)
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
- build_docker(platform, docker_binary=docker_binary,
registry=args.docker_registry,
+ build_docker(platform, registry=args.docker_registry,
num_retries=args.docker_build_retries,
no_cache=args.no_cache)
if args.build_only:
continue
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
b/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
index 717a5aa..a17261b 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
@@ -79,4 +79,3 @@ RUN /work/ubuntu_adduser.sh
COPY runtime_functions.sh /work/
WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/compat
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 4b544e4..4523e1f 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -767,7 +767,7 @@ build_ubuntu_gpu_mkldnn() {
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=1 \
- USE_TVM_OP=1 \
+ USE_TVM_OP=0 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
USE_SIGNAL_HANDLER=1 \
-j$(nproc)
@@ -784,7 +784,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=0 \
- USE_TVM_OP=1 \
+ USE_TVM_OP=0 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
USE_SIGNAL_HANDLER=1 \
-j$(nproc)
@@ -799,7 +799,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=1 \
- USE_TVM_OP=1 \
+ USE_TVM_OP=0 \
USE_CPP_PACKAGE=1 \
USE_DIST_KVSTORE=1 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -827,26 +827,6 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
make cython PYTHON=python3
}
-build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
- set -ex
- build_ccache_wrappers
- make \
- DEV=1 \
- USE_BLAS=openblas \
- USE_MKLDNN=0 \
- USE_CUDA=1 \
- USE_CUDA_PATH=/usr/local/cuda \
- USE_CUDNN=1 \
- USE_TVM_OP=0 \
- USE_CPP_PACKAGE=1 \
- USE_DIST_KVSTORE=1 \
- CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
- USE_SIGNAL_HANDLER=1 \
- -j$(nproc)
-
- make cython PYTHON=python3
-}
-
build_ubuntu_amalgamation() {
set -ex
# Amalgamation can not be run with -j nproc
@@ -874,7 +854,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=1 \
-DUSE_CUDNN=1 \
- -DUSE_TVM_OP=1 \
+ -DUSE_TVM_OP=0 \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKLML_MKL=1 \
-DCMAKE_BUILD_TYPE=Release \
@@ -893,7 +873,7 @@ build_ubuntu_gpu_cmake() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
- -DUSE_TVM_OP=ON \
+ -DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
@@ -916,7 +896,7 @@ build_ubuntu_gpu_cmake_no_rtc() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
- -DUSE_TVM_OP=ON \
+ -DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
@@ -932,29 +912,6 @@ build_ubuntu_gpu_cmake_no_rtc() {
ninja
}
-build_ubuntu_gpu_cmake_no_tvm_op() {
- set -ex
- cd /work/build
- build_ccache_wrappers
- cmake \
- -DUSE_SIGNAL_HANDLER=ON \
- -DUSE_CUDA=ON \
- -DUSE_CUDNN=ON \
- -DUSE_TVM_OP=OFF \
- -DPython3_EXECUTABLE=/usr/bin/python3 \
- -DUSE_MKL_IF_AVAILABLE=OFF \
- -DUSE_MKLML_MKL=OFF \
- -DUSE_MKLDNN=OFF \
- -DUSE_DIST_KVSTORE=ON \
- -DCMAKE_BUILD_TYPE=Release \
- -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
- -DBUILD_CYTHON_MODULES=1 \
- -G Ninja \
- /work/mxnet
-
- ninja
-}
-
build_ubuntu_cpu_large_tensor() {
set -ex
cd /work/build
@@ -980,7 +937,7 @@ build_ubuntu_gpu_large_tensor() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
- -DUSE_TVM_OP=ON \
+ -DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 98c774b..c4fd96e 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -143,7 +143,7 @@ def compile_unix_int64_cpu() {
def compile_unix_int64_gpu() {
return ['GPU: USE_INT64_TENSOR_SIZE': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/build-gpu-int64') {
timeout(time: max_time, unit: 'MINUTES') {
utils.init_git()
@@ -253,20 +253,6 @@ def compile_unix_full_gpu_mkldnn_cpp_test() {
}]
}
-def compile_unix_full_gpu_no_tvm_op() {
- return ['GPU: CUDA10.1+cuDNN7 TVM_OP OFF': {
- node(NODE_LINUX_CPU) {
- ws('workspace/build-gpu-no-tvm-op') {
- timeout(time: max_time, unit: 'MINUTES') {
- utils.init_git()
- utils.docker_run('ubuntu_build_cuda',
'build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op', false)
- utils.pack_lib('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
- }
- }
- }
- }]
-}
-
def compile_unix_cmake_mkldnn_gpu() {
return ['GPU: CMake MKLDNN': {
node(NODE_LINUX_CPU) {
@@ -295,19 +281,6 @@ def compile_unix_cmake_gpu() {
}]
}
-def compile_unix_cmake_gpu_no_tvm_op() {
- return ['GPU: CMake TVM_OP OFF': {
- node(NODE_LINUX_CPU) {
- ws('workspace/build-cmake-gpu-no-tvm-op') {
- timeout(time: max_time, unit: 'MINUTES') {
- utils.init_git()
- utils.docker_run('ubuntu_gpu_cu101',
'build_ubuntu_gpu_cmake_no_tvm_op', false)
- }
- }
- }
- }]
-}
-
def compile_unix_cmake_gpu_no_rtc() {
return ['GPU: CMake CUDA RTC OFF': {
node(NODE_LINUX_CPU) {
@@ -750,7 +723,7 @@ def test_unix_python3_mkl_cpu() {
def test_unix_python3_gpu() {
return ['Python3: GPU': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/ut-python3-gpu') {
try {
utils.unpack_and_init('gpu', mx_lib_cython)
@@ -764,22 +737,6 @@ def test_unix_python3_gpu() {
}]
}
-def test_unix_python3_gpu_no_tvm_op() {
- return ['Python3: GPU TVM_OP OFF': {
- node(NODE_LINUX_GPU) {
- ws('workspace/ut-python3-gpu-no-tvm-op') {
- try {
- utils.unpack_and_init('gpu_no_tvm_op',
mx_lib_cpp_examples_no_tvm_op)
- python3_gpu_ut_cython('ubuntu_gpu_cu101')
- utils.publish_test_coverage()
- } finally {
- utils.collect_test_results_unix('nosetests_gpu.xml',
'nosetests_python3_gpu.xml')
- }
- }
- }
- }]
-}
-
def test_unix_python3_quantize_gpu() {
return ['Python3: Quantize GPU': {
node(NODE_LINUX_GPU_P3) {
@@ -866,7 +823,7 @@ def test_unix_python3_mkldnn_mkl_cpu() {
def test_unix_python3_mkldnn_gpu() {
return ['Python3: MKLDNN-GPU': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/ut-python3-mkldnn-gpu') {
try {
utils.unpack_and_init('mkldnn_gpu', mx_mkldnn_lib)
@@ -882,7 +839,7 @@ def test_unix_python3_mkldnn_gpu() {
def test_unix_python3_mkldnn_nocudnn_gpu() {
return ['Python3: MKLDNN-GPU-NOCUDNN': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/ut-python3-mkldnn-gpu-nocudnn') {
try {
utils.unpack_and_init('mkldnn_gpu_nocudnn', mx_mkldnn_lib)
@@ -916,7 +873,7 @@ def test_unix_python3_tensorrt_gpu() {
def test_unix_python3_integration_gpu() {
return ['Python Integration GPU': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/it-python-gpu') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('gpu', mx_lib)
@@ -944,8 +901,8 @@ def test_unix_caffe_gpu() {
}
def test_unix_cpp_package_gpu() {
- return ['cpp-package GPU': {
- node(NODE_LINUX_GPU) {
+ return ['cpp-package GPU Makefile': {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/it-cpp-package') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('gpu', mx_lib_cpp_examples)
@@ -958,8 +915,8 @@ def test_unix_cpp_package_gpu() {
}
def test_unix_capi_cpp_package() {
- return ['capi-cpp-package GPU': {
- node(NODE_LINUX_GPU) {
+ return ['capi-cpp-package GPU Makefile': {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/it-capi-cpp-package') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('gpu_mkldnn_cpp_test', mx_lib_cpp_capi)
@@ -1000,8 +957,8 @@ def test_unix_scala_mkldnn_cpu(){
}
def test_unix_scala_gpu() {
- return ['Scala: GPU': {
- node(NODE_LINUX_GPU) {
+ return ['Scala: GPU Makefile': {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/ut-scala-gpu') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('gpu', mx_lib)
@@ -1084,7 +1041,7 @@ def test_unix_perl_cpu() {
def test_unix_cpp_gpu() {
return ['Cpp: GPU': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/ut-cpp-gpu') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('cmake_gpu', mx_cmake_lib)
@@ -1125,8 +1082,8 @@ def test_unix_cpp_cpu() {
}
def test_unix_perl_gpu() {
- return ['Perl: GPU': {
- node(NODE_LINUX_GPU) {
+ return ['Perl: GPU Makefile': {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/ut-perl-gpu') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('gpu', mx_lib)
@@ -1140,7 +1097,7 @@ def test_unix_perl_gpu() {
def test_unix_r_gpu() {
return ['R: GPU': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/ut-r-gpu') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('gpu', mx_lib)
@@ -1208,7 +1165,7 @@ def test_unix_distributed_kvstore_cpu() {
def test_unix_distributed_kvstore_gpu() {
return ['dist-kvstore tests GPU': {
- node(NODE_LINUX_GPU) {
+ node(NODE_LINUX_GPU_G4) {
ws('workspace/it-dist-kvstore') {
timeout(time: max_time, unit: 'MINUTES') {
utils.unpack_and_init('gpu', mx_lib)
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index f8c28d5..5e26a9f 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -29,7 +29,7 @@ node('utility') {
utils = load('ci/Jenkinsfile_utils.groovy')
custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
}
-utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu',
linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3')
+utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu',
linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4:
'mxnetlinux-gpu-g4')
utils.main_wrapper(
core_logic: {
@@ -41,8 +41,6 @@ core_logic: {
custom_steps.compile_unix_cmake_gpu(),
custom_steps.compile_unix_tensorrt_gpu(),
custom_steps.compile_unix_int64_gpu(),
- custom_steps.compile_unix_full_gpu_no_tvm_op(),
- custom_steps.compile_unix_cmake_gpu_no_tvm_op(),
custom_steps.compile_unix_cmake_gpu_no_rtc(),
custom_steps.compile_unix_full_gpu_mkldnn_cpp_test()
])
@@ -63,7 +61,6 @@ core_logic: {
custom_steps.test_unix_distributed_kvstore_gpu(),
custom_steps.test_static_python_gpu(),
custom_steps.test_static_python_gpu_cmake(),
- custom_steps.test_unix_python3_gpu_no_tvm_op(),
custom_steps.test_unix_capi_cpp_package(),
// Disabled due to: https://github.com/apache/incubator-mxnet/issues/11407