This is an automated email from the ASF dual-hosted git repository.

lausen pushed a change to branch leezu-patch-1
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git.


 discard 8a9599b  Disable MKL_USE_STATIC_LIBS by default
     add 2fff11d  [OpPerf] Fix axis_shape and function mismatch for LTS (#17894)
     add 4c0f763  Fix a typo (#17979)
     add 1b107a0  Remove redundant condition in np_matrix_op.cc (#17933)
     add 84b0ddd  Add USE_DIST_KVSTORE=ON to GPU build (#17911)
     add 03b8146  Skip test_kvstore_gpu.test_rsp_push_pull (#17983)
     add ff234db  Skip test_gluon_data.py on OSX (#17969)
     add c244f9f  [MXNET-#16795] Byteps-KVStore: Intergrate Byteps into mxnet 
as new type of kvstore backend (#17555)
     add 5adcbf8  GPU gemms true fp16 (#17466)
     add c3c76a8  Optimize AddTakeGrad Tensor Sum (#17906)
     add 002d4f1  * impl - FFi for linalg op (#17795)
     add 79c576b  [ONNX export] Fixing spatial export for batchnorm (#17711)
     add 892f982  * impl - linalg.lstsq for cpu (#17950)
     add a960f5a  ffi_array_split, v/h/dsplit (#17873)
     add f906a02  ffi_atleast_1/2/3d (#17897)
     add 16ddc6d  Custom Operator Random Number Generator Support (#17762)
     add b7f7525  dnnl v1.2.2 (#17991)
     add 13841dd  [mkldnn] optimize for mkldnn batchnorm backward (#17902)
     add da95add  Fix vector access out of bound in MKLDNNConvolutionBackward 
(#17997)
     add 6cc990c  Revert "[MXNET-#16795] Byteps-KVStore: Intergrate Byteps into 
mxnet as new type of kvstore backend (#17555)" (#17998)
     add d97dead  [Numpy] allow mix integer dtypes for power/add/multiply 
(#17921)
     add 178b98e  [Numpy] OP_interp (#17793)
     add d1616c9  [numpy] FFI binary bitwise ops  (#17812)
     add 664889a  [MKL-DNN] Integrate Conv3d and Pool3d/1d (#17884)
     add 5ff2994  [Website 2.0] General Version Dropdown (#17948)
     add 6a931c7  Add np.linalg.qr (#17851)
     add 0eeb337  Improve activation backward (#17973)
     add d8c7293  fix np flip when axis input contains negative number (#17880)
     add 0d87aa8  [numpy] FFI for insert \ delete \ matmul etc. (#17759)
     add 3dab617  add: numpy rollaxis (#17865)
     add af76466  add: numpy op tril_indices (#17904)
     add 8f82cd8  [MKLDNN] support using any format in pooling backward (#17900)
     add 57c785a  [Numpy][Bug Fix] Fix Wrong Result in Numpy Operator `where` 
(#17899)
     add e4afe50  * impl debug - FFI for linalg multioutput op (#17879)
     add 2be2027  Add instructions on distributed MXNet with Horovod on 
Kubernetes (#17974)
     add 249b9a1  Fix cudnn Dropout reproducibility (#17547)
     add a6fef3f  [Website 2.0] Nightly Build for v2.x (master) (#17957)
     add 1679ade  fixes #17918; update ruby & jekyll, remove incompatible 
plugins (#17927)
     add 07b8d7a  Fix ElemwiseSum for more than 4 inputs (#17995)
     add 0597f87  updating stash regex and copy command to fix nightly 
imagenet_inference (#18021)
     add 58911b5  fixing hyperlinks in python tutorial (#17929)
     add c9f8caa  Update 3rdparty/mkldnn remote URL and pin to v1.3 (#17972)
     add 8d065cc  [Numpy] FFI Invocation for Unary Ops (#17779)
     add 6692d2c  [Bug Fix] support multiple-dim input for unravel_index 
(#17748)
     add 37c9dd6  Fix for handling negative indices in the fusion of slice 
(#17937)
     add a1fa6a8  Workaround gnu_tls handshake error on Ubuntu 14.04 Nvidia 
Docker (#18018)
     add 7dd7e7e  Fix issue of zeros gradients w.r.t. RNN bias when num_layers 
> 1 (#17872)
     add 02ac75e  Fix typo in crash course website (#18010)
     add 0bff90d  Support projection feature of LSTM (#17996)
     add 8f6d116  [MKL-DNN] BatchNormRelu Fusion (#17679)
     add 7a59239  Remove unused files in Website doc (#16722)
     add e3d7866  [Numpy] FFI: random.choice, take and clip (#17854)
     add 8e3f0f3  * impl - linalg matrix_rank for cpu and gpu implemented 
(#18020)
     add a044744  [Numpy] FFI for linalg.qr and linalg.lstsq (#18040)
     add f3cfaf9  ffi random (#18051)
     add fb73a17  Switch to C++17 and modernize toolchain + CI (#17984)
     add e796ae9  Integrate Horovod training API as part of MXNet native 
distributed training API (#17531)
     add c7d2b3c  [NumPy] Add NumPy support for triu (#17614)
     add 2c4732b  Fix CI (#18056)
     add ce48a9d  Remove code owner (#17928)
     add 94f235d  [numpy] add new ffi for column_stack and hstack (#17831)
     add 37dbbd4  Fix CD (#18072)
     add 5c768f0  [Numpy Extension] Add stop_gradient to npx (#18076)
     add afae030  No tensor cores for fp32 interleaved attention, remove div by 
8 restriction (#17994)
     add 9337137  For mxnet-validation pipeline, require sanity build to 
complete successfully before running other build pipelines. (#17999)
     add 7bef85e  [Numpy] Add ffi for np.sum, np.std, np.var, np.average and 
np.histogram (#17866)
     add bd0816e  Add np.linalg.qr backward (#18050)
     add 5155095  add zero grad for npi_unique (#18080)
     add b7d1c69  [Numpy] add new ffi for np.linalg.norm (#18066)
     add cf93bdc  [Numpy] New FFIs for Operator: tile, trace, transpose (#18017)
     add b01d1dc  Add gelu fuse ops (#18082)
     add 4a244dc  CI/CD: Remove cuda 9.0, 9.1 (#18087)
     add f882de0  [Numpy] add cross product op (#17637)
     add 5542d03  Fix and optimize handling of vectorized memory accesses 
(#17767)
     add dcada9b  [Numpy] FFI: random.shuffle, equal, not_equal, less_equal, 
greater_equal, less, maximum and minimum (#17896)
     add 586c8ab  CI: Simplify CentOS7 CI/CD config (#18093)
     add 1c0416c  Disable MKL_USE_STATIC_LIBS by default

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (8a9599b)
            \
             N -- N -- N   refs/heads/leezu-patch-1 (1c0416c)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

No new revisions were added by this update.

Summary of changes:
 .github/workflows/os_x_staticbuild.yml             |    3 +-
 .gitmodules                                        |    2 +-
 3rdparty/dmlc-core                                 |    2 +-
 3rdparty/mkldnn                                    |    2 +-
 3rdparty/mshadow/guide/Makefile                    |    2 +-
 3rdparty/mshadow/guide/mshadow-ps/Makefile         |    2 +-
 3rdparty/mshadow/make/mshadow.mk                   |    4 +-
 3rdparty/mshadow/mshadow/base.h                    |   72 -
 3rdparty/mshadow/mshadow/half2.h                   |  143 --
 3rdparty/mshadow/mshadow/logging.h                 |    5 +
 3rdparty/mshadow/mshadow/packet-inl.h              |    4 +
 3rdparty/mshadow/mshadow/random.h                  |  118 +-
 3rdparty/mshadow/mshadow/tensor_cpu-inl.h          |    9 +-
 3rdparty/mshadow/test/Makefile                     |    2 +-
 CMakeLists.txt                                     |   61 +-
 CODEOWNERS                                         |   26 +-
 Makefile                                           |   34 +-
 amalgamation/Makefile                              |    7 +-
 amalgamation/amalgamation.py                       |    2 +-
 benchmark/opperf/README.md                         |    4 +-
 .../nd_operations/array_manipulation_operators.py  |    8 +-
 benchmark/opperf/rules/default_params.py           |    1 -
 benchmark/python/ffi/benchmark_ffi.py              |   87 ++
 cd/Jenkinsfile_cd_pipeline                         |    2 +-
 cd/Jenkinsfile_release_job                         |    2 +-
 cd/README.md                                       |    1 -
 cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy       |    4 +-
 cd/mxnet_lib/mxnet_lib_pipeline.groovy             |   10 +-
 cd/mxnet_lib/static/Jenkins_pipeline.groovy        |    9 +-
 cd/python/docker/Jenkins_pipeline.groovy           |    5 +-
 cd/python/pypi/Jenkins_pipeline.groovy             |    5 +-
 cd/python/pypi/pypi_package.sh                     |    2 +-
 cd/utils/artifact_repository.md                    |    4 +-
 cd/utils/artifact_repository.py                    |    6 +-
 cd/utils/docker_tag.sh                             |    4 +-
 cd/utils/mxnet_base_image.sh                       |    6 -
 ci/README.md                                       |   95 +-
 ci/build.py                                        |   11 +-
 ci/dev_menu.py                                     |    2 +-
 ci/docker/Dockerfile.build.android_armv7           |   94 +-
 ci/docker/Dockerfile.build.android_armv8           |   92 +-
 ci/docker/Dockerfile.build.armv6                   |   45 +-
 ci/docker/Dockerfile.build.armv7                   |   54 +-
 ci/docker/Dockerfile.build.armv8                   |   56 +-
 ci/docker/Dockerfile.build.jetson                  |   96 +-
 ci/docker/Dockerfile.build.test.arm_qemu           |   47 -
 ....ubuntu1404_cpu => Dockerfile.build.test.armv7} |   24 +-
 ....ubuntu1404_cpu => Dockerfile.build.test.armv8} |   24 +-
 ci/docker/Dockerfile.build.ubuntu_build_cuda       |   11 +-
 ci/docker/Dockerfile.build.ubuntu_cpu              |    8 +-
 ci/docker/Dockerfile.build.ubuntu_cpu_jekyll       |   10 +-
 ci/docker/Dockerfile.build.ubuntu_cpu_julia        |    8 +-
 ci/docker/Dockerfile.build.ubuntu_cpu_r            |    3 +
 ci/docker/Dockerfile.build.ubuntu_cpu_scala        |    3 +
 ci/docker/Dockerfile.build.ubuntu_gpu_cu100        |   84 --
 ci/docker/Dockerfile.build.ubuntu_gpu_cu101        |   14 +-
 ci/docker/Dockerfile.build.ubuntu_gpu_cu102        |   85 --
 ci/docker/Dockerfile.build.ubuntu_gpu_cu80         |   79 --
 ci/docker/Dockerfile.build.ubuntu_gpu_cu90         |   85 --
 ci/docker/Dockerfile.build.ubuntu_gpu_cu92         |   84 --
 ci/docker/Dockerfile.build.ubuntu_nightly_cpu      |   11 +-
 ci/docker/Dockerfile.build.ubuntu_nightly_gpu      |   11 +-
 ....centos7_cpu => Dockerfile.publish.centos7_cpu} |    9 +-
 ...s7_gpu => Dockerfile.publish.centos7_gpu_cu100} |   17 +-
 ...s7_gpu => Dockerfile.publish.centos7_gpu_cu101} |   17 +-
 ...s7_gpu => Dockerfile.publish.centos7_gpu_cu102} |   17 +-
 ...os7_gpu => Dockerfile.publish.centos7_gpu_cu92} |   15 +-
 ci/docker/Dockerfile.publish.test.centos7_cpu      |   12 +-
 ci/docker/Dockerfile.publish.test.centos7_gpu      |   12 +-
 ci/docker/Dockerfile.publish.test.ubuntu1404_gpu   |   40 -
 ci/docker/Dockerfile.publish.ubuntu1404_gpu        |   39 -
 ci/docker/install/android_armv7_openblas.sh        |   31 -
 ci/docker/install/android_ndk.sh                   |   38 -
 ci/docker/install/arm64_openblas.sh                |   35 -
 ci/docker/install/centos7_base.sh                  |   46 -
 ci/docker/install/centos7_ccache.sh                |    8 +-
 ci/docker/install/centos7_core.sh                  |   10 +
 .../install/{arm_openblas.sh => centos7_nccl.sh}   |   19 +-
 ci/docker/install/deb_ubuntu_ccache.sh             |   26 +-
 ci/docker/install/{arm_openblas.sh => thrust.sh}   |   17 +-
 ci/docker/install/ubuntu_arm_qemu.sh               |   37 -
 ci/docker/install/ubuntu_arm_qemu_bin.sh           |   40 -
 ci/docker/install/ubuntu_gcc8.sh                   |    2 +-
 ci/docker/install/ubuntu_publish.sh                |   88 --
 ci/docker/install/ubuntu_r.sh                      |    5 +-
 ci/docker/install/ubuntu_scala.sh                  |   26 +-
 ci/docker/qemu/README.md                           |   18 -
 ci/docker/qemu/runtime_functions.py                |  134 --
 ci/docker/qemu/vmcontrol.py                        |  360 -----
 ci/docker/runtime_functions.sh                     |  277 ++--
 .../aarch64-linux-gnu-toolchain.cmake}             |   24 +-
 .../arm-linux-gnueabihf-toolchain.cmake}           |   19 +-
 ci/jenkins/Jenkins_steps.groovy                    |  121 +-
 ci/jenkins/Jenkinsfile_centos_cpu                  |    5 +-
 ci/jenkins/Jenkinsfile_centos_gpu                  |   10 +-
 ci/jenkins/Jenkinsfile_clang                       |    4 +-
 ci/jenkins/Jenkinsfile_edge                        |    7 +-
 .../{Jenkinsfile_centos_gpu => Jenkinsfile_full}   |   46 +-
 ci/jenkins/Jenkinsfile_unix_cpu                    |    3 -
 ci/jenkins/Jenkinsfile_unix_gpu                    |    2 -
 ...file_centos_cpu => Jenkinsfile_website_nightly} |   25 +-
 ci/publish/Jenkinsfile                             |    2 +-
 ci/publish/README.md                               |    7 +-
 ci/qemu/README.md                                  |   92 --
 ci/qemu/copy.sh                                    |   23 -
 ci/qemu/init.sh                                    |   23 -
 ci/qemu/initrd_modif/inittab                       |   38 -
 ci/qemu/install.sh                                 |   32 -
 ci/qemu/mxnet_requirements.txt                     |    7 -
 ci/qemu/preseed.cfg                                |   68 -
 ci/qemu/preseed.sh                                 |   29 -
 ci/qemu/run.sh                                     |   33 -
 ci/qemu/test_requirements.txt                      |    3 -
 ci/safe_docker_run.py                              |    2 +-
 cmake/Modules/FindNCCL.cmake                       |   10 +-
 cmake/upstream/FindCUDAToolkit.cmake               |  205 ++-
 config/distribution/linux_cu100.cmake              |    3 +-
 config/distribution/linux_cu101.cmake              |    3 +-
 config/distribution/linux_cu102.cmake              |    3 +-
 config/distribution/linux_cu75.cmake               |   35 -
 config/distribution/linux_cu80.cmake               |   35 -
 config/distribution/linux_cu90.cmake               |   35 -
 config/distribution/linux_cu91.cmake               |   35 -
 config/distribution/linux_cu92.cmake               |    3 +-
 cpp-package/example/Makefile                       |    2 +-
 cpp-package/example/example.mk                     |    4 +-
 cpp-package/example/feature_extract/Makefile       |    4 +-
 cpp-package/example/inference/Makefile             |    2 +-
 cpp-package/example/inference/inference.mk         |    4 +-
 .../docker-python/Dockerfile.mxnet.python.gpu.cu90 |   29 -
 .../Dockerfile.mxnet.python.gpu.cu90.mkl           |   29 -
 .../Dockerfile.mxnet.python3.gpu.cu90              |   29 -
 .../Dockerfile.mxnet.python3.gpu.cu90.mkl          |   29 -
 docker/docker-python/README.md                     |    8 -
 docker/docker-python/build_python_dockerfile.sh    |   21 +-
 .../python/tutorials/deploy/run-on-aws/use_ec2.rst |    6 +-
 .../getting-started/crash-course/4-train.md        |    2 +-
 .../getting-started/crash-course/5-predict.md      |    2 +-
 .../themes/mx-theme/mxtheme/header_top.html        |   17 +
 .../themes/mx-theme/mxtheme/layout.html            |   48 +
 docs/static_site/Makefile                          |    2 +-
 docs/static_site/src/Gemfile                       |    8 +-
 docs/static_site/src/Gemfile.lock                  |   73 +-
 docs/static_site/src/_config_beta.yml              |    4 +-
 docs/static_site/src/_config_prod.yml              |    4 +-
 docs/static_site/src/_includes/head.html           |   39 +
 docs/static_site/src/_includes/header.html         |   17 +
 docs/static_site/src/pages/api/faq/env_var.md      |    8 +
 .../src/pages/get_started/windows_setup.md         |    4 +-
 example/distributed_training-horovod/README.md     |    7 +-
 .../distributed_training/cifar10_kvstore_hvd.py    |  237 ++++
 example/extensions/lib_custom_op/relu_lib.cu       |   90 +-
 example/extensions/lib_custom_op/test_relu.py      |   43 +-
 example/image-classification/predict-cpp/Makefile  |    6 +-
 example/multi_threaded_inference/Makefile          |    2 +-
 .../multi_threaded_inference.cc                    |    5 +-
 example/rnn/large_word_lm/setup.py                 |    2 +-
 include/mxnet/base.h                               |   12 -
 include/mxnet/lib_api.h                            |   57 +-
 include/mxnet/random_generator.h                   |    8 +
 include/mxnet/resource.h                           |    2 +-
 include/mxnet/runtime/ffi_helper.h                 |   18 +
 include/mxnet/runtime/object.h                     |    1 +
 julia/deps/build.jl                                |    2 +-
 julia/src/base.jl                                  |    1 +
 make/crosscompile.jetson.mk                        |  216 ---
 make/staticbuild/linux_cu100.mk                    |    8 +
 make/staticbuild/linux_cu101.mk                    |    8 +
 make/staticbuild/linux_cu102.mk                    |    8 +
 make/staticbuild/linux_cu75.mk                     |  167 ---
 make/staticbuild/linux_cu80.mk                     |  170 ---
 make/staticbuild/linux_cu90.mk                     |  172 ---
 make/staticbuild/linux_cu91.mk                     |  172 ---
 make/staticbuild/linux_cu92.mk                     |    8 +
 perl-package/AI-MXNet/t/test_init.t                |    5 +-
 python/mxnet/_ffi/_cython/convert.pxi              |    6 +
 python/mxnet/_ffi/node_generic.py                  |    2 +
 python/mxnet/_numpy_op_doc.py                      |  289 +---
 .../mxnet/contrib/onnx/mx2onnx/_op_translations.py |    9 +-
 python/mxnet/gluon/contrib/nn/basic_layers.py      |    3 +-
 python/mxnet/gluon/nn/basic_layers.py              |  142 +-
 python/mxnet/gluon/rnn/rnn_layer.py                |    1 +
 python/mxnet/gluon/trainer.py                      |    1 +
 python/mxnet/kvstore/__init__.py                   |    1 +
 python/mxnet/kvstore/horovod.py                    |  161 +++
 python/mxnet/kvstore/kvstore.py                    |    3 +
 python/mxnet/ndarray/numpy/_op.py                  | 1081 ++++++++++++---
 python/mxnet/ndarray/numpy/linalg.py               |  226 +++-
 python/mxnet/ndarray/numpy/random.py               |  192 +--
 python/mxnet/ndarray/sparse.py                     |    2 +-
 python/mxnet/ndarray_doc.py                        |  103 +-
 python/mxnet/numpy/fallback.py                     |    4 -
 python/mxnet/numpy/fallback_linalg.py              |    8 +-
 python/mxnet/numpy/linalg.py                       |  173 ++-
 python/mxnet/numpy/multiarray.py                   |  741 ++++++++++-
 python/mxnet/numpy/random.py                       |   39 +-
 python/mxnet/numpy_dispatch_protocol.py            |    5 +
 python/mxnet/symbol/numpy/_symbol.py               |  544 +++++++-
 python/mxnet/symbol/numpy/linalg.py                |  173 ++-
 python/mxnet/symbol/numpy/random.py                |   11 +-
 python/mxnet/symbol_doc.py                         |  215 +--
 python/setup.py                                    |    4 +-
 .../assembly/src/main/assembly/assembly.xml        |    2 +-
 .../apache/mxnet/util/NativeLibraryLoader.scala    |    2 +-
 src/api/_api_internal/_api_internal.cc             |   10 +
 .../np_det.cc}                                     |   25 +-
 .../np_eig.cc}                                     |   44 +-
 .../np_eigvals.cc}                                 |   41 +-
 .../np_inv.cc}                                     |   25 +-
 .../numpy/{np_cumsum.cc => linalg/np_lstsq.cc}     |   60 +-
 .../np_matrix_rank.cc}                             |   63 +-
 .../numpy/{np_cumsum.cc => linalg/np_norm.cc}      |   46 +-
 .../{np_tensordot_op.cc => linalg/np_pinv.cc}      |   60 +-
 .../np_potrf.cc}                                   |   30 +-
 .../np_qr.cc}                                      |   28 +-
 .../np_slogdet.cc}                                 |   26 +-
 .../np_solve.cc}                                   |   27 +-
 .../np_tensorinv.cc}                               |   30 +-
 .../np_tensorsolve.cc}                             |   36 +-
 src/api/operator/numpy/np_bincount_op.cc           |    4 +-
 .../operator/numpy/np_broadcast_reduce_op_value.cc |  105 +-
 ...np_broadcast_reduce_op_value.cc => np_cross.cc} |   32 +-
 src/api/operator/numpy/np_cumsum.cc                |    4 +-
 src/api/operator/numpy/np_delete_op.cc             |  101 ++
 ...ast_op.cc => np_elemwise_broadcast_logic_op.cc} |   57 +-
 src/api/operator/numpy/np_elemwise_broadcast_op.cc |   59 +
 .../operator/numpy/np_elemwise_unary_op_basic.cc   |   93 ++
 src/api/operator/numpy/np_histogram_op.cc          |   81 ++
 src/api/operator/numpy/np_init_op.cc               |  207 ++-
 src/api/operator/numpy/np_insert_op.cc             |  156 +++
 .../numpy/{np_bincount_op.cc => np_interp_op.cc}   |   61 +-
 ...roadcast_reduce_op_value.cc => np_matmul_op.cc} |   35 +-
 src/api/operator/numpy/np_matrix_op.cc             |  256 +++-
 src/api/operator/numpy/np_moments_op.cc            |  209 +++
 src/api/operator/numpy/np_tensordot_op.cc          |    4 +-
 ...broadcast_reduce_op_value.cc => np_trace_op.cc} |   42 +-
 ..._broadcast_reduce_op_value.cc => np_triu_op.cc} |   32 +-
 src/api/operator/numpy/np_window_op.cc             |   79 ++
 .../random/{np_laplace_op.cc => np_choice_op.cc}   |   57 +-
 .../{np_cumsum.cc => random/np_exponential_op.cc}  |   58 +-
 src/api/operator/numpy/random/np_laplace_op.cc     |    2 +-
 .../operator/numpy/random/np_location_scale_op.cc  |  150 +++
 .../numpy/{np_cumsum.cc => random/np_pareto_op.cc} |   57 +-
 .../numpy/{np_cumsum.cc => random/np_power_op.cc}  |   57 +-
 .../{np_cumsum.cc => random/np_rayleigh_op.cc}     |   57 +-
 .../{np_cumsum.cc => random/np_weibull_op.cc}      |   57 +-
 src/api/operator/random/np_gamma_op.cc             |  108 ++
 src/api/operator/random/np_normal_op.cc            |   96 ++
 src/api/operator/random/np_uniform_op.cc           |   96 ++
 .../shuffle_op.cc}                                 |   39 +-
 .../elemwise_binary_broadcast_op_extended.cc}      |   35 +-
 .../{numpy/np_cumsum.cc => tensor/indexing_op.cc}  |   67 +-
 src/api/operator/tensor/matrix_op.cc               |   92 ++
 src/api/operator/ufunc_helper.cc                   |   24 +-
 src/api/operator/ufunc_helper.h                    |   15 +-
 src/api/operator/utils.cc                          |    5 +
 src/api/operator/utils.h                           |   15 +-
 src/c_api/c_api.cc                                 |   41 +-
 src/c_api/c_api_executor.cc                        |    2 +-
 src/common/cuda_vectorization.cuh                  |  283 ++++
 src/common/random_generator.cu                     |    5 +
 src/executor/pointwise_fusion_pass.cc              |   14 +
 src/operator/c_lapack_api.cc                       |   32 +
 src/operator/c_lapack_api.h                        |  136 +-
 src/operator/contrib/batch_norm_relu.cc            |  298 +++++
 src/operator/contrib/dgl_graph.cc                  |   73 +-
 src/operator/contrib/transformer.cu                |   83 +-
 src/operator/fusion/fused_op-inl.h                 |   31 +-
 src/operator/fusion/fused_op.cu                    |   45 +-
 src/operator/linalg_impl.h                         |   53 +-
 src/operator/mshadow_op.h                          |   67 -
 src/operator/nn/batch_norm.cc                      |    8 +-
 src/operator/nn/dropout-inl.h                      |   11 +-
 src/operator/nn/dropout.cc                         |    1 +
 src/operator/nn/mkldnn/mkldnn_act.cc               |   41 +-
 src/operator/nn/mkldnn/mkldnn_base-inl.h           |   41 +-
 src/operator/nn/mkldnn/mkldnn_base.cc              |   47 +-
 src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h     |   99 +-
 src/operator/nn/mkldnn/mkldnn_convolution.cc       |   65 +-
 src/operator/nn/mkldnn/mkldnn_fully_connected.cc   |    4 +-
 src/operator/nn/mkldnn/mkldnn_pooling-inl.h        |   59 +-
 src/operator/nn/mkldnn/mkldnn_pooling.cc           |  305 +++--
 src/operator/nn/mkldnn/mkldnn_rnn-inl.h            |   73 +-
 src/operator/nn/mkldnn/mkldnn_rnn.cc               |  241 ++--
 src/operator/nn/pooling.cc                         |    6 +-
 src/operator/nn/softmax-inl.h                      |    2 +-
 src/operator/numpy/linalg/np_eig-inl.h             |    6 +
 src/operator/numpy/linalg/np_eigvals-inl.h         |    6 +
 src/operator/numpy/linalg/np_lstsq-inl.h           |  614 +++++++++
 src/operator/numpy/linalg/np_lstsq.cc              |   97 ++
 .../numpy/{np_trace_op.cu => linalg/np_lstsq.cu}   |   15 +-
 src/operator/numpy/linalg/np_matrix_rank-inl.h     |  449 +++++++
 src/operator/numpy/linalg/np_matrix_rank.cc        |  165 +++
 .../{np_trace_op.cu => linalg/np_matrix_rank.cu}   |   16 +-
 src/operator/numpy/linalg/np_norm-inl.h            |   18 +
 src/operator/numpy/linalg/np_pinv-inl.h            |   14 +
 src/operator/numpy/linalg/np_potrf.cc              |    3 +-
 src/operator/numpy/linalg/np_qr-inl.h              |  613 +++++++++
 src/operator/numpy/linalg/np_qr.cc                 |  113 ++
 .../numpy/{np_trace_op.cu => linalg/np_qr.cu}      |   21 +-
 src/operator/numpy/linalg/np_tensorinv-inl.h       |    6 +
 src/operator/numpy/linalg/np_tensorsolve-inl.h     |    6 +
 src/operator/numpy/np_boolean_mask_assign.cc       |    2 +-
 src/operator/numpy/np_broadcast_reduce_op.h        |   48 +-
 src/operator/numpy/np_broadcast_reduce_op_value.cc |   22 +-
 src/operator/numpy/np_broadcast_reduce_op_value.cu |    4 +-
 src/operator/numpy/np_cross-inl.h                  | 1387 ++++++++++++++++++++
 src/operator/numpy/np_cross.cc                     |  121 ++
 src/operator/numpy/{np_trace_op.cu => np_cross.cu} |   16 +-
 src/operator/numpy/np_delete_op-inl.h              |   14 +
 src/operator/numpy/np_elemwise_broadcast_op.cc     |    4 +
 src/operator/numpy/np_elemwise_broadcast_op.h      |   68 +-
 src/operator/numpy/np_init_op.cc                   |    2 +-
 src/operator/numpy/np_init_op.cu                   |    6 +-
 src/operator/numpy/np_init_op.h                    |   31 +
 src/operator/numpy/np_insert_op-inl.h              |   16 +
 src/operator/numpy/np_interp_op-inl.h              |  286 ++++
 src/operator/numpy/np_interp_op.cc                 |   92 ++
 .../numpy/{np_trace_op.cu => np_interp_op.cu}      |   18 +-
 src/operator/numpy/np_matrix_op-inl.h              |  218 +++
 src/operator/numpy/np_matrix_op.cc                 |  132 +-
 src/operator/numpy/np_matrix_op.cu                 |   11 +-
 src/operator/numpy/np_trace_op-inl.h               |   10 +
 src/operator/numpy/np_trace_op.cc                  |    6 +-
 src/operator/numpy/np_trace_op.cu                  |    4 +-
 src/operator/numpy/np_triu_op-inl.h                |  241 ++++
 src/operator/numpy/np_triu_op.cc                   |   61 +
 .../numpy/{np_trace_op.cu => np_triu_op.cu}        |   16 +-
 src/operator/numpy/np_unique_op.cc                 |    1 +
 src/operator/numpy/np_where_op-inl.h               |    8 +-
 src/operator/numpy/np_window_op.h                  |    8 +
 src/operator/numpy/random/np_choice_op.h           |   11 +
 src/operator/numpy/random/np_exponential_op.h      |    8 +
 src/operator/numpy/random/np_gamma_op.h            |   12 +
 src/operator/numpy/random/np_location_scale_op.h   |   10 +
 src/operator/numpy/random/np_normal_op.h           |   12 +
 src/operator/numpy/random/np_pareto_op.h           |    9 +
 src/operator/numpy/random/np_power_op.h            |   13 +
 src/operator/numpy/random/np_rayleigh_op.h         |    9 +
 src/operator/numpy/random/np_uniform_op.h          |   12 +
 src/operator/numpy/random/np_weibull_op.h          |    9 +
 .../mkldnn/mkldnn_quantized_pooling.cc             |    4 +-
 src/operator/quantization/quantized_conv.cc        |   97 +-
 src/operator/quantization/quantized_pooling.cc     |  100 +-
 src/operator/random/shuffle_op.cc                  |    2 +-
 src/operator/rnn-inl.h                             |   29 +-
 src/operator/rnn.cc                                |    8 +-
 src/operator/rnn_impl.h                            |   29 +-
 src/operator/subgraph/mkldnn/mkldnn_conv.cc        |    9 +-
 .../subgraph/mkldnn/mkldnn_conv_property.h         |    3 +-
 .../subgraph/mkldnn/mkldnn_subgraph_base-inl.h     |    2 +-
 src/operator/tensor/elemwise_binary_op.cuh         |  322 +++++
 src/operator/tensor/elemwise_binary_op.h           |  206 ++-
 src/operator/tensor/elemwise_binary_op_basic.cu    |   23 +-
 src/operator/tensor/elemwise_binary_scalar_op.cuh  |  207 +++
 src/operator/tensor/elemwise_binary_scalar_op.h    |   75 +-
 .../tensor/elemwise_binary_scalar_op_basic.cu      |    9 +-
 .../tensor/elemwise_binary_scalar_op_extended.cu   |   15 +-
 src/operator/tensor/elemwise_sum.cu                |  112 +-
 src/operator/tensor/elemwise_sum.h                 |   14 +-
 src/operator/tensor/elemwise_unary_op.cuh          |  127 ++
 src/operator/tensor/elemwise_unary_op.h            |   56 +-
 src/operator/tensor/elemwise_unary_op_basic.cc     |    1 +
 src/operator/tensor/elemwise_unary_op_basic.cu     |    1 +
 src/operator/tensor/elemwise_unary_op_pow.cu       |    1 +
 src/operator/tensor/elemwise_unary_op_trig.cu      |    1 +
 src/operator/tensor/histogram-inl.h                |   42 +-
 src/operator/tensor/indexing_op.h                  |   21 +
 src/operator/tensor/init_op.h                      |   29 +
 src/operator/tensor/la_op.h                        |    6 +
 src/operator/tensor/matrix_op-inl.h                |   13 +
 src/operator/tensor/matrix_op.cc                   |    1 +
 src/operator/tensor/ravel.cc                       |   12 +-
 src/operator/tensor/ravel.h                        |   19 +-
 src/resource.cc                                    |    4 +-
 tests/cpp/engine/threaded_engine_test.cc           |   14 +-
 tests/cpp/operator/mkldnn_test.cc                  |    2 +-
 tests/cpp/thread_safety/thread_safety_test.cc      |   16 +-
 tests/cpp/unittest.mk                              |   31 +-
 tests/jenkins/run_test_pip_installations.sh        |   12 +-
 tests/nightly/dist_device_sync_kvstore_horovod.py  |   80 ++
 tests/nightly/test_distributed_training-gpu.sh     |   53 +
 tests/python/gpu/test_extensions_gpu.py            |   18 +-
 tests/python/gpu/test_fusion.py                    |   29 +-
 tests/python/gpu/test_gluon_gpu.py                 |   21 +
 tests/python/gpu/test_kvstore_gpu.py               |    1 +
 tests/python/mkl/test_mkldnn.py                    |   71 +-
 tests/python/quantization/test_quantization.py     |   58 +-
 tests/python/unittest/test_gluon_data.py           |    5 -
 tests/python/unittest/test_gluon_rnn.py            |  194 ++-
 tests/python/unittest/test_init.py                 |    6 +-
 .../python/unittest/test_numpy_interoperability.py |  170 ++-
 tests/python/unittest/test_numpy_ndarray.py        |    1 +
 tests/python/unittest/test_numpy_op.py             |  864 +++++++++++-
 tests/python/unittest/test_operator.py             |  140 ++
 tests/python/unittest/test_optimizer.py            |    3 +-
 tools/dependencies/README.md                       |    1 -
 tools/dependencies/make_shared_dependencies.sh     |    3 +-
 tools/dependencies/zmq.sh                          |    6 +
 tools/launch.py                                    |   63 +-
 tools/pip/doc/CPU_ADDITIONAL.md                    |    6 +-
 tools/pip/doc/CU100_ADDITIONAL.md                  |    8 +-
 tools/pip/doc/CU101_ADDITIONAL.md                  |    8 +-
 tools/pip/doc/CU102_ADDITIONAL.md                  |    7 +-
 tools/pip/doc/CU75_ADDITIONAL.md                   |   38 -
 tools/pip/doc/CU80_ADDITIONAL.md                   |   38 -
 tools/pip/doc/CU90_ADDITIONAL.md                   |   45 -
 tools/pip/doc/CU92_ADDITIONAL.md                   |    6 +-
 tools/pip/doc/NATIVE_ADDITIONAL.md                 |    7 +-
 tools/pip/doc/PYPI_README.md                       |    2 +-
 tools/pip/setup.py                                 |   13 +-
 tools/setup_gpu_build_tools.sh                     |  120 +-
 tools/staticbuild/README.md                        |    2 +-
 tools/staticbuild/build.sh                         |    2 +-
 tools/staticbuild/build_lib.sh                     |   10 +-
 tools/staticbuild/build_lib_cmake.sh               |   10 +-
 416 files changed, 16978 insertions(+), 7495 deletions(-)
 delete mode 100755 3rdparty/mshadow/mshadow/half2.h
 delete mode 100644 ci/docker/Dockerfile.build.test.arm_qemu
 rename ci/docker/{Dockerfile.publish.ubuntu1404_cpu => 
Dockerfile.build.test.armv7} (72%)
 rename ci/docker/{Dockerfile.publish.test.ubuntu1404_cpu => 
Dockerfile.build.test.armv8} (72%)
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu100
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu102
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu80
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu90
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu92
 copy ci/docker/{Dockerfile.publish.test.centos7_cpu => 
Dockerfile.publish.centos7_cpu} (85%)
 copy ci/docker/{Dockerfile.publish.test.centos7_gpu => 
Dockerfile.publish.centos7_gpu_cu100} (74%)
 copy ci/docker/{Dockerfile.publish.test.centos7_gpu => 
Dockerfile.publish.centos7_gpu_cu101} (74%)
 copy ci/docker/{Dockerfile.publish.test.centos7_gpu => 
Dockerfile.publish.centos7_gpu_cu102} (74%)
 copy ci/docker/{Dockerfile.publish.test.centos7_gpu => 
Dockerfile.publish.centos7_gpu_cu92} (77%)
 delete mode 100644 ci/docker/Dockerfile.publish.test.ubuntu1404_gpu
 delete mode 100644 ci/docker/Dockerfile.publish.ubuntu1404_gpu
 delete mode 100755 ci/docker/install/android_armv7_openblas.sh
 delete mode 100755 ci/docker/install/android_ndk.sh
 delete mode 100755 ci/docker/install/arm64_openblas.sh
 delete mode 100755 ci/docker/install/centos7_base.sh
 copy ci/docker/install/{arm_openblas.sh => centos7_nccl.sh} (53%)
 rename ci/docker/install/{arm_openblas.sh => thrust.sh} (75%)
 delete mode 100755 ci/docker/install/ubuntu_arm_qemu.sh
 delete mode 100755 ci/docker/install/ubuntu_arm_qemu_bin.sh
 delete mode 100755 ci/docker/install/ubuntu_publish.sh
 delete mode 100644 ci/docker/qemu/README.md
 delete mode 100755 ci/docker/qemu/runtime_functions.py
 delete mode 100644 ci/docker/qemu/vmcontrol.py
 rename ci/docker/{install/android_arm64_openblas.sh => 
toolchains/aarch64-linux-gnu-toolchain.cmake} (64%)
 mode change 100755 => 100644
 rename ci/docker/{install/ubuntu_arm.sh => 
toolchains/arm-linux-gnueabihf-toolchain.cmake} (65%)
 mode change 100755 => 100644
 copy ci/jenkins/{Jenkinsfile_centos_gpu => Jenkinsfile_full} (50%)
 copy ci/jenkins/{Jenkinsfile_centos_cpu => Jenkinsfile_website_nightly} (75%)
 delete mode 100644 ci/qemu/README.md
 delete mode 100755 ci/qemu/copy.sh
 delete mode 100755 ci/qemu/init.sh
 delete mode 100644 ci/qemu/initrd_modif/inittab
 delete mode 100755 ci/qemu/install.sh
 delete mode 100644 ci/qemu/mxnet_requirements.txt
 delete mode 100644 ci/qemu/preseed.cfg
 delete mode 100755 ci/qemu/preseed.sh
 delete mode 100755 ci/qemu/run.sh
 delete mode 100644 ci/qemu/test_requirements.txt
 delete mode 100644 config/distribution/linux_cu75.cmake
 delete mode 100644 config/distribution/linux_cu80.cmake
 delete mode 100644 config/distribution/linux_cu90.cmake
 delete mode 100644 config/distribution/linux_cu91.cmake
 delete mode 100644 docker/docker-python/Dockerfile.mxnet.python.gpu.cu90
 delete mode 100644 docker/docker-python/Dockerfile.mxnet.python.gpu.cu90.mkl
 delete mode 100644 docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90
 delete mode 100644 docker/docker-python/Dockerfile.mxnet.python3.gpu.cu90.mkl
 create mode 100644 example/distributed_training/cifar10_kvstore_hvd.py
 delete mode 100644 make/crosscompile.jetson.mk
 delete mode 100644 make/staticbuild/linux_cu75.mk
 delete mode 100644 make/staticbuild/linux_cu80.mk
 delete mode 100644 make/staticbuild/linux_cu90.mk
 delete mode 100644 make/staticbuild/linux_cu91.mk
 create mode 100644 python/mxnet/kvstore/horovod.py
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_det.cc} (67%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_eig.cc} (55%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_eigvals.cc} (56%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_inv.cc} (67%)
 copy src/api/operator/numpy/{np_cumsum.cc => linalg/np_lstsq.cc} (51%)
 copy src/api/operator/numpy/{np_tensordot_op.cc => linalg/np_matrix_rank.cc} 
(51%)
 copy src/api/operator/numpy/{np_cumsum.cc => linalg/np_norm.cc} (63%)
 copy src/api/operator/numpy/{np_tensordot_op.cc => linalg/np_pinv.cc} (51%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_potrf.cc} (67%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_qr.cc} (67%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_slogdet.cc} (67%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_solve.cc} (62%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_tensorinv.cc} (67%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
linalg/np_tensorsolve.cc} (58%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => np_cross.cc} 
(63%)
 create mode 100644 src/api/operator/numpy/np_delete_op.cc
 copy src/api/operator/numpy/{np_elemwise_broadcast_op.cc => 
np_elemwise_broadcast_logic_op.cc} (52%)
 create mode 100644 src/api/operator/numpy/np_elemwise_unary_op_basic.cc
 create mode 100644 src/api/operator/numpy/np_histogram_op.cc
 create mode 100644 src/api/operator/numpy/np_insert_op.cc
 copy src/api/operator/numpy/{np_bincount_op.cc => np_interp_op.cc} (55%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
np_matmul_op.cc} (62%)
 create mode 100644 src/api/operator/numpy/np_moments_op.cc
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => 
np_trace_op.cc} (64%)
 copy src/api/operator/numpy/{np_broadcast_reduce_op_value.cc => np_triu_op.cc} 
(59%)
 create mode 100644 src/api/operator/numpy/np_window_op.cc
 copy src/api/operator/numpy/random/{np_laplace_op.cc => np_choice_op.cc} (62%)
 copy src/api/operator/numpy/{np_cumsum.cc => random/np_exponential_op.cc} (50%)
 create mode 100644 src/api/operator/numpy/random/np_location_scale_op.cc
 copy src/api/operator/numpy/{np_cumsum.cc => random/np_pareto_op.cc} (51%)
 copy src/api/operator/numpy/{np_cumsum.cc => random/np_power_op.cc} (51%)
 copy src/api/operator/numpy/{np_cumsum.cc => random/np_rayleigh_op.cc} (50%)
 copy src/api/operator/numpy/{np_cumsum.cc => random/np_weibull_op.cc} (51%)
 create mode 100644 src/api/operator/random/np_gamma_op.cc
 create mode 100644 src/api/operator/random/np_normal_op.cc
 create mode 100644 src/api/operator/random/np_uniform_op.cc
 copy src/api/operator/{numpy/np_broadcast_reduce_op_value.cc => 
random/shuffle_op.cc} (63%)
 copy src/api/operator/{numpy/np_broadcast_reduce_op_value.cc => 
tensor/elemwise_binary_broadcast_op_extended.cc} (56%)
 copy src/api/operator/{numpy/np_cumsum.cc => tensor/indexing_op.cc} (50%)
 create mode 100644 src/api/operator/tensor/matrix_op.cc
 create mode 100644 src/common/cuda_vectorization.cuh
 create mode 100644 src/operator/contrib/batch_norm_relu.cc
 create mode 100644 src/operator/numpy/linalg/np_lstsq-inl.h
 create mode 100644 src/operator/numpy/linalg/np_lstsq.cc
 copy src/operator/numpy/{np_trace_op.cu => linalg/np_lstsq.cu} (74%)
 create mode 100644 src/operator/numpy/linalg/np_matrix_rank-inl.h
 create mode 100644 src/operator/numpy/linalg/np_matrix_rank.cc
 copy src/operator/numpy/{np_trace_op.cu => linalg/np_matrix_rank.cu} (69%)
 create mode 100644 src/operator/numpy/linalg/np_qr-inl.h
 create mode 100644 src/operator/numpy/linalg/np_qr.cc
 copy src/operator/numpy/{np_trace_op.cu => linalg/np_qr.cu} (71%)
 create mode 100644 src/operator/numpy/np_cross-inl.h
 create mode 100644 src/operator/numpy/np_cross.cc
 copy src/operator/numpy/{np_trace_op.cu => np_cross.cu} (72%)
 create mode 100644 src/operator/numpy/np_interp_op-inl.h
 create mode 100644 src/operator/numpy/np_interp_op.cc
 copy src/operator/numpy/{np_trace_op.cu => np_interp_op.cu} (73%)
 create mode 100644 src/operator/numpy/np_triu_op-inl.h
 create mode 100644 src/operator/numpy/np_triu_op.cc
 copy src/operator/numpy/{np_trace_op.cu => np_triu_op.cu} (73%)
 create mode 100644 src/operator/tensor/elemwise_binary_op.cuh
 create mode 100644 src/operator/tensor/elemwise_binary_scalar_op.cuh
 create mode 100644 src/operator/tensor/elemwise_unary_op.cuh
 create mode 100644 tests/nightly/dist_device_sync_kvstore_horovod.py
 create mode 100755 tests/nightly/test_distributed_training-gpu.sh
 delete mode 100644 tools/pip/doc/CU75_ADDITIONAL.md
 delete mode 100644 tools/pip/doc/CU80_ADDITIONAL.md
 delete mode 100644 tools/pip/doc/CU90_ADDITIONAL.md

Reply via email to