This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git.
from bbc39fa add signal handler for fpe, bus error (#18956)
add 29d6f27 Use RTC for elementwise and broadcast ops (#18622)
No new revisions were added by this update.
Summary of changes:
3rdparty/mshadow/mshadow/base.h | 48 --
3rdparty/mshadow/mshadow/half2.h | 162 ----
CMakeLists.txt | 16 +-
ci/build_windows.py | 6 -
ci/docker/runtime_functions.sh | 22 -
ci/jenkins/Jenkins_steps.groovy | 14 -
ci/jenkins/Jenkinsfile_unix_gpu | 1 -
config/darwin.cmake | 1 -
config/linux.cmake | 1 -
config/linux_gpu.cmake | 1 -
docs/python_docs/python/tutorials/extend/index.rst | 7 +
.../src/pages/api/faq/add_op_in_backend.md | 1 +
docs/static_site/src/pages/api/faq/env_var.md | 6 +-
docs/static_site/src/pages/api/faq/using_rtc.md | 465 +++++++++++
include/mxnet/libinfo.h | 5 -
include/mxnet/rtc.h | 4 +-
python/mxnet/contrib/amp/lists/symbol_fp16.py | 3 -
python/mxnet/runtime.py | 2 +-
src/c_api/c_api.cc | 20 +-
src/common/cuda/rtc.cc | 244 ++++++
src/common/cuda/rtc.h | 92 +++
src/common/cuda/rtc/backward_functions-inl.h | 480 +++++++++++
src/common/cuda/rtc/forward_functions-inl.h | 917 +++++++++++++++++++++
src/common/cuda/rtc/half-inl.h | 84 ++
src/common/cuda/rtc/reducer-inl.h | 109 +++
.../cuda/rtc}/special_functions-inl.h | 112 ++-
src/common/cuda/rtc/util-inl.h | 389 +++++++++
src/common/cuda/rtc/vectorization-inl.h | 463 +++++++++++
src/common/{cuda_utils.cc => cuda/utils.cc} | 2 +-
src/common/{cuda_utils.h => cuda/utils.h} | 6 +-
src/common/rtc.cc | 6 +-
src/common/utils.cc | 25 +
src/common/utils.h | 14 +
src/engine/stream_manager.h | 2 +-
src/engine/threaded_engine.cc | 2 +-
src/engine/threaded_engine_pooled.cc | 2 +-
src/imperative/cached_op.h | 4 +-
src/imperative/pointwise_fusion_pass.cc | 6 +-
src/kvstore/kvstore_nccl.h | 2 +-
src/libinfo.cc | 2 -
src/ndarray/ndarray_function.cu | 2 +-
src/operator/bilinear_sampler.cu | 2 +-
src/operator/contrib/deformable_psroi_pooling.cu | 2 +-
src/operator/contrib/gradient_multiplier_op.cu | 4 +-
src/operator/contrib/nn/deformable_im2col.cuh | 2 +-
.../contrib/nn/modulated_deformable_im2col.cuh | 2 +-
src/operator/contrib/psroi_pooling.cu | 2 +-
src/operator/contrib/stes_op.cu | 8 +-
src/operator/contrib/transformer.cu | 2 +-
src/operator/fusion/fused_op-inl.h | 872 +-------------------
src/operator/fusion/fused_op.cc | 4 +-
src/operator/fusion/fused_op.cu | 148 +---
src/operator/fusion/fused_op.h | 5 +-
src/operator/leaky_relu-inl.h | 18 +
src/operator/linalg_impl.h | 2 +-
src/operator/mshadow_op.h | 69 --
src/operator/mxnet_op.h | 2 +-
src/operator/nn/batch_norm.cu | 3 +-
src/operator/nn/cudnn/cudnn_activation-inl.h | 2 +-
src/operator/nn/cudnn/cudnn_algoreg-inl.h | 2 +-
src/operator/nn/cudnn/cudnn_convolution-inl.h | 2 +-
src/operator/nn/cudnn/cudnn_deconvolution-inl.h | 2 +-
src/operator/nn/depthwise_convolution-inl.h | 2 +-
src/operator/nn/depthwise_convolution_tf.cuh | 2 +-
src/operator/nn/group_norm-inl.h | 93 ++-
src/operator/nn/layer_norm-inl.h | 87 +-
src/operator/nn/pool.cuh | 2 +-
src/operator/nn/softmax-inl.h | 2 +-
.../linalg/broadcast_reduce_customized-inl.cuh | 21 +-
.../numpy/linalg/broadcast_reduce_customized-inl.h | 4 +
.../numpy/linalg/broadcast_reduce_op_customized.h | 4 +-
src/operator/numpy/linalg/np_matrix_rank-inl.h | 16 +-
src/operator/numpy/linalg/np_pinv-inl.h | 30 +-
src/operator/numpy/np_broadcast_reduce_op.h | 25 +-
src/operator/numpy/np_cross-inl.h | 44 +-
src/operator/numpy/np_diff-inl.h | 4 +-
.../numpy/np_elemwise_broadcast_logic_op.cu | 4 +-
src/operator/numpy/np_elemwise_broadcast_op.cu | 56 +-
src/operator/numpy/np_elemwise_broadcast_op.h | 10 +-
.../numpy/np_elemwise_broadcast_op_extended.cc | 6 +-
.../numpy/np_elemwise_broadcast_op_extended.cu | 67 +-
.../numpy/np_elemwise_broadcast_op_extended_sec.cu | 33 +-
src/operator/numpy/np_elemwise_unary_op_basic.cc | 2 +-
src/operator/numpy/np_elemwise_unary_op_basic.cu | 171 ++--
src/operator/numpy/np_polynomial_op.cu | 2 +-
src/operator/numpy/np_true_divide.cu | 3 +-
src/operator/numpy/np_where_op-inl.h | 12 +-
src/operator/numpy/random/np_exponential_op.h | 2 +-
src/operator/numpy/random/np_gamma_op.h | 2 +-
src/operator/numpy/random/np_location_scale_op.h | 10 +-
src/operator/numpy/random/np_normal_op.h | 10 +-
src/operator/numpy/random/np_pareto_op.h | 2 +-
src/operator/numpy/random/np_rayleigh_op.h | 2 +-
src/operator/numpy/random/np_weibull_op.h | 2 +-
src/operator/operator_common.h | 2 +-
src/operator/operator_tune.cc | 2 -
src/operator/pad.cu | 2 +-
src/operator/quantization/quantization_utils.h | 2 +-
src/operator/random/pdf_op.h | 4 +-
src/operator/tensor/broadcast_reduce-inl.cuh | 338 +-------
src/operator/tensor/broadcast_reduce-inl.h | 580 ++++++++++---
src/operator/tensor/broadcast_reduce_op.h | 12 +-
src/operator/tensor/cast_storage-inl.h | 2 +-
.../tensor/elemwise_binary_broadcast_op-inl.cuh | 82 --
.../tensor/elemwise_binary_broadcast_op.cc | 452 ++++++++++
src/operator/tensor/elemwise_binary_broadcast_op.h | 231 +-----
.../tensor/elemwise_binary_broadcast_op_basic.cu | 25 +-
.../elemwise_binary_broadcast_op_extended.cu | 21 +-
.../tensor/elemwise_binary_broadcast_op_logic.cu | 18 +-
src/operator/tensor/elemwise_binary_op.cc | 351 +++++++-
src/operator/tensor/elemwise_binary_op.h | 192 +++--
src/operator/tensor/elemwise_binary_op_basic.cu | 28 +-
src/operator/tensor/elemwise_binary_op_extended.cu | 22 +-
src/operator/tensor/elemwise_binary_op_logic.cu | 19 +-
src/operator/tensor/elemwise_binary_scalar_op.cc | 257 ++++++
src/operator/tensor/elemwise_binary_scalar_op.h | 91 +-
.../tensor/elemwise_binary_scalar_op_basic.cu | 37 +-
.../tensor/elemwise_binary_scalar_op_extended.cu | 29 +-
.../tensor/elemwise_binary_scalar_op_logic.cu | 30 +-
src/operator/tensor/elemwise_scatter_op.cc | 142 ----
src/operator/tensor/elemwise_scatter_op.cu | 47 --
src/operator/tensor/elemwise_scatter_op.h | 318 -------
src/operator/tensor/elemwise_sum.cu | 130 ++-
src/operator/tensor/elemwise_sum.h | 12 -
src/operator/tensor/elemwise_unary_op.cc | 163 ++++
src/operator/tensor/elemwise_unary_op.h | 279 ++++---
src/operator/tensor/elemwise_unary_op_basic.cc | 4 +-
src/operator/tensor/elemwise_unary_op_basic.cu | 85 +-
src/operator/tensor/elemwise_unary_op_logexp.cu | 31 +-
src/operator/tensor/elemwise_unary_op_pow.cu | 35 +-
src/operator/tensor/elemwise_unary_op_trig.cu | 91 +-
src/operator/tensor/pseudo2DTranspose_op-inl.cuh | 2 +-
src/operator/tensor/reduce_rtc.cc | 524 ++++++++++++
src/profiler/profiler.cc | 2 +-
src/profiler/storage_profiler.cc | 2 +-
src/resource.cc | 2 +-
src/storage/storage_manager_helpers.h | 2 +-
tests/python/gpu/test_fusion.py | 1 +
tests/python/unittest/test_numpy_op.py | 81 +-
tests/python/unittest/test_operator.py | 72 ++
tests/python/unittest/test_sparse_operator.py | 146 ----
141 files changed, 7027 insertions(+), 3610 deletions(-)
delete mode 100644 3rdparty/mshadow/mshadow/half2.h
create mode 100644 docs/static_site/src/pages/api/faq/using_rtc.md
create mode 100644 src/common/cuda/rtc.cc
create mode 100644 src/common/cuda/rtc.h
create mode 100644 src/common/cuda/rtc/backward_functions-inl.h
create mode 100644 src/common/cuda/rtc/forward_functions-inl.h
create mode 100644 src/common/cuda/rtc/half-inl.h
create mode 100644 src/common/cuda/rtc/reducer-inl.h
copy src/{operator => common/cuda/rtc}/special_functions-inl.h (83%)
create mode 100644 src/common/cuda/rtc/util-inl.h
create mode 100644 src/common/cuda/rtc/vectorization-inl.h
rename src/common/{cuda_utils.cc => cuda/utils.cc} (99%)
rename src/common/{cuda_utils.h => cuda/utils.h} (99%)
delete mode 100644 src/operator/tensor/elemwise_binary_broadcast_op-inl.cuh
create mode 100644 src/operator/tensor/elemwise_binary_broadcast_op.cc
create mode 100644 src/operator/tensor/elemwise_binary_scalar_op.cc
delete mode 100644 src/operator/tensor/elemwise_scatter_op.cc
delete mode 100644 src/operator/tensor/elemwise_scatter_op.cu
delete mode 100644 src/operator/tensor/elemwise_scatter_op.h
create mode 100644 src/operator/tensor/elemwise_unary_op.cc
create mode 100644 src/operator/tensor/reduce_rtc.cc