eric-haibin-lin closed pull request #12804: CudnnFind() usage improvements URL: https://github.com/apache/incubator-mxnet/pull/12804
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 5e5e76d52aa..404f135cd91 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -187,3 +187,4 @@ List of Contributors * [LuckyPigeon](https://github.com/LuckyPigeon) * [Anton Chernov](https://github.com/lebeg) * [Denisa Roberts](https://github.com/D-Roberts) +* [Dick Carter](https://github.com/DickJC123) diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index 0464c73fadc..92cf4931b04 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -67,6 +67,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 * MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF - Values: Int ```(default=24)``` - The cutoff threshold that decides the rounding strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T. +* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE + - Values: Int ```(default=2097152)``` + - When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value. + - The default was chosen to minimize global memory fragmentation within the GPU driver. Set this to 1 to disable. ## Engine Type diff --git a/include/mxnet/base.h b/include/mxnet/base.h index 783f74ab447..5f16eb44186 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -225,11 +225,11 @@ struct Context { /*! * \brief get the free and total available memory on a GPU * \param dev the GPU number to query - * \param free_mem pointer to the integer holding free GPU memory - * \param total_mem pointer to the integer holding total GPU memory + * \param free_mem pointer to the uint64_t holding free GPU memory + * \param total_mem pointer to the uint64_t holding total GPU memory * \return No return value */ - inline static void GetGPUMemoryInformation(int dev, int *free, int *total); + inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total); /*! * Create a pinned CPU context. * \param dev_id the device id for corresponding GPU. @@ -334,8 +334,8 @@ inline int32_t Context::GetGPUCount() { #endif } -inline void Context::GetGPUMemoryInformation(int dev, int *free_mem, - int *total_mem) { +inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem, + uint64_t *total_mem) { #if MXNET_USE_CUDA size_t memF, memT; @@ -354,8 +354,8 @@ inline void Context::GetGPUMemoryInformation(int dev, int *free_mem, e = cudaSetDevice(curDevice); CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e); - *free_mem = static_cast<int>(memF); - *total_mem = static_cast<int>(memT); + *free_mem = static_cast<uint64_t>(memF); + *total_mem = static_cast<uint64_t>(memT); #else LOG(FATAL) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index dc33c95437f..e9f1e2d6ccc 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -441,6 +441,7 @@ MXNET_DLL int MXGetGPUCount(int* out); /*! * \brief get the free and total available memory on a GPU + * Note: Deprecated, use MXGetGPUMemoryInformation64 instead. * \param dev the GPU number to query * \param free_mem pointer to the integer holding free GPU memory * \param total_mem pointer to the integer holding total GPU memory @@ -448,6 +449,15 @@ MXNET_DLL int MXGetGPUCount(int* out); */ MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem); +/*! + * \brief get the free and total available memory on a GPU + * \param dev the GPU number to query + * \param free_mem pointer to the uint64_t holding free GPU memory + * \param total_mem pointer to the uint64_t holding total GPU memory + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem); + /*! * \brief get the MXNet library version as an integer * \param pointer to the integer holding the version number diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i index 38665748a0b..b1907f5cd7e 100644 --- a/perl-package/AI-MXNetCAPI/mxnet.i +++ b/perl-package/AI-MXNetCAPI/mxnet.i @@ -344,6 +344,7 @@ int MXGetGPUCount(int* out); /*! * \brief get the free and total available memory on a GPU + * Note: deprecated, use MXGetGPUMemoryInformation64(). * \param dev the GPU number to query * \param free_mem pointer to the integer holding free GPU memory * \param total_mem pointer to the integer holding total GPU memory @@ -351,6 +352,15 @@ int MXGetGPUCount(int* out); */ int MXGetGPUMemoryInformation(int dev, int *out, int *out); +/*! + * \brief get the free and total available memory on a GPU + * \param dev the GPU number to query + * \param free_mem pointer to the uint64_t holding free GPU memory + * \param total_mem pointer to the uint64_t holding total GPU memory + * \return 0 when success, -1 when failure happens + */ +int MXGetGPUMemoryInformation64(int dev, uint64_t *out, uint64_t *out); + //------------------------------------- // Part 1: NDArray creation and deletion diff --git a/python/mxnet/context.py b/python/mxnet/context.py index 61b70532dd7..15ea9905de0 100644 --- a/python/mxnet/context.py +++ b/python/mxnet/context.py @@ -258,6 +258,30 @@ def num_gpus(): check_call(_LIB.MXGetGPUCount(ctypes.byref(count))) return count.value +def gpu_memory_info(device_id=0): + """Query CUDA for the free and total bytes of GPU global memory. + + Parameters + ---------- + device_id : int, optional + The device id of the GPU device. + + Raises + ------ + Will raise an exception on any CUDA error. + + Returns + ------- + (free, total) : (int, int) + The number of GPUs. + + """ + free = ctypes.c_uint64() + total = ctypes.c_uint64() + dev_id = ctypes.c_int(device_id) + check_call(_LIB.MXGetGPUMemoryInformation64(dev_id, ctypes.byref(free), ctypes.byref(total))) + return (free.value, total.value) + def current_context(): """Returns the current context. diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 56e318097a3..80bd60538ff 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -122,7 +122,18 @@ int MXGetGPUCount(int* out) { API_END(); } +// Deprecated: use MXGetGPUMemoryInformation64() instead. int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem) { + API_BEGIN(); + uint64_t free_mem64 = 0UL; + uint64_t total_mem64 = 0UL; + Context::GetGPUMemoryInformation(dev, &free_mem64, &total_mem64); + *free_mem = static_cast<int>(free_mem64); + *total_mem = static_cast<int>(total_mem64); + API_END(); +} + +int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem) { API_BEGIN(); Context::GetGPUMemoryInformation(dev, free_mem, total_mem); API_END(); diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h index 3b59fd1c3ce..21d3a30ba7c 100644 --- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h +++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h @@ -30,6 +30,8 @@ #include <mutex> #include <string> #include <vector> +#include <functional> +#include <utility> #include "../../../common/cuda_utils.h" #include "../convolution-inl.h" #include "../deconvolution-inl.h" @@ -65,7 +67,11 @@ class CuDNNAlgo { template<typename ParamType> class CuDNNAlgoReg { public: - bool Find(const ParamType ¶m, + using AlgoSetter_t = std::function<void(CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *, + CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *, + CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *)>; + + void FindOrElseRegister(const ParamType ¶m, const std::vector<TShape> &in_shape, const std::vector<TShape> &out_shape, cudnnDataType_t cudnn_data_type, @@ -75,7 +81,8 @@ class CuDNNAlgoReg { bool add_to_weight, CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd, CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd, - CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) { + CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt, + const AlgoSetter_t &algo_setter) { CHECK(in_shape.size() == 2 || in_shape.size() == 3); ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; @@ -85,45 +92,28 @@ class CuDNNAlgoReg { *fwd = i->second.fwd; *bwd = i->second.bwd; *flt = i->second.flt; - return true; - } - return false; - } - - void Register(const ParamType ¶m, - const std::vector<TShape> &in_shape, - const std::vector<TShape> &out_shape, - cudnnDataType_t cudnn_data_type, - cudnnDataType_t cudnn_forward_compute_type, - cudnnDataType_t cudnn_backward_compute_type, - int sm_arch, - bool add_to_weight, - const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd, - const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd, - const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) { - CHECK(in_shape.size() == 2 || in_shape.size() == 3); - ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, - cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; - std::lock_guard<std::mutex> guard(lock_); - if (param.cudnn_tune.value() && reg_.size() % 50 == 0) { - LOG(INFO) << "Running performance tests to find the best convolution " - "algorithm, " - "this can take a while... (setting env variable " - "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)"; - if (reg_.size() >= 1000) { - // Many people are very concerned about this warning, so change the warning once. - if (!is_warning_autotune_) { - LOG(INFO) - << "If you see this message in the middle of training, you are " - "probably using bucketing. Consider setting env variable " - "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning."; - is_warning_autotune_ = true; + } else { + if (param.cudnn_tune.value() && reg_.size() % 50 == 0) { + LOG(INFO) << "Running performance tests to find the best convolution " + "algorithm, " + "this can take a while... (setting env variable " + "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)"; + if (reg_.size() >= 1000) { + // Many people are very concerned about this warning, so change the warning once. + if (!is_warning_autotune_) { + LOG(INFO) + << "If you see this message in the middle of training, you are " + "probably using bucketing. Consider setting env variable " + "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning."; + is_warning_autotune_ = true; + } } } + // Call provided function to determine the algos- likely uses cudnnFind() or cudnnGet() + algo_setter(fwd, bwd, flt); + // Save result so future lookups hit in this registry + reg_.insert(std::pair<ParamKey, CudnnAlgorithms>(key, CudnnAlgorithms{*fwd, *bwd, *flt})); } - reg_[key].fwd = fwd; - reg_[key].bwd = bwd; - reg_[key].flt = flt; } static CuDNNAlgoReg *Get(); diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index 53bd76c9c3e..d63d46821ed 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -26,6 +26,7 @@ #ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_ +#include <mxnet/storage.h> #include <algorithm> #include <vector> #include <mutex> @@ -611,236 +612,265 @@ class CuDNNConvolutionOp { } } - void SelectAlgo(const RunContext& rctx, + void CuDNNAlgoSetter(const RunContext& rctx, const std::vector<TShape>& in_shape, const std::vector<TShape>& out_shape, cudnnDataType_t cudnn_forward_compute_type, - cudnnDataType_t cudnn_backward_compute_type) { - if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, - cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), add_to_weight_, - &forward_algo_, &back_algo_, &back_algo_w_)) { - mshadow::Stream<gpu> *s = rctx.get_stream<gpu>(); - CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle); - size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType)); - #if CUDNN_MAJOR >= 7 - // Starting with cuDNNv7, the algo number returned by *Get*() is not the entire - // story: the notion of whether the algo ran in Tensor Core mode is not known. - // Since we want to report the Tensor Core mode in the verbose output, we switch - // to using the new *Get*_v7() call. Since the function signature of *Get*_v7() matches - // that of *Find*(), we can unify the find-vs-get logic by using function pointers. - - // Forward Algorithm Find/Get() v7 - std::vector<cudnnConvolutionFwdAlgoPerf_t> fwd_results(MaxForwardAlgos(s->dnn_handle_)); - int actual_fwd_algos = 0; - auto fwd_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionForwardAlgorithm_v7 - : cudnnFindConvolutionForwardAlgorithm; - CUDNN_CALL((*fwd_algo_discoverer)(s->dnn_handle_, - in_desc_, - filter_desc_, - forward_conv_desc_, - out_desc_, - fwd_results.size(), - &actual_fwd_algos, - fwd_results.data())); - fwd_results.resize(actual_fwd_algos); - AlgoFinalSelect<cudnnConvolutionFwdAlgoPerf_t, - cudnnConvolutionFwdAlgo_t>(fwd_results, "forward", - workspace_byte, &forward_algo_); - - // Backprop-to-Filter Algorithm Find/Get() v7 - auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); - std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos); - int actual_bwd_filter_algos = 0; - // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we - // were summing into the output (i.e. beta != 0). Get() returned OK algos though. - auto bwd_filter_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 - : cudnnFindConvolutionBackwardFilterAlgorithm; - CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, + cudnnDataType_t cudnn_backward_compute_type, + CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd, + CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd, + CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) { + // Not in algo registry, must determine via *Get*() or *Find*() + mshadow::Stream<gpu> *s = rctx.get_stream<gpu>(); + CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle); + size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType)); +#if CUDNN_MAJOR >= 7 + // Starting with cuDNNv7, the algo number returned by *Get*() is not the entire + // story: the notion of whether the algo ran in Tensor Core mode is not known. + // Since we want to report the Tensor Core mode in the verbose output, we switch + // to using the new *Get*_v7() call. Since the function signature of *Get*_v7() matches + // that of *Find*(), we can unify the find-vs-get logic by using function pointers. + + // Forward Algorithm Find/Get() v7 + std::vector<cudnnConvolutionFwdAlgoPerf_t> fwd_results(MaxForwardAlgos(s->dnn_handle_)); + int actual_fwd_algos = 0; + auto fwd_algo_discoverer = + param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionForwardAlgorithm_v7 + : cudnnFindConvolutionForwardAlgorithm; + CUDNN_CALL((*fwd_algo_discoverer)(s->dnn_handle_, + in_desc_, + filter_desc_, + forward_conv_desc_, + out_desc_, + fwd_results.size(), + &actual_fwd_algos, + fwd_results.data())); + fwd_results.resize(actual_fwd_algos); + AlgoFinalSelect<cudnnConvolutionFwdAlgoPerf_t, + cudnnConvolutionFwdAlgo_t>(fwd_results, "forward", + workspace_byte, fwd); + + // Backprop-to-Filter Algorithm Find/Get() v7 + auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); + std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos); + int actual_bwd_filter_algos = 0; + // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we + // were summing into the output (i.e. beta != 0). Get() returned OK algos though. + auto bwd_filter_algo_discoverer = + param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + : cudnnFindConvolutionBackwardFilterAlgorithm; + CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, + in_desc_, + out_desc_, + back_conv_desc_w_, + filter_desc_, + bwd_filt_results.size(), + &actual_bwd_filter_algos, + bwd_filt_results.data())); + bwd_filt_results.resize(actual_bwd_filter_algos); + AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t, + cudnnConvolutionBwdFilterAlgo_t>(bwd_filt_results, "backprop-to-filter", + workspace_byte, flt); + + // Backprop-to-Data Algorithm Find/Get() v7 + auto max_bwd_data_algos = MaxBackwardDataAlgos(s->dnn_handle_); + std::vector<cudnnConvolutionBwdDataAlgoPerf_t> bwd_data_results(max_bwd_data_algos); + int actual_bwd_data_algos = 0; + auto bwd_data_algo_discoverer = + param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardDataAlgorithm_v7 + : cudnnFindConvolutionBackwardDataAlgorithm; + CUDNN_CALL((*bwd_data_algo_discoverer)(s->dnn_handle_, + filter_desc_, + out_desc_, + back_conv_desc_, + in_desc_, + bwd_data_results.size(), + &actual_bwd_data_algos, + bwd_data_results.data())); + bwd_data_results.resize(actual_bwd_data_algos); + AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t, + cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data", + workspace_byte, bwd); +#else + // CUDNN_MAJOR < 7 + const int kMaxAlgos = 10; + int nalgo = kMaxAlgos; + int i = 0; + size_t min_memory_needs = 0; + // Forward Algorithm Find/Get, v6 and earlier + if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) { + // In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is + // supported. Hard-coded this since the algo find() or get() throws an FPE. + fwd->Set(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, false); + } else if (!param_.cudnn_tune.value()) { + cudnnConvolutionFwdAlgo_t fastest_fwd_algo; + CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_, in_desc_, - out_desc_, - back_conv_desc_w_, filter_desc_, - bwd_filt_results.size(), - &actual_bwd_filter_algos, - bwd_filt_results.data())); - bwd_filt_results.resize(actual_bwd_filter_algos); - AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t, - cudnnConvolutionBwdFilterAlgo_t>(bwd_filt_results, "backprop-to-filter", - workspace_byte, &back_algo_w_); - - // Backprop-to-Data Algorithm Find/Get() v7 - auto max_bwd_data_algos = MaxBackwardDataAlgos(s->dnn_handle_); - std::vector<cudnnConvolutionBwdDataAlgoPerf_t> bwd_data_results(max_bwd_data_algos); - int actual_bwd_data_algos = 0; - auto bwd_data_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardDataAlgorithm_v7 - : cudnnFindConvolutionBackwardDataAlgorithm; - CUDNN_CALL((*bwd_data_algo_discoverer)(s->dnn_handle_, - filter_desc_, - out_desc_, - back_conv_desc_, - in_desc_, - bwd_data_results.size(), - &actual_bwd_data_algos, - bwd_data_results.data())); - bwd_data_results.resize(actual_bwd_data_algos); - AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t, - cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data", - workspace_byte, &back_algo_); - #else - // CUDNN_MAJOR < 7 - const int kMaxAlgos = 10; - int nalgo = kMaxAlgos; - int i = 0; - size_t min_memory_needs = 0; - // Forward Algorithm Find/Get, v6 and earlier - if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) { - // In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is - // supported. Hard-coded this since the algo find() or get() throws an FPE. - forward_algo_.Set(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, false); - } else if (!param_.cudnn_tune.value()) { - cudnnConvolutionFwdAlgo_t fastest_fwd_algo; - CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_, - in_desc_, - filter_desc_, - forward_conv_desc_, - out_desc_, - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_byte, - &fastest_fwd_algo)); - forward_algo_.Set(fastest_fwd_algo, false); - } else { - cudnnConvolutionFwdAlgoPerf_t fwd_algo[kMaxAlgos]; - CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(s->dnn_handle_, - in_desc_, - filter_desc_, - forward_conv_desc_, - out_desc_, - kMaxAlgos, - &nalgo, - fwd_algo)); - i = 0; - while (i < nalgo - && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS - || (param_.cudnn_tune.value() == conv::kLimited - && fwd_algo[i].memory > workspace_byte))) { - ++i; - min_memory_needs = - (i == 0) ? fwd_algo[i].memory : std::min(min_memory_needs, fwd_algo[i].memory); - } - if (i == nalgo) { - LOG(FATAL) << nalgo << " forward algorithms with minimum memory requirement " - << min_memory_needs << " bytes have been tried. Workspace size is set to " - << workspace_byte << " bytes, please consider reducing the batch/model size, " - << "or increasing workspace size."; - } else { - forward_algo_.Set(fwd_algo[i].algo, false); - } + forward_conv_desc_, + out_desc_, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_byte, + &fastest_fwd_algo)); + fwd->Set(fastest_fwd_algo, false); + } else { + cudnnConvolutionFwdAlgoPerf_t fwd_algo[kMaxAlgos]; + CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(s->dnn_handle_, + in_desc_, + filter_desc_, + forward_conv_desc_, + out_desc_, + kMaxAlgos, + &nalgo, + fwd_algo)); + i = 0; + while (i < nalgo + && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS + || (param_.cudnn_tune.value() == conv::kLimited + && fwd_algo[i].memory > workspace_byte))) { + ++i; + min_memory_needs = + (i == 0) ? fwd_algo[i].memory : std::min(min_memory_needs, fwd_algo[i].memory); } - // Backprop-to-Filter Algorithm Find/Get, v6 and earlier - if (!param_.cudnn_tune.value()) { - cudnnConvolutionBwdFilterAlgo_t fastest_bwd_filt_algo; - CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_, - in_desc_, - out_desc_, - back_conv_desc_w_, - filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_byte, - &fastest_bwd_filt_algo)); - back_algo_w_.Set(fastest_bwd_filt_algo, false); + if (i == nalgo) { + LogNoSuitableAlgoAndExit(nalgo, min_memory_needs, workspace_byte, "forward"); } else { - cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo[kMaxAlgos]; - CUDNN_CALL(cudnnFindConvolutionBackwardFilterAlgorithm(s->dnn_handle_, - in_desc_, - out_desc_, - back_conv_desc_w_, - filter_desc_, - kMaxAlgos, - &nalgo, - bwd_filter_algo)); - i = 0; - while (i < nalgo - && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS - || (param_.cudnn_tune.value() == conv::kLimited - && bwd_filter_algo[i].memory > workspace_byte))) { - ++i; - min_memory_needs = (i == 0) ? - bwd_filter_algo[i].memory : - std::min(min_memory_needs, bwd_filter_algo[i].memory); - } - if (i == nalgo) { - LOG(FATAL) << nalgo << " backward filter algorithms with minimum memory requirement " - << min_memory_needs << " bytes have been tried. Workspace size is set to " - << workspace_byte << " bytes, please consider reducing the batch/model size, " - << "or increasing workspace size."; - } else { - back_algo_w_.Set(bwd_filter_algo[i].algo, false); - } + fwd->Set(fwd_algo[i].algo, false); } - // Backprop-to-Data Algorithm Get(), v6 and earlier - if (!param_.cudnn_tune.value()) { - cudnnConvolutionBwdDataAlgo_t fastest_bwd_data_algo; - CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_, - filter_desc_, - out_desc_, - back_conv_desc_, - in_desc_, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_byte, - &fastest_bwd_data_algo)); - back_algo_.Set(fastest_bwd_data_algo, false); - } else { - cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo[kMaxAlgos]; - CUDNN_CALL(cudnnFindConvolutionBackwardDataAlgorithm(s->dnn_handle_, - filter_desc_, - out_desc_, - back_conv_desc_, + } + // Backprop-to-Filter Algorithm Find/Get, v6 and earlier + if (!param_.cudnn_tune.value()) { + cudnnConvolutionBwdFilterAlgo_t fastest_bwd_filt_algo; + CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_, + in_desc_, + out_desc_, + back_conv_desc_w_, + filter_desc_, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_byte, + &fastest_bwd_filt_algo)); + flt->Set(fastest_bwd_filt_algo, false); + } else { + cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo[kMaxAlgos]; + CUDNN_CALL(cudnnFindConvolutionBackwardFilterAlgorithm(s->dnn_handle_, in_desc_, + out_desc_, + back_conv_desc_w_, + filter_desc_, kMaxAlgos, &nalgo, - bwd_data_algo)); - i = 0; - while (i < nalgo - && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS - || (param_.cudnn_tune.value() == conv::kLimited - && bwd_data_algo[i].memory > workspace_byte))) { - ++i; - min_memory_needs = (i == 0) ? - bwd_data_algo[i].memory : - std::min(min_memory_needs, bwd_data_algo[i].memory); - } - if (i == nalgo) { - LOG(FATAL) << nalgo << " backward data algorithms with minimum memory requirement " - << min_memory_needs << " bytes have been tried. Workspace size is set to " - << workspace_byte << " bytes, please consider reducing the batch/model size, " - << "or increasing workspace size."; - } else { - back_algo_.Set(bwd_data_algo[i].algo, false); - } + bwd_filter_algo)); + i = 0; + while (i < nalgo + && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS + || (param_.cudnn_tune.value() == conv::kLimited + && bwd_filter_algo[i].memory > workspace_byte))) { + ++i; + min_memory_needs = (i == 0) ? + bwd_filter_algo[i].memory : + std::min(min_memory_needs, bwd_filter_algo[i].memory); } - #endif // CUDNN_MAJOR < 7 - - // Fix for issue #11241 - int cudnn_find_issue_max_features = 64 * 1024; - if (add_to_weight_ && Features(in_shape[conv::kData]) >= cudnn_find_issue_max_features) { - this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); + if (i == nalgo) { + LogNoSuitableAlgoAndExit(nalgo, min_memory_needs, workspace_byte, "backward filter"); + } else { + flt->Set(bwd_filter_algo[i].algo, false); } + } + // Backprop-to-Data Algorithm Get(), v6 and earlier + if (!param_.cudnn_tune.value()) { + cudnnConvolutionBwdDataAlgo_t fastest_bwd_data_algo; + CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_, + filter_desc_, + out_desc_, + back_conv_desc_, + in_desc_, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_byte, + &fastest_bwd_data_algo)); + bwd->Set(fastest_bwd_data_algo, false); + } else { + cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo[kMaxAlgos]; + CUDNN_CALL(cudnnFindConvolutionBackwardDataAlgorithm(s->dnn_handle_, + filter_desc_, + out_desc_, + back_conv_desc_, + in_desc_, + kMaxAlgos, + &nalgo, + bwd_data_algo)); + i = 0; + while (i < nalgo + && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS + || (param_.cudnn_tune.value() == conv::kLimited + && bwd_data_algo[i].memory > workspace_byte))) { + ++i; + min_memory_needs = (i == 0) ? + bwd_data_algo[i].memory : + std::min(min_memory_needs, bwd_data_algo[i].memory); + } + if (i == nalgo) { + LogNoSuitableAlgoAndExit(nalgo, min_memory_needs, workspace_byte, "backward data"); + } else { + bwd->Set(bwd_data_algo[i].algo, false); + } + } +#endif // CUDNN_MAJOR < 7 - // An algo specification by the user may be cached here, but another - // convolution will match only if identically specified. - // We're caching results of *Get* as well as *Find*, but these records - // will be held distinctly because param_.cudnn_tune is part of the key. - CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, - cudnn_forward_compute_type, - cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), this->add_to_weight_, - this->forward_algo_, - this->back_algo_, this->back_algo_w_); + // Fix for issue #11241 + int cudnn_find_issue_max_features = 64 * 1024; + if (add_to_weight_ && Features(in_shape[conv::kData]) >= cudnn_find_issue_max_features) { + flt->Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); } + } + + void SelectAlgo(const RunContext& rctx, + const std::vector<TShape>& in_shape, + const std::vector<TShape>& out_shape, + cudnnDataType_t cudnn_forward_compute_type, + cudnnDataType_t cudnn_backward_compute_type) { + auto algo_setter = [&](CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd, + CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd, + CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) { + if (param_.cudnn_tune.value() == conv::kOff) { + // The routine will only be calling cudnnGet, so no need to grab the Storage lock. + this->CuDNNAlgoSetter(rctx, in_shape, out_shape, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + fwd, bwd, flt); + } else { + // One potential problem is that cudnnFind() uses cudaMalloc() to directly allocate + // I/O and workspace areas, and these allocations may result in an out-of-memory + // error even though the StorageMangager free pool is not empty. Ideally, cudnnFind + // would use MXNet's storage allocator for its I/O and workspace areas, instead of using + // the area carved out by MXNET_GPU_MEM_POOL_RESERVE. + // To get somewhat the same effect as this, we can pre-allocate the areas needed for the + // I/Os (possibly triggering a desirable StorageManager::ReleaseAll()), followed by a + // DirectFree(), which makes these areas available for cudnn's subsequent cudaMalloc(). + + // Allocate for x (or dx), w (or dw) and y (or dy). + ReserveElements({in_shape[conv::kData].Size(), + in_shape[conv::kWeight].Size(), + out_shape[conv::kOut].Size()}); + + // We're about to call cudnnFind so we need to quiet the system by grabbing + // the Storage lock. Concurrent cudaMalloc's can disrupt the accurate timing + // measurements of the algos, and can prevent the cuda driver's proper freeing + // of cudnnFind's internal temporary allocations. Grabbing the lock might also + // impede other threads from launching work on the GPU. + std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU)); + this->CuDNNAlgoSetter(rctx, in_shape, out_shape, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + fwd, bwd, flt); + } + }; + + CuDNNConvAlgoReg::Get()->FindOrElseRegister(param_, in_shape, out_shape, dtype_, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + SMArch(rctx.ctx.dev_id), add_to_weight_, + &forward_algo_, &back_algo_, &back_algo_w_, algo_setter); + // If we're allowing Tensor Core variants of the algos to be considered in // *Find*() or *Get*(), but a non-Tensor-Core algo variant is the fastest, // we must change the descriptor to preclude Tensor Core. Simplest is to @@ -877,6 +907,7 @@ class CuDNNConvolutionOp { << " please consider reducing batch/model size or increasing the workspace size"; } + void GetTempSize(const OpContext& ctx) { mshadow::Stream<gpu> *s = ctx.get_stream<gpu>(); size_t back_size = 0, back_size_w = 0; @@ -975,6 +1006,25 @@ class CuDNNConvolutionOp { return c; } + // Make a number of allocations and directly free them, ensuring room for an equivalent set of + // cudaMalloc() calls by (say) cudnnFind(). `elements` spec the alloc size in DTypes, not bytes. + void ReserveElements(const std::vector<size_t> &elements) { + std::vector<Storage::Handle> handles; + for (size_t alloc_element : elements) + handles.push_back(Storage::Get()->Alloc(alloc_element * sizeof(DType), Context::GPU())); + for (auto &handle : handles) + Storage::Get()->DirectFree(handle); + } + + // Log that no suitable algo was found that met the workspace constraints, then exit. + void LogNoSuitableAlgoAndExit(int num_algos_tried, size_t min_memory_needs, + size_t workspace_byte, std::string algo_kind) { + LOG(FATAL) << num_algos_tried << " " << algo_kind << " with minimum memory requirement " + << min_memory_needs << " bytes have been tried. Workspace size is set to " + << workspace_byte << " bytes, please consider reducing the batch/model size, " + << "or increasing workspace size."; + } + std::vector<int> param_stride_; std::vector<int> param_dilate_; std::vector<int> param_pad_; diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index 041bea66f7b..c0c56507bbf 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -26,6 +26,7 @@ #ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_ +#include <mxnet/storage.h> #include <algorithm> #include <vector> #include <mutex> @@ -538,245 +539,273 @@ class CuDNNDeconvolutionOp { } } - void SelectAlgo(const RunContext& rctx, - const std::vector<TShape>& in_shape, - const std::vector<TShape>& out_shape, - cudnnDataType_t cudnn_forward_compute_type, - cudnnDataType_t cudnn_backward_compute_type) { - if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, - cudnn_forward_compute_type, - cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), add_to_weight_, - &forward_algo_, &back_algo_, &back_algo_w_)) { - mshadow::Stream <gpu> *s = rctx.get_stream<gpu>(); - CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle); - size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType)); - #if CUDNN_MAJOR >= 7 - // Starting with cuDNNv7, the algo number returned by *Get*() is not the entire - // story: the notion of whether the algo ran in Tensor Core mode is not known. - // Since we want to report the Tensor Core mode in the verbose output, we switch - // to using the new *Get*_v7() call. Since the function signature of *Get*_v7() matches - // that of *Find*(), we can unify the find-vs-get logic by using function pointers. - - // Forward Algorithm Find/Get() v7 - std::vector<cudnnConvolutionFwdAlgoPerf_t> fwd_results(MaxForwardAlgos(s->dnn_handle_)); - int actual_fwd_algos = 0; - auto fwd_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionForwardAlgorithm_v7 + void CuDNNAlgoSetter(const RunContext& rctx, + const std::vector<TShape>& in_shape, + const std::vector<TShape>& out_shape, + cudnnDataType_t cudnn_forward_compute_type, + cudnnDataType_t cudnn_backward_compute_type, + CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd, + CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd, + CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) { + // Not in algo registry, must determine via *Get*() or *Find*() + mshadow::Stream <gpu> *s = rctx.get_stream<gpu>(); + CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle); + size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType)); +#if CUDNN_MAJOR >= 7 + // Starting with cuDNNv7, the algo number returned by *Get*() is not the entire + // story: the notion of whether the algo ran in Tensor Core mode is not known. + // Since we want to report the Tensor Core mode in the verbose output, we switch + // to using the new *Get*_v7() call. Since the function signature of *Get*_v7() matches + // that of *Find*(), we can unify the find-vs-get logic by using function pointers. + + // Forward Algorithm Find/Get() v7 + std::vector<cudnnConvolutionFwdAlgoPerf_t> fwd_results(MaxForwardAlgos(s->dnn_handle_)); + int actual_fwd_algos = 0; + auto fwd_algo_discoverer = + param_.cudnn_tune.value() == deconv::kOff ? cudnnGetConvolutionForwardAlgorithm_v7 : cudnnFindConvolutionForwardAlgorithm; - CUDNN_CALL((*fwd_algo_discoverer)(s->dnn_handle_, - out_desc_, - filter_desc_, - back_conv_desc_, // fwd algo used to backprop-to-data - in_desc_, - fwd_results.size(), - &actual_fwd_algos, - fwd_results.data())); - fwd_results.resize(actual_fwd_algos); - AlgoFinalSelect<cudnnConvolutionFwdAlgoPerf_t, - cudnnConvolutionFwdAlgo_t>(fwd_results, "forward", - workspace_byte, &forward_algo_); - - // Backprop-to-Filter Algorithm Find/Get() v7 - auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); - std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos); - int actual_bwd_filter_algos = 0; - // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we - // were summing into the output (i.e. beta != 0). Get() returned OK algos though. - auto bwd_filter_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 - : cudnnFindConvolutionBackwardFilterAlgorithm; - CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, - out_desc_, - in_desc_, - back_conv_desc_, - filter_desc_, - bwd_filt_results.size(), - &actual_bwd_filter_algos, - bwd_filt_results.data())); - bwd_filt_results.resize(actual_bwd_filter_algos); - AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t, - cudnnConvolutionBwdFilterAlgo_t>(bwd_filt_results, "backprop-to-filter", - workspace_byte, &back_algo_w_); - - // Backprop-to-Data Algorithm Find/Get() v7 - auto max_bwd_data_algos = MaxBackwardDataAlgos(s->dnn_handle_); - std::vector<cudnnConvolutionBwdDataAlgoPerf_t> bwd_data_results(max_bwd_data_algos); - int actual_bwd_data_algos = 0; - auto bwd_data_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardDataAlgorithm_v7 + CUDNN_CALL((*fwd_algo_discoverer)(s->dnn_handle_, + out_desc_, + filter_desc_, + back_conv_desc_, // fwd algo used to backprop-to-data + in_desc_, + fwd_results.size(), + &actual_fwd_algos, + fwd_results.data())); + fwd_results.resize(actual_fwd_algos); + AlgoFinalSelect<cudnnConvolutionFwdAlgoPerf_t, + cudnnConvolutionFwdAlgo_t>(fwd_results, "forward", + workspace_byte, fwd); + + // Backprop-to-Filter Algorithm Find/Get() v7 + auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); + std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos); + int actual_bwd_filter_algos = 0; + // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we + // were summing into the output (i.e. beta != 0). Get() returned OK algos though. + auto bwd_filter_algo_discoverer = + param_.cudnn_tune.value() == deconv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + : cudnnFindConvolutionBackwardFilterAlgorithm; + CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, + out_desc_, + in_desc_, + back_conv_desc_, + filter_desc_, + bwd_filt_results.size(), + &actual_bwd_filter_algos, + bwd_filt_results.data())); + bwd_filt_results.resize(actual_bwd_filter_algos); + AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t, + cudnnConvolutionBwdFilterAlgo_t>(bwd_filt_results, "backprop-to-filter", + workspace_byte, flt); + // Backprop-to-Data Algorithm Find/Get() v7 + auto max_bwd_data_algos = MaxBackwardDataAlgos(s->dnn_handle_); + std::vector<cudnnConvolutionBwdDataAlgoPerf_t> bwd_data_results(max_bwd_data_algos); + int actual_bwd_data_algos = 0; + auto bwd_data_algo_discoverer = + param_.cudnn_tune.value() == deconv::kOff ? cudnnGetConvolutionBackwardDataAlgorithm_v7 : cudnnFindConvolutionBackwardDataAlgorithm; - CUDNN_CALL((*bwd_data_algo_discoverer)(s->dnn_handle_, + CUDNN_CALL((*bwd_data_algo_discoverer)(s->dnn_handle_, + filter_desc_, + in_desc_, + forward_conv_desc_, // bwd algo used in inference + out_desc_, + bwd_data_results.size(), + &actual_bwd_data_algos, + bwd_data_results.data())); + bwd_data_results.resize(actual_bwd_data_algos); + AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t, + cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data", + workspace_byte, bwd); +#else + // CUDNN_MAJOR < 7 + const int kMaxAlgos = 10; + int nalgo = kMaxAlgos; + int i = 0; + size_t min_memory_needs = 0; + // Forward Algorithm Find/Get, v6 and earlier + if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) { + // In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is + // supported. Hard-coded this since the algo find() or get() throws an FPE. + fwd->Set(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, false); + } else if (!param_.cudnn_tune.value()) { + cudnnConvolutionFwdAlgo_t fastest_fwd_algo; + CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_, + out_desc_, + filter_desc_, + back_conv_desc_, // fwd algo used in dgrad + in_desc_, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_byte, + &fastest_fwd_algo)); + fwd->Set(fastest_fwd_algo, false); + } else { + cudnnConvolutionFwdAlgoPerf_t fwd_algo[kMaxAlgos]; + CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(s->dnn_handle_, + out_desc_, + filter_desc_, + back_conv_desc_, // fwd algo used in dgrad + in_desc_, + kMaxAlgos, + &nalgo, + fwd_algo)); + i = 0; + while (i < nalgo + && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS + || (param_.cudnn_tune.value() == deconv::kLimited + && fwd_algo[i].memory > workspace_byte))) { + ++i; + min_memory_needs = (i == 0) ? + fwd_algo[i].memory : + std::min(min_memory_needs, fwd_algo[i].memory); + } + if (i == nalgo) { + LogNoSuitableAlgoAndExit(nalgo, min_memory_needs, workspace_byte, + "forward algos (for use in deconv op backprop-to-data)"); + } else { + fwd->Set(fwd_algo[i].algo, false); + } + } + // Backprop-to-Filter Algorithm Find/Get, v6 and earlier + if (!param_.cudnn_tune.value()) { + cudnnConvolutionBwdFilterAlgo_t fastest_bwd_filt_algo; + CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_, + out_desc_, + in_desc_, + back_conv_desc_, + filter_desc_, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_byte, + &fastest_bwd_filt_algo)); + flt->Set(fastest_bwd_filt_algo, false); + } else { + cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo[kMaxAlgos]; + CUDNN_CALL(cudnnFindConvolutionBackwardFilterAlgorithm(s->dnn_handle_, + out_desc_, + in_desc_, + back_conv_desc_, + filter_desc_, + kMaxAlgos, + &nalgo, + bwd_filter_algo)); + i = 0; + while (i < nalgo + && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS + || (param_.cudnn_tune.value() == deconv::kLimited + && bwd_filter_algo[i].memory > workspace_byte))) { + ++i; + min_memory_needs = (i == 0) ? + bwd_filter_algo[i].memory : + std::min(min_memory_needs, bwd_filter_algo[i].memory); + } + if (i == nalgo) { + LogNoSuitableAlgoAndExit(nalgo, min_memory_needs, workspace_byte, + "backward filter algos (for use in deconv op backprop-to-filter)"); + } else { + flt->Set(bwd_filter_algo[i].algo, false); + } + } + // Backprop-to-Data Algorithm Get(), v6 and earlier + if (!param_.cudnn_tune.value()) { + cudnnConvolutionBwdDataAlgo_t fastest_bwd_data_algo; + CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_, + filter_desc_, + in_desc_, + forward_conv_desc_, // bwd algo used for inference + out_desc_, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_byte, + &fastest_bwd_data_algo)); + bwd->Set(fastest_bwd_data_algo, false); + } else { + cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo[kMaxAlgos]; + CUDNN_CALL(cudnnFindConvolutionBackwardDataAlgorithm(s->dnn_handle_, filter_desc_, in_desc_, forward_conv_desc_, // bwd algo used in inference out_desc_, - bwd_data_results.size(), - &actual_bwd_data_algos, - bwd_data_results.data())); - bwd_data_results.resize(actual_bwd_data_algos); - AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t, - cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data", - workspace_byte, &back_algo_); - #else - // CUDNN_MAJOR < 7 - const int kMaxAlgos = 10; - int nalgo = kMaxAlgos; - int i = 0; - size_t min_memory_needs = 0; - // Forward Algorithm Find/Get, v6 and earlier - if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) { - // In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is - // supported. Hard-coded this since the algo find() or get() throws an FPE. - forward_algo_.Set(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, false); - } else if (!param_.cudnn_tune.value()) { - cudnnConvolutionFwdAlgo_t fastest_fwd_algo; - CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_, - out_desc_, - filter_desc_, - back_conv_desc_, // fwd algo used in dgrad - in_desc_, - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_byte, - &fastest_fwd_algo)); - forward_algo_.Set(fastest_fwd_algo, false); - } else { - cudnnConvolutionFwdAlgoPerf_t fwd_algo[kMaxAlgos]; - CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(s->dnn_handle_, - out_desc_, - filter_desc_, - back_conv_desc_, // fwd algo used in dgrad - in_desc_, - kMaxAlgos, - &nalgo, - fwd_algo)); - i = 0; - while (i < nalgo - && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS - || (param_.cudnn_tune.value() == deconv::kLimited - && fwd_algo[i].memory > workspace_byte))) { - ++i; - min_memory_needs = (i == 0) ? - fwd_algo[i].memory : - std::min(min_memory_needs, fwd_algo[i].memory); - } - if (i == nalgo) { - LOG(FATAL) << nalgo << " forward algorithms" - << " (for use in deconvolution operator backprop-to-data)" - << " with minimum memory requirement " << min_memory_needs - << " bytes have been tried. Workspace size is set to " << workspace_byte - << " bytes, please consider reducing the batch/model size," - << " or increasing workspace size."; - } else { - forward_algo_.Set(fwd_algo[i].algo, false); - } + kMaxAlgos, + &nalgo, + bwd_data_algo)); + i = 0; + while (i < nalgo + && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS + || (param_.cudnn_tune.value() == deconv::kLimited + && bwd_data_algo[i].memory > workspace_byte))) { + ++i; + min_memory_needs = (i == 0) ? + bwd_data_algo[i].memory : + std::min(min_memory_needs, bwd_data_algo[i].memory); } - // Backprop-to-Filter Algorithm Find/Get, v6 and earlier - if (!param_.cudnn_tune.value()) { - cudnnConvolutionBwdFilterAlgo_t fastest_bwd_filt_algo; - CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_, - out_desc_, - in_desc_, - back_conv_desc_, - filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_byte, - &fastest_bwd_filt_algo)); - back_algo_w_.Set(fastest_bwd_filt_algo, false); + if (i == nalgo) { + LogNoSuitableAlgoAndExit(nalgo, min_memory_needs, workspace_byte, + "backward data algos (for use in deconv op forward inference)"); } else { - cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo[kMaxAlgos]; - CUDNN_CALL(cudnnFindConvolutionBackwardFilterAlgorithm(s->dnn_handle_, - out_desc_, - in_desc_, - back_conv_desc_, - filter_desc_, - kMaxAlgos, - &nalgo, - bwd_filter_algo)); - i = 0; - while (i < nalgo - && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS - || (param_.cudnn_tune.value() == deconv::kLimited - && bwd_filter_algo[i].memory > workspace_byte))) { - ++i; - min_memory_needs = (i == 0) ? - bwd_filter_algo[i].memory : - std::min(min_memory_needs, bwd_filter_algo[i].memory); - } - if (i == nalgo) { - LOG(FATAL) << nalgo << " backward filter algorithms" - << " (for use in deconvolution operator backprop-to-filter)" - << " with minimum memory requirement " << min_memory_needs - << " bytes have been tried. Workspace size is set to " << workspace_byte - << " bytes, please consider reducing the batch/model size," - << " or increasing workspace size."; - } else { - back_algo_w_.Set(bwd_filter_algo[i].algo, false); - } + bwd->Set(bwd_data_algo[i].algo, false); } - // Backprop-to-Data Algorithm Get(), v6 and earlier - if (!param_.cudnn_tune.value()) { - cudnnConvolutionBwdDataAlgo_t fastest_bwd_data_algo; - CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_, - filter_desc_, - in_desc_, - forward_conv_desc_, // bwd algo used for inference - out_desc_, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_byte, - &fastest_bwd_data_algo)); - back_algo_.Set(fastest_bwd_data_algo, false); + } +#endif // CUDNN_MAJOR < 7 + + // Fix for issue #11241 + int cudnn_find_issue_max_features = 64 * 1024; + // With deconvolution, the algo sensitivity is to a large number of output features + if (add_to_weight_ && Features(out_shape[deconv::kOut]) >= cudnn_find_issue_max_features) { + flt->Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); + } + } + + void SelectAlgo(const RunContext& rctx, + const std::vector<TShape>& in_shape, + const std::vector<TShape>& out_shape, + cudnnDataType_t cudnn_forward_compute_type, + cudnnDataType_t cudnn_backward_compute_type) { + auto algo_setter = [&](CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd, + CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd, + CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) { + if (param_.cudnn_tune.value() == deconv::kOff) { + // The routine will only be calling cudnnGet, so no need to grab the Storage lock. + this->CuDNNAlgoSetter(rctx, in_shape, out_shape, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + fwd, bwd, flt); } else { - cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo[kMaxAlgos]; - CUDNN_CALL(cudnnFindConvolutionBackwardDataAlgorithm(s->dnn_handle_, - filter_desc_, - in_desc_, - forward_conv_desc_, // bwd algo used in inference - out_desc_, - kMaxAlgos, - &nalgo, - bwd_data_algo)); - i = 0; - while (i < nalgo - && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS - || (param_.cudnn_tune.value() == deconv::kLimited - && bwd_data_algo[i].memory > workspace_byte))) { - ++i; - min_memory_needs = (i == 0) ? - bwd_data_algo[i].memory : - std::min(min_memory_needs, bwd_data_algo[i].memory); - } - if (i == nalgo) { - LOG(FATAL) << nalgo << " backward data algorithms" - << " (for use in deconvolution operator forward inference) with" - << " minimum memory requirement " << min_memory_needs - << " bytes have been tried. Workspace size is set to " << workspace_byte - << " bytes, please consider reducing the batch/model size," - << " or increasing workspace size."; - } else { - back_algo_.Set(bwd_data_algo[i].algo, false); - } - } - #endif // CUDNN_MAJOR < 7 + // One potential problem is that cudnnFind() uses cudaMalloc() to directly allocate + // I/O and workspace areas, and these allocations may result in an out-of-memory + // error even though the StorageMangager free pool is not empty. Ideally, cudnnFind + // would use MXNet's storage allocator for its I/O and workspace areas, instead of using + // the area carved out by MXNET_GPU_MEM_POOL_RESERVE. + // To get somewhat the same effect as this, we can pre-allocate the areas needed for the + // I/Os (possibly triggering a desirable StorageManager::ReleaseAll()), followed by a + // DirectFree(), which makes these areas available for cudnn's subsequent cudaMalloc(). + + // Allocate for x (or dx), w (or dw) and y (or dy). + ReserveElements({in_shape[conv::kData].Size(), + in_shape[conv::kWeight].Size(), + out_shape[conv::kOut].Size()}); - // Fix for issue #11241 - int cudnn_find_issue_max_features = 64 * 1024; - // With deconvolution, the algo sensitivity is to a large number of output features - if (add_to_weight_ && Features(out_shape[deconv::kOut]) >= cudnn_find_issue_max_features) { - this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); + // We're about to call cudnnFind so we need to quiet the system by grabbing + // the Storage lock. Concurrent cudaMalloc's can disrupt the accurate timing + // measurements of the algos, and can prevent the cuda driver's proper freeing + // of cudnnFind's internal temporary allocations. Grabbing the lock might also + // impede other threads from launching work on the GPU. + std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU)); + this->CuDNNAlgoSetter(rctx, in_shape, out_shape, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + fwd, bwd, flt); } + }; + + // An algo specification by the user may be cached here, but another + // convolution will match only if identically specified. + // We're caching results of *Get* as well as *Find*, but these records + // will be held distinctly because param_.cudnn_tune is part of the key. + CuDNNDeconvAlgoReg::Get()->FindOrElseRegister(param_, in_shape, out_shape, dtype_, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + SMArch(rctx.ctx.dev_id), add_to_weight_, + &forward_algo_, &back_algo_, &back_algo_w_, algo_setter); - // An algo specification by the user may be cached here, but another - // convolution will match only if identically specified. - // We're caching results of *Get* as well as *Find*, but these records - // will be held distinctly because param_.cudnn_tune is part of the key. - CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, - cudnn_forward_compute_type, - cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), this->add_to_weight_, - this->forward_algo_, - this->back_algo_, this->back_algo_w_); - } // If we're allowing Tensor Core variants of the algos to be considered in // *Find*() or *Get*(), but a non-Tensor-Core algo variant is the fastest, // we must change the descriptor to preclude Tensor Core. Simplest is to @@ -818,6 +847,7 @@ class CuDNNDeconvolutionOp { << " please consider reducing batch/model size or increasing the workspace size"; } + void GetTempSize(const OpContext& ctx) { mshadow::Stream<gpu> *s = ctx.get_stream<gpu>(); size_t back_data_algo_workspace_size = 0; @@ -921,6 +951,26 @@ class CuDNNDeconvolutionOp { return c; } + // Make a number of allocations and directly free them, ensuring room for an equivalent set of + // cudaMalloc() calls by (say) cudnnFind(). `elements` spec the alloc size in DTypes, not bytes. + void ReserveElements(const std::vector<size_t> &elements) { + std::vector<Storage::Handle> handles; + for (size_t alloc_element : elements) + handles.push_back(Storage::Get()->Alloc(alloc_element * sizeof(DType), Context::GPU())); + for (auto &handle : handles) + Storage::Get()->DirectFree(handle); + } + + + // Log that no suitable algo was found that met the workspace constraints, then exit. + void LogNoSuitableAlgoAndExit(int num_algos_tried, size_t min_memory_needs, + size_t workspace_byte, std::string algo_kind) { + LOG(FATAL) << num_algos_tried << " " << algo_kind << " with minimum memory requirement " + << min_memory_needs << " bytes have been tried. Workspace size is set to " + << workspace_byte << " bytes, please consider reducing the batch/model size, " + << "or increasing workspace size."; + } + std::vector<int> param_stride_; std::vector<int> param_dilate_; diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h index f3a9b16cdd8..cade8d9495f 100644 --- a/src/storage/pooled_storage_manager.h +++ b/src/storage/pooled_storage_manager.h @@ -57,6 +57,11 @@ class GPUPooledStorageManager final : public StorageManager { GPUPooledStorageManager() { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); + large_alloc_round_size_ = dmlc::GetEnv("MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE", 2 * 1024 * 1024); + if (large_alloc_round_size_ <= 0) { + LOG(FATAL) << "MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE cannot be set to a value <= 0, found: " + << large_alloc_round_size_; + } if (page_size_ < NDEV) { LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than " << NDEV \ << ". Got " << page_size_ << "."; @@ -80,7 +85,7 @@ class GPUPooledStorageManager final : public StorageManager { private: void DirectFreeNoLock(Storage::Handle handle) { cudaError_t err = cudaFree(handle.dptr); - size_t size = std::max(handle.size, page_size_); + size_t size = RoundAllocSize(handle.size); // ignore unloading error, as memory has already been recycled if (err != cudaSuccess && err != cudaErrorCudartUnloading) { LOG(FATAL) << "CUDA: " << cudaGetErrorString(err); @@ -88,12 +93,31 @@ class GPUPooledStorageManager final : public StorageManager { used_memory_ -= size; } + // Round a value 'x' up to the next multiple of 'multiple' + size_t RoundToMultiple(size_t x, size_t multiple) { + size_t retVal = ((x + multiple - 1) / multiple) * multiple; + return retVal; + } + + size_t RoundAllocSize(size_t size) { + // Round up small allocs to the page_size_ to consolidate the pool lookups + size = std::max(size, page_size_); + // To ensure proper freeing under some driver variants, make sure + // large allocs entirely occupy their slabs, which cannot then be + // locked by smaller permanent allocations sharing the slab. + if (size > large_alloc_round_size_) + size = RoundToMultiple(size, large_alloc_round_size_); + return size; + } + private: void ReleaseAll(); // used memory size_t used_memory_ = 0; // page size size_t page_size_; + // size that large allocations should be rounded to, for proper freeing. + size_t large_alloc_round_size_; // percentage of reserved memory int reserve_; // number of devices @@ -105,7 +129,7 @@ class GPUPooledStorageManager final : public StorageManager { void GPUPooledStorageManager::Alloc(Storage::Handle* handle) { std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU)); - size_t size = std::max(handle->size, page_size_); + size_t size = RoundAllocSize(handle->size); auto&& reuse_it = memory_pool_.find(size); if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) { size_t free, total; @@ -130,7 +154,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) { void GPUPooledStorageManager::Free(Storage::Handle handle) { std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU)); - size_t size = std::max(handle.size, page_size_); + size_t size = RoundAllocSize(handle.size); auto&& reuse_pool = memory_pool_[size]; reuse_pool.push_back(handle.dptr); } diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 8394276c8ef..8ada95b3bf2 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -25,12 +25,14 @@ import mxnet as mx import numpy as np import unittest +import math from nose.tools import assert_raises from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal from mxnet.base import MXNetError from mxnet import autograd from numpy.testing import assert_allclose + curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.insert(0, os.path.join(curr_path, '../unittest')) from common import setup_module, with_seed, teardown, assert_raises_cudnn_disabled @@ -57,7 +59,7 @@ def check_rnn_layer(layer): for g, c in zip(gs, cs): assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6) - +@with_seed() def check_rnn_layer_w_rand_inputs(layer): layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)]) x = mx.nd.uniform(shape=(10, 16, 30)) @@ -186,7 +188,7 @@ def _syncParameters(bn1, bn2, ctx): input2grad = mx.nd.concat(*[output.grad.as_in_context(input.context) for output in inputs2], dim=0) assert_almost_equal(input1.grad.asnumpy(), input2grad.asnumpy(), atol=1e-3, rtol=1e-3) - +@with_seed() def test_sync_batchnorm(): def get_num_devices(): for i in range(100): @@ -203,6 +205,7 @@ def get_num_devices(): _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)), num_devices=ndev, cuda=True) + @with_seed() def test_symbol_block_fp16(): # Test case to verify if initializing the SymbolBlock from a model with params @@ -233,6 +236,45 @@ def test_symbol_block_fp16(): break assert np.dtype(net_fp16.params[param_name].dtype) == np.dtype(np.float16) + +@with_seed() +def test_large_models(): + ctx = default_context() + # Create model + net = gluon.nn.HybridSequential() + + largest_num_features = 256 + with net.name_scope(): + net.add(nn.Conv2D(largest_num_features, 3)) + + net.hybridize() + net.initialize(mx.init.Normal(sigma=0.01), ctx=ctx) + + # Compute the height (=width) of the square tensor of the given size in bytes + def tensor_size(big_tensor_bytes): + bytes_per_float = 4 + sz = int(math.sqrt(big_tensor_bytes / largest_num_features / bytes_per_float)) + return (sz // 100) * 100 + + # The idea is to create models with large tensors of (say) 20% of the total memory. + # This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's + # from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%). + (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(ctx.device_id) + start_size = tensor_size(0.20 * total_mem_bytes) + num_trials = 10 + sys.stderr.write(' testing global memory of size {} ... '.format(total_mem_bytes)) + sys.stderr.flush() + for i in range(num_trials): + sz = start_size - 10 * i + (height, width) = (sz,sz) + sys.stderr.write(" {}x{} ".format(height,width)) + sys.stderr.flush() + data_in = nd.random_uniform(low=0, high=255, shape=(1, 3, height, width), + ctx=ctx, dtype="float32") + # Evaluate model + net(data_in).asnumpy() + + if __name__ == '__main__': import nose nose.runmodule() ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
