[GitHub] lebeg commented on a change in pull request #11041: gpu mem pool strategy
lebeg commented on a change in pull request #11041: gpu mem pool strategy URL: https://github.com/apache/incubator-mxnet/pull/11041#discussion_r194660923 ## File path: src/storage/pooled_storage_manager.h ## @@ -129,13 +139,172 @@ void GPUPooledStorageManager::ReleaseAll() { for (auto&& j : i.second) { Storage::Handle handle; handle.dptr = j; - handle.size = i.first - NDEV; + handle.size = i.first; DirectFreeNoLock(handle); } } memory_pool_.clear(); } +/*! + * \brief Storage manager with a memory pool, with rounded size, on gpu. + * + * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and + * nearest multiple (linear) rounding to help alleviate the memory allocation stress + * in which the default naive exact-size-match pool falls short, such as in variable-length + * input/output cases like RNN workloads. + * + * \param cutoff the cutoff at which rounding is switched from exponential to linear. It's set + * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must be between 20 (1 MB) + * and 34 (16 GB). + * Suppose the cutoff is X, the memory size buckets look like this: + * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ... + */ +class GPUPooledRoundedStorageManager final : public StorageManager { + public: + /*! + * \brief Default constructor. + */ + GPUPooledRoundedStorageManager() { +reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); +page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); +cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24); +if (page_size_ < 32) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \ + << "Got: " << page_size_ << "."; +} +if (page_size_ != 1ul << log2_round_up(page_size_)) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " << page_size_ << "."; +} +page_size_ = log2_round_up(page_size_); +if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \ + << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \ + << cut_off_ << "."; +} +if (cut_off_ < page_size_) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \ + << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " \ + << cut_off_ << " vs " << page_size_ << "."; +} +memory_pool_ = std::vector>((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_); + } + /*! + * \brief Default destructor. + */ + ~GPUPooledRoundedStorageManager() { +ReleaseAll(); + } + + void Alloc(Storage::Handle* handle) override; + void Free(Storage::Handle handle) override; + + void DirectFree(Storage::Handle handle) override { +std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU)); +DirectFreeNoLock(handle); + } + + private: + inline int log2_round_up(size_t s) { +return static_cast(std::ceil(std::log2(s))); + } + inline int div_pow2_round_up(size_t s, int divisor_log2) { +// (1025, 10) -> 2 +// (2048, 10) -> 2 +// (2049, 10) -> 3 +size_t result = s >> divisor_log2; +return static_cast(result + (s > (result << divisor_log2) ? 1 : 0)); + } + inline int get_bucket(size_t s) { +int log_size = log2_round_up(s); +if (log_size > static_cast(cut_off_)) + return div_pow2_round_up(s, cut_off_) - 1 + cut_off_; +else + return std::max(log_size, static_cast(page_size_)); + } + inline size_t get_size(int bucket) { +if (bucket <= static_cast(cut_off_)) + return 1ul << bucket; +else + return (bucket - cut_off_ + 1) * (1ul << cut_off_); + } + + void DirectFreeNoLock(Storage::Handle handle) { +cudaError_t err = cudaFree(handle.dptr); +size_t size = get_size(get_bucket(handle.size)); +// ignore unloading error, as memory has already been recycled +if (err != cudaSuccess && err != cudaErrorCudartUnloading) { + LOG(FATAL) << "CUDA: " << cudaGetErrorString(err); +} +used_memory_ -= size; + } + + private: + void ReleaseAll(); + // number of devices + const int NDEV = 32; + // log2 of maximum page size. 16GB + const size_t LOG2_MAX_MEM = 34; + // address width in bits + static const int addr_width = sizeof(size_t) * 8; + // used memory + size_t used_memory_ = 0; + // page size + size_t page_size_; + // log2 of memory size before switching to exponential mode to linear mode + size_t cut_off_; + // percentage of reserved memory + int reserve_; + // memory pool + std::vector> memory_pool_; + DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager); +}; // class GPUPooledRoundedStorageManager + +void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) { + std::lock_guard lock(Storage::Get()->GetMutex(Context::kG
[GitHub] lebeg commented on a change in pull request #11041: gpu mem pool strategy
lebeg commented on a change in pull request #11041: gpu mem pool strategy URL: https://github.com/apache/incubator-mxnet/pull/11041#discussion_r194661450 ## File path: src/storage/pooled_storage_manager.h ## @@ -129,13 +139,172 @@ void GPUPooledStorageManager::ReleaseAll() { for (auto&& j : i.second) { Storage::Handle handle; handle.dptr = j; - handle.size = i.first - NDEV; + handle.size = i.first; DirectFreeNoLock(handle); } } memory_pool_.clear(); } +/*! + * \brief Storage manager with a memory pool, with rounded size, on gpu. + * + * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and + * nearest multiple (linear) rounding to help alleviate the memory allocation stress + * in which the default naive exact-size-match pool falls short, such as in variable-length + * input/output cases like RNN workloads. + * + * \param cutoff the cutoff at which rounding is switched from exponential to linear. It's set + * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must be between 20 (1 MB) + * and 34 (16 GB). + * Suppose the cutoff is X, the memory size buckets look like this: + * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ... + */ +class GPUPooledRoundedStorageManager final : public StorageManager { + public: + /*! + * \brief Default constructor. + */ + GPUPooledRoundedStorageManager() { +reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); +page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); +cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24); +if (page_size_ < 32) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \ + << "Got: " << page_size_ << "."; +} +if (page_size_ != 1ul << log2_round_up(page_size_)) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " << page_size_ << "."; +} +page_size_ = log2_round_up(page_size_); +if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \ + << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \ + << cut_off_ << "."; +} +if (cut_off_ < page_size_) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \ + << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " \ + << cut_off_ << " vs " << page_size_ << "."; +} +memory_pool_ = std::vector>((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_); + } + /*! + * \brief Default destructor. + */ + ~GPUPooledRoundedStorageManager() { +ReleaseAll(); + } + + void Alloc(Storage::Handle* handle) override; + void Free(Storage::Handle handle) override; + + void DirectFree(Storage::Handle handle) override { +std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU)); +DirectFreeNoLock(handle); + } + + private: + inline int log2_round_up(size_t s) { +return static_cast(std::ceil(std::log2(s))); + } + inline int div_pow2_round_up(size_t s, int divisor_log2) { +// (1025, 10) -> 2 +// (2048, 10) -> 2 +// (2049, 10) -> 3 +size_t result = s >> divisor_log2; +return static_cast(result + (s > (result << divisor_log2) ? 1 : 0)); + } + inline int get_bucket(size_t s) { +int log_size = log2_round_up(s); +if (log_size > static_cast(cut_off_)) + return div_pow2_round_up(s, cut_off_) - 1 + cut_off_; +else + return std::max(log_size, static_cast(page_size_)); + } + inline size_t get_size(int bucket) { +if (bucket <= static_cast(cut_off_)) + return 1ul << bucket; +else + return (bucket - cut_off_ + 1) * (1ul << cut_off_); + } + + void DirectFreeNoLock(Storage::Handle handle) { +cudaError_t err = cudaFree(handle.dptr); +size_t size = get_size(get_bucket(handle.size)); +// ignore unloading error, as memory has already been recycled +if (err != cudaSuccess && err != cudaErrorCudartUnloading) { + LOG(FATAL) << "CUDA: " << cudaGetErrorString(err); +} +used_memory_ -= size; + } + + private: + void ReleaseAll(); + // number of devices + const int NDEV = 32; + // log2 of maximum page size. 16GB + const size_t LOG2_MAX_MEM = 34; + // address width in bits + static const int addr_width = sizeof(size_t) * 8; + // used memory + size_t used_memory_ = 0; + // page size + size_t page_size_; + // log2 of memory size before switching to exponential mode to linear mode + size_t cut_off_; + // percentage of reserved memory + int reserve_; + // memory pool + std::vector> memory_pool_; + DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager); +}; // class GPUPooledRoundedStorageManager + +void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) { + std::lock_guard lock(Storage::Get()->GetMutex(Context::kG