[GitHub] lebeg commented on a change in pull request #11041: gpu mem pool strategy

2018-06-12 Thread GitBox
lebeg commented on a change in pull request #11041: gpu mem pool strategy
URL: https://github.com/apache/incubator-mxnet/pull/11041#discussion_r194660923
 
 

 ##
 File path: src/storage/pooled_storage_manager.h
 ##
 @@ -129,13 +139,172 @@ void GPUPooledStorageManager::ReleaseAll() {
 for (auto&& j : i.second) {
   Storage::Handle handle;
   handle.dptr = j;
-  handle.size = i.first - NDEV;
+  handle.size = i.first;
   DirectFreeNoLock(handle);
 }
   }
   memory_pool_.clear();
 }
 
+/*!
+ * \brief Storage manager with a memory pool, with rounded size, on gpu.
+ *
+ * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and
+ * nearest multiple (linear) rounding to help alleviate the memory allocation 
stress
+ * in which the default naive exact-size-match pool falls short, such as in 
variable-length
+ * input/output cases like RNN workloads.
+ *
+ * \param cutoff the cutoff at which rounding is switched from exponential to 
linear. It's set
+ * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must 
be between 20 (1 MB)
+ * and 34 (16 GB).
+ * Suppose the cutoff is X, the memory size buckets look like this:
+ * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ...
+ */
+class GPUPooledRoundedStorageManager final : public StorageManager {
+ public:
+  /*!
+   * \brief Default constructor.
+   */
+  GPUPooledRoundedStorageManager() {
+reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
+page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24);
+if (page_size_ < 32) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value 
smaller than 32. " \
+ << "Got: " << page_size_ << ".";
+}
+if (page_size_ != 1ul << log2_round_up(page_size_)) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " 
<< page_size_ << ".";
+}
+page_size_ = log2_round_up(page_size_);
+if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a 
value " \
+ << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". 
Got: " \
+ << cut_off_ << ".";
+}
+if (cut_off_ < page_size_) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a 
value " \
+ << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " 
\
+ << cut_off_ << " vs " << page_size_ << ".";
+}
+memory_pool_ = std::vector>((1ul << (LOG2_MAX_MEM - 
cut_off_)) + cut_off_);
+  }
+  /*!
+   * \brief Default destructor.
+   */
+  ~GPUPooledRoundedStorageManager() {
+ReleaseAll();
+  }
+
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
+
+  void DirectFree(Storage::Handle handle) override {
+std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU));
+DirectFreeNoLock(handle);
+  }
+
+ private:
+  inline int log2_round_up(size_t s) {
+return static_cast(std::ceil(std::log2(s)));
+  }
+  inline int div_pow2_round_up(size_t s, int divisor_log2) {
+// (1025, 10) -> 2
+// (2048, 10) -> 2
+// (2049, 10) -> 3
+size_t result = s >> divisor_log2;
+return static_cast(result + (s > (result << divisor_log2) ? 1 : 0));
+  }
+  inline int get_bucket(size_t s) {
+int log_size = log2_round_up(s);
+if (log_size > static_cast(cut_off_))
+  return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
+else
+  return std::max(log_size, static_cast(page_size_));
+  }
+  inline size_t get_size(int bucket) {
+if (bucket <= static_cast(cut_off_))
+  return 1ul << bucket;
+else
+  return (bucket - cut_off_ + 1) * (1ul << cut_off_);
+  }
+
+  void DirectFreeNoLock(Storage::Handle handle) {
+cudaError_t err = cudaFree(handle.dptr);
+size_t size = get_size(get_bucket(handle.size));
+// ignore unloading error, as memory has already been recycled
+if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
+  LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
+}
+used_memory_ -= size;
+  }
+
+ private:
+  void ReleaseAll();
+  // number of devices
+  const int NDEV = 32;
+  // log2 of maximum page size. 16GB
+  const size_t LOG2_MAX_MEM = 34;
+  // address width in bits
+  static const int addr_width = sizeof(size_t) * 8;
+  // used memory
+  size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
+  // log2 of memory size before switching to exponential mode to linear mode
+  size_t cut_off_;
+  // percentage of reserved memory
+  int reserve_;
+  // memory pool
+  std::vector> memory_pool_;
+  DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);
+};  // class GPUPooledRoundedStorageManager
+
+void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
+  std::lock_guard lock(Storage::Get()->GetMutex(Context::kG

[GitHub] lebeg commented on a change in pull request #11041: gpu mem pool strategy

2018-06-12 Thread GitBox
lebeg commented on a change in pull request #11041: gpu mem pool strategy
URL: https://github.com/apache/incubator-mxnet/pull/11041#discussion_r194661450
 
 

 ##
 File path: src/storage/pooled_storage_manager.h
 ##
 @@ -129,13 +139,172 @@ void GPUPooledStorageManager::ReleaseAll() {
 for (auto&& j : i.second) {
   Storage::Handle handle;
   handle.dptr = j;
-  handle.size = i.first - NDEV;
+  handle.size = i.first;
   DirectFreeNoLock(handle);
 }
   }
   memory_pool_.clear();
 }
 
+/*!
+ * \brief Storage manager with a memory pool, with rounded size, on gpu.
+ *
+ * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and
+ * nearest multiple (linear) rounding to help alleviate the memory allocation 
stress
+ * in which the default naive exact-size-match pool falls short, such as in 
variable-length
+ * input/output cases like RNN workloads.
+ *
+ * \param cutoff the cutoff at which rounding is switched from exponential to 
linear. It's set
+ * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must 
be between 20 (1 MB)
+ * and 34 (16 GB).
+ * Suppose the cutoff is X, the memory size buckets look like this:
+ * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ...
+ */
+class GPUPooledRoundedStorageManager final : public StorageManager {
+ public:
+  /*!
+   * \brief Default constructor.
+   */
+  GPUPooledRoundedStorageManager() {
+reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
+page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24);
+if (page_size_ < 32) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value 
smaller than 32. " \
+ << "Got: " << page_size_ << ".";
+}
+if (page_size_ != 1ul << log2_round_up(page_size_)) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " 
<< page_size_ << ".";
+}
+page_size_ = log2_round_up(page_size_);
+if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a 
value " \
+ << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". 
Got: " \
+ << cut_off_ << ".";
+}
+if (cut_off_ < page_size_) {
+  LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a 
value " \
+ << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " 
\
+ << cut_off_ << " vs " << page_size_ << ".";
+}
+memory_pool_ = std::vector>((1ul << (LOG2_MAX_MEM - 
cut_off_)) + cut_off_);
+  }
+  /*!
+   * \brief Default destructor.
+   */
+  ~GPUPooledRoundedStorageManager() {
+ReleaseAll();
+  }
+
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
+
+  void DirectFree(Storage::Handle handle) override {
+std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU));
+DirectFreeNoLock(handle);
+  }
+
+ private:
+  inline int log2_round_up(size_t s) {
+return static_cast(std::ceil(std::log2(s)));
+  }
+  inline int div_pow2_round_up(size_t s, int divisor_log2) {
+// (1025, 10) -> 2
+// (2048, 10) -> 2
+// (2049, 10) -> 3
+size_t result = s >> divisor_log2;
+return static_cast(result + (s > (result << divisor_log2) ? 1 : 0));
+  }
+  inline int get_bucket(size_t s) {
+int log_size = log2_round_up(s);
+if (log_size > static_cast(cut_off_))
+  return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
+else
+  return std::max(log_size, static_cast(page_size_));
+  }
+  inline size_t get_size(int bucket) {
+if (bucket <= static_cast(cut_off_))
+  return 1ul << bucket;
+else
+  return (bucket - cut_off_ + 1) * (1ul << cut_off_);
+  }
+
+  void DirectFreeNoLock(Storage::Handle handle) {
+cudaError_t err = cudaFree(handle.dptr);
+size_t size = get_size(get_bucket(handle.size));
+// ignore unloading error, as memory has already been recycled
+if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
+  LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
+}
+used_memory_ -= size;
+  }
+
+ private:
+  void ReleaseAll();
+  // number of devices
+  const int NDEV = 32;
+  // log2 of maximum page size. 16GB
+  const size_t LOG2_MAX_MEM = 34;
+  // address width in bits
+  static const int addr_width = sizeof(size_t) * 8;
+  // used memory
+  size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
+  // log2 of memory size before switching to exponential mode to linear mode
+  size_t cut_off_;
+  // percentage of reserved memory
+  int reserve_;
+  // memory pool
+  std::vector> memory_pool_;
+  DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);
+};  // class GPUPooledRoundedStorageManager
+
+void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
+  std::lock_guard lock(Storage::Get()->GetMutex(Context::kG