Repository: incubator-singa Updated Branches: refs/heads/master 4dfee5208 -> 458b0f6a8
SINGA-287 - Add memory size check for cudnn convolution add warning if the cudnn conv workspace size is too large (>512MB); add size check when malloc memory for tensor to make sure the size is >=0 Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/458b0f6a Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/458b0f6a Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/458b0f6a Branch: refs/heads/master Commit: 458b0f6a8cb7a05081e1908465c552ba10d2b23f Parents: 4dfee52 Author: Wei Wang <[email protected]> Authored: Tue Dec 20 17:55:50 2016 +0800 Committer: Wei Wang <[email protected]> Committed: Wed Dec 21 18:14:15 2016 +0800 ---------------------------------------------------------------------- python/singa/data.py | 6 ++++-- src/core/device/device.cc | 2 ++ src/core/tensor/tensor.cc | 2 +- src/model/layer/cudnn_convolution.cc | 5 +++++ 4 files changed, 12 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/python/singa/data.py ---------------------------------------------------------------------- diff --git a/python/singa/data.py b/python/singa/data.py index 4fffd92..3a99ad3 100644 --- a/python/singa/data.py +++ b/python/singa/data.py @@ -112,9 +112,9 @@ class ImageBatchIter: img_label = int(item[1]) img_list.append((img_label, img_path)) index = 0 # index for the image + if self.shuffle: + random.shuffle(img_list) while not self.stop: - if index == 0 and self.shuffle: - random.shuffle(img_list) if not self.queue.full(): x = [] y = np.empty(self.batch_size, dtype=np.int32) @@ -134,6 +134,8 @@ class ImageBatchIter: index += 1 if index == self.num_samples: index = 0 # reset to the first image + if self.shuffle: + random.shuffle(img_list) # enqueue one mini-batch self.queue.put((np.asarray(x), y)) else: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/src/core/device/device.cc ---------------------------------------------------------------------- diff --git a/src/core/device/device.cc b/src/core/device/device.cc index 0220df0..cda1b9f 100644 --- a/src/core/device/device.cc +++ b/src/core/device/device.cc @@ -33,6 +33,8 @@ void Device::Exec(function<void(Context*)>&& fn, const vector<Block*> read_block // TODO(wangwei) get Block from the memory manager Block* Device::NewBlock(int size) { + CHECK_GE(size, 0) << "size is negative, could be caused by the type cast " + << "from size_t to int. In that case, the size is too large."; if (size > 0) { void* ptr = Malloc(size); return new Block(ptr, size); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index 4898594..d40fd88 100644 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -122,7 +122,7 @@ void Tensor::AsType(const DataType type) { } void Tensor::ToDevice(std::shared_ptr<Device> dst) { - // TODO(wangwei) the comparison is very strict. May compare against device ID? + // TODO(wangwei) the comparison is restricted. May compare against device ID? if (device_ != dst) { Tensor tmp(shape_, dst, data_type_); if (block_ != nullptr && Size() && block_->initialized()) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/src/model/layer/cudnn_convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc index 54bba06..196d137 100644 --- a/src/model/layer/cudnn_convolution.cc +++ b/src/model/layer/cudnn_convolution.cc @@ -151,6 +151,11 @@ void CudnnConvolution::InitCudnn(const Tensor &input) { workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) / sizeof(float) + 1; + if (workspace_count_ > workspace_byte_limit_) + LOG(WARNING) << "The required memory for workspace (" + << workspace_count_ * sizeof(float) + << ") is larger than the expected Bytes (" + << workspace_byte_limit_ << ")"; workspace_ = Tensor(Shape{workspace_count_}, dev, dtype); has_init_cudnn_ = true; }
