SINGA-113 Model/Hybrid Partition Support Update memory copy in SliceLayer and ConcateLayer considering cpu (memcpy) and gpu (cudaMemcpy). Test hybrid partition running on GPU (CUDNN) using cudnn_hybrid.conf
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/bf4cd3bc Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/bf4cd3bc Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/bf4cd3bc Branch: refs/heads/master Commit: bf4cd3bc9473d802d97327226c636ddc442cda0f Parents: 82563f6 Author: Wei Wang <[email protected]> Authored: Fri Dec 25 17:47:13 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Fri Dec 25 21:27:42 2015 +0800 ---------------------------------------------------------------------- examples/cifar10/cudnn_hybrid.conf | 306 ++++++++++++++++++++++++ include/singa/neuralnet/connection_layer.h | 10 +- src/neuralnet/connection_layer/concate.cc | 67 ++++-- src/neuralnet/connection_layer/slice.cc | 76 ++++-- src/neuralnet/connection_layer/split.cc | 16 +- src/utils/graph.cc | 5 +- 6 files changed, 422 insertions(+), 58 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bf4cd3bc/examples/cifar10/cudnn_hybrid.conf ---------------------------------------------------------------------- diff --git a/examples/cifar10/cudnn_hybrid.conf b/examples/cifar10/cudnn_hybrid.conf new file mode 100644 index 0000000..a11145c --- /dev/null +++ b/examples/cifar10/cudnn_hybrid.conf @@ -0,0 +1,306 @@ +name: "cifar10-convnet" +train_steps: 10000 +test_steps: 0 +test_freq: 200 +#validate_steps: 100 +#validate_freq: 300 +disp_freq: 200 +gpu: 0 +gpu: 1 +#debug: true +#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0" +train_one_batch { + alg: kBP +} +updater{ + type: kSGD + weight_decay:0.004 + momentum:0.9 + learning_rate { + type: kFixedStep + fixedstep_conf:{ + step:0 + step:60000 + step:65000 + step_lr:0.001 + step_lr:0.0001 + step_lr:0.00001 + } + } +} +neuralnet { + layer{ + name: "data" + type: kRecordInput + store_conf { + backend: "kvfile" + path: "examples/cifar10/train_data.bin" + mean_file: "examples/cifar10/image_mean.bin" + batchsize: 100 + #random_skip: 5000 + shape: 3 + shape: 32 + shape: 32 + } + include: kTrain + partition_dim: 0 + } +# layer{ +# name: "data" +# type: kRecordInput +# store_conf { +# backend: "kvfile" +# path: "examples/cifar10/val_data.bin" +# mean_file: "examples/cifar10/image_mean.bin" +# batchsize: 64 +# random_skip: 5000 +# shape: 3 +# shape: 32 +# shape: 32 +# } +# include: kVal +# } + layer{ + name: "data" + type: kRecordInput + store_conf { + backend: "kvfile" + path: "examples/cifar10/test_data.bin" + mean_file: "examples/cifar10/image_mean.bin" + batchsize: 100 + shape: 3 + shape: 32 + shape: 32 + } + include: kTest + partition_dim: 0 + } + + layer { + partition_dim: 0 + name: "conv1" + type: kCudnnConv + srclayers: "data" + convolution_conf { + num_filters: 32 + kernel: 5 + stride: 1 + pad:2 + } + param { + name: "w1" + init { + type:kGaussian + std:0.0001 + } + } + param { + name: "b1" + lr_scale:2.0 + init { + type: kConstant + value:0 + } + } + } + + layer { + partition_dim: 0 + name: "pool1" + type: kCudnnPool + srclayers: "conv1" + pooling_conf { + pool: MAX + kernel: 3 + stride: 2 + } + } + layer { + partition_dim: 0 + name: "relu1" + type: kCudnnActivation + activation_conf { + type: RELU + } + srclayers:"pool1" + } + layer { + partition_dim: 0 + name: "norm1" + type: kCudnnLRN + lrn_conf { + local_size: 3 + alpha: 5e-05 + beta: 0.75 + } + srclayers:"relu1" + } + layer { + partition_dim: 0 + name: "conv2" + type: kCudnnConv + srclayers: "norm1" + convolution_conf { + num_filters: 32 + kernel: 5 + stride: 1 + pad:2 + } + param { + name: "w2" + init { + type:kGaussian + std:0.01 + } + } + param { + name: "b2" + lr_scale:2.0 + init { + type: kConstant + value:0 + } + } + } + layer { + partition_dim: 0 + name: "relu2" + type: kCudnnActivation + activation_conf { + type: RELU + } + srclayers:"conv2" + } + layer { + partition_dim: 0 + name: "pool2" + type: kCudnnPool + srclayers: "relu2" + pooling_conf { + pool: AVG + kernel: 3 + stride: 2 + } + } + layer { + partition_dim: 0 + name: "norm2" + type: kCudnnLRN + lrn_conf { + local_size: 3 + alpha: 5e-05 + beta: 0.75 + } + srclayers:"pool2" + } + layer { + partition_dim: 0 + name: "conv3" + type: kCudnnConv + srclayers: "norm2" + convolution_conf { + num_filters: 64 + kernel: 5 + stride: 1 + pad:2 + } + param { + name: "w3" + init { + type:kGaussian + std:0.01 + } + } + param { + name: "b3" + init { + type: kConstant + value:0 + } + } + } + layer { + partition_dim: 0 + name: "relu3" + type: kCudnnActivation + activation_conf { + type: RELU + } + srclayers:"conv3" + } + layer { + partition_dim: 0 + name: "pool3" + type: kCudnnPool + srclayers: "relu3" + pooling_conf { + pool: AVG + kernel: 3 + stride: 2 + } + } + layer { + partition_dim: 1 + name: "ip1" + type: kInnerProduct + srclayers:"pool3" + innerproduct_conf { + num_output: 10 + } + param { + name: "w4" + wd_scale:250 + init { + type:kGaussian + std:0.01 + } + } + param { + name: "b4" + lr_scale:2.0 + wd_scale:0 + init { + type: kConstant + value:0 + } + } + } +# layer { +# name : "softmax" +# type: kSoftmax +# srclayers: "ip1" +# } +# +# layer { +# name : "argsort" +# type: kArgSort +# srclayers: "softmax" +# } + layer{ + partition_dim: 0 + name: "loss" + type: kSoftmaxLoss + softmaxloss_conf{ + topk:1 + } + srclayers:"ip1" + srclayers: "data" + } +# uncomment "softmax", "argsort", "output" layer and comment "loss" layer +# to extract features from argsort +# layer { +# name : "output" +# type: kCSVOutput +# srclayers: "argsort" +# store_conf { +# path: "examples/cifar10/out.csv" +# } +# } +} +cluster { + nworker_groups: 1 + nserver_groups: 1 + nworkers_per_group: 2 + nworkers_per_procs: 2 + workspace: "examples/cifar10" +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bf4cd3bc/include/singa/neuralnet/connection_layer.h ---------------------------------------------------------------------- diff --git a/include/singa/neuralnet/connection_layer.h b/include/singa/neuralnet/connection_layer.h index 14e5092..a18f458 100644 --- a/include/singa/neuralnet/connection_layer.h +++ b/include/singa/neuralnet/connection_layer.h @@ -103,8 +103,8 @@ class ConcateLayer : public ConnectionLayer { void ComputeGradient(int flag, const vector<Layer*>& srclayers) override; private: - int num_concates = 0; - int concate_dim = 0; + int num_concates_ = 0; + int concate_dim_ = 0; }; /** @@ -126,8 +126,8 @@ class SliceLayer : public ConnectionLayer { Blob<float>* mutable_grad(const Layer* from) override; private: - int num_slices = 0; - int slice_dim = 0; + int num_slices_ = 0; + int slice_dim_ = 0; Layer2Index layer_idx_; }; @@ -149,7 +149,7 @@ class SplitLayer : public ConnectionLayer { Blob<float>* mutable_grad(const Layer* from) override; private: - int num_splits = 0; + int num_splits_ = 0; Layer2Index layer_idx_; }; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bf4cd3bc/src/neuralnet/connection_layer/concate.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/connection_layer/concate.cc b/src/neuralnet/connection_layer/concate.cc index adf701e..0cdd812 100644 --- a/src/neuralnet/connection_layer/concate.cc +++ b/src/neuralnet/connection_layer/concate.cc @@ -20,6 +20,8 @@ *************************************************************/ #include "singa/neuralnet/connection_layer.h" +#include "singa/utils/singleton.h" +#include "singa/utils/context.h" namespace singa { @@ -28,15 +30,15 @@ void ConcateLayer::Setup(const LayerProto& conf, CHECK_GT(srclayers.size(), 1); Layer::Setup(conf, srclayers); vector<int> shape = srclayers[0]->data(this).shape(); - concate_dim = conf.concate_conf().concate_dim(); - num_concates = conf.concate_conf().num_concates(); - CHECK_GE(concate_dim, 0); - CHECK_LT(concate_dim, shape.size()); - CHECK_EQ(num_concates, srclayers.size()); + concate_dim_ = conf.concate_conf().concate_dim(); + num_concates_ = conf.concate_conf().num_concates(); + CHECK_GE(concate_dim_, 0); + CHECK_LT(concate_dim_, shape.size()); + CHECK_EQ(num_concates_, srclayers.size()); for (size_t i = 1; i < srclayers.size(); i++) { const vector<int>& src_shape = srclayers[i]->data(this).shape(); for (size_t j = 0; j < shape.size(); j++) - if (static_cast<int>(j) == concate_dim) + if (static_cast<int>(j) == concate_dim_) shape[j] += src_shape[j]; else CHECK_EQ(shape[j], src_shape[j]); @@ -47,18 +49,32 @@ void ConcateLayer::Setup(const LayerProto& conf, void ConcateLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { CHECK_GT(srclayers.size(), 1); - CHECK_EQ(num_concates, srclayers.size()); + CHECK_EQ(num_concates_, srclayers.size()); // calculate step for each memcpy - int step = srclayers[0]->data(this).shape()[concate_dim]; - for (unsigned i = concate_dim + 1; i < data_.shape().size(); ++i) + int step = srclayers[0]->data(this).shape()[concate_dim_]; + for (unsigned i = concate_dim_ + 1; i < data_.shape().size(); ++i) step *= data_.shape()[i]; int srclayer_offset = 0; int concate_offset = 0; + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); while (concate_offset < data_.count()) { for (size_t i = 0; i < srclayers.size(); ++i) { - const float* src = srclayers[i]->data(this).cpu_data() + srclayer_offset; - float* dst = data_.mutable_cpu_data() + concate_offset; - memcpy(dst, src, step * sizeof(float)); + if (device == -1) { + const float* src = srclayers[i]->data(this).cpu_data() + + srclayer_offset; + float* dst = data_.mutable_cpu_data() + concate_offset; + memcpy(dst, src, step * sizeof(float)); + } else { +#ifdef USE_GPU + const float* src = srclayers[i]->data(this).gpu_data() + + srclayer_offset; + float* dst = data_.mutable_gpu_data() + concate_offset; + cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault); +#else + LOG(FATAL) << "GPU is supported"; +#endif + } concate_offset += step; } srclayer_offset += step; @@ -67,19 +83,32 @@ void ConcateLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { void ConcateLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { CHECK_GT(srclayers.size(), 1); - CHECK_EQ(num_concates, srclayers.size()); + CHECK_EQ(num_concates_, srclayers.size()); // calculate step for each memcpy - int step = srclayers[0]->grad(this).shape()[concate_dim]; - for (unsigned i = concate_dim + 1; i < grad_.shape().size(); ++i) + int step = srclayers[0]->grad(this).shape()[concate_dim_]; + for (unsigned i = concate_dim_ + 1; i < grad_.shape().size(); ++i) step *= grad_.shape()[i]; int srclayer_offset = 0; int concate_offset = 0; + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); while (concate_offset < grad_.count()) { for (size_t i = 0; i < srclayers.size(); ++i) { - const float* src = grad_.cpu_data() + concate_offset; - float* dst = srclayers[i]->mutable_grad(this)->mutable_cpu_data() - + srclayer_offset; - memcpy(dst, src, step * sizeof(float)); + if (device == -1) { + const float* src = grad_.cpu_data() + concate_offset; + float* dst = srclayers[i]->mutable_grad(this)->mutable_cpu_data() + + srclayer_offset; + memcpy(dst, src, step * sizeof(float)); + } else { +#ifdef USE_GPU + const float* src = grad_.gpu_data() + concate_offset; + float* dst = srclayers[i]->mutable_grad(this)->mutable_gpu_data() + + srclayer_offset; + cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault); +#else + LOG(FATAL) << "GPU is supported"; +#endif + } concate_offset += step; } srclayer_offset += step; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bf4cd3bc/src/neuralnet/connection_layer/slice.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/connection_layer/slice.cc b/src/neuralnet/connection_layer/slice.cc index b625c66..15fdc48 100644 --- a/src/neuralnet/connection_layer/slice.cc +++ b/src/neuralnet/connection_layer/slice.cc @@ -21,6 +21,8 @@ #include "singa/neuralnet/connection_layer.h" #include "singa/utils/math_blob.h" +#include "singa/utils/singleton.h" +#include "singa/utils/context.h" namespace singa { @@ -38,20 +40,20 @@ void SliceLayer::Setup(const LayerProto& conf, CHECK_EQ(srclayers.size(), 1); Layer::Setup(conf, srclayers); vector<int> shape = srclayers[0]->data(this).shape(); - slice_dim = conf.slice_conf().slice_dim(); - num_slices = conf.slice_conf().num_slices(); - CHECK_GE(slice_dim, 0); - CHECK_LT(slice_dim, shape.size()); - CHECK_GT(num_slices, 0); + slice_dim_ = conf.slice_conf().slice_dim(); + num_slices_ = conf.slice_conf().num_slices(); + CHECK_GE(slice_dim_, 0); + CHECK_LT(slice_dim_, shape.size()); + CHECK_GT(num_slices_, 0); // add num_slices-1 more blobs - for (int i = 1; i < num_slices; ++i) { + for (int i = 1; i < num_slices_; ++i) { datavec_.push_back(new Blob<float>()); gradvec_.push_back(new Blob<float>()); } // TODO(wangsh): remove equal-size restrict later - CHECK_EQ(shape[slice_dim] % num_slices, 0); - shape[slice_dim] /= num_slices; - for (int i = 0; i < num_slices; ++i) { + CHECK_EQ(shape[slice_dim_] % num_slices_, 0); + shape[slice_dim_] /= num_slices_; + for (int i = 0; i < num_slices_; ++i) { // if (i == slice_num - 1) shape[slice_dim] += remain; datavec_[i]->Reshape(shape); gradvec_[i]->Reshape(shape); @@ -62,16 +64,28 @@ void SliceLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { CHECK_EQ(srclayers.size(), 1); const Blob<float>& blob = srclayers[0]->data(this); // calculate step for each memcpy - int step = datavec_[0]->shape()[slice_dim]; - for (unsigned i = slice_dim + 1; i < datavec_[0]->shape().size(); ++i) + int step = datavec_[0]->shape()[slice_dim_]; + for (unsigned i = slice_dim_ + 1; i < datavec_[0]->shape().size(); ++i) step *= datavec_[0]->shape()[i]; int srclayer_offset = 0; int slice_offset = 0; + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); while (srclayer_offset < blob.count()) { - for (int i = 0; i < num_slices; ++i) { - const float* src = blob.cpu_data() + srclayer_offset; - float* dst = datavec_[i]->mutable_cpu_data() + slice_offset; - memcpy(dst, src, step * sizeof(float)); + for (int i = 0; i < num_slices_; ++i) { + if (device == -1) { + const float* src = blob.cpu_data() + srclayer_offset; + float* dst = datavec_[i]->mutable_cpu_data() + slice_offset; + memcpy(dst, src, step * sizeof(float)); + } else { +#ifdef USE_GPU + const float* src = blob.gpu_data() + srclayer_offset; + float* dst = datavec_[i]->mutable_gpu_data() + slice_offset; + cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault); +#else + LOG(FATAL) << "GPU is supported"; +#endif + } srclayer_offset += step; } slice_offset += step; @@ -82,16 +96,28 @@ void SliceLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { CHECK_EQ(srclayers.size(), 1); Blob<float>* blob = srclayers[0]->mutable_grad(this); // calculate step for each memcpy - int step = gradvec_[0]->shape()[slice_dim]; - for (size_t i = slice_dim + 1; i < gradvec_[0]->shape().size(); ++i) + int step = gradvec_[0]->shape()[slice_dim_]; + for (size_t i = slice_dim_ + 1; i < gradvec_[0]->shape().size(); ++i) step *= gradvec_[0]->shape()[i]; int srclayer_offset = 0; int slice_offset = 0; + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); while (srclayer_offset < blob->count()) { - for (int i = 0; i < num_slices; ++i) { - const float* src = gradvec_[i]->cpu_data() + slice_offset; - float* dst = blob->mutable_cpu_data() + srclayer_offset; - memcpy(dst, src, step * sizeof(float)); + for (int i = 0; i < num_slices_; ++i) { + if (device == -1) { + const float* src = gradvec_[i]->cpu_data() + slice_offset; + float* dst = blob->mutable_cpu_data() + srclayer_offset; + memcpy(dst, src, step * sizeof(float)); + } else { +#ifdef USE_GPU + const float* src = gradvec_[i]->gpu_data() + slice_offset; + float* dst = blob->mutable_gpu_data() + srclayer_offset; + cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault); +#else + LOG(FATAL) << "GPU is supported"; +#endif + } srclayer_offset += step; } slice_offset += step; @@ -100,27 +126,27 @@ void SliceLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { const Blob<float>& SliceLayer::data(const Layer* from) { int idx = from ? layer_idx_.Get(from) : 0; - CHECK_LT(idx, num_slices); + CHECK_LT(idx, num_slices_); return *datavec_[idx]; } const Blob<float>& SliceLayer::grad(const Layer* from) { int idx = from ? layer_idx_.Get(from) : 0; - CHECK_LT(idx, num_slices); + CHECK_LT(idx, num_slices_); return *gradvec_[idx]; } Blob<float>* SliceLayer::mutable_data(const Layer* from) { CHECK(from); int idx = layer_idx_.Get(from); - CHECK_LT(idx, num_slices); + CHECK_LT(idx, num_slices_); return datavec_[idx]; } Blob<float>* SliceLayer::mutable_grad(const Layer* from) { CHECK(from); int idx = layer_idx_.Get(from); - CHECK_LT(idx, num_slices); + CHECK_LT(idx, num_slices_); return gradvec_[idx]; } const std::string SliceLayer::ToString(bool debug, int flag) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bf4cd3bc/src/neuralnet/connection_layer/split.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/connection_layer/split.cc b/src/neuralnet/connection_layer/split.cc index a9270fb..e46b902 100644 --- a/src/neuralnet/connection_layer/split.cc +++ b/src/neuralnet/connection_layer/split.cc @@ -37,14 +37,14 @@ void SplitLayer::Setup(const LayerProto& conf, CHECK_EQ(srclayers.size(), 1); Layer::Setup(conf, srclayers); data_.Reshape(srclayers[0]->data(this).shape()); - data_.ShareData(srclayers[0]->data(this), false); - int num_splits = conf.split_conf().num_splits(); - CHECK_GT(num_splits, 0); + data_.ShareData(srclayers[0]->mutable_data(this), false); + num_splits_ = conf.split_conf().num_splits(); + CHECK_GT(num_splits_, 0); // add num_splits-1 more grad blobs - for (int i = 1; i < num_splits; ++i) { + for (int i = 1; i < num_splits_; ++i) { gradvec_.push_back(new Blob<float>()); } - for (int i = 0; i < num_splits; ++i) + for (int i = 0; i < num_splits_; ++i) gradvec_[i]->Reshape(srclayers[0]->data(this).shape()); } @@ -56,7 +56,7 @@ void SplitLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { void SplitLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { CHECK_EQ(srclayers.size(), 1); // aggregate all gradients to grad_[0] - for (int i = 1; i < num_splits; ++i) + for (int i = 1; i < num_splits_; ++i) AXPY<float>(1.0, *gradvec_[i], gradvec_[0]); // copy grad_[0] to srclayer's grad Copy(*gradvec_[0], srclayers[0]->mutable_grad(this)); @@ -65,14 +65,14 @@ void SplitLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { const Blob<float>& SplitLayer::grad(const Layer* from) { CHECK(from); int idx = layer_idx_.Get(from); - CHECK_LT(idx, num_splits); + CHECK_LT(idx, num_splits_); return *gradvec_[idx]; } Blob<float>* SplitLayer::mutable_grad(const Layer* from) { CHECK(from); int idx = layer_idx_.Get(from); - CHECK_LT(idx, num_splits); + CHECK_LT(idx, num_splits_); return gradvec_[idx]; } const std::string SplitLayer::ToString(bool debug, int flag) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/bf4cd3bc/src/utils/graph.cc ---------------------------------------------------------------------- diff --git a/src/utils/graph.cc b/src/utils/graph.cc index 0211e5a..4f59635 100644 --- a/src/utils/graph.cc +++ b/src/utils/graph.cc @@ -97,7 +97,10 @@ Node* Graph::AddNode(const string& name, const string& origin, int id, Node* Graph::AddNode(const string& name, const std::map<string, string>& attrs) { Node* node = new Node(name, attrs); - AddNode(node); + nodes_.push_back(node); + CHECK(name2node_.find(node->name) == name2node_.end()) + << "node " << node->name << " already exists"; + name2node_[node->name] = node; return node; }
