SINGA-113 Model/Hybrid Partition Support Fix a bug caused by the mutable_grad(Layer*) function of SplitLayer and SlicLayer. They return the grad blob based on dest layer's partition_id. The bug we encountered: the slice layer is connected to a BridgeSrcLayer and a ConcateLayer, both with partition id = 0 (the partition id of the BridgeDstLayer connected from BridgeSrcLayer is 1); during backward propagation, both BridgeSrcLayer and ConcateLayer get the set same grad blob of the SliceLayer, hence this blob is overwritten and the other grad Blob is empty.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/82563f69 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/82563f69 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/82563f69 Branch: refs/heads/master Commit: 82563f6905fa1efc45a2a5dfbd85d10c96693643 Parents: 8af565c Author: Wei Wang <[email protected]> Authored: Fri Dec 25 13:40:18 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Fri Dec 25 20:27:51 2015 +0800 ---------------------------------------------------------------------- examples/cifar10/hybrid.conf | 292 ++++++++++++++++++++++++ include/singa/neuralnet/connection_layer.h | 28 ++- include/singa/neuralnet/layer.h | 29 ++- src/neuralnet/connection_layer/bridge.cc | 4 +- src/neuralnet/connection_layer/slice.cc | 29 +-- src/neuralnet/connection_layer/split.cc | 12 +- 6 files changed, 357 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/examples/cifar10/hybrid.conf ---------------------------------------------------------------------- diff --git a/examples/cifar10/hybrid.conf b/examples/cifar10/hybrid.conf new file mode 100644 index 0000000..ec3da0c --- /dev/null +++ b/examples/cifar10/hybrid.conf @@ -0,0 +1,292 @@ +name: "cifar10-convnet" +train_steps: 1000 +test_steps: 0 +test_freq: 200 +#validate_steps: 100 +#validate_freq: 300 +disp_freq: 30 +#debug: true +#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0" +train_one_batch { + alg: kBP +} +updater{ + type: kSGD + weight_decay:0.004 + momentum:0.9 + learning_rate { + type: kFixedStep + fixedstep_conf:{ + step:0 + step:60000 + step:65000 + step_lr:0.001 + step_lr:0.0001 + step_lr:0.00001 + } + } +} +neuralnet { + layer{ + name: "data" + type: kRecordInput + store_conf { + backend: "kvfile" + path: "examples/cifar10/train_data.bin" + mean_file: "examples/cifar10/image_mean.bin" + batchsize: 100 + #random_skip: 5000 + shape: 3 + shape: 32 + shape: 32 + } + include: kTrain + } +# layer{ +# name: "data" +# type: kRecordInput +# store_conf { +# backend: "kvfile" +# path: "examples/cifar10/val_data.bin" +# mean_file: "examples/cifar10/image_mean.bin" +# batchsize: 64 +# random_skip: 5000 +# shape: 3 +# shape: 32 +# shape: 32 +# } +# include: kVal +# } + layer{ + name: "data" + type: kRecordInput + store_conf { + backend: "kvfile" + path: "examples/cifar10/test_data.bin" + mean_file: "examples/cifar10/image_mean.bin" + batchsize: 100 + shape: 3 + shape: 32 + shape: 32 + } + include: kTest + } + + layer { + partition_dim: 0 + name: "conv1" + type: kCConvolution + srclayers: "data" + convolution_conf { + num_filters: 32 + kernel: 5 + stride: 1 + pad:2 + } + param { + name: "w1" + init { + type:kGaussian + std:0.0001 + } + } + param { + name: "b1" + lr_scale:2.0 + init { + type: kConstant + value:0 + } + } + } + + layer { + partition_dim: 0 + name: "pool1" + type: kCPooling + srclayers: "conv1" + pooling_conf { + pool: MAX + kernel: 3 + stride: 2 + } + } + layer { + partition_dim: 0 + name: "relu1" + type: kReLU + srclayers:"pool1" + } + layer { + partition_dim: 0 + name: "norm1" + type: kLRN + lrn_conf { + local_size: 3 + alpha: 5e-05 + beta: 0.75 + } + srclayers:"relu1" + } + layer { + partition_dim: 0 + name: "conv2" + type: kCConvolution + srclayers: "norm1" + convolution_conf { + num_filters: 32 + kernel: 5 + stride: 1 + pad:2 + } + param { + name: "w2" + init { + type:kGaussian + std:0.01 + } + } + param { + name: "b2" + lr_scale:2.0 + init { + type: kConstant + value:0 + } + } + } + layer { + partition_dim: 0 + name: "relu2" + type: kReLU + srclayers:"conv2" + } + layer { + partition_dim: 0 + name: "pool2" + type: kCPooling + srclayers: "relu2" + pooling_conf { + pool: AVG + kernel: 3 + stride: 2 + } + } + layer { + partition_dim: 0 + name: "norm2" + type: kLRN + lrn_conf { + local_size: 3 + alpha: 5e-05 + beta: 0.75 + } + srclayers:"pool2" + } + layer { + partition_dim: 0 + name: "conv3" + type: kCConvolution + srclayers: "norm2" + convolution_conf { + num_filters: 64 + kernel: 5 + stride: 1 + pad:2 + } + param { + name: "w3" + init { + type:kGaussian + std:0.01 + } + } + param { + name: "b3" + init { + type: kConstant + value:0 + } + } + } + layer { + partition_dim: 0 + name: "relu3" + type: kReLU + srclayers:"conv3" + } + layer { + partition_dim: 0 + name: "pool3" + type: kCPooling + srclayers: "relu3" + pooling_conf { + pool: AVG + kernel: 3 + stride: 2 + } + } + layer { + partition_dim: 1 + name: "ip1" + type: kInnerProduct + srclayers:"pool3" + innerproduct_conf { + num_output: 10 + } + param { + name: "w4" + wd_scale:250 + init { + type:kGaussian + std:0.01 + } + } + param { + name: "b4" + lr_scale:2.0 + wd_scale:0 + init { + type: kConstant + value:0 + } + } + } +# layer { +# name : "softmax" +# type: kSoftmax +# srclayers: "ip1" +# } +# +# layer { +# name : "argsort" +# type: kArgSort +# srclayers: "softmax" +# } + layer{ + name: "loss" + type: kSoftmaxLoss + softmaxloss_conf{ + topk:1 + } + srclayers:"ip1" + srclayers: "data" + } +# uncomment "softmax", "argsort", "output" layer and comment "loss" layer +# to extract features from argsort +# layer { +# name : "output" +# type: kCSVOutput +# srclayers: "argsort" +# store_conf { +# path: "examples/cifar10/out.csv" +# } +# } +} +cluster { + nworker_groups: 1 + nserver_groups: 1 + nworkers_per_group: 2 + nworkers_per_procs: 2 + workspace: "examples/cifar10" +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/include/singa/neuralnet/connection_layer.h ---------------------------------------------------------------------- diff --git a/include/singa/neuralnet/connection_layer.h b/include/singa/neuralnet/connection_layer.h index ae918ce..14e5092 100644 --- a/include/singa/neuralnet/connection_layer.h +++ b/include/singa/neuralnet/connection_layer.h @@ -29,6 +29,24 @@ #include "singa/neuralnet/layer.h" namespace singa { +/** + * Used inside SplitLayer and SliceLayer to locate the out-going connection + * index given the Layer pointer. + */ +class Layer2Index { + public: + int Get(const Layer* layer) { + if (layer2idx_.find(layer) == layer2idx_.end()) { + int idx = layer2idx_.size(); + layer2idx_[layer] = idx; + } + return layer2idx_[layer]; + } + + private: + std::unordered_map<const Layer*, int> layer2idx_; +}; + class BridgeLayer : public ConnectionLayer { public: @@ -102,14 +120,15 @@ class SliceLayer : public ConnectionLayer { void ComputeFeature(int flag, const vector<Layer*>& srclayers) override; void ComputeGradient(int flag, const vector<Layer*>& srclayers) override; const std::string ToString(bool debug, int flag) override; - const Blob<float>& data(const Layer* from) const override; - const Blob<float>& grad(const Layer* from) const override; + const Blob<float>& data(const Layer* from) override; + const Blob<float>& grad(const Layer* from) override; Blob<float>* mutable_data(const Layer* from) override; Blob<float>* mutable_grad(const Layer* from) override; - + private: int num_slices = 0; int slice_dim = 0; + Layer2Index layer_idx_; }; /** @@ -126,11 +145,12 @@ class SplitLayer : public ConnectionLayer { void ComputeFeature(int flag, const vector<Layer*>& srclayers) override; void ComputeGradient(int flag, const vector<Layer*>& srclayers) override; const std::string ToString(bool debug, int flag) override; - const Blob<float>& grad(const Layer* from) const override; + const Blob<float>& grad(const Layer* from) override; Blob<float>* mutable_grad(const Layer* from) override; private: int num_splits = 0; + Layer2Index layer_idx_; }; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/include/singa/neuralnet/layer.h ---------------------------------------------------------------------- diff --git a/include/singa/neuralnet/layer.h b/include/singa/neuralnet/layer.h index 3dbb3fc..28ab92e 100644 --- a/include/singa/neuralnet/layer.h +++ b/include/singa/neuralnet/layer.h @@ -174,6 +174,13 @@ class Layer { */ inline const std::string& name() const { return layer_conf_.name(); } /** + * @return a const ref for Blob vector storing feature values of this layer. + */ + virtual const vector<Blob<float>*>& data() const { + return datavec_; + } + + /** * @param[in] from pointer to one of the dst layer. For some layers, they have * more than one data Blob. In this case, this argument identifies the layer * that is requesting the data Blob. @@ -182,21 +189,19 @@ class Layer { * virtual const vector<Blob<float>>& data() const or * virtual const Blob<float>& data(int k) const instead}. */ - virtual const Blob<float>& data(const Layer* from) const { + virtual const Blob<float>& data(const Layer* from) { return data_; } /** - * @return a const ref for Blob vector storing feature values of this layer. - */ - virtual const vector<Blob<float>*>& data() const { - return datavec_; - } - /** * @return a const ref for the kth Blob. + * TODO(wangwei) if make this function const, there will be a warning + * indicating that data(const Layer*) and this function are ambiguous for + * data(0). */ - virtual const Blob<float>& data(int k) const { + virtual const Blob<float>& data(int k) { return *datavec_.at(k); } + /** * @see data(). * @return the pointer to the Blob storing feature values of this layer. @@ -226,7 +231,7 @@ class Layer { * virtual const vector<Blob<float>>& grad() const or * virtual const Blob<float>& grad(int k) const instead}. */ - virtual const Blob<float>& grad(const Layer* from) const { + virtual const Blob<float>& grad(const Layer* from) { return grad_; } /** @@ -286,7 +291,7 @@ class InputLayer : virtual public Layer { return nullptr; // LOG(FATAL) << "Input layer has no gradient blob"; } - const Blob<float>& grad(const Layer* from) const override { + const Blob<float>& grad(const Layer* from) override { return grad_; // LOG(FATAL) << "Input layer has no gradient blob"; } @@ -312,7 +317,7 @@ class LossLayer : virtual public Layer { return nullptr; // LOG(FATAL) << "Loss layer has no gradient blob"; } - const Blob<float>& grad(const Layer* from) const override { + const Blob<float>& grad(const Layer* from) override { return grad_; // LOG(FATAL) << "Loss layer has no gradient blob"; } @@ -328,7 +333,7 @@ class OutputLayer : virtual public Layer { return nullptr; // LOG(FATAL) << "Output layer has no gradient blob"; } - const Blob<float>& grad(const Layer* from) const override { + const Blob<float>& grad(const Layer* from) override { return grad_; // LOG(FATAL) << "Output layer has no gradient blob"; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/src/neuralnet/connection_layer/bridge.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/connection_layer/bridge.cc b/src/neuralnet/connection_layer/bridge.cc index a2302ab..2cfd55a 100644 --- a/src/neuralnet/connection_layer/bridge.cc +++ b/src/neuralnet/connection_layer/bridge.cc @@ -69,8 +69,8 @@ void BridgeSrcLayer::Setup(const LayerProto& conf, Layer::Setup(conf, srclayers); data_.Reshape(srclayers[0]->data(this).shape()); grad_.ReshapeLike(data_); - data_.ShareData(srclayers[0]->mutable_data(this)); - grad_.ShareData(srclayers[0]->mutable_grad(this)); + data_.ShareData(srclayers[0]->mutable_data(this), false); + grad_.ShareData(srclayers[0]->mutable_grad(this), false); } void BridgeSrcLayer::ComputeFeature(int flag, const vector<Layer*>& srcs) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/src/neuralnet/connection_layer/slice.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/connection_layer/slice.cc b/src/neuralnet/connection_layer/slice.cc index db83f1b..b625c66 100644 --- a/src/neuralnet/connection_layer/slice.cc +++ b/src/neuralnet/connection_layer/slice.cc @@ -98,29 +98,30 @@ void SliceLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { } } -const Blob<float>& SliceLayer::data(const Layer* from) const { - int id = from ? from->partition_id() : 0; - CHECK_LT(id, num_slices); - return *datavec_[id]; +const Blob<float>& SliceLayer::data(const Layer* from) { + int idx = from ? layer_idx_.Get(from) : 0; + CHECK_LT(idx, num_slices); + return *datavec_[idx]; } -const Blob<float>& SliceLayer::grad(const Layer* from) const { - int id = from ? from->partition_id() : 0; - CHECK_LT(id, num_slices); - return *gradvec_[id]; +const Blob<float>& SliceLayer::grad(const Layer* from) { + int idx = from ? layer_idx_.Get(from) : 0; + CHECK_LT(idx, num_slices); + return *gradvec_[idx]; } Blob<float>* SliceLayer::mutable_data(const Layer* from) { - int id = from ? from->partition_id() : 0; CHECK(from); - CHECK_LT(id, num_slices); - return datavec_[id]; + int idx = layer_idx_.Get(from); + CHECK_LT(idx, num_slices); + return datavec_[idx]; } Blob<float>* SliceLayer::mutable_grad(const Layer* from) { - int id = from ? from->partition_id() : 0; - CHECK_LT(id, num_slices); - return gradvec_[id]; + CHECK(from); + int idx = layer_idx_.Get(from); + CHECK_LT(idx, num_slices); + return gradvec_[idx]; } const std::string SliceLayer::ToString(bool debug, int flag) { if (!debug) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/src/neuralnet/connection_layer/split.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/connection_layer/split.cc b/src/neuralnet/connection_layer/split.cc index 9bbe59e..a9270fb 100644 --- a/src/neuralnet/connection_layer/split.cc +++ b/src/neuralnet/connection_layer/split.cc @@ -62,16 +62,18 @@ void SplitLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { Copy(*gradvec_[0], srclayers[0]->mutable_grad(this)); } -const Blob<float>& SplitLayer::grad(const Layer* from) const { +const Blob<float>& SplitLayer::grad(const Layer* from) { CHECK(from); - CHECK_LT(from->partition_id(), num_splits); - return *gradvec_[from->partition_id()]; + int idx = layer_idx_.Get(from); + CHECK_LT(idx, num_splits); + return *gradvec_[idx]; } Blob<float>* SplitLayer::mutable_grad(const Layer* from) { CHECK(from); - CHECK_LT(from->partition_id(), num_splits); - return gradvec_[from->partition_id()]; + int idx = layer_idx_.Get(from); + CHECK_LT(idx, num_splits); + return gradvec_[idx]; } const std::string SplitLayer::ToString(bool debug, int flag) { if (!debug)
