Hybrid Partition Support

wangsh Sun, 27 Dec 2015 19:32:14 -0800

SINGA-113 Model/Hybrid Partition Support

Fix a bug caused by the mutable_grad(Layer*) function of SplitLayer and 
SlicLayer.
They return the grad blob based on dest layer's partition_id.
The bug we encountered:
the slice layer is connected to a BridgeSrcLayer and a ConcateLayer, both with
partition id = 0 (the partition id of the BridgeDstLayer connected from 
BridgeSrcLayer is 1);
during backward propagation, both BridgeSrcLayer and ConcateLayer get
the set same grad blob of the SliceLayer, hence this blob is overwritten and 
the other grad Blob is empty.



Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/82563f69
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/82563f69
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/82563f69

Branch: refs/heads/master
Commit: 82563f6905fa1efc45a2a5dfbd85d10c96693643
Parents: 8af565c
Author: Wei Wang <[email protected]>
Authored: Fri Dec 25 13:40:18 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Fri Dec 25 20:27:51 2015 +0800

----------------------------------------------------------------------
 examples/cifar10/hybrid.conf               | 292 ++++++++++++++++++++++++
 include/singa/neuralnet/connection_layer.h |  28 ++-
 include/singa/neuralnet/layer.h            |  29 ++-
 src/neuralnet/connection_layer/bridge.cc   |   4 +-
 src/neuralnet/connection_layer/slice.cc    |  29 +--
 src/neuralnet/connection_layer/split.cc    |  12 +-
 6 files changed, 357 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/examples/cifar10/hybrid.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/hybrid.conf b/examples/cifar10/hybrid.conf
new file mode 100644
index 0000000..ec3da0c
--- /dev/null
+++ b/examples/cifar10/hybrid.conf
@@ -0,0 +1,292 @@
+name: "cifar10-convnet"
+train_steps: 1000
+test_steps: 0
+test_freq: 200
+#validate_steps: 100
+#validate_freq: 300
+disp_freq: 30
+#debug: true
+#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0"
+train_one_batch {
+  alg: kBP
+}
+updater{
+  type: kSGD
+  weight_decay:0.004
+  momentum:0.9
+  learning_rate {
+    type: kFixedStep
+    fixedstep_conf:{
+      step:0
+      step:60000
+      step:65000
+      step_lr:0.001
+      step_lr:0.0001
+      step_lr:0.00001
+    }
+  }
+}
+neuralnet {
+  layer{
+    name: "data"
+    type: kRecordInput
+    store_conf {
+      backend: "kvfile"
+      path: "examples/cifar10/train_data.bin"
+      mean_file: "examples/cifar10/image_mean.bin"
+      batchsize: 100
+      #random_skip: 5000
+      shape: 3
+      shape: 32
+      shape: 32
+    }
+    include: kTrain
+  }
+#  layer{
+#    name: "data"
+#    type: kRecordInput
+#    store_conf {
+#      backend: "kvfile"
+#      path: "examples/cifar10/val_data.bin"
+#      mean_file: "examples/cifar10/image_mean.bin"
+#      batchsize: 64
+#      random_skip: 5000
+#      shape: 3
+#      shape: 32
+#      shape: 32
+#    }
+#    include: kVal
+#  }
+  layer{
+    name: "data"
+    type: kRecordInput
+    store_conf {
+      backend: "kvfile"
+      path: "examples/cifar10/test_data.bin"
+      mean_file: "examples/cifar10/image_mean.bin"
+      batchsize: 100
+      shape: 3
+      shape: 32
+      shape: 32
+    }
+    include: kTest
+  }
+
+  layer {
+    partition_dim: 0
+    name: "conv1"
+    type: kCConvolution
+    srclayers: "data"
+    convolution_conf {
+      num_filters: 32
+      kernel: 5
+      stride: 1
+      pad:2
+    }
+    param {
+      name: "w1"
+      init {
+        type:kGaussian
+        std:0.0001
+      }
+    }
+    param {
+      name: "b1"
+      lr_scale:2.0
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+
+  layer {
+    partition_dim: 0
+    name: "pool1"
+    type: kCPooling
+    srclayers: "conv1"
+    pooling_conf {
+      pool: MAX
+      kernel: 3
+      stride: 2
+    }
+  }
+  layer {
+    partition_dim: 0
+    name: "relu1"
+    type: kReLU
+    srclayers:"pool1"
+  }
+  layer {
+    partition_dim: 0
+    name: "norm1"
+    type: kLRN
+    lrn_conf {
+      local_size: 3
+      alpha: 5e-05
+      beta: 0.75
+    }
+    srclayers:"relu1"
+  }
+  layer {
+    partition_dim: 0
+    name: "conv2"
+    type: kCConvolution
+    srclayers: "norm1"
+    convolution_conf {
+      num_filters: 32
+      kernel: 5
+      stride: 1
+      pad:2
+    }
+    param {
+      name: "w2"
+      init {
+        type:kGaussian
+        std:0.01
+      }
+    }
+    param {
+      name: "b2"
+      lr_scale:2.0
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+  layer {
+    partition_dim: 0
+    name: "relu2"
+    type: kReLU
+    srclayers:"conv2"
+  }
+  layer {
+    partition_dim: 0
+    name: "pool2"
+    type: kCPooling
+    srclayers: "relu2"
+    pooling_conf {
+      pool: AVG
+      kernel: 3
+      stride: 2
+    }
+  }
+  layer {
+    partition_dim: 0
+    name: "norm2"
+    type: kLRN
+    lrn_conf {
+      local_size: 3
+      alpha: 5e-05
+      beta: 0.75
+    }
+    srclayers:"pool2"
+  }
+  layer {
+    partition_dim: 0
+    name: "conv3"
+    type: kCConvolution
+    srclayers: "norm2"
+    convolution_conf {
+      num_filters: 64
+      kernel: 5
+      stride: 1
+      pad:2
+    }
+    param {
+      name: "w3"
+      init {
+        type:kGaussian
+        std:0.01
+      }
+    }
+    param {
+      name: "b3"
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+  layer {
+    partition_dim: 0
+    name: "relu3"
+    type: kReLU
+    srclayers:"conv3"
+  }
+  layer {
+    partition_dim: 0
+    name: "pool3"
+    type: kCPooling
+    srclayers: "relu3"
+    pooling_conf {
+      pool: AVG
+      kernel: 3
+      stride: 2
+    }
+  }
+  layer {
+    partition_dim: 1
+    name: "ip1"
+    type: kInnerProduct
+    srclayers:"pool3"
+    innerproduct_conf {
+      num_output: 10
+    }
+    param {
+      name: "w4"
+      wd_scale:250
+      init {
+        type:kGaussian
+        std:0.01
+      }
+    }
+    param {
+      name: "b4"
+      lr_scale:2.0
+      wd_scale:0
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+#  layer {
+#   name : "softmax"
+#   type: kSoftmax
+#   srclayers: "ip1"
+#  }
+#
+#  layer {
+#   name : "argsort"
+#   type: kArgSort
+#   srclayers: "softmax"
+#  }
+  layer{
+    name: "loss"
+    type: kSoftmaxLoss
+    softmaxloss_conf{
+      topk:1
+    }
+    srclayers:"ip1"
+    srclayers: "data"
+  }
+# uncomment "softmax", "argsort", "output" layer and comment "loss" layer
+# to extract features from argsort
+#  layer {
+#    name : "output"
+#    type: kCSVOutput
+#    srclayers: "argsort"
+#    store_conf {
+#      path: "examples/cifar10/out.csv"
+#    }
+#  }
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  nworkers_per_group: 2
+  nworkers_per_procs: 2
+  workspace: "examples/cifar10"
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/include/singa/neuralnet/connection_layer.h
----------------------------------------------------------------------
diff --git a/include/singa/neuralnet/connection_layer.h 
b/include/singa/neuralnet/connection_layer.h
index ae918ce..14e5092 100644
--- a/include/singa/neuralnet/connection_layer.h
+++ b/include/singa/neuralnet/connection_layer.h
@@ -29,6 +29,24 @@
 #include "singa/neuralnet/layer.h"
 
 namespace singa {
+/**
+ * Used inside SplitLayer and SliceLayer to locate the out-going connection
+ * index given the Layer pointer.
+ */
+class Layer2Index {
+ public:
+  int Get(const Layer* layer) {
+    if (layer2idx_.find(layer) == layer2idx_.end()) {
+      int idx =  layer2idx_.size();
+      layer2idx_[layer] = idx;
+    }
+    return layer2idx_[layer];
+  }
+
+ private:
+  std::unordered_map<const Layer*, int> layer2idx_;
+};
+
 
 class BridgeLayer : public ConnectionLayer {
  public:
@@ -102,14 +120,15 @@ class SliceLayer : public ConnectionLayer {
   void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
   const std::string ToString(bool debug, int flag) override;
-  const Blob<float>& data(const Layer* from) const override;
-  const Blob<float>& grad(const Layer* from) const override;
+  const Blob<float>& data(const Layer* from) override;
+  const Blob<float>& grad(const Layer* from) override;
   Blob<float>* mutable_data(const Layer* from) override;
   Blob<float>* mutable_grad(const Layer* from) override;
- 
+
  private:
   int num_slices = 0;
   int slice_dim = 0;
+  Layer2Index layer_idx_;
 };
 
 /**
@@ -126,11 +145,12 @@ class SplitLayer : public ConnectionLayer {
   void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
   const std::string ToString(bool debug, int flag) override;
-  const Blob<float>& grad(const Layer* from) const override;
+  const Blob<float>& grad(const Layer* from) override;
   Blob<float>* mutable_grad(const Layer* from) override;
 
  private:
   int num_splits = 0;
+  Layer2Index layer_idx_;
 };
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/include/singa/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/neuralnet/layer.h b/include/singa/neuralnet/layer.h
index 3dbb3fc..28ab92e 100644
--- a/include/singa/neuralnet/layer.h
+++ b/include/singa/neuralnet/layer.h
@@ -174,6 +174,13 @@ class Layer {
    */
   inline const std::string& name() const { return layer_conf_.name(); }
   /**
+   * @return a const ref for Blob vector storing feature values of this layer.
+   */
+  virtual const vector<Blob<float>*>& data() const {
+    return datavec_;
+  }
+
+  /**
    * @param[in] from pointer to one of the dst layer. For some layers, they 
have
    * more than one data Blob. In this case, this argument identifies the layer
    * that is requesting the data Blob.
@@ -182,21 +189,19 @@ class Layer {
    * virtual const vector<Blob<float>>& data() const or
    * virtual const Blob<float>& data(int k) const instead}.
    */
-  virtual const Blob<float>& data(const Layer* from) const {
+  virtual const Blob<float>& data(const Layer* from) {
     return data_;
   }
   /**
-   * @return a const ref for Blob vector storing feature values of this layer.
-   */
-  virtual const vector<Blob<float>*>& data() const {
-    return datavec_;
-  }
-  /**
    * @return a const ref for the kth Blob.
+   * TODO(wangwei) if make this function const, there will be a warning
+   * indicating that data(const Layer*) and this function are ambiguous for
+   * data(0).
    */
-  virtual const Blob<float>& data(int k) const {
+  virtual const Blob<float>& data(int k) {
     return *datavec_.at(k);
   }
+
   /**
    * @see data().
    * @return the pointer to the Blob storing feature values of this layer.
@@ -226,7 +231,7 @@ class Layer {
    * virtual const vector<Blob<float>>& grad() const or
    * virtual const Blob<float>& grad(int k) const instead}.
    */
-  virtual const Blob<float>& grad(const Layer* from) const {
+  virtual const Blob<float>& grad(const Layer* from) {
     return grad_;
   }
   /**
@@ -286,7 +291,7 @@ class InputLayer : virtual public Layer {
     return nullptr;
     // LOG(FATAL) << "Input layer has no gradient blob";
   }
-  const Blob<float>& grad(const Layer* from) const override {
+  const Blob<float>& grad(const Layer* from) override {
     return grad_;
     // LOG(FATAL) << "Input layer has no gradient blob";
   }
@@ -312,7 +317,7 @@ class LossLayer : virtual public Layer {
     return nullptr;
     // LOG(FATAL) << "Loss layer has no gradient blob";
   }
-  const Blob<float>& grad(const Layer* from) const override {
+  const Blob<float>& grad(const Layer* from) override {
     return grad_;
     // LOG(FATAL) << "Loss layer has no gradient blob";
   }
@@ -328,7 +333,7 @@ class OutputLayer : virtual public Layer {
     return nullptr;
     // LOG(FATAL) << "Output layer has no gradient blob";
   }
-  const Blob<float>& grad(const Layer* from) const override {
+  const Blob<float>& grad(const Layer* from) override {
     return grad_;
     // LOG(FATAL) << "Output layer has no gradient blob";
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/src/neuralnet/connection_layer/bridge.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer/bridge.cc 
b/src/neuralnet/connection_layer/bridge.cc
index a2302ab..2cfd55a 100644
--- a/src/neuralnet/connection_layer/bridge.cc
+++ b/src/neuralnet/connection_layer/bridge.cc
@@ -69,8 +69,8 @@ void BridgeSrcLayer::Setup(const LayerProto& conf,
   Layer::Setup(conf, srclayers);
   data_.Reshape(srclayers[0]->data(this).shape());
   grad_.ReshapeLike(data_);
-  data_.ShareData(srclayers[0]->mutable_data(this));
-  grad_.ShareData(srclayers[0]->mutable_grad(this));
+  data_.ShareData(srclayers[0]->mutable_data(this), false);
+  grad_.ShareData(srclayers[0]->mutable_grad(this), false);
 }
 
 void BridgeSrcLayer::ComputeFeature(int flag, const vector<Layer*>& srcs) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/src/neuralnet/connection_layer/slice.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer/slice.cc 
b/src/neuralnet/connection_layer/slice.cc
index db83f1b..b625c66 100644
--- a/src/neuralnet/connection_layer/slice.cc
+++ b/src/neuralnet/connection_layer/slice.cc
@@ -98,29 +98,30 @@ void SliceLayer::ComputeGradient(int flag, const 
vector<Layer*>& srclayers) {
   }
 }
 
-const Blob<float>& SliceLayer::data(const Layer* from) const {
-  int id = from ? from->partition_id() : 0;
-  CHECK_LT(id, num_slices);
-  return *datavec_[id];
+const Blob<float>& SliceLayer::data(const Layer* from) {
+  int idx = from ? layer_idx_.Get(from) : 0;
+  CHECK_LT(idx, num_slices);
+  return *datavec_[idx];
 }
 
-const Blob<float>& SliceLayer::grad(const Layer* from) const {
-  int id = from ? from->partition_id() : 0;
-  CHECK_LT(id, num_slices);
-  return *gradvec_[id];
+const Blob<float>& SliceLayer::grad(const Layer* from) {
+  int idx = from ? layer_idx_.Get(from) : 0;
+  CHECK_LT(idx, num_slices);
+  return *gradvec_[idx];
 }
 
 Blob<float>* SliceLayer::mutable_data(const Layer* from) {
-  int id = from ? from->partition_id() : 0;
   CHECK(from);
-  CHECK_LT(id, num_slices);
-  return datavec_[id];
+  int idx = layer_idx_.Get(from);
+  CHECK_LT(idx, num_slices);
+  return datavec_[idx];
 }
 
 Blob<float>* SliceLayer::mutable_grad(const Layer* from) {
-  int id = from ? from->partition_id() : 0;
-  CHECK_LT(id, num_slices);
-  return gradvec_[id];
+  CHECK(from);
+  int idx = layer_idx_.Get(from);
+  CHECK_LT(idx, num_slices);
+  return gradvec_[idx];
 }
 const std::string SliceLayer::ToString(bool debug, int flag) {
   if (!debug)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/82563f69/src/neuralnet/connection_layer/split.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer/split.cc 
b/src/neuralnet/connection_layer/split.cc
index 9bbe59e..a9270fb 100644
--- a/src/neuralnet/connection_layer/split.cc
+++ b/src/neuralnet/connection_layer/split.cc
@@ -62,16 +62,18 @@ void SplitLayer::ComputeGradient(int flag, const 
vector<Layer*>& srclayers) {
   Copy(*gradvec_[0], srclayers[0]->mutable_grad(this));
 }
 
-const Blob<float>& SplitLayer::grad(const Layer* from) const {
+const Blob<float>& SplitLayer::grad(const Layer* from) {
   CHECK(from);
-  CHECK_LT(from->partition_id(), num_splits);
-  return *gradvec_[from->partition_id()];
+  int idx = layer_idx_.Get(from);
+  CHECK_LT(idx, num_splits);
+  return *gradvec_[idx];
 }
 
 Blob<float>* SplitLayer::mutable_grad(const Layer* from) {
   CHECK(from);
-  CHECK_LT(from->partition_id(), num_splits);
-  return gradvec_[from->partition_id()];
+  int idx = layer_idx_.Get(from);
+  CHECK_LT(idx, num_splits);
+  return gradvec_[idx];
 }
 const std::string SplitLayer::ToString(bool debug, int flag) {
   if (!debug)

[3/5] incubator-singa git commit: SINGA-113 Model/Hybrid Partition Support

Reply via email to