[2/2] incubator-singa git commit: SINGA-192 Implement optimization algorithms for v1

wangwei Mon, 13 Jun 2016 05:30:07 -0700

SINGA-192 Implement optimization algorithms for v1

Change interface of Apply() from Optimizer class, now Apply() method
will receive a ``const& tensor grad'' argument rather than ``tensor*
grad'', that is to say, this method will not cause side effect on
grad tensor.



Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/0cd96639
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/0cd96639
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/0cd96639

Branch: refs/heads/dev
Commit: 0cd96639e232b4a83a2b33e51b05132085b7d31b
Parents: 272100a
Author: WANG Ji <[email protected]>
Authored: Mon Jun 13 11:23:30 2016 +0800
Committer: Wei Wang <[email protected]>
Committed: Mon Jun 13 20:28:52 2016 +0800

----------------------------------------------------------------------
 include/singa/model/optimizer.h  | 22 ++++++++++++++++------
 src/model/optimizer/adagrad.cc   |  9 +++++----
 src/model/optimizer/nesterov.cc  | 10 ++++++----
 src/model/optimizer/optimizer.cc |  2 +-
 src/model/optimizer/rmsprop.cc   |  9 +++++----
 src/model/optimizer/sgd.cc       |  9 +++++----
 src/proto/model.proto            |  2 +-
 test/singa/test_adagrad.cc       |  8 ++++----
 test/singa/test_nesterov.cc      |  8 ++++----
 test/singa/test_rmsprop.cc       |  8 ++++----
 test/singa/test_sgd.cc           | 16 ++++++++--------
 11 files changed, 59 insertions(+), 44 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/include/singa/model/optimizer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h
index 7da1db8..f912668 100644
--- a/include/singa/model/optimizer.h
+++ b/include/singa/model/optimizer.h
@@ -62,7 +62,7 @@ class Optimizer {
   /// conducted. It assumes all these operations are done either by users or
   /// by Apply(int, const string&, Tensor*, Tensor*).
   /// All sub-classes should override this function.
-  virtual void Apply(int step, float lr, const string& name, Tensor* grad,
+  virtual void Apply(int step, float lr, const string& name, const Tensor& 
grad,
                      Tensor* value) = 0;
 
   /// Apply the updating algorithm.
@@ -76,6 +76,9 @@ class Optimizer {
   void SetLearningRateGenerator(function<float(int)> func) {
     learning_rate_generator_ = func;
   }
+  /// Since Optimizer base layer has pure virtual function, a virtual
+  /// deconstructor is needed.
+  virtual ~Optimizer() = default;
 
  protected:
   function<float(int)> learning_rate_generator_;
@@ -109,6 +112,7 @@ class Constraint {
   /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
   void Apply(int step, const vector<Tensor*>& grads,
              const vector<Tensor*>& values);
+
  private:
   /// currently only support "L2" norm constraint, i.e., the norm should be 
less
   /// than the configured threshold_, otherwise, the parameters would be 
clipped
@@ -142,6 +146,7 @@ class Regularizer {
   /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
   void Apply(int step, const vector<Tensor*>& grads,
              const vector<Tensor*>& values);
+
  private:
   /// currently only support "L2" regularizer. type_ is case insensitive.
   /// TODO(wangwei) add more regularizer, e.g., L1.
@@ -154,7 +159,7 @@ class SGD : Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
-  void Apply(int step, float lr, const string& name, Tensor* grad,
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
              Tensor* value) override;
 
   /// The argument function returns the momentum value given the current 
running
@@ -162,6 +167,7 @@ class SGD : Optimizer {
   void SetMomentumGenerator(std::function<float(int)> func) {
     momentum_generator_ = func;
   }
+  virtual ~SGD() = default;
 
  private:
   std::unordered_map<string, Tensor> history_gradient_;
@@ -173,7 +179,7 @@ class Nesterov : Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
-  void Apply(int step, float lr, const string& name, Tensor* grad,
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
              Tensor* value) override;
 
   /// The argument function returns the momentum value given the current 
running
@@ -181,6 +187,7 @@ class Nesterov : Optimizer {
   void SetMomentumGenerator(std::function<float(int)> func) {
     momentum_generator_ = func;
   }
+  virtual ~Nesterov() = default;
 
  private:
   std::unordered_map<string, Tensor> history_gradient_;
@@ -192,8 +199,9 @@ class Adagrad : Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
-  void Apply(int step, float lr, const string& name, Tensor* grad,
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
              Tensor* value) override;
+  virtual ~Adagrad() = default;
 
  private:
   std::unordered_map<string, Tensor> history_gradient_;
@@ -204,8 +212,9 @@ class RMSProp : Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
-  void Apply(int step, float lr, const string& name, Tensor* grad,
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
              Tensor* value) override;
+  virtual ~RMSProp() = default;
 
  private:
   std::unordered_map<string, Tensor> history_gradient_;
@@ -236,7 +245,8 @@ class LocalAllReduce : public Optimizer{
   /// 2. Partition parameters onto worker devices. For example, model parameter
   /// set is {A, B, C}, nb_workers = 3, then worker 0/1/2 would be in charge of
   /// updating A/B/C respectively. A gradient Tensor for A/B/C would be created
-  /// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt 
to register the specs
+  /// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt 
to
+register the specs
   /// for A/B/C.
   void Register(const vector<string>& names,
                 const vector<Tensor>& values,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/adagrad.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/adagrad.cc b/src/model/optimizer/adagrad.cc
index 0b8ec88..6910320 100644
--- a/src/model/optimizer/adagrad.cc
+++ b/src/model/optimizer/adagrad.cc
@@ -23,14 +23,15 @@ namespace singa {
 
 void Adagrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); }
 
-void Adagrad::Apply(int step, float lr, const string& name, Tensor* grad,
+void Adagrad::Apply(int step, float lr, const string& name, const Tensor& grad,
                     Tensor* value) {
   if (history_gradient_.find(name) == history_gradient_.end())
     history_gradient_[name].ResetLike(*value);
   Tensor& history = history_gradient_[name];
-  history += Square(*grad);
-  (*grad) /= Sqrt(history + delta_);
-  Axpy(-lr, *grad, value);
+  Tensor tmp = grad.Clone();
+  history += Square(tmp);
+  tmp /= Sqrt(history + delta_);
+  Axpy(-lr, tmp, value);
 }
 }  // namespace singa
 #endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/nesterov.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/nesterov.cc b/src/model/optimizer/nesterov.cc
index 95c5531..2f16c06 100644
--- a/src/model/optimizer/nesterov.cc
+++ b/src/model/optimizer/nesterov.cc
@@ -26,16 +26,18 @@ void Nesterov::Setup(const OptimizerConf& conf) {
   SetMomentumGenerator([m](int step) { return m; });
 }
 
-void Nesterov::Apply(int step, float lr, const string& name, Tensor* grad,
+void Nesterov::Apply(int step, float lr, const string& name, const Tensor& 
grad,
                      Tensor* value) {
   if (momentum_generator_) {
     float mom = momentum_generator_(step);
     if (history_gradient_.find(name) == history_gradient_.end())
       history_gradient_[name].ResetLike(*value);
     Tensor& history = history_gradient_[name];
-    Tensor tmp = history;
-    history = history * mom + (*grad) * lr;
-    tmp = history * (1 + mom) - tmp * mom;
+    Tensor tmp = history.Clone();
+    history *= mom;
+    Axpy(lr, grad, &history);
+    tmp *= -mom;
+    Axpy(1 + mom, history, &tmp);
     (*value) -= tmp;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/optimizer.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/optimizer.cc b/src/model/optimizer/optimizer.cc
index 92b6b3d..c9e7a72 100644
--- a/src/model/optimizer/optimizer.cc
+++ b/src/model/optimizer/optimizer.cc
@@ -54,7 +54,7 @@ void Optimizer::Apply(int step, const string& name, Tensor* 
grad,
   float lr = learning_rate_generator_(step);
   if (learning_rate_multplier_.find(name) != learning_rate_multplier_.end())
     lr *= learning_rate_multplier_.at(name);
-  Apply(step, lr, name, grad, param);
+  Apply(step, lr, name, *grad, param);
 }
 
 void Regularizer::Setup(const RegularizerConf& conf) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/rmsprop.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/rmsprop.cc b/src/model/optimizer/rmsprop.cc
index 7b9934c..9f13e11 100644
--- a/src/model/optimizer/rmsprop.cc
+++ b/src/model/optimizer/rmsprop.cc
@@ -26,16 +26,17 @@ void RMSProp::Setup(const OptimizerConf& conf) {
   rho_ = conf.rho();
 }
 
-void RMSProp::Apply(int step, float lr, const string& name, Tensor* grad,
+void RMSProp::Apply(int step, float lr, const string& name, const Tensor& grad,
                     Tensor* value) {
   if (history_gradient_.find(name) == history_gradient_.end()) {
     history_gradient_[name].ResetLike(*value);
   }
   Tensor& history = history_gradient_[name];
   history *= rho_;
-  Axpy(1 - rho_, Square(*grad), &history);
-  (*grad) /= Sqrt(history + delta_);
-  Axpy(-lr, *grad, value);
+  Tensor tmp = grad.Clone();
+  Axpy(1 - rho_, Square(tmp), &history);
+  tmp /= Sqrt(history + delta_);
+  Axpy(-lr, tmp, value);
 }
 }  // namespace singa
 #endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/sgd.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/sgd.cc b/src/model/optimizer/sgd.cc
index 49c17c9..ae8b7ac 100644
--- a/src/model/optimizer/sgd.cc
+++ b/src/model/optimizer/sgd.cc
@@ -28,9 +28,10 @@ void SGD::Setup(const OptimizerConf& conf) {
   }
 }
 
-void SGD::Apply(int step, float lr, const string& name, Tensor* grad,
+void SGD::Apply(int step, float lr, const string& name, const Tensor& grad,
                 Tensor* value) {
-  (*grad) *= lr;
+  Tensor tmp = grad.Clone();
+  tmp *= lr;
   if (momentum_generator_) {
     float mom = momentum_generator_(step);
     if (mom != 0) {
@@ -38,12 +39,12 @@ void SGD::Apply(int step, float lr, const string& name, 
Tensor* grad,
         history_gradient_[name].ResetLike(*value);
       Tensor& history = history_gradient_[name];
       history *= mom;
-      history += *grad;
+      history += tmp;
       (*value) -= history;
       return;
     }
   }
-  (*value) -= *grad;
+  (*value) -= tmp;
 }
 }  // namespace singa
 #endif  // SRC_MODEL_OPTIMIZER_SGD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 590fdd6..c49f767 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -78,7 +78,7 @@ message OptimizerConf {
   optional string type = 1 [default = "sgd"];
 
   // used by RMSprop and Adadelta
-  optional float rho = 2 [default = 0.001];
+  optional float rho = 2 [default = 0.95];
 
   // used by Adam and AdamMax
   optional float beta_1 = 3 [default = 0.9];

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_adagrad.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_adagrad.cc b/test/singa/test_adagrad.cc
index 80240b1..ef930d5 100644
--- a/test/singa/test_adagrad.cc
+++ b/test/singa/test_adagrad.cc
@@ -36,7 +36,7 @@ TEST(Adagrad, ApplyCPU) {
 
   singa::OptimizerConf conf;
   adagrad.Setup(conf);
-  adagrad.Apply(0, lr, "xx", &grad, &value);
+  adagrad.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   const float* newv1 = v1.data<const float*>();
@@ -47,7 +47,7 @@ TEST(Adagrad, ApplyCPU) {
                 1e-5);
 
   grad.CopyDataFromHostPtr(g, 4);
-  adagrad.Apply(1, lr, "xx", &grad, &value);
+  adagrad.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   const float* newv2 = v2.data<const float*>();
   for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
@@ -71,7 +71,7 @@ TEST(Adagrad, ApplyCUDA) {
 
   singa::OptimizerConf conf;
   adagrad.Setup(conf);
-  adagrad.Apply(0, lr, "xx", &grad, &value);
+  adagrad.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   v1.ToHost();
@@ -83,7 +83,7 @@ TEST(Adagrad, ApplyCUDA) {
                 1e-5);
 
   grad.CopyDataFromHostPtr(g, 4);
-  adagrad.Apply(1, lr, "xx", &grad, &value);
+  adagrad.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   v2.ToHost();
   const float* newv2 = v2.data<const float*>();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_nesterov.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_nesterov.cc b/test/singa/test_nesterov.cc
index e7083c8..a9b264c 100644
--- a/test/singa/test_nesterov.cc
+++ b/test/singa/test_nesterov.cc
@@ -35,7 +35,7 @@ TEST(Nesterov, ApplyCPU) {
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 
-  nesterov.Apply(0, lr, "xx", &grad, &value);
+  nesterov.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   const float* newv1 = v1.data<const float*>();
@@ -47,7 +47,7 @@ TEST(Nesterov, ApplyCPU) {
   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
 
   grad.CopyDataFromHostPtr(g, 4);
-  nesterov.Apply(1, lr, "xx", &grad, &value);
+  nesterov.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   const float* newv2 = v2.data<const float*>();
   for (int i = 0; i < 4; ++i) {
@@ -73,7 +73,7 @@ TEST(Nesterov, ApplyCUDA) {
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 
-  nesterov.Apply(0, lr, "xx", &grad, &value);
+  nesterov.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   v1.ToHost();
@@ -86,7 +86,7 @@ TEST(Nesterov, ApplyCUDA) {
   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
 
   grad.CopyDataFromHostPtr(g, 4);
-  nesterov.Apply(1, lr, "xx", &grad, &value);
+  nesterov.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   v2.ToHost();
   const float* newv2 = v2.data<const float*>();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_rmsprop.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_rmsprop.cc b/test/singa/test_rmsprop.cc
index 8104f50..ddfdefe 100644
--- a/test/singa/test_rmsprop.cc
+++ b/test/singa/test_rmsprop.cc
@@ -40,7 +40,7 @@ TEST(RMSProp, ApplyCPU) {
   grad.CopyDataFromHostPtr(g, 4);
 
   rmsprop.Setup(conf);
-  rmsprop.Apply(0, lr, "xx", &grad, &value);
+  rmsprop.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   const float* newv1 = v1.data<const float*>();
@@ -51,7 +51,7 @@ TEST(RMSProp, ApplyCPU) {
                 1e-5);
 
   grad.CopyDataFromHostPtr(g, 4);
-  rmsprop.Apply(1, lr, "xx", &grad, &value);
+  rmsprop.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   const float* newv2 = v2.data<const float*>();
   for (int i = 0; i < 4; ++i)
@@ -80,7 +80,7 @@ TEST(RMSProp, ApplyCUDA) {
   grad.CopyDataFromHostPtr(g, 4);
 
   rmsprop.Setup(conf);
-  rmsprop.Apply(0, lr, "xx", &grad, &value);
+  rmsprop.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   v1.ToHost();
@@ -92,7 +92,7 @@ TEST(RMSProp, ApplyCUDA) {
                 1e-5);
 
   grad.CopyDataFromHostPtr(g, 4);
-  rmsprop.Apply(1, lr, "xx", &grad, &value);
+  rmsprop.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   v2.ToHost();
   const float* newv2 = v2.data<const float*>();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
index c0b6e2b..5417b04 100644
--- a/test/singa/test_sgd.cc
+++ b/test/singa/test_sgd.cc
@@ -33,7 +33,7 @@ TEST(SGD, ApplyWithoutMomentum) {
   grad.CopyDataFromHostPtr(g, 4);
 
   float lr = 0.1f;
-  sgd.Apply(0, lr, "xx", &grad, &value);
+  sgd.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   const float* newv1 = v1.data<const float*>();
@@ -44,7 +44,7 @@ TEST(SGD, ApplyWithoutMomentum) {
 
   lr /= 2;
   grad.CopyDataFromHostPtr(g, 4);
-  sgd.Apply(1, lr, "xx", &grad, &value);
+  sgd.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   const float* newv2 = v2.data<const float*>();
   for (int i = 0; i < 4; i++) {
@@ -65,7 +65,7 @@ TEST(SGD, ApplyWithMomentum) {
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 
-  sgd.Apply(0, lr, "xx", &grad, &value);
+  sgd.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   const float* newv1 = v1.data<const float*>();
@@ -74,7 +74,7 @@ TEST(SGD, ApplyWithMomentum) {
   }
 
   grad.CopyDataFromHostPtr(g, 4);
-  sgd.Apply(1, lr, "xx", &grad, &value);
+  sgd.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   const float* newv2 = v2.data<const float*>();
   for (int i = 0; i < 4; i++) {
@@ -94,7 +94,7 @@ TEST(SGD, ApplyWithoutMomentumCuda) {
   grad.CopyDataFromHostPtr(g, 4);
 
   float lr = 0.1f;
-  sgd.Apply(0, lr, "xx", &grad, &value);
+  sgd.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   v1.ToHost();
@@ -106,7 +106,7 @@ TEST(SGD, ApplyWithoutMomentumCuda) {
 
   lr /= 2;
   grad.CopyDataFromHostPtr(g, 4);
-  sgd.Apply(1, lr, "xx", &grad, &value);
+  sgd.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   v2.ToHost();
   const float* newv2 = v2.data<const float*>();
@@ -129,7 +129,7 @@ TEST(SGD, ApplyWithMomentumCuda) {
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 
-  sgd.Apply(0, lr, "xx", &grad, &value);
+  sgd.Apply(0, lr, "xx", grad, &value);
 
   singa::Tensor v1 = value.Clone();
   v1.ToHost();
@@ -139,7 +139,7 @@ TEST(SGD, ApplyWithMomentumCuda) {
   }
 
   grad.CopyDataFromHostPtr(g, 4);
-  sgd.Apply(1, lr, "xx", &grad, &value);
+  sgd.Apply(1, lr, "xx", grad, &value);
   singa::Tensor v2 = value.Clone();
   v2.ToHost();
   const float* newv2 = v2.data<const float*>();

[2/2] incubator-singa git commit: SINGA-192 Implement optimization algorithms for v1

Reply via email to