[1/2] incubator-singa git commit: SINGA-46 Fix a bug in updater.cc to scale the gradients

wangsh Wed, 12 Aug 2015 02:01:52 -0700

Repository: incubator-singa
Updated Branches:
  refs/heads/master a8c8211f4 -> 538736c4a



SINGA-46 Fix a bug in updater.cc to scale the gradients

Scale gradients in Updater::Update() before updating parameters.
Format code in updater.h and updater.cc.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6b34ff4e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6b34ff4e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6b34ff4e

Branch: refs/heads/master
Commit: 6b34ff4e539ade046d916fa2af52af425a304f2d
Parents: d5b6a30
Author: Wei Wang <[email protected]>
Authored: Wed Aug 12 16:30:00 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Wed Aug 12 16:32:37 2015 +0800

----------------------------------------------------------------------
 include/utils/updater.h |  41 ++++++++------
 src/utils/updater.cc    | 130 ++++++++++++++++++++++---------------------
 2 files changed, 89 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6b34ff4e/include/utils/updater.h
----------------------------------------------------------------------
diff --git a/include/utils/updater.h b/include/utils/updater.h
index d2f4dc1..99629cf 100644
--- a/include/utils/updater.h
+++ b/include/utils/updater.h
@@ -1,55 +1,61 @@
-#ifndef INCLUDE_UTILS_UPDATER_H_
-#define INCLUDE_UTILS_UPDATER_H_
+#ifndef SINGA_UTILS_UPDATER_H_
+#define SINGA_UTILS_UPDATER_H_
+
 #include "proto/job.pb.h"
 #include "utils/param.h"
 
-namespace singa{
+namespace singa {
 /**
  * Updater for Param.
  */
 class Updater{
  public:
   virtual ~Updater() {}
-  virtual void Init(const UpdaterProto &proto){
-    proto_=proto;
+  virtual void Init(const UpdaterProto &proto) {
+    proto_ = proto;
   }
-  virtual void Update(int step, Param* param, float grad_scale=1.0f)=0;
+  virtual void Update(int step, Param* param, float grad_scale = 1.0f) = 0;
 
   float GetLearningRate(int step);
+
  protected:
   UpdaterProto proto_;
 };
-class SGDUpdater : public Updater{
+
+class SGDUpdater : public Updater {
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, Param* param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale = 1.0f);
 
  protected:
   float base_lr_;
   float momentum_;
   float weight_decay_;
 };
-class NesterovUpdater : public Updater{
+
+class AdaGradUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, Param* param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale = 1.0f);
 
  protected:
   float base_lr_;
-  float momentum_;
+  float delta_;
   float weight_decay_;
 };
-class AdaGradUpdater : public Updater{
+
+
+class NesterovUpdater : public Updater {
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, Param* param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale = 1.0f);
 
  protected:
   float base_lr_;
-  float delta_;
+  float momentum_;
   float weight_decay_;
 };
-
+/*
 class RMSPropUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
@@ -62,7 +68,6 @@ class RMSPropUpdater : public Updater{
   float weight_decay_;
 };
 
-/*
 class AdaDeltaUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
@@ -74,6 +79,6 @@ class AdaDeltaUpdater : public Updater{
   float weight_decay_;
 };
 */
-}
+}  // namespace singa
 
-#endif // INCLUDE_UTILS_UPDATER_H_
+#endif  // SINGA_UTILS_UPDATER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6b34ff4e/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index 18e53ce..b85982e 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -3,27 +3,28 @@
 #include "mshadow/tensor.h"
 #include "mshadow/cxxnet_op.h"
 #include "proto/job.pb.h"
+namespace  singa {
+
 using namespace mshadow;
 using namespace mshadow::expr;
 
-namespace  singa {
 
-float Updater::GetLearningRate(int step){
-  float ret = 0., r = 0., base=proto_.base_lr();
-  int freq=0;
+float Updater::GetLearningRate(int step) {
+  float ret = 0., r = 0., base = proto_.base_lr();
+  int freq = 0;
   switch (proto_.lr_change()) {
     case UpdaterProto_ChangeMethod_kFixed:
       ret = base;
       break;
     case UpdaterProto_ChangeMethod_kLinear:
       // a is init, b is the final
-      freq=proto_.linear_conf().change_freq();
+      freq = proto_.linear_conf().change_freq();
       r = step * 1.0  / freq;
       ret = (1.0 - r) * base + r * proto_.linear_conf().final_lr();
       break;
     case UpdaterProto_ChangeMethod_kExponential:
       // a is init, b is the final, from convnet
-      freq=proto_.exponential_conf().change_freq();
+      freq = proto_.exponential_conf().change_freq();
       ret = base / pow(2, step * 1. / freq);
       break;
     case UpdaterProto_ChangeMethod_kInverseT:
@@ -34,19 +35,19 @@ float Updater::GetLearningRate(int step){
       break;
     case UpdaterProto_ChangeMethod_kInverse:
       // a is init, b is gamma, c is pow
-      ret=base*pow(1.f+proto_.inverse_conf().gamma()*step,
-          -proto_.inverse_conf().pow());
+      ret = base * pow(1.f + proto_.inverse_conf().gamma() * step,
+           - proto_.inverse_conf().pow());
       break;
     case UpdaterProto_ChangeMethod_kStep:
       // a is the base learning rate, b is gamma, from caffe
       // notice it is step/change_steps, not step*1.0/change_steps
-      freq=proto_.step_conf().change_freq();
+      freq = proto_.step_conf().change_freq();
       ret = base * pow(proto_.step_conf().gamma(), step / freq);
       break;
     case UpdaterProto_ChangeMethod_kFixedStep:
-      for(int i=0;i<proto_.fixedstep_conf().step_size();i++){
-        if(step>proto_.fixedstep_conf().step(i))
-          ret=proto_.fixedstep_conf().step_lr(i);
+      for (int i = 0; i < proto_.fixedstep_conf().step_size(); i++) {
+        if (step > proto_.fixedstep_conf().step(i))
+          ret = proto_.fixedstep_conf().step_lr(i);
       }
       break;
     default:
@@ -56,91 +57,93 @@ float Updater::GetLearningRate(int step){
 }
 
 /***********************SGD with momentum******************************/
-void SGDUpdater::Init(const UpdaterProto& proto){
+void SGDUpdater::Init(const UpdaterProto& proto) {
   Updater::Init(proto);
-  base_lr_=proto.base_lr();
-  //CHECK_GT(base_lr_, 0);
-  momentum_=proto.momentum();
-  weight_decay_=proto.weight_decay();
+  base_lr_ = proto.base_lr();
+  momentum_ = proto.momentum();
+  weight_decay_ = proto.weight_decay();
 }
 
-void SGDUpdater::Update(int step, Param* param, float grad_scale){
-  Shape<1> s=Shape1(param->size());
+void SGDUpdater::Update(int step, Param* param, float grad_scale) {
+  Shape<1> s = Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
-  float wd=weight_decay_*param->weight_decay_multiplier();
-  if(wd>0){ // L2 regularization
-    grad+=data*wd;
+  float lr = GetLearningRate(step)*param->learning_rate_multiplier();
+  float wd = weight_decay_*param->weight_decay_multiplier();
+  if (grad_scale != 1.f)
+    grad *= grad_scale;
+  if (wd > 0) {  // L2 regularization, should be done after timing grad_scale
+    grad += data * wd;
   }
-  if(momentum_>0){
+  if (momentum_ > 0) {
     Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-    if(step==0) history=0;
-    history=history*momentum_-lr*grad;
-    data+=history;
-  }else{
-    grad*=-lr;
-    data+=grad;
+    history = history * momentum_ - lr * grad;
+    data += history;
+  } else {
+    grad *= -lr;
+    data += grad;
   }
 }
 
 /***********************Nesterov******************************/
-void NesterovUpdater::Init(const UpdaterProto& proto){
+void NesterovUpdater::Init(const UpdaterProto& proto) {
   Updater::Init(proto);
-  base_lr_=proto.base_lr();
+  base_lr_ = proto.base_lr();
   CHECK_GT(base_lr_, 0);
-  weight_decay_=proto.weight_decay();
+  weight_decay_ = proto.weight_decay();
 }
 
-void NesterovUpdater::Update(int step, Param* param, float grad_scale){
-  Shape<1> s=Shape1(param->size());
+void NesterovUpdater::Update(int step, Param* param, float grad_scale) {
+  Shape<1> s = Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
   TensorContainer<cpu, 1> tmp(s);
-  if(step==0) history=0;
-  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
-  float wd=weight_decay_*param->weight_decay_multiplier();
-  if(wd>0){ // L2 regularization
-    grad+=data*wd;
+  float lr = GetLearningRate(step)*param->learning_rate_multiplier();
+  float wd = weight_decay_*param->weight_decay_multiplier();
+  if (grad_scale != 1.f)
+    grad *= grad_scale;
+  if (wd > 0) {  // L2 regularization, should be done after timing grad_scale
+    grad += data * wd;
   }
   Copy(tmp, history);
-  history=history*momentum_+lr*grad;
-  tmp=history*(1+momentum_)-tmp*momentum_;
-  data-=tmp;
+  history = history * momentum_ + lr * grad;
+  tmp = history * (1 + momentum_) - tmp * momentum_;
+  data -= tmp;
 }
 /***********************AdaGrad******************************/
-void AdaGradUpdater::Init(const UpdaterProto& proto){
+void AdaGradUpdater::Init(const UpdaterProto& proto) {
   Updater::Init(proto);
-  base_lr_=proto.base_lr();
+  base_lr_ = proto.base_lr();
   CHECK_GT(base_lr_, 0);
-  delta_=proto.delta();
-  weight_decay_=proto.weight_decay();
+  delta_ = proto.delta();
+  weight_decay_ = proto.weight_decay();
 }
 
-void AdaGradUpdater::Update(int step, Param* param, float grad_scale){
-  Shape<1> s=Shape1(param->size());
+void AdaGradUpdater::Update(int step, Param* param, float grad_scale) {
+  Shape<1> s = Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  if(step==0) history=0;
-  history+=F<op::square>(grad*grad_scale);
-  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
-  float wd=weight_decay_*param->weight_decay_multiplier();
-  if(wd>0){ // L2 regularization
-    grad+=data*wd;
+  float lr = GetLearningRate(step)*param->learning_rate_multiplier();
+  float wd = weight_decay_*param->weight_decay_multiplier();
+  if (grad_scale != 1.f)
+    grad *= grad_scale;
+  if (wd > 0) {  //  L2 regularization, should be done after timing grad_scale
+    grad += data * wd;
   }
-  data-=lr*grad/(F<op::sqrtop>(history,delta_));
+  history += F<op::square>(grad);
+  data -= lr * grad / (F<op::sqrtop>(history, delta_));
 }
 
-/***********************RMSProp******************************/
+/***********************RMSProp******************************
 void RMSPropUpdater::Init(const UpdaterProto& proto){
   Updater::Init(proto);
-  base_lr_=proto.base_lr();
+  base_lr_ = proto.base_lr();
   CHECK_GT(base_lr_, 0);
-  delta_=proto.delta();
-  rho_=proto.rmsprop_conf().rho();
-  weight_decay_=proto.weight_decay();
+  delta_ = proto.delta();
+  rho_ = proto.rmsprop_conf().rho();
+  weight_decay_ = proto.weight_decay();
 }
 
 void RMSPropUpdater::Update(int step, Param* param, float grad_scale){
@@ -148,7 +151,6 @@ void RMSPropUpdater::Update(int step, Param* param, float 
grad_scale){
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  if(step==0) history=0;
   history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale);
   float lr=GetLearningRate(step)*param->learning_rate_multiplier();
   float wd=weight_decay_*param->weight_decay_multiplier();
@@ -158,7 +160,7 @@ void RMSPropUpdater::Update(int step, Param* param, float 
grad_scale){
   data-=lr*grad/(F<op::sqrtop>(history,delta_));
 }
 
-/***********************AdaDelta******************************
+***********************AdaDelta******************************
 void AdaDeltaUpdater::Init(const UpdaterProto& proto){
   Updater::Init(proto);
   delta_=proto.delta();
@@ -188,4 +190,4 @@ void AdaDeltaUpdater::Update(int step, Param* param, float 
grad_scale){
 }
 */
 
-} /* singa */
+}  // namespace singa

[1/2] incubator-singa git commit: SINGA-46 Fix a bug in updater.cc to scale the gradients

Reply via email to