Repository: incubator-singa Updated Branches: refs/heads/master a8c8211f4 -> 538736c4a
SINGA-46 Fix a bug in updater.cc to scale the gradients Scale gradients in Updater::Update() before updating parameters. Format code in updater.h and updater.cc. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6b34ff4e Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6b34ff4e Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6b34ff4e Branch: refs/heads/master Commit: 6b34ff4e539ade046d916fa2af52af425a304f2d Parents: d5b6a30 Author: Wei Wang <[email protected]> Authored: Wed Aug 12 16:30:00 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Wed Aug 12 16:32:37 2015 +0800 ---------------------------------------------------------------------- include/utils/updater.h | 41 ++++++++------ src/utils/updater.cc | 130 ++++++++++++++++++++++--------------------- 2 files changed, 89 insertions(+), 82 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6b34ff4e/include/utils/updater.h ---------------------------------------------------------------------- diff --git a/include/utils/updater.h b/include/utils/updater.h index d2f4dc1..99629cf 100644 --- a/include/utils/updater.h +++ b/include/utils/updater.h @@ -1,55 +1,61 @@ -#ifndef INCLUDE_UTILS_UPDATER_H_ -#define INCLUDE_UTILS_UPDATER_H_ +#ifndef SINGA_UTILS_UPDATER_H_ +#define SINGA_UTILS_UPDATER_H_ + #include "proto/job.pb.h" #include "utils/param.h" -namespace singa{ +namespace singa { /** * Updater for Param. */ class Updater{ public: virtual ~Updater() {} - virtual void Init(const UpdaterProto &proto){ - proto_=proto; + virtual void Init(const UpdaterProto &proto) { + proto_ = proto; } - virtual void Update(int step, Param* param, float grad_scale=1.0f)=0; + virtual void Update(int step, Param* param, float grad_scale = 1.0f) = 0; float GetLearningRate(int step); + protected: UpdaterProto proto_; }; -class SGDUpdater : public Updater{ + +class SGDUpdater : public Updater { public: virtual void Init(const UpdaterProto& proto); - virtual void Update(int step, Param* param, float grad_scale=1.0f); + virtual void Update(int step, Param* param, float grad_scale = 1.0f); protected: float base_lr_; float momentum_; float weight_decay_; }; -class NesterovUpdater : public Updater{ + +class AdaGradUpdater : public Updater{ public: virtual void Init(const UpdaterProto& proto); - virtual void Update(int step, Param* param, float grad_scale=1.0f); + virtual void Update(int step, Param* param, float grad_scale = 1.0f); protected: float base_lr_; - float momentum_; + float delta_; float weight_decay_; }; -class AdaGradUpdater : public Updater{ + + +class NesterovUpdater : public Updater { public: virtual void Init(const UpdaterProto& proto); - virtual void Update(int step, Param* param, float grad_scale=1.0f); + virtual void Update(int step, Param* param, float grad_scale = 1.0f); protected: float base_lr_; - float delta_; + float momentum_; float weight_decay_; }; - +/* class RMSPropUpdater : public Updater{ public: virtual void Init(const UpdaterProto& proto); @@ -62,7 +68,6 @@ class RMSPropUpdater : public Updater{ float weight_decay_; }; -/* class AdaDeltaUpdater : public Updater{ public: virtual void Init(const UpdaterProto& proto); @@ -74,6 +79,6 @@ class AdaDeltaUpdater : public Updater{ float weight_decay_; }; */ -} +} // namespace singa -#endif // INCLUDE_UTILS_UPDATER_H_ +#endif // SINGA_UTILS_UPDATER_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6b34ff4e/src/utils/updater.cc ---------------------------------------------------------------------- diff --git a/src/utils/updater.cc b/src/utils/updater.cc index 18e53ce..b85982e 100644 --- a/src/utils/updater.cc +++ b/src/utils/updater.cc @@ -3,27 +3,28 @@ #include "mshadow/tensor.h" #include "mshadow/cxxnet_op.h" #include "proto/job.pb.h" +namespace singa { + using namespace mshadow; using namespace mshadow::expr; -namespace singa { -float Updater::GetLearningRate(int step){ - float ret = 0., r = 0., base=proto_.base_lr(); - int freq=0; +float Updater::GetLearningRate(int step) { + float ret = 0., r = 0., base = proto_.base_lr(); + int freq = 0; switch (proto_.lr_change()) { case UpdaterProto_ChangeMethod_kFixed: ret = base; break; case UpdaterProto_ChangeMethod_kLinear: // a is init, b is the final - freq=proto_.linear_conf().change_freq(); + freq = proto_.linear_conf().change_freq(); r = step * 1.0 / freq; ret = (1.0 - r) * base + r * proto_.linear_conf().final_lr(); break; case UpdaterProto_ChangeMethod_kExponential: // a is init, b is the final, from convnet - freq=proto_.exponential_conf().change_freq(); + freq = proto_.exponential_conf().change_freq(); ret = base / pow(2, step * 1. / freq); break; case UpdaterProto_ChangeMethod_kInverseT: @@ -34,19 +35,19 @@ float Updater::GetLearningRate(int step){ break; case UpdaterProto_ChangeMethod_kInverse: // a is init, b is gamma, c is pow - ret=base*pow(1.f+proto_.inverse_conf().gamma()*step, - -proto_.inverse_conf().pow()); + ret = base * pow(1.f + proto_.inverse_conf().gamma() * step, + - proto_.inverse_conf().pow()); break; case UpdaterProto_ChangeMethod_kStep: // a is the base learning rate, b is gamma, from caffe // notice it is step/change_steps, not step*1.0/change_steps - freq=proto_.step_conf().change_freq(); + freq = proto_.step_conf().change_freq(); ret = base * pow(proto_.step_conf().gamma(), step / freq); break; case UpdaterProto_ChangeMethod_kFixedStep: - for(int i=0;i<proto_.fixedstep_conf().step_size();i++){ - if(step>proto_.fixedstep_conf().step(i)) - ret=proto_.fixedstep_conf().step_lr(i); + for (int i = 0; i < proto_.fixedstep_conf().step_size(); i++) { + if (step > proto_.fixedstep_conf().step(i)) + ret = proto_.fixedstep_conf().step_lr(i); } break; default: @@ -56,91 +57,93 @@ float Updater::GetLearningRate(int step){ } /***********************SGD with momentum******************************/ -void SGDUpdater::Init(const UpdaterProto& proto){ +void SGDUpdater::Init(const UpdaterProto& proto) { Updater::Init(proto); - base_lr_=proto.base_lr(); - //CHECK_GT(base_lr_, 0); - momentum_=proto.momentum(); - weight_decay_=proto.weight_decay(); + base_lr_ = proto.base_lr(); + momentum_ = proto.momentum(); + weight_decay_ = proto.weight_decay(); } -void SGDUpdater::Update(int step, Param* param, float grad_scale){ - Shape<1> s=Shape1(param->size()); +void SGDUpdater::Update(int step, Param* param, float grad_scale) { + Shape<1> s = Shape1(param->size()); Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); - float lr=GetLearningRate(step)*param->learning_rate_multiplier(); - float wd=weight_decay_*param->weight_decay_multiplier(); - if(wd>0){ // L2 regularization - grad+=data*wd; + float lr = GetLearningRate(step)*param->learning_rate_multiplier(); + float wd = weight_decay_*param->weight_decay_multiplier(); + if (grad_scale != 1.f) + grad *= grad_scale; + if (wd > 0) { // L2 regularization, should be done after timing grad_scale + grad += data * wd; } - if(momentum_>0){ + if (momentum_ > 0) { Tensor<cpu, 1> history(param->mutable_cpu_history(), s); - if(step==0) history=0; - history=history*momentum_-lr*grad; - data+=history; - }else{ - grad*=-lr; - data+=grad; + history = history * momentum_ - lr * grad; + data += history; + } else { + grad *= -lr; + data += grad; } } /***********************Nesterov******************************/ -void NesterovUpdater::Init(const UpdaterProto& proto){ +void NesterovUpdater::Init(const UpdaterProto& proto) { Updater::Init(proto); - base_lr_=proto.base_lr(); + base_lr_ = proto.base_lr(); CHECK_GT(base_lr_, 0); - weight_decay_=proto.weight_decay(); + weight_decay_ = proto.weight_decay(); } -void NesterovUpdater::Update(int step, Param* param, float grad_scale){ - Shape<1> s=Shape1(param->size()); +void NesterovUpdater::Update(int step, Param* param, float grad_scale) { + Shape<1> s = Shape1(param->size()); Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); Tensor<cpu, 1> history(param->mutable_cpu_history(), s); TensorContainer<cpu, 1> tmp(s); - if(step==0) history=0; - float lr=GetLearningRate(step)*param->learning_rate_multiplier(); - float wd=weight_decay_*param->weight_decay_multiplier(); - if(wd>0){ // L2 regularization - grad+=data*wd; + float lr = GetLearningRate(step)*param->learning_rate_multiplier(); + float wd = weight_decay_*param->weight_decay_multiplier(); + if (grad_scale != 1.f) + grad *= grad_scale; + if (wd > 0) { // L2 regularization, should be done after timing grad_scale + grad += data * wd; } Copy(tmp, history); - history=history*momentum_+lr*grad; - tmp=history*(1+momentum_)-tmp*momentum_; - data-=tmp; + history = history * momentum_ + lr * grad; + tmp = history * (1 + momentum_) - tmp * momentum_; + data -= tmp; } /***********************AdaGrad******************************/ -void AdaGradUpdater::Init(const UpdaterProto& proto){ +void AdaGradUpdater::Init(const UpdaterProto& proto) { Updater::Init(proto); - base_lr_=proto.base_lr(); + base_lr_ = proto.base_lr(); CHECK_GT(base_lr_, 0); - delta_=proto.delta(); - weight_decay_=proto.weight_decay(); + delta_ = proto.delta(); + weight_decay_ = proto.weight_decay(); } -void AdaGradUpdater::Update(int step, Param* param, float grad_scale){ - Shape<1> s=Shape1(param->size()); +void AdaGradUpdater::Update(int step, Param* param, float grad_scale) { + Shape<1> s = Shape1(param->size()); Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); Tensor<cpu, 1> history(param->mutable_cpu_history(), s); - if(step==0) history=0; - history+=F<op::square>(grad*grad_scale); - float lr=GetLearningRate(step)*param->learning_rate_multiplier(); - float wd=weight_decay_*param->weight_decay_multiplier(); - if(wd>0){ // L2 regularization - grad+=data*wd; + float lr = GetLearningRate(step)*param->learning_rate_multiplier(); + float wd = weight_decay_*param->weight_decay_multiplier(); + if (grad_scale != 1.f) + grad *= grad_scale; + if (wd > 0) { // L2 regularization, should be done after timing grad_scale + grad += data * wd; } - data-=lr*grad/(F<op::sqrtop>(history,delta_)); + history += F<op::square>(grad); + data -= lr * grad / (F<op::sqrtop>(history, delta_)); } -/***********************RMSProp******************************/ +/***********************RMSProp****************************** void RMSPropUpdater::Init(const UpdaterProto& proto){ Updater::Init(proto); - base_lr_=proto.base_lr(); + base_lr_ = proto.base_lr(); CHECK_GT(base_lr_, 0); - delta_=proto.delta(); - rho_=proto.rmsprop_conf().rho(); - weight_decay_=proto.weight_decay(); + delta_ = proto.delta(); + rho_ = proto.rmsprop_conf().rho(); + weight_decay_ = proto.weight_decay(); } void RMSPropUpdater::Update(int step, Param* param, float grad_scale){ @@ -148,7 +151,6 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale){ Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); Tensor<cpu, 1> history(param->mutable_cpu_history(), s); - if(step==0) history=0; history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale); float lr=GetLearningRate(step)*param->learning_rate_multiplier(); float wd=weight_decay_*param->weight_decay_multiplier(); @@ -158,7 +160,7 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale){ data-=lr*grad/(F<op::sqrtop>(history,delta_)); } -/***********************AdaDelta****************************** +***********************AdaDelta****************************** void AdaDeltaUpdater::Init(const UpdaterProto& proto){ Updater::Init(proto); delta_=proto.delta(); @@ -188,4 +190,4 @@ void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){ } */ -} /* singa */ +} // namespace singa
