SINGA-145 New SGD based optimization Updaters: AdaDelta, Adam, AdamMax New Updaters: AdaDelta, Adam, AdamMax. To implement AdamMax, add two new operators for Tensor in cxxnet_op.h i.e. op::abs and op::max.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e32e70cc Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e32e70cc Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e32e70cc Branch: refs/heads/master Commit: e32e70ccdd16582c633f12e3e6702636139a6078 Parents: 8d4953a Author: ijingo <[email protected]> Authored: Fri Feb 19 16:10:21 2016 +0800 Committer: jinyangturbo <[email protected]> Committed: Wed Feb 24 01:21:23 2016 -0800 ---------------------------------------------------------------------- examples/cifar10/job.conf | 57 +++++++++++++++++------- include/mshadow/cxxnet_op.h | 11 +++++ include/singa/utils/param.h | 3 +- include/singa/utils/updater.h | 29 +++++++++++-- src/driver.cc | 4 ++ src/proto/job.proto | 23 ++++++++++ src/utils/param.cc | 1 + src/utils/updater.cc | 88 ++++++++++++++++++++++++++++++-------- 8 files changed, 178 insertions(+), 38 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/examples/cifar10/job.conf ---------------------------------------------------------------------- diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf index d20b452..22b2bb2 100644 --- a/examples/cifar10/job.conf +++ b/examples/cifar10/job.conf @@ -1,7 +1,7 @@ name: "cifar10-convnet" -train_steps: 1000 +train_steps: 5000 test_steps: 100 -test_freq: 200 +test_freq: 500 #validate_steps: 100 #validate_freq: 300 disp_freq: 50 @@ -9,20 +9,47 @@ disp_freq: 50 train_one_batch { alg: kBP } +#updater{ +# type: kSGD +# weight_decay:0.004 +# momentum:0.9 +# learning_rate { +# type: kFixedStep +# fixedstep_conf:{ +# step:0 +# step:60000 +# step:65000 +# step_lr:0.001 +# step_lr:0.0001 +# step_lr:0.00001 +# } +# } +#} +#updater{ +# type: kAdaDelta +# weight_decay:0.004 +# delta: 0.000001 +# learning_rate { +# type: kFixed +# base_lr:1 +# } +#} +#updater{ +# type: kAdamMax +# weight_decay:0.004 +# delta: 0.00000001 +# learning_rate { +# type: kFixed +# base_lr:0.0001 +# } +#} updater{ - type: kSGD + type: kAdamMax weight_decay:0.004 - momentum:0.9 + delta: 0.00000001 learning_rate { - type: kFixedStep - fixedstep_conf:{ - step:0 - step:60000 - step:65000 - step_lr:0.001 - step_lr:0.0001 - step_lr:0.00001 - } + type: kFixed + base_lr:0.002 } } neuralnet { @@ -273,7 +300,7 @@ neuralnet { cluster { nworker_groups: 1 nserver_groups: 1 - nworkers_per_group: 1 - nworkers_per_procs: 1 + nworkers_per_group: 4 + nworkers_per_procs: 4 workspace: "examples/cifar10" } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/mshadow/cxxnet_op.h ---------------------------------------------------------------------- diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h index 930caf2..1422070 100644 --- a/include/mshadow/cxxnet_op.h +++ b/include/mshadow/cxxnet_op.h @@ -86,6 +86,12 @@ namespace mshadow { } }; + struct abs{ + MSHADOW_XINLINE static real_t Map(real_t a) { + return a < 0 ? -a : a; + } + }; + }; //namespace op }; //namespace mshadow @@ -110,6 +116,11 @@ namespace mshadow { return sqrt(a+b); } }; + struct max { + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return a > b ? a : b; + } + }; }; // namespace op }; // namespace mshadow http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/singa/utils/param.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/param.h b/include/singa/utils/param.h index 9930710..fcaaeb7 100644 --- a/include/singa/utils/param.h +++ b/include/singa/utils/param.h @@ -219,6 +219,7 @@ class Param { inline float* mutable_cpu_data() { return data_.mutable_cpu_data(); } inline float* mutable_cpu_grad() { return grad_.mutable_cpu_data(); } inline float* mutable_cpu_history() { return history_.mutable_cpu_data(); } + inline float* mutable_cpu_update() { return update_.mutable_cpu_data(); } /** * @return slice start ID */ @@ -355,7 +356,7 @@ class Param { std::vector<bool> pending_update_; int num_pending_requests_ = 0; // data, gradient, history gradient of this parameter - Blob<float> data_, grad_, history_; + Blob<float> data_, grad_, history_, update_; ParamProto proto_; }; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/singa/utils/updater.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/updater.h b/include/singa/utils/updater.h index 7fec78c..b14f72b 100644 --- a/include/singa/utils/updater.h +++ b/include/singa/utils/updater.h @@ -125,19 +125,40 @@ class RMSPropUpdater : public Updater { protected: float rho_; + float delta_; }; -/* class AdaDeltaUpdater : public Updater { public: - virtual void Update(int step, Param* param, float grad_scale); + void Init(const UpdaterProto &proto) override; + void Update(int step, Param* param, float grad_scale) override; protected: float rho_; float delta_; - float weight_decay_; }; -*/ + +class AdamUpdater : public Updater { + public: + void Init(const UpdaterProto &proto) override; + void Update(int step, Param* param, float grad_scale) override; + + protected: + float beta1_; + float beta2_; + float delta_; +}; + +class AdamMaxUpdater : public Updater { + public: + void Init(const UpdaterProto &proto) override; + void Update(int step, Param* param, float grad_scale) override; + + protected: + float beta1_; + float beta2_; + float delta_; +}; } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/driver.cc ---------------------------------------------------------------------- diff --git a/src/driver.cc b/src/driver.cc index 1e4929f..6163865 100644 --- a/src/driver.cc +++ b/src/driver.cc @@ -128,6 +128,10 @@ void Driver::Init(int argc, char **argv) { RegisterUpdater<AdaGradUpdater>(kAdaGrad); RegisterUpdater<NesterovUpdater>(kNesterov); RegisterUpdater<RMSPropUpdater>(kRMSProp); + RegisterUpdater<AdaDeltaUpdater>(kAdaDelta); + RegisterUpdater<AdamUpdater>(kAdam); + RegisterUpdater<AdamMaxUpdater>(kAdamMax); + RegisterUpdater<SGDUpdater>(kSGD); // register learning rate change methods http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/proto/job.proto ---------------------------------------------------------------------- diff --git a/src/proto/job.proto b/src/proto/job.proto index da52ea9..7bc0ea3 100644 --- a/src/proto/job.proto +++ b/src/proto/job.proto @@ -138,6 +138,12 @@ message UpdaterProto { // configuration for RMSProp algorithm optional RMSPropProto rmsprop_conf = 3; + // congiguration for AdaDelta algorithm + optional AdaDeltaProto adadelta_conf = 4; + // congiguration for Adam algorithm + optional AdamProto adam_conf = 5; + // congiguration for AdamMax algorithm + optional AdamMaxProto adammax_conf = 6; // learning rate generator optional LRGenProto learning_rate = 11; @@ -561,6 +567,17 @@ message RMSPropProto { // history=history*rho_+(1-rho_)*(grad*grad_scale); required float rho = 1; } +message AdaDeltaProto { + required float rho = 1 [default = 0.9]; +} +message AdamProto { + required float beta1 = 1 [default = 0.9]; + required float beta2 = 2 [default = 0.999]; +} +message AdamMaxProto { + required float beta1 = 1 [default = 0.9]; + required float beta2 = 2 [default = 0.999]; +} message FixedStepProto { repeated int32 step = 28; @@ -713,6 +730,12 @@ enum UpdaterType { kRMSProp = 3; // Nesterov first optimal gradient method kNesterov = 4; + // AdaDelta + kAdaDelta = 5; + // Adam + kAdam = 6; + // AdamMax + kAdamMax = 7; // For user defined updater kUserUpdater = 105; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/utils/param.cc ---------------------------------------------------------------------- diff --git a/src/utils/param.cc b/src/utils/param.cc index 95396bc..158c777 100644 --- a/src/utils/param.cc +++ b/src/utils/param.cc @@ -154,6 +154,7 @@ void Param::Setup(const vector<int>& shape) { data_.Reshape(shape); grad_.Reshape(shape); history_.Reshape(shape); + update_.Reshape(shape); } void Param::InitValues() { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/utils/updater.cc ---------------------------------------------------------------------- diff --git a/src/utils/updater.cc b/src/utils/updater.cc index 1b3e26c..3f45d9e 100644 --- a/src/utils/updater.cc +++ b/src/utils/updater.cc @@ -181,6 +181,7 @@ void AdaGradUpdater::Update(int step, Param* param, float grad_scale) { void RMSPropUpdater::Init(const UpdaterProto& proto) { Updater::Init(proto); rho_ = proto.rmsprop_conf().rho(); + delta_ = proto.delta(); } void RMSPropUpdater::Update(int step, Param* param, float grad_scale) { @@ -198,14 +199,13 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale) { if (wd > 0) // L2 regularization, should be done after timing grad_scale grad += data * wd; history = history * rho_ + (1 - rho_) * F<square>(grad); - data -= lr * grad / (F<sqrtop>(history, proto_.delta())); + data -= lr * grad / F<sqrtop>(history, delta_); } -/***********************AdaDelta****************************** +/***********************AdaDelta******************************/ void AdaDeltaUpdater::Init(const UpdaterProto& proto){ Updater::Init(proto); - delta_=proto.delta(); - rho_=proto.rho(); - weight_decay_=proto.weight_decay(); + delta_ = proto.delta(); + rho_=proto.adadelta_conf().rho(); } void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){ @@ -215,19 +215,71 @@ void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){ Tensor<cpu, 1> history(param->mutable_cpu_history(), s); Tensor<cpu, 1> update(param->mutable_cpu_update(), s); TensorContainer<cpu, 1> tmp(s); - float wd=weight_decay_*param->wd_scale(); - if(wd>0){ // L2 regularization - grad+=data*wd; - } - if(step==0){ - history=0; - update=0; - } - history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale); - tmp=grad*F<op::sqrtop>(update, delta_)/F<op::sqrtop>(history, delta_); - update=rho_*update+(1-rho_)*F<op::square>(tmp); - data-=tmp; + float wd = weight_decay_*param->wd_scale(); + float lr = lr_gen_->Get(step) * param->lr_scale(); + if (grad_scale != 1.f) + grad *= grad_scale; + if (wd > 0) // L2 regularization, should be done after timing grad_scale + grad += data * wd; + history = history * rho_ + (1 - rho_) * F<op::square>(grad); + tmp = grad * F<op::sqrtop>(update, delta_) / F<op::sqrtop>(history, delta_); + update = rho_ * update + (1 - rho_) * F<op::square>(tmp); + if (lr != 1.f) + data -= lr * tmp; + else + data -= tmp; +} + +/***********************Adam******************************/ +void AdamUpdater::Init(const UpdaterProto &proto) { + Updater::Init(proto); + beta1_=proto.adam_conf().beta1(); + beta2_=proto.adam_conf().beta2(); + delta_ = proto.delta(); +} + +void AdamUpdater::Update(int step, Param* param, float grad_scale) { + Shape<1> s=Shape1(param->size()); + Tensor<cpu, 1> data(param->mutable_cpu_data(), s); + Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); + Tensor<cpu, 1> history(param->mutable_cpu_history(), s); + Tensor<cpu, 1> update(param->mutable_cpu_update(), s); + float wd = weight_decay_*param->wd_scale(); + float lr = lr_gen_->Get(step) * param->lr_scale(); + if (grad_scale != 1.f) + grad *= grad_scale; + if (wd > 0) // L2 regularization, should be done after timing grad_scale + grad += data * wd; + history = history * beta1_ + (1 - beta1_) * grad; + update = update * beta2_ + (1 - beta2_) * F<op::square>(grad); + data -= lr * history / F<op::sqrtop>(update, delta_); +} + +/***********************AdamMax******************************/ +void AdamMaxUpdater::Init(const UpdaterProto &proto) { + Updater::Init(proto); + beta1_=proto.adammax_conf().beta1(); + beta2_=proto.adammax_conf().beta2(); + delta_=proto.delta(); +} + +void AdamMaxUpdater::Update(int step, Param* param, float grad_scale) { + Shape<1> s=Shape1(param->size()); + Tensor<cpu, 1> data(param->mutable_cpu_data(), s); + Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); + Tensor<cpu, 1> history(param->mutable_cpu_history(), s); + Tensor<cpu, 1> update(param->mutable_cpu_update(), s); + float wd = weight_decay_*param->wd_scale(); + float lr = lr_gen_->Get(step) * param->lr_scale(); + if (grad_scale != 1.f) + grad *= grad_scale; + if (wd > 0) // L2 regularization, should be done after timing grad_scale + grad += data * wd; + history = history * beta1_ + (1 - beta1_) * grad; + update = update * beta2_; + grad = F<op::abs>(grad); + update = F<op::max>(update, grad) + delta_; + data -= lr * history / update; } -*/ } // namespace singa
