SINGA-192 Implement optimization algorithms for v1 Change interface of Apply() from Optimizer class, now Apply() method will receive a ``const& tensor grad'' argument rather than ``tensor* grad'', that is to say, this method will not cause side effect on grad tensor.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/0cd96639 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/0cd96639 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/0cd96639 Branch: refs/heads/dev Commit: 0cd96639e232b4a83a2b33e51b05132085b7d31b Parents: 272100a Author: WANG Ji <[email protected]> Authored: Mon Jun 13 11:23:30 2016 +0800 Committer: Wei Wang <[email protected]> Committed: Mon Jun 13 20:28:52 2016 +0800 ---------------------------------------------------------------------- include/singa/model/optimizer.h | 22 ++++++++++++++++------ src/model/optimizer/adagrad.cc | 9 +++++---- src/model/optimizer/nesterov.cc | 10 ++++++---- src/model/optimizer/optimizer.cc | 2 +- src/model/optimizer/rmsprop.cc | 9 +++++---- src/model/optimizer/sgd.cc | 9 +++++---- src/proto/model.proto | 2 +- test/singa/test_adagrad.cc | 8 ++++---- test/singa/test_nesterov.cc | 8 ++++---- test/singa/test_rmsprop.cc | 8 ++++---- test/singa/test_sgd.cc | 16 ++++++++-------- 11 files changed, 59 insertions(+), 44 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/include/singa/model/optimizer.h ---------------------------------------------------------------------- diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h index 7da1db8..f912668 100644 --- a/include/singa/model/optimizer.h +++ b/include/singa/model/optimizer.h @@ -62,7 +62,7 @@ class Optimizer { /// conducted. It assumes all these operations are done either by users or /// by Apply(int, const string&, Tensor*, Tensor*). /// All sub-classes should override this function. - virtual void Apply(int step, float lr, const string& name, Tensor* grad, + virtual void Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) = 0; /// Apply the updating algorithm. @@ -76,6 +76,9 @@ class Optimizer { void SetLearningRateGenerator(function<float(int)> func) { learning_rate_generator_ = func; } + /// Since Optimizer base layer has pure virtual function, a virtual + /// deconstructor is needed. + virtual ~Optimizer() = default; protected: function<float(int)> learning_rate_generator_; @@ -109,6 +112,7 @@ class Constraint { /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py void Apply(int step, const vector<Tensor*>& grads, const vector<Tensor*>& values); + private: /// currently only support "L2" norm constraint, i.e., the norm should be less /// than the configured threshold_, otherwise, the parameters would be clipped @@ -142,6 +146,7 @@ class Regularizer { /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py void Apply(int step, const vector<Tensor*>& grads, const vector<Tensor*>& values); + private: /// currently only support "L2" regularizer. type_ is case insensitive. /// TODO(wangwei) add more regularizer, e.g., L1. @@ -154,7 +159,7 @@ class SGD : Optimizer { public: void Setup(const OptimizerConf& conf); /// Apply the updating algorithm. - void Apply(int step, float lr, const string& name, Tensor* grad, + void Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) override; /// The argument function returns the momentum value given the current running @@ -162,6 +167,7 @@ class SGD : Optimizer { void SetMomentumGenerator(std::function<float(int)> func) { momentum_generator_ = func; } + virtual ~SGD() = default; private: std::unordered_map<string, Tensor> history_gradient_; @@ -173,7 +179,7 @@ class Nesterov : Optimizer { public: void Setup(const OptimizerConf& conf); /// Apply the updating algorithm. - void Apply(int step, float lr, const string& name, Tensor* grad, + void Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) override; /// The argument function returns the momentum value given the current running @@ -181,6 +187,7 @@ class Nesterov : Optimizer { void SetMomentumGenerator(std::function<float(int)> func) { momentum_generator_ = func; } + virtual ~Nesterov() = default; private: std::unordered_map<string, Tensor> history_gradient_; @@ -192,8 +199,9 @@ class Adagrad : Optimizer { public: void Setup(const OptimizerConf& conf); /// Apply the updating algorithm. - void Apply(int step, float lr, const string& name, Tensor* grad, + void Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) override; + virtual ~Adagrad() = default; private: std::unordered_map<string, Tensor> history_gradient_; @@ -204,8 +212,9 @@ class RMSProp : Optimizer { public: void Setup(const OptimizerConf& conf); /// Apply the updating algorithm. - void Apply(int step, float lr, const string& name, Tensor* grad, + void Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) override; + virtual ~RMSProp() = default; private: std::unordered_map<string, Tensor> history_gradient_; @@ -236,7 +245,8 @@ class LocalAllReduce : public Optimizer{ /// 2. Partition parameters onto worker devices. For example, model parameter /// set is {A, B, C}, nb_workers = 3, then worker 0/1/2 would be in charge of /// updating A/B/C respectively. A gradient Tensor for A/B/C would be created - /// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt to register the specs + /// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt to +register the specs /// for A/B/C. void Register(const vector<string>& names, const vector<Tensor>& values, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/adagrad.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/adagrad.cc b/src/model/optimizer/adagrad.cc index 0b8ec88..6910320 100644 --- a/src/model/optimizer/adagrad.cc +++ b/src/model/optimizer/adagrad.cc @@ -23,14 +23,15 @@ namespace singa { void Adagrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); } -void Adagrad::Apply(int step, float lr, const string& name, Tensor* grad, +void Adagrad::Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) { if (history_gradient_.find(name) == history_gradient_.end()) history_gradient_[name].ResetLike(*value); Tensor& history = history_gradient_[name]; - history += Square(*grad); - (*grad) /= Sqrt(history + delta_); - Axpy(-lr, *grad, value); + Tensor tmp = grad.Clone(); + history += Square(tmp); + tmp /= Sqrt(history + delta_); + Axpy(-lr, tmp, value); } } // namespace singa #endif // SRC_MODEL_OPTIMIZER_ADAGRAD_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/nesterov.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/nesterov.cc b/src/model/optimizer/nesterov.cc index 95c5531..2f16c06 100644 --- a/src/model/optimizer/nesterov.cc +++ b/src/model/optimizer/nesterov.cc @@ -26,16 +26,18 @@ void Nesterov::Setup(const OptimizerConf& conf) { SetMomentumGenerator([m](int step) { return m; }); } -void Nesterov::Apply(int step, float lr, const string& name, Tensor* grad, +void Nesterov::Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) { if (momentum_generator_) { float mom = momentum_generator_(step); if (history_gradient_.find(name) == history_gradient_.end()) history_gradient_[name].ResetLike(*value); Tensor& history = history_gradient_[name]; - Tensor tmp = history; - history = history * mom + (*grad) * lr; - tmp = history * (1 + mom) - tmp * mom; + Tensor tmp = history.Clone(); + history *= mom; + Axpy(lr, grad, &history); + tmp *= -mom; + Axpy(1 + mom, history, &tmp); (*value) -= tmp; } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/optimizer.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/optimizer.cc b/src/model/optimizer/optimizer.cc index 92b6b3d..c9e7a72 100644 --- a/src/model/optimizer/optimizer.cc +++ b/src/model/optimizer/optimizer.cc @@ -54,7 +54,7 @@ void Optimizer::Apply(int step, const string& name, Tensor* grad, float lr = learning_rate_generator_(step); if (learning_rate_multplier_.find(name) != learning_rate_multplier_.end()) lr *= learning_rate_multplier_.at(name); - Apply(step, lr, name, grad, param); + Apply(step, lr, name, *grad, param); } void Regularizer::Setup(const RegularizerConf& conf) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/rmsprop.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/rmsprop.cc b/src/model/optimizer/rmsprop.cc index 7b9934c..9f13e11 100644 --- a/src/model/optimizer/rmsprop.cc +++ b/src/model/optimizer/rmsprop.cc @@ -26,16 +26,17 @@ void RMSProp::Setup(const OptimizerConf& conf) { rho_ = conf.rho(); } -void RMSProp::Apply(int step, float lr, const string& name, Tensor* grad, +void RMSProp::Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) { if (history_gradient_.find(name) == history_gradient_.end()) { history_gradient_[name].ResetLike(*value); } Tensor& history = history_gradient_[name]; history *= rho_; - Axpy(1 - rho_, Square(*grad), &history); - (*grad) /= Sqrt(history + delta_); - Axpy(-lr, *grad, value); + Tensor tmp = grad.Clone(); + Axpy(1 - rho_, Square(tmp), &history); + tmp /= Sqrt(history + delta_); + Axpy(-lr, tmp, value); } } // namespace singa #endif // SRC_MODEL_OPTIMIZER_ADAGRAD_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/model/optimizer/sgd.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/sgd.cc b/src/model/optimizer/sgd.cc index 49c17c9..ae8b7ac 100644 --- a/src/model/optimizer/sgd.cc +++ b/src/model/optimizer/sgd.cc @@ -28,9 +28,10 @@ void SGD::Setup(const OptimizerConf& conf) { } } -void SGD::Apply(int step, float lr, const string& name, Tensor* grad, +void SGD::Apply(int step, float lr, const string& name, const Tensor& grad, Tensor* value) { - (*grad) *= lr; + Tensor tmp = grad.Clone(); + tmp *= lr; if (momentum_generator_) { float mom = momentum_generator_(step); if (mom != 0) { @@ -38,12 +39,12 @@ void SGD::Apply(int step, float lr, const string& name, Tensor* grad, history_gradient_[name].ResetLike(*value); Tensor& history = history_gradient_[name]; history *= mom; - history += *grad; + history += tmp; (*value) -= history; return; } } - (*value) -= *grad; + (*value) -= tmp; } } // namespace singa #endif // SRC_MODEL_OPTIMIZER_SGD_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/src/proto/model.proto ---------------------------------------------------------------------- diff --git a/src/proto/model.proto b/src/proto/model.proto index 590fdd6..c49f767 100644 --- a/src/proto/model.proto +++ b/src/proto/model.proto @@ -78,7 +78,7 @@ message OptimizerConf { optional string type = 1 [default = "sgd"]; // used by RMSprop and Adadelta - optional float rho = 2 [default = 0.001]; + optional float rho = 2 [default = 0.95]; // used by Adam and AdamMax optional float beta_1 = 3 [default = 0.9]; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_adagrad.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_adagrad.cc b/test/singa/test_adagrad.cc index 80240b1..ef930d5 100644 --- a/test/singa/test_adagrad.cc +++ b/test/singa/test_adagrad.cc @@ -36,7 +36,7 @@ TEST(Adagrad, ApplyCPU) { singa::OptimizerConf conf; adagrad.Setup(conf); - adagrad.Apply(0, lr, "xx", &grad, &value); + adagrad.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); const float* newv1 = v1.data<const float*>(); @@ -47,7 +47,7 @@ TEST(Adagrad, ApplyCPU) { 1e-5); grad.CopyDataFromHostPtr(g, 4); - adagrad.Apply(1, lr, "xx", &grad, &value); + adagrad.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); const float* newv2 = v2.data<const float*>(); for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i]; @@ -71,7 +71,7 @@ TEST(Adagrad, ApplyCUDA) { singa::OptimizerConf conf; adagrad.Setup(conf); - adagrad.Apply(0, lr, "xx", &grad, &value); + adagrad.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); v1.ToHost(); @@ -83,7 +83,7 @@ TEST(Adagrad, ApplyCUDA) { 1e-5); grad.CopyDataFromHostPtr(g, 4); - adagrad.Apply(1, lr, "xx", &grad, &value); + adagrad.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); v2.ToHost(); const float* newv2 = v2.data<const float*>(); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_nesterov.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_nesterov.cc b/test/singa/test_nesterov.cc index e7083c8..a9b264c 100644 --- a/test/singa/test_nesterov.cc +++ b/test/singa/test_nesterov.cc @@ -35,7 +35,7 @@ TEST(Nesterov, ApplyCPU) { value.CopyDataFromHostPtr(v, 4); grad.CopyDataFromHostPtr(g, 4); - nesterov.Apply(0, lr, "xx", &grad, &value); + nesterov.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); const float* newv1 = v1.data<const float*>(); @@ -47,7 +47,7 @@ TEST(Nesterov, ApplyCPU) { for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]); grad.CopyDataFromHostPtr(g, 4); - nesterov.Apply(1, lr, "xx", &grad, &value); + nesterov.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); const float* newv2 = v2.data<const float*>(); for (int i = 0; i < 4; ++i) { @@ -73,7 +73,7 @@ TEST(Nesterov, ApplyCUDA) { value.CopyDataFromHostPtr(v, 4); grad.CopyDataFromHostPtr(g, 4); - nesterov.Apply(0, lr, "xx", &grad, &value); + nesterov.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); v1.ToHost(); @@ -86,7 +86,7 @@ TEST(Nesterov, ApplyCUDA) { for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]); grad.CopyDataFromHostPtr(g, 4); - nesterov.Apply(1, lr, "xx", &grad, &value); + nesterov.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); v2.ToHost(); const float* newv2 = v2.data<const float*>(); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_rmsprop.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_rmsprop.cc b/test/singa/test_rmsprop.cc index 8104f50..ddfdefe 100644 --- a/test/singa/test_rmsprop.cc +++ b/test/singa/test_rmsprop.cc @@ -40,7 +40,7 @@ TEST(RMSProp, ApplyCPU) { grad.CopyDataFromHostPtr(g, 4); rmsprop.Setup(conf); - rmsprop.Apply(0, lr, "xx", &grad, &value); + rmsprop.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); const float* newv1 = v1.data<const float*>(); @@ -51,7 +51,7 @@ TEST(RMSProp, ApplyCPU) { 1e-5); grad.CopyDataFromHostPtr(g, 4); - rmsprop.Apply(1, lr, "xx", &grad, &value); + rmsprop.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); const float* newv2 = v2.data<const float*>(); for (int i = 0; i < 4; ++i) @@ -80,7 +80,7 @@ TEST(RMSProp, ApplyCUDA) { grad.CopyDataFromHostPtr(g, 4); rmsprop.Setup(conf); - rmsprop.Apply(0, lr, "xx", &grad, &value); + rmsprop.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); v1.ToHost(); @@ -92,7 +92,7 @@ TEST(RMSProp, ApplyCUDA) { 1e-5); grad.CopyDataFromHostPtr(g, 4); - rmsprop.Apply(1, lr, "xx", &grad, &value); + rmsprop.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); v2.ToHost(); const float* newv2 = v2.data<const float*>(); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0cd96639/test/singa/test_sgd.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc index c0b6e2b..5417b04 100644 --- a/test/singa/test_sgd.cc +++ b/test/singa/test_sgd.cc @@ -33,7 +33,7 @@ TEST(SGD, ApplyWithoutMomentum) { grad.CopyDataFromHostPtr(g, 4); float lr = 0.1f; - sgd.Apply(0, lr, "xx", &grad, &value); + sgd.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); const float* newv1 = v1.data<const float*>(); @@ -44,7 +44,7 @@ TEST(SGD, ApplyWithoutMomentum) { lr /= 2; grad.CopyDataFromHostPtr(g, 4); - sgd.Apply(1, lr, "xx", &grad, &value); + sgd.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); const float* newv2 = v2.data<const float*>(); for (int i = 0; i < 4; i++) { @@ -65,7 +65,7 @@ TEST(SGD, ApplyWithMomentum) { value.CopyDataFromHostPtr(v, 4); grad.CopyDataFromHostPtr(g, 4); - sgd.Apply(0, lr, "xx", &grad, &value); + sgd.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); const float* newv1 = v1.data<const float*>(); @@ -74,7 +74,7 @@ TEST(SGD, ApplyWithMomentum) { } grad.CopyDataFromHostPtr(g, 4); - sgd.Apply(1, lr, "xx", &grad, &value); + sgd.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); const float* newv2 = v2.data<const float*>(); for (int i = 0; i < 4; i++) { @@ -94,7 +94,7 @@ TEST(SGD, ApplyWithoutMomentumCuda) { grad.CopyDataFromHostPtr(g, 4); float lr = 0.1f; - sgd.Apply(0, lr, "xx", &grad, &value); + sgd.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); v1.ToHost(); @@ -106,7 +106,7 @@ TEST(SGD, ApplyWithoutMomentumCuda) { lr /= 2; grad.CopyDataFromHostPtr(g, 4); - sgd.Apply(1, lr, "xx", &grad, &value); + sgd.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); v2.ToHost(); const float* newv2 = v2.data<const float*>(); @@ -129,7 +129,7 @@ TEST(SGD, ApplyWithMomentumCuda) { value.CopyDataFromHostPtr(v, 4); grad.CopyDataFromHostPtr(g, 4); - sgd.Apply(0, lr, "xx", &grad, &value); + sgd.Apply(0, lr, "xx", grad, &value); singa::Tensor v1 = value.Clone(); v1.ToHost(); @@ -139,7 +139,7 @@ TEST(SGD, ApplyWithMomentumCuda) { } grad.CopyDataFromHostPtr(g, 4); - sgd.Apply(1, lr, "xx", &grad, &value); + sgd.Apply(1, lr, "xx", grad, &value); singa::Tensor v2 = value.Clone(); v2.ToHost(); const float* newv2 = v2.data<const float*>();
