Repository: incubator-singa Updated Branches: refs/heads/dev b167dfa5b -> 272100a3f
SINGA-192 Implement optimization algorithms for v1 implement optimization algorithms for Singa v1 including nesterov, adagrad, rmsprop. Add unit test cases for these algorithms. However, only nesterov passed the test case, adagrad and rmsprop need Sqrt() operation for tensor which has not been implemented yet. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/178db014 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/178db014 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/178db014 Branch: refs/heads/dev Commit: 178db0144208fd5d5e7de58a575d0ea6300fdfdf Parents: 01aaf49 Author: WANG Ji <[email protected]> Authored: Sat Jun 11 15:00:18 2016 +0800 Committer: WANG Ji <[email protected]> Committed: Sat Jun 11 16:38:27 2016 +0800 ---------------------------------------------------------------------- include/singa/model/optimizer.h | 43 +++++++++++++++ src/model/optimizer/adagrad.cc | 35 ++++++++++++ src/model/optimizer/nesterov.cc | 43 +++++++++++++++ src/model/optimizer/rmsprop.cc | 38 +++++++++++++ src/proto/model.proto | 3 + test/singa/test_adagrad.cc | 92 +++++++++++++++++++++++++++++++ test/singa/test_nesterov.cc | 101 ++++++++++++++++++++++++++++++++++ test/singa/test_rmsprop.cc | 103 +++++++++++++++++++++++++++++++++++ 8 files changed, 458 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/include/singa/model/optimizer.h ---------------------------------------------------------------------- diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h index 7ca9f53..7da1db8 100644 --- a/include/singa/model/optimizer.h +++ b/include/singa/model/optimizer.h @@ -168,6 +168,49 @@ class SGD : Optimizer { std::function<float(int)> momentum_generator_; }; +// =============Nesterov====================================================== +class Nesterov : Optimizer { + public: + void Setup(const OptimizerConf& conf); + /// Apply the updating algorithm. + void Apply(int step, float lr, const string& name, Tensor* grad, + Tensor* value) override; + + /// The argument function returns the momentum value given the current running + /// step (i.e., iterations/mini-batches). + void SetMomentumGenerator(std::function<float(int)> func) { + momentum_generator_ = func; + } + + private: + std::unordered_map<string, Tensor> history_gradient_; + std::function<float(int)> momentum_generator_; +}; + +// =============Adagrad======================================================= +class Adagrad : Optimizer { + public: + void Setup(const OptimizerConf& conf); + /// Apply the updating algorithm. + void Apply(int step, float lr, const string& name, Tensor* grad, + Tensor* value) override; + + private: + std::unordered_map<string, Tensor> history_gradient_; + float delta_; +}; +// =============RMSProp======================================================= +class RMSProp : Optimizer { + public: + void Setup(const OptimizerConf& conf); + /// Apply the updating algorithm. + void Apply(int step, float lr, const string& name, Tensor* grad, + Tensor* value) override; + + private: + std::unordered_map<string, Tensor> history_gradient_; + float delta_, rho_; +}; // ============LocalAllReduce for single node multiple workers ============== /// Updater for training models on a single node with multiple devices (workers) /// All model parameters are partitioned such that each parameter is updated on http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/model/optimizer/adagrad.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/adagrad.cc b/src/model/optimizer/adagrad.cc new file mode 100644 index 0000000..8bdb07c --- /dev/null +++ b/src/model/optimizer/adagrad.cc @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_ +#define SRC_MODEL_OPTIMIZER_ADAGRAD_H_ +#include "singa/model/optimizer.h" +#include <functional> +namespace singa { + +void Adagrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); } + +void Adagrad::Apply(int step, float lr, const string& name, Tensor* grad, + Tensor* value) { + if (history_gradient_.find(name) == history_gradient_.end()) + history_gradient_[name].ResetLike(*value); + Tensor& history = history_gradient_[name]; + history += (*grad) * (*grad); + (*value) -= (*grad) * lr / Sqrt(history + delta_); +} +} // namespace singa +#endif // SRC_MODEL_OPTIMIZER_ADAGRAD_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/model/optimizer/nesterov.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/nesterov.cc b/src/model/optimizer/nesterov.cc new file mode 100644 index 0000000..95c5531 --- /dev/null +++ b/src/model/optimizer/nesterov.cc @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_MODEL_OPTIMIZER_NESTEROV_H_ +#define SRC_MODEL_OPTIMIZER_NESTEROV_H_ +#include "singa/model/optimizer.h" +#include <functional> +namespace singa { + +void Nesterov::Setup(const OptimizerConf& conf) { + float m = conf.momentum(); + SetMomentumGenerator([m](int step) { return m; }); +} + +void Nesterov::Apply(int step, float lr, const string& name, Tensor* grad, + Tensor* value) { + if (momentum_generator_) { + float mom = momentum_generator_(step); + if (history_gradient_.find(name) == history_gradient_.end()) + history_gradient_[name].ResetLike(*value); + Tensor& history = history_gradient_[name]; + Tensor tmp = history; + history = history * mom + (*grad) * lr; + tmp = history * (1 + mom) - tmp * mom; + (*value) -= tmp; + } +} +} // namespace singa +#endif // SRC_MODEL_OPTIMIZER_NESTEROV_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/model/optimizer/rmsprop.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/rmsprop.cc b/src/model/optimizer/rmsprop.cc new file mode 100644 index 0000000..cad333c --- /dev/null +++ b/src/model/optimizer/rmsprop.cc @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_ +#define SRC_MODEL_OPTIMIZER_ADAGRAD_H_ +#include "singa/model/optimizer.h" +#include <functional> +namespace singa { + +void RMSProp::Setup(const OptimizerConf& conf) { + delta_ = conf.delta(); + rho_ = conf.delta(); +} + +void RMSProp::Apply(int step, float lr, const string& name, Tensor* grad, + Tensor* value) { + if (history_gradient_.find(name) == history_gradient_.end()) + history_gradient_[name].ResetLike(*value); + Tensor& history = history_gradient_[name]; + history = history * rho_ + (*grad) * (*grad) * (1 - rho_); + (*value) -= (*grad) * lr / Sqrt(history + delta_); +} +} // namespace singa +#endif // SRC_MODEL_OPTIMIZER_ADAGRAD_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/src/proto/model.proto ---------------------------------------------------------------------- diff --git a/src/proto/model.proto b/src/proto/model.proto index d368296..c26aa35 100644 --- a/src/proto/model.proto +++ b/src/proto/model.proto @@ -86,6 +86,9 @@ message OptimizerConf { // used by vanilla sgd and nesterov optional float momentum = 5 [default = 0.9]; + + // delta is used to avoid dividing zero + optional float delta = 6 [default = 0.0000001]; } message ConstraintConf { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/test/singa/test_adagrad.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_adagrad.cc b/test/singa/test_adagrad.cc new file mode 100644 index 0000000..1382467 --- /dev/null +++ b/test/singa/test_adagrad.cc @@ -0,0 +1,92 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "gtest/gtest.h" +#include "singa/model/optimizer.h" +#include "singa_config.h" +#include <cmath> + +TEST(Adagrad, ApplyCPU) { + singa::Adagrad adagrad; + float lr = 0.1f; + const float v[4] = {0.1, 0.2, 0.3, 0.4}; + const float g[4] = {0.01, 0.02, 0.03, 0.04}; + + singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4}); + value.CopyDataFromHostPtr(v, 4); + grad.CopyDataFromHostPtr(g, 4); + + adagrad.Apply(0, lr, "xx", &grad, &value); + + singa::Tensor v1 = value.Clone(); + const float* newv1 = v1.data<const float*>(); + float history[4]; + for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i]; + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv1[i], + v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); + + grad.CopyDataFromHostPtr(g, 4); + adagrad.Apply(1, lr, "xx", &grad, &value); + singa::Tensor v2 = value.Clone(); + const float* newv2 = v2.data<const float*>(); + for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i]; + + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv2[i], + newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); +} + +#ifdef USE_CUDA +TEST(Adagrad, ApplyCUDA) { + singa::Adagrad adagrad; + float lr = 0.1f; + const float v[4] = {0.1, 0.2, 0.3, 0.4}; + const float g[4] = {0.01, 0.02, 0.03, 0.04}; + + singa::CudaGPU dev; + singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev); + value.CopyDataFromHostPtr(v, 4); + grad.CopyDataFromHostPtr(g, 4); + + adagrad.Apply(0, lr, "xx", &grad, &value); + + singa::Tensor v1 = value.Clone(); + v1.ToHost(); + const float* newv1 = v1.data<const float*>(); + float history[4]; + for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i]; + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv1[i], + v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); + + grad.CopyDataFromHostPtr(g, 4); + adagrad.Apply(1, lr, "xx", &grad, &value); + singa::Tensor v2 = value.Clone(); + v2.ToHost(); + const float* newv2 = v2.data<const float*>(); + for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i]; + + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv2[i], + newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); +} +#endif http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/test/singa/test_nesterov.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_nesterov.cc b/test/singa/test_nesterov.cc new file mode 100644 index 0000000..e7083c8 --- /dev/null +++ b/test/singa/test_nesterov.cc @@ -0,0 +1,101 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "gtest/gtest.h" +#include "singa/model/optimizer.h" +#include "singa_config.h" + +TEST(Nesterov, ApplyCPU) { + singa::Nesterov nesterov; + float lr = 0.1f; + auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; }; + nesterov.SetMomentumGenerator(func); + const float v[4] = {0.1, 0.2, 0.3, 0.4}; + const float g[4] = {0.01, 0.02, 0.03, 0.04}; + + singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4}); + value.CopyDataFromHostPtr(v, 4); + grad.CopyDataFromHostPtr(g, 4); + + nesterov.Apply(0, lr, "xx", &grad, &value); + + singa::Tensor v1 = value.Clone(); + const float* newv1 = v1.data<const float*>(); + float history[4], tmp[4]; + for (int i = 0; i < 4; ++i) { + history[i] = g[i] * lr; + tmp[i] = history[i] * (1 + func(0)); + } + for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]); + + grad.CopyDataFromHostPtr(g, 4); + nesterov.Apply(1, lr, "xx", &grad, &value); + singa::Tensor v2 = value.Clone(); + const float* newv2 = v2.data<const float*>(); + for (int i = 0; i < 4; ++i) { + tmp[i] = history[i]; + history[i] = history[i] * func(1) + g[i] * lr; + tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1); + } + + for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]); +} + +#ifdef USE_CUDA +TEST(Nesterov, ApplyCUDA) { + singa::Nesterov nesterov; + float lr = 0.1f; + auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; }; + nesterov.SetMomentumGenerator(func); + const float v[4] = {0.1, 0.2, 0.3, 0.4}; + const float g[4] = {0.01, 0.02, 0.03, 0.04}; + + singa::CudaGPU dev; + singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev); + value.CopyDataFromHostPtr(v, 4); + grad.CopyDataFromHostPtr(g, 4); + + nesterov.Apply(0, lr, "xx", &grad, &value); + + singa::Tensor v1 = value.Clone(); + v1.ToHost(); + const float* newv1 = v1.data<const float*>(); + float history[4], tmp[4]; + for (int i = 0; i < 4; ++i) { + history[i] = g[i] * lr; + tmp[i] = history[i] * (1 + func(0)); + } + for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]); + + grad.CopyDataFromHostPtr(g, 4); + nesterov.Apply(1, lr, "xx", &grad, &value); + singa::Tensor v2 = value.Clone(); + v2.ToHost(); + const float* newv2 = v2.data<const float*>(); + for (int i = 0; i < 4; ++i) { + tmp[i] = history[i]; + history[i] = history[i] * func(1) + g[i] * lr; + tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1); + } + + for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]); +} +#endif http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/178db014/test/singa/test_rmsprop.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_rmsprop.cc b/test/singa/test_rmsprop.cc new file mode 100644 index 0000000..62101f7 --- /dev/null +++ b/test/singa/test_rmsprop.cc @@ -0,0 +1,103 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "gtest/gtest.h" +#include "singa/model/optimizer.h" +#include "singa_config.h" +#include <cmath> + +TEST(RMSProp, ApplyCPU) { + singa::RMSProp rmsprop; + float lr = 0.1f; + float rho = 0.002f; + const float v[4] = {0.1, 0.2, 0.3, 0.4}; + const float g[4] = {0.01, 0.02, 0.03, 0.04}; + + singa::OptimizerConf conf; + conf.set_rho(rho); + + singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4}); + value.CopyDataFromHostPtr(v, 4); + grad.CopyDataFromHostPtr(g, 4); + + rmsprop.Setup(conf); + rmsprop.Apply(0, lr, "xx", &grad, &value); + + singa::Tensor v1 = value.Clone(); + const float* newv1 = v1.data<const float*>(); + float history[4]; + for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho); + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv1[i], + v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); + + grad.CopyDataFromHostPtr(g, 4); + rmsprop.Apply(1, lr, "xx", &grad, &value); + singa::Tensor v2 = value.Clone(); + const float* newv2 = v2.data<const float*>(); + for (int i = 0; i < 4; ++i) + history[i] += history[i] * rho + g[i] * g[i] * (1 - rho); + + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv2[i], + newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); +} + +#ifdef USE_CUDA +TEST(RMSProp, ApplyCUDA) { + singa::RMSProp rmsprop; + float lr = 0.1f; + float rho = 0.002f; + const float v[4] = {0.1, 0.2, 0.3, 0.4}; + const float g[4] = {0.01, 0.02, 0.03, 0.04}; + + singa::OptimizerConf conf; + conf.set_rho(rho); + + singa::CudaGPU dev; + singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev); + value.CopyDataFromHostPtr(v, 4); + grad.CopyDataFromHostPtr(g, 4); + + rmsprop.Apply(0, lr, "xx", &grad, &value); + + singa::Tensor v1 = value.Clone(); + v1.ToHost(); + const float* newv1 = v1.data<const float*>(); + float history[4]; + for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho); + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv1[i], + v[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); + + grad.CopyDataFromHostPtr(g, 4); + rmsprop.Apply(1, lr, "xx", &grad, &value); + singa::Tensor v2 = value.Clone(); + v2.ToHost(); + const float* newv2 = v2.data<const float*>(); + for (int i = 0; i < 4; ++i) + history[i] += history[i] * rho + g[i] * g[i] * (1 - rho); + + for (int i = 0; i < 4; ++i) + EXPECT_FLOAT_EQ(newv2[i], + newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8)); +} +#endif
