SINGA-58 Fix fan-in dimension of weight matrix Use the num of cols of a weight matrix as its fan-in. Layer that have weight matrix should follow this assumption. Otherwise, if there would be errors when the weight matrix is initialized based on fan-in.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/fcd377ae Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/fcd377ae Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/fcd377ae Branch: refs/heads/master Commit: fcd377aed543f5a44deeb3145551b107e6cc2324 Parents: a584da6 Author: Wei Wang <[email protected]> Authored: Sat Aug 15 11:37:59 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Sat Aug 15 14:59:11 2015 +0800 ---------------------------------------------------------------------- examples/cifar10/job.conf | 16 ++++++++-------- examples/mnist/conv.conf | 16 ++++++++-------- include/utils/param.h | 8 ++++---- src/neuralnet/layer.cc | 8 ++++---- src/proto/job.proto | 8 ++++---- src/utils/param.cc | 10 ++++------ src/utils/updater.cc | 18 +++++++++--------- 7 files changed, 41 insertions(+), 43 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/examples/cifar10/job.conf ---------------------------------------------------------------------- diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf index 89afca9..fdf6167 100644 --- a/examples/cifar10/job.conf +++ b/examples/cifar10/job.conf @@ -65,12 +65,12 @@ neuralnet { name: "w1" init_method:kGaussian std:0.0001 - learning_rate_multiplier:1.0 + lr_scale:1.0 } param { name: "b1" init_method: kConstant - learning_rate_multiplier:2.0 + lr_scale:2.0 value:0 } } @@ -115,12 +115,12 @@ neuralnet { name: "w2" init_method:kGaussian std:0.01 - learning_rate_multiplier:1.0 + lr_scale:1.0 } param { name: "b2" init_method: kConstant - learning_rate_multiplier:2.0 + lr_scale:2.0 value:0 } } @@ -197,14 +197,14 @@ neuralnet { name: "w4" init_method:kGaussian std:0.01 - learning_rate_multiplier:1.0 - weight_decay_multiplier:250 + lr_scale:1.0 + wd_scale:250 } param { name: "b4" init_method: kConstant - learning_rate_multiplier:2.0 - weight_decay_multiplier:0 + lr_scale:2.0 + wd_scale:0 value:0 } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/examples/mnist/conv.conf ---------------------------------------------------------------------- diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf index ba6f6a7..3509a36 100644 --- a/examples/mnist/conv.conf +++ b/examples/mnist/conv.conf @@ -63,12 +63,12 @@ neuralnet { param{ name: "w1" init_method:kUniformSqrtFanIn - learning_rate_multiplier:1.0 + lr_scale:1.0 } param{ name: "b1" init_method: kConstant - learning_rate_multiplier:2.0 + lr_scale:2.0 value:0 } } @@ -94,12 +94,12 @@ neuralnet { param{ name: "w2" init_method:kUniformSqrtFanIn - learning_rate_multiplier:1.0 + lr_scale:1.0 } param{ name: "b2" init_method: kConstant - learning_rate_multiplier:2.0 + lr_scale:2.0 value:0 } } @@ -123,12 +123,12 @@ neuralnet { param{ name: "w3" init_method:kUniformSqrtFanIn - learning_rate_multiplier:1.0 + lr_scale:1.0 } param{ name: "b3" init_method: kConstant - learning_rate_multiplier:2.0 + lr_scale:2.0 value:0 } @@ -150,12 +150,12 @@ neuralnet { param { name: "w4" init_method:kUniformSqrtFanIn - learning_rate_multiplier:1 + lr_scale:1 } param { name: "b4" init_method: kConstant - learning_rate_multiplier:2 + lr_scale:2 value:0 } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/include/utils/param.h ---------------------------------------------------------------------- diff --git a/include/utils/param.h b/include/utils/param.h index 8fabe71..2eb66db 100644 --- a/include/utils/param.h +++ b/include/utils/param.h @@ -52,14 +52,14 @@ class Param { /** * Scale the learning rate when updating parameters in the Param object */ - float learning_rate_multiplier() { - return proto_.learning_rate_multiplier(); + float lr_scale() { + return proto_.lr_scale(); } /** * Scale the weight decay when updating parameters in the Param object */ - float weight_decay_multiplier() { - return proto_.weight_decay_multiplier(); + float wd_scale() { + return proto_.wd_scale(); } /** * Parameter name used for Param re-use in other model or sharing between http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/neuralnet/layer.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc index c1fce00..810d0b4 100644 --- a/src/neuralnet/layer.cc +++ b/src/neuralnet/layer.cc @@ -341,7 +341,7 @@ void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) { Factory<Param>* factory=Singleton<Factory<Param>>::Instance(); weight_ = factory->Create("Param"); bias_ = factory->Create("Param"); - weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_}); + weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_}); bias_->Setup(proto.param(1), vector<int>{hdim_}); } @@ -350,7 +350,7 @@ void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) { auto src = Tensor2(srclayers_[0]->mutable_data(this)); auto weight = Tensor2(weight_->mutable_data()); auto bias = Tensor1(bias_->mutable_data()); - data=dot(src, weight); + data=dot(src, weight.T()); // repmat: repeat bias vector into batchsize rows data+=repmat(bias, batchsize_); } @@ -363,10 +363,10 @@ void InnerProductLayer::ComputeGradient(Phase phas) { auto gbias = Tensor1(bias_->mutable_grad()); gbias=sum_rows(grad); - gweight=dot(src.T(), grad); + gweight=dot(grad.T(), src); if(srclayers_[0]->mutable_grad(this)!=nullptr){ auto gsrc = Tensor2(srclayers_[0]->mutable_grad(this)); - gsrc=dot(grad, weight.T()); + gsrc=dot(grad, weight); } } /***************************************************************************** http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/proto/job.proto ---------------------------------------------------------------------- diff --git a/src/proto/job.proto b/src/proto/job.proto index dbbfc61..fe8dc21 100644 --- a/src/proto/job.proto +++ b/src/proto/job.proto @@ -213,10 +213,10 @@ message ParamProto { // for gaussian sampling optional float mean = 8 [default = 0]; optional float std = 9 [default = 1]; - // multiplied on the global learning rate. - optional float learning_rate_multiplier = 15 [default = 1]; - // multiplied on the global weight decay. - optional float weight_decay_multiplier = 16 [default = 1]; + // scale factor, multiplied on the global learning rate. + optional float lr_scale = 15 [default = 1]; + // scale factor, multiplied on the global weight decay. + optional float wd_scale = 16 [default = 1]; // name of the owner param from which this param shares the values optional string share_from = 60; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/utils/param.cc ---------------------------------------------------------------------- diff --git a/src/utils/param.cc b/src/utils/param.cc index 2655877..7adea7c 100644 --- a/src/utils/param.cc +++ b/src/utils/param.cc @@ -53,11 +53,9 @@ void Param::InitValues(int version) { break; case InitMethod::kUniformSqrtFanIn: random->SampleUniform(data, proto_.low(), proto_.high()); - // only valid for param matrix with dim 1 for fan in - LOG(ERROR) << "init fan in"; + // only valid for param matrix with num of cols as fan in CHECK_EQ(data_->shape().size(), 2); data *= proto_.value() / sqrt(data_->shape().at(1) / 3.0f); - LOG(ERROR) << "end fan in"; break; case InitMethod::kUniformSqrtFanInOut: random->SampleUniform(data, proto_.low(), proto_.high()); @@ -96,7 +94,7 @@ Msg* Param::GenPutMsg(bool copy, int idx) { void *p = ptr; if (copy) p = nullptr; msg->AddFormatFrame("iffp", slice_size_[idx], - learning_rate_multiplier(), weight_decay_multiplier(), p); + lr_scale(), wd_scale(), p); if (copy) { msg->AddFrame(ptr, slice_size_[idx] * sizeof(float)); } @@ -146,8 +144,8 @@ Msg* Param::HandlePutMsg(Msg** msg, bool reserve) { float* ptr; (*msg)->ParseFormatFrame("iffp", &size, &lr, &wc, &ptr); ParamProto proto; - proto.set_learning_rate_multiplier(lr); - proto.set_weight_decay_multiplier(wc); + proto.set_lr_scale(lr); + proto.set_wd_scale(wc); vector<int> shape{size}; Setup(proto, shape); if (ptr == nullptr) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/utils/updater.cc ---------------------------------------------------------------------- diff --git a/src/utils/updater.cc b/src/utils/updater.cc index c038ca7..7bca6dc 100644 --- a/src/utils/updater.cc +++ b/src/utils/updater.cc @@ -68,8 +68,8 @@ void SGDUpdater::Update(int step, Param* param, float grad_scale) { Shape<1> s = Shape1(param->size()); Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); - float lr = GetLearningRate(step)*param->learning_rate_multiplier(); - float wd = weight_decay_*param->weight_decay_multiplier(); + float lr = GetLearningRate(step)*param->lr_scale(); + float wd = weight_decay_*param->wd_scale(); if (grad_scale != 1.f) grad *= grad_scale; if (wd > 0) { // L2 regularization, should be done after timing grad_scale @@ -99,8 +99,8 @@ void NesterovUpdater::Update(int step, Param* param, float grad_scale) { Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); Tensor<cpu, 1> history(param->mutable_cpu_history(), s); TensorContainer<cpu, 1> tmp(s); - float lr = GetLearningRate(step)*param->learning_rate_multiplier(); - float wd = weight_decay_*param->weight_decay_multiplier(); + float lr = GetLearningRate(step)*param->lr_scale(); + float wd = weight_decay_*param->wd_scale(); if (grad_scale != 1.f) grad *= grad_scale; if (wd > 0) { // L2 regularization, should be done after timing grad_scale @@ -125,8 +125,8 @@ void AdaGradUpdater::Update(int step, Param* param, float grad_scale) { Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); Tensor<cpu, 1> history(param->mutable_cpu_history(), s); - float lr = GetLearningRate(step)*param->learning_rate_multiplier(); - float wd = weight_decay_*param->weight_decay_multiplier(); + float lr = GetLearningRate(step)*param->lr_scale(); + float wd = weight_decay_*param->wd_scale(); if (grad_scale != 1.f) grad *= grad_scale; if (wd > 0) { // L2 regularization, should be done after timing grad_scale @@ -152,8 +152,8 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale){ Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); Tensor<cpu, 1> history(param->mutable_cpu_history(), s); history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale); - float lr=GetLearningRate(step)*param->learning_rate_multiplier(); - float wd=weight_decay_*param->weight_decay_multiplier(); + float lr=GetLearningRate(step)*param->lr_scale(); + float wd=weight_decay_*param->wd_scale(); if(wd>0){ // L2 regularization grad+=data*wd; } @@ -175,7 +175,7 @@ void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){ Tensor<cpu, 1> history(param->mutable_cpu_history(), s); Tensor<cpu, 1> update(param->mutable_cpu_update(), s); TensorContainer<cpu, 1> tmp(s); - float wd=weight_decay_*param->weight_decay_multiplier(); + float wd=weight_decay_*param->wd_scale(); if(wd>0){ // L2 regularization grad+=data*wd; }
