SINGA-32 Implement synchronous training framework Fix a bug from InitLocalParam() of Worker class. One worker owns one Param if the param's data blob is not shared from other workers. Previously, a Worker would not send Get request for one Param if it owns this Param. But it may not init the Param locally because it is not the first group in a group set which subscribe to the same server group.
To fix the bug, all workers would send Get requests for Params in its local layers. There would not extra cost for getting Params owned by the worker (from the first group), because the get reqest would not be sent (the param version is already the latest). Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/96bedb22 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/96bedb22 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/96bedb22 Branch: refs/heads/master Commit: 96bedb2264f7d4ebd8a2a0cad67dc9a91f5419c9 Parents: 585e275 Author: wang wei <[email protected]> Authored: Fri Jul 17 16:04:21 2015 +0800 Committer: wang wei <[email protected]> Committed: Fri Jul 17 16:04:21 2015 +0800 ---------------------------------------------------------------------- src/trainer/trainer.cc | 2 +- src/trainer/worker.cc | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96bedb22/src/trainer/trainer.cc ---------------------------------------------------------------------- diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc index 3ecaad0..44c37ea 100644 --- a/src/trainer/trainer.cc +++ b/src/trainer/trainer.cc @@ -345,7 +345,7 @@ void Trainer::DisplayMetric(Msg** msg) { char prefix[128]; msgg->ParseFormatFrame("s", prefix); CHECK(msgg->NextFrame()); - const string perf(static_cast<char*>(msgg->FrameData()), msgg->FrameSize());; + const string perf(static_cast<char*>(msgg->FrameData()), msgg->FrameSize()); Metric cur(perf); LOG(ERROR) << prefix << " step-" << step <<", " << cur.ToLogString(); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96bedb22/src/trainer/worker.cc ---------------------------------------------------------------------- diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc index bf98f0b..e1f2a41 100644 --- a/src/trainer/worker.cc +++ b/src/trainer/worker.cc @@ -68,8 +68,7 @@ void Worker::InitLocalParams() { for (auto layer : train_net_->layers()) { if (layer->partition_id() == id_) for (auto param : layer->GetParams()) - if (param->owner() != param->id()) - Get(param, modelproto_.warmup_steps()); + Get(param, modelproto_.warmup_steps()); } } @@ -114,7 +113,7 @@ void Worker::Run() { Test(modelproto_.test_steps(), kTest, test_net_); } TrainOneBatch(step_, &perf); - //LOG(ERROR)<<"Train "<<step; + // LOG(ERROR) << "Train " << step_; if (DisplayNow(step_)) { Report("Train", perf); perf.Reset();
