SINGA-32 Implement synchronous training framework

Fix a bug from InitLocalParam() of Worker class.
One worker owns one Param if the param's data blob is not shared from other 
workers.
Previously, a Worker would not send Get request for one Param if it owns this 
Param.
But it may not init the Param locally because it is not the first group in a 
group
set which subscribe to the same server group.

To fix the bug, all workers would send Get requests for Params in its local 
layers.
There would not extra cost for getting Params owned by the worker (from the 
first group),
because the get reqest would not be sent (the param version is already the 
latest).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/96bedb22
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/96bedb22
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/96bedb22

Branch: refs/heads/master
Commit: 96bedb2264f7d4ebd8a2a0cad67dc9a91f5419c9
Parents: 585e275
Author: wang wei <[email protected]>
Authored: Fri Jul 17 16:04:21 2015 +0800
Committer: wang wei <[email protected]>
Committed: Fri Jul 17 16:04:21 2015 +0800

----------------------------------------------------------------------
 src/trainer/trainer.cc | 2 +-
 src/trainer/worker.cc  | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96bedb22/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index 3ecaad0..44c37ea 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -345,7 +345,7 @@ void Trainer::DisplayMetric(Msg** msg) {
     char prefix[128];
     msgg->ParseFormatFrame("s", prefix);
     CHECK(msgg->NextFrame());
-    const string perf(static_cast<char*>(msgg->FrameData()), 
msgg->FrameSize());;
+    const string perf(static_cast<char*>(msgg->FrameData()), 
msgg->FrameSize());
     Metric cur(perf);
     LOG(ERROR) << prefix << " step-" << step <<", " << cur.ToLogString();
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96bedb22/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index bf98f0b..e1f2a41 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -68,8 +68,7 @@ void Worker::InitLocalParams() {
   for (auto layer : train_net_->layers()) {
     if (layer->partition_id() == id_)
       for (auto param : layer->GetParams())
-        if (param->owner() != param->id())
-          Get(param, modelproto_.warmup_steps());
+        Get(param, modelproto_.warmup_steps());
   }
 }
 
@@ -114,7 +113,7 @@ void Worker::Run() {
       Test(modelproto_.test_steps(), kTest, test_net_);
     }
     TrainOneBatch(step_, &perf);
-    //LOG(ERROR)<<"Train "<<step;
+    // LOG(ERROR) << "Train " << step_;
     if (DisplayNow(step_)) {
       Report("Train", perf);
       perf.Reset();

Reply via email to