Repository: incubator-singa Updated Branches: refs/heads/master 9d43056af -> 039de8b0a
SINGA-48 Fix a bug in trainer.cc that assigns the same NeuralNet instance to workers from diff groups Cleaned the code for SetupWorkerServer and fixed the bug. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/db440127 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/db440127 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/db440127 Branch: refs/heads/master Commit: db440127bc35c626a4e8407c6e3bfd9331870a37 Parents: da844af Author: Wei Wang <[email protected]> Authored: Wed Aug 12 20:06:30 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Thu Aug 13 14:13:44 2015 +0800 ---------------------------------------------------------------------- src/neuralnet/neuralnet.cc | 1 + src/neuralnet/optional_layer.cc | 2 +- src/trainer/trainer.cc | 63 ++++++++++++++++++------------------ 3 files changed, 34 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/db440127/src/neuralnet/neuralnet.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc index 83bd4fd..23154c5 100644 --- a/src/neuralnet/neuralnet.cc +++ b/src/neuralnet/neuralnet.cc @@ -288,6 +288,7 @@ Graph* NeuralNet::CreateGraph(const NetProto& netproto, int npartitions) { // differentiate partitions string nodename = layer.name() + "@" + string(suffix); proto->set_partition_id(i); + proto->set_name(nodename); auto node = new Node(nodename, layer.name(), i, proto); graph->AddNode(node); nodes.push_back(node); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/db440127/src/neuralnet/optional_layer.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/optional_layer.cc b/src/neuralnet/optional_layer.cc index 06f413f..97595cf 100644 --- a/src/neuralnet/optional_layer.cc +++ b/src/neuralnet/optional_layer.cc @@ -95,7 +95,7 @@ void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) { SingleLabelImageRecord* record=sample_.mutable_image(); ConvertCaffeDatumToRecord(datum, record); - batchsize_=batchsize(); + batchsize_ = proto.lmdbdata_conf().batchsize(); if(partition_dim() == 0) batchsize_ /= npartitions; records_.resize(batchsize_); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/db440127/src/trainer/trainer.cc ---------------------------------------------------------------------- diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc index 7a20e53..f348ff6 100644 --- a/src/trainer/trainer.cc +++ b/src/trainer/trainer.cc @@ -86,51 +86,52 @@ void Trainer::SetupWorkerServer( auto net = NeuralNet::Create(net_conf, kTrain, grp_size); // MUST do SliceParam before share param/net with others auto slices = SliceParams(net->params()); - shared_ptr<NeuralNet> train_net, test_net, valid_net; - int grp = workers.size() ? workers.at(0)->grp_id() : -1; - if (grp == 0 && model_conf.test_steps()) { - // test are performed only by the first group - test_net = NeuralNet::Create(net_conf, kTest, grp_size); - test_net->ShareParamsFrom(net); - } - if (grp == 0 && model_conf.validation_steps()) { - // validation are performed only by the first group - valid_net = NeuralNet::Create(net_conf, kValidation, grp_size); - valid_net->ShareParamsFrom(net); - } - bool prepare_param = true; + + std::unordered_map<int, shared_ptr<NeuralNet>> grp_net; + int first_grp = workers.size() ? workers.at(0)->grp_id() : -1; for (auto worker : workers) { - if (worker->grp_id() != grp) { - train_net = NeuralNet::Create(net_conf, kTrain, grp_size); - if(cluster->share_memory()) - train_net->ShareParamsFrom(net); - valid_net = test_net = nullptr; - grp = worker->grp_id(); - prepare_param = true; - } else { - train_net = net; - } - worker->Setup(model_conf, train_net, valid_net, test_net); - // Prepare ParamEntry - if (prepare_param) { - for (auto layer : train_net->layers()) { + int grp_id = worker->grp_id(); + int worker_id = worker->id(); + shared_ptr<NeuralNet> test_net = nullptr, valid_net = nullptr; + if (grp_net.find(grp_id) == grp_net.end()) { + if (grp_id == first_grp) { + // test are performed only by the first group now. TODO update. + if (first_grp == 0 && model_conf.test_steps() && worker_id == 0) { + test_net = NeuralNet::Create(net_conf, kTest, 1); // hard code for exp + test_net->ShareParamsFrom(net); + } + // validation are performed only by the first group. TODO update. + if (first_grp == 0 && model_conf.validation_steps() && worker_id == 0) { + valid_net = NeuralNet::Create(net_conf, kValidation, 1); + valid_net->ShareParamsFrom(net); + } + grp_net[grp_id] = net; + } else { + grp_net[grp_id] = NeuralNet::Create(net_conf, kTrain, grp_size); + if(cluster->share_memory()) + grp_net[grp_id]->ShareParamsFrom(net); + } + for (auto layer : grp_net[grp_id]->layers()) { bool local = layer->partition_id() >= workers.front()->id() && layer->partition_id() <= workers.back()->id(); for (auto param : layer->GetParams()) { - int hash = Hash(grp, param->owner()); + int hash = Hash(grp_id, param->owner()); if (worker_shard_.find(hash) == worker_shard_.end()) worker_shard_[hash] = new ParamEntry(); worker_shard_[hash]->AddParam(local, param); } } - prepare_param = false; } + LOG(INFO) << "grp " << worker->grp_id() << ", worker " + << worker->id() << " net " << grp_net[grp_id].get(); + worker->Setup(model_conf, grp_net[grp_id], valid_net, test_net); } - // partition among server groups, each group maintains one sub-set for sync + + // partition among server groups, each group maintains one sub-set for sync auto slice2group = PartitionSlices(cluster->nserver_groups(), slices); for (auto server : servers) server->Setup(model_conf.updater(), &server_shard_, slice2group); - // partition within one server group, each server updates for one sub-set + // partition within one server group, each server updates for one sub-set slice2server_ = PartitionSlices(cluster->nservers_per_group(), slices); }
