Repository: incubator-singa Updated Branches: refs/heads/master 72e73cc18 -> 7954a87d2
SINGA-34 Support external zookeeper service move global configurations from ClusterProto to a new GlobalProto add a global conf file as conf/singa.conf change tool/plot to tool/pb2 Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3819e590 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3819e590 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3819e590 Branch: refs/heads/master Commit: 3819e59089bd3f441b88e4ebfc37848f0983c3cc Parents: 72e73cc Author: wang sheng <[email protected]> Authored: Thu Jul 16 03:41:25 2015 +0800 Committer: wang sheng <[email protected]> Committed: Thu Jul 16 04:47:08 2015 +0800 ---------------------------------------------------------------------- .gitignore | 3 +-- Makefile.example | 4 ++++ conf/singa.conf | 5 +++++ include/trainer/trainer.h | 5 +++-- include/utils/cluster.h | 14 +++++--------- src/main.cc | 9 ++++++--- src/proto/cluster.proto | 4 ++-- src/proto/global.proto | 8 ++++++++ src/trainer/trainer.cc | 5 +++-- src/utils/cluster.cc | 11 +++++++---- tool/gen_hosts.py | 9 ++------- tool/plot/__init__.py | 0 tool/plot/plot.py | 0 13 files changed, 46 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index a419725..527972b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,12 +14,11 @@ *.cproject *.log *.nfs* -*_pb2.py -*.pyc *.pb.h *.pb.cc *.hosts *.out +tool/pb2/* src/test/data/* tmp log* http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/Makefile.example ---------------------------------------------------------------------- diff --git a/Makefile.example b/Makefile.example index 6d8d83a..f2c58fe 100644 --- a/Makefile.example +++ b/Makefile.example @@ -82,11 +82,15 @@ $(PROTO_SRCS): $(PROTOS) protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS) mkdir -p include/proto/ cp src/proto/*.pb.h include/proto/ + mkdir -p tool/pb2/ + touch tool/pb2/__init__.py + protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS) @echo clean: rm -rf *.a *.so rm -rf include/proto/* rm -rf src/proto/*.pb.h src/proto/*.pb.cc + rm -rf tool/pb2/* rm -rf $(BUILD_DIR) @echo http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/conf/singa.conf ---------------------------------------------------------------------- diff --git a/conf/singa.conf b/conf/singa.conf new file mode 100644 index 0000000..f6c351b --- /dev/null +++ b/conf/singa.conf @@ -0,0 +1,5 @@ +# point to your active zookeeper service +zookeeper_host: "localhost:2181" + +# set if you want to change log directory +# log_dir: "/tmp/singa-log/" http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/include/trainer/trainer.h ---------------------------------------------------------------------- diff --git a/include/trainer/trainer.h b/include/trainer/trainer.h index 2419dc4..0ee01d4 100644 --- a/include/trainer/trainer.h +++ b/include/trainer/trainer.h @@ -2,6 +2,7 @@ #define INCLUDE_TRAINER_TRAINER_H_ #include <unordered_map> #include "proto/cluster.pb.h" +#include "proto/global.pb.h" #include "proto/model.pb.h" #include "utils/updater.h" #include "utils/param.h" @@ -88,8 +89,8 @@ class Trainer{ * @param modelproto * @param clusterproto */ - void Start(const ModelProto& modelproto, const ClusterProto& clusterproto, - const int procs_id); + void Start(const ModelProto& modelproto, const GlobalProto& globalproto, + const ClusterProto& clusterproto, const int procs_id); // TODO add Resume() function to continue training from a previously stopped // point. http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/include/utils/cluster.h ---------------------------------------------------------------------- diff --git a/include/utils/cluster.h b/include/utils/cluster.h index e5980ca..68ae937 100644 --- a/include/utils/cluster.h +++ b/include/utils/cluster.h @@ -9,6 +9,7 @@ #include "utils/common.h" #include "proto/cluster.pb.h" #include "utils/cluster_rt.h" +#include "proto/global.pb.h" using std::shared_ptr; using std::string; @@ -24,7 +25,8 @@ namespace singa { class Cluster { public: static shared_ptr<Cluster> Get(); - static shared_ptr<Cluster> Get(const ClusterProto& cluster, int procs_id=0); + static shared_ptr<Cluster> Get(const GlobalProto& global, + const ClusterProto& cluster, int procs_id=0); const int nserver_groups()const{ return cluster_.nserver_groups(); } const int nworker_groups()const { return cluster_.nworker_groups(); } @@ -83,13 +85,6 @@ class Cluster { const string vis_folder(){ return cluster_.workspace()+"/visualization"; } - const string log_folder(){ - if(cluster_.has_log_dir()){ - return cluster_.workspace()+"log"; - }else - return ""; - } - const int stub_timeout() const { return cluster_.stub_timeout(); } @@ -130,7 +125,7 @@ class Cluster { void Register(const string& endpoint); private: - Cluster(const ClusterProto &cluster, int procs_id) ; + Cluster(const GlobalProto& global, const ClusterProto &cluster, int procs_id) ; void SetupFolders(const ClusterProto &cluster); int Hash(int gid, int id, int flag); @@ -141,6 +136,7 @@ class Cluster { std::vector<std::string> endpoints_; // cluster config proto ClusterProto cluster_; + GlobalProto global_; shared_ptr<ClusterRuntime> cluster_rt_; // make this class a singlton static shared_ptr<Cluster> instance_; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/src/main.cc ---------------------------------------------------------------------- diff --git a/src/main.cc b/src/main.cc index 4c2bb03..e6b7368 100644 --- a/src/main.cc +++ b/src/main.cc @@ -23,6 +23,7 @@ DEFINE_int32(procsID, -1, "Global process ID"); DEFINE_string(cluster, "examples/mnist/cluster.conf", "Cluster config file"); DEFINE_string(model, "examples/mnist/conv.conf", "Model config file"); +DEFINE_string(global, "conf/singa.conf", "Global config file"); /** * Register layers, and other customizable classes. @@ -42,14 +43,16 @@ int main(int argc, char **argv) { singa::ReadProtoFromTextFile(FLAGS_cluster.c_str(), &cluster); singa::ModelProto model; singa::ReadProtoFromTextFile(FLAGS_model.c_str(), &model); - if(cluster.has_log_dir()) - singa::SetupLog(cluster.log_dir(), model.name()); + singa::GlobalProto global; + singa::ReadProtoFromTextFile(FLAGS_global.c_str(), &global); + singa::SetupLog(global.log_dir(), model.name()); LOG(INFO) << "The cluster config is\n" << cluster.DebugString(); LOG(INFO) << "The model config is\n" << model.DebugString(); + LOG(INFO) << "The global config is\n" << global.DebugString(); RegisterClasses(model); singa::Trainer trainer; - trainer.Start(model, cluster, FLAGS_procsID); + trainer.Start(model, global, cluster, FLAGS_procsID); return 0; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/src/proto/cluster.proto ---------------------------------------------------------------------- diff --git a/src/proto/cluster.proto b/src/proto/cluster.proto index 1480cc1..54ce300 100644 --- a/src/proto/cluster.proto +++ b/src/proto/cluster.proto @@ -21,9 +21,9 @@ message ClusterProto { // local workspace, train/val/test shards, checkpoint files required string workspace = 14; // relative path to workspace. if not set, use the default dir of glog - optional string log_dir = 15; + //optional string log_dir = 15; // ip/hostname : port [, ip/hostname : port] - optional string zookeeper_host = 16 [default = "localhost:2181"]; + //optional string zookeeper_host = 16 [default = "localhost:2181"]; // message size limit, default 1MB // optional int32 largest_message = 20 [default = 1048576]; // optional float bandwidth = 21 [default = 100]; // MB/s http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/src/proto/global.proto ---------------------------------------------------------------------- diff --git a/src/proto/global.proto b/src/proto/global.proto new file mode 100644 index 0000000..84eb7be --- /dev/null +++ b/src/proto/global.proto @@ -0,0 +1,8 @@ +package singa; + +message GlobalProto { + // ip/hostname:port[,ip/hostname:port] + required string zookeeper_host = 1; + // if not set, use the default dir of glog + optional string log_dir = 2 [default = "/tmp/singa-log/"]; +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/src/trainer/trainer.cc ---------------------------------------------------------------------- diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc index f4e52a6..a6a5dbf 100644 --- a/src/trainer/trainer.cc +++ b/src/trainer/trainer.cc @@ -248,13 +248,14 @@ vector<Worker*> Trainer::CreateWorkers(int nthreads, return workers; } -void Trainer::Start(const ModelProto& mproto, const ClusterProto& cproto, +void Trainer::Start(const ModelProto& mproto, const GlobalProto& gproto, + const ClusterProto& cproto, int procs_id){ // procs_id is only used for resume training CHECK_EQ(procs_id, -1); RegisterDefaultClasses(mproto); - auto cluster=Cluster::Get(cproto, procs_id); + auto cluster=Cluster::Get(gproto, cproto, procs_id); router_=make_shared<Router>(); router_->Bind(kInprocRouterEndpoint); if(cluster->nprocs()>1){ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/src/utils/cluster.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster.cc b/src/utils/cluster.cc index fdbde69..0c4eefa 100644 --- a/src/utils/cluster.cc +++ b/src/utils/cluster.cc @@ -10,9 +10,11 @@ namespace singa { std::shared_ptr<Cluster> Cluster::instance_; -Cluster::Cluster(const ClusterProto &cluster, int procs_id) { +Cluster::Cluster(const GlobalProto & global, const ClusterProto &cluster, + int procs_id) { procs_id_=procs_id; cluster_ = cluster; + global_ = global; SetupFolders(cluster); if(server_worker_separate()) nprocs_=nworker_procs()+nserver_procs(); @@ -47,7 +49,7 @@ Cluster::Cluster(const ClusterProto &cluster, int procs_id) { } } - auto rt=new ZKClusterRT(cluster_.zookeeper_host()); + auto rt=new ZKClusterRT(global_.zookeeper_host()); rt->Init(); cluster_rt_=shared_ptr<ClusterRuntime>(static_cast<ClusterRuntime*>(rt)); @@ -73,8 +75,9 @@ void Cluster::SetupFolders(const ClusterProto &cluster){ mkdir(vis_folder().c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); } -shared_ptr<Cluster> Cluster::Get(const ClusterProto& cluster, int procs_id){ - instance_.reset(new Cluster(cluster, procs_id)); +shared_ptr<Cluster> Cluster::Get(const GlobalProto& global, const ClusterProto& cluster, + int procs_id){ + instance_.reset(new Cluster(global, cluster, procs_id)); return instance_; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/tool/gen_hosts.py ---------------------------------------------------------------------- diff --git a/tool/gen_hosts.py b/tool/gen_hosts.py old mode 100755 new mode 100644 index e2ed29d..570eff9 --- a/tool/gen_hosts.py +++ b/tool/gen_hosts.py @@ -4,7 +4,7 @@ import argparse import os import sys from google.protobuf import text_format -from plot.cluster_pb2 import ClusterProto +from pb2.cluster_pb2 import ClusterProto # parse command line parser = argparse.ArgumentParser(description='Generate host list from host file for a SINGA job') @@ -13,11 +13,6 @@ parser.add_argument('-src', dest='src', metavar='SRC_FILE', required=True, help= parser.add_argument('-dst', dest='dst', metavar='DST_FILE', required=True, help='generated list') args = parser.parse_args(); -# change to SINGA_HOME -abspath = os.path.abspath(__file__) -dname = os.path.dirname(abspath) -os.chdir(dname+'/..') - # read from .conf file fd_conf = open(args.conf, 'r') cluster = ClusterProto() @@ -44,7 +39,7 @@ fd_src.close() # write to dst file num_hosts = len(hosts) if (num_hosts == 0): - print 'source host file is empty' + print 'ERROR: source host file is empty' sys.exit() fd_dst = open(args.dst, 'w') for i in range(nprocs): http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/tool/plot/__init__.py ---------------------------------------------------------------------- diff --git a/tool/plot/__init__.py b/tool/plot/__init__.py deleted file mode 100644 index e69de29..0000000 http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3819e590/tool/plot/plot.py ---------------------------------------------------------------------- diff --git a/tool/plot/plot.py b/tool/plot/plot.py deleted file mode 100644 index e69de29..0000000
