[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176945375
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -153,23 +157,82 @@ class KVStoreDistServer {
 
   void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
 CommandType recved_type = static_cast(recved.head);
-if (recved_type == CommandType::kStopServer) {
-  exec_.Stop();
-} else if (recved_type == CommandType::kSyncMode) {
-  sync_mode_ = true;
-} else if (recved_type == CommandType::kSetGradientCompression) {
-  gradient_compression_->DecodeParams(recved.body);
-} else {
-  // this uses value 0 for message id from frontend
-  // let the main thread to execute ctrl, which is necessary for python
-  exec_.Exec([this, recved]() {
-  CHECK(controller_);
-  controller_(recved.head, recved.body);
-});
+switch (recved_type) {
+  case CommandType::kStopServer:
+exec_.Stop();
+break;
+  case CommandType::kSyncMode:
+sync_mode_ = true;
+break;
+  case CommandType::kSetGradientCompression:
+gradient_compression_->DecodeParams(recved.body);
+break;
+  case CommandType::kSetProfilerParams:
+// last char is the type of profiler command
+ProcessServerProfilerCommands(static_cast
+  (recved.body.back() - '0'),
+  recved.body);
+break;
+  case CommandType::kController:
+// this uses value 0 for message id from frontend
+// let the main thread to execute ctrl, which is necessary for python
+exec_.Exec([this, recved]() {
+CHECK(controller_);
+controller_(recved.head, recved.body);
+  });
+break;
 }
 app->Response(recved);
   }
 
+  void ProcessServerProfilerCommands(KVStoreServerProfilerCommand type, const 
std::string& body) {
+switch (type) {
+  case KVStoreServerProfilerCommand::kSetConfig:
+SetProfilerConfig(body.substr(0, body.size() - 1));
+break;
+  case KVStoreServerProfilerCommand::kState:
+MXSetProfilerState(static_cast(body.front() - '0'));
+break;
+  case KVStoreServerProfilerCommand::kPause:
+MXProfilePause(static_cast(body.front() - '0'));
+break;
+  case KVStoreServerProfilerCommand::kDump:
+MXDumpProfile(static_cast(body.front() - '0'));
+break;
+}
+  }
+
+  void SetProfilerConfig(std::string params_str) {
+std::vector elems;
+mxnet::kvstore::split(params_str, ',', std::back_inserter(elems));
+std::vector ckeys;
+std::vector cvals;
+ckeys.reserve(elems.size());
+cvals.reserve(elems.size());
+
+for (int i=0; i < elems.size(); i++) {
+  std::vector parts;
+  mxnet::kvstore::split(elems[i], ':', std::back_inserter(parts));
+  CHECK(!parts[0].empty()) << "ProfilerConfig parameter is empty";
+  CHECK(!parts[1].empty()) << "ProfilerConfig value is empty for parameter 
"<< parts[0];
+  if (parts[0] == "filename") {
+parts[1] = "rank" + std::to_string(ps::MyRank()) + "_" + parts[1];
+  }
+  char* ckey = new char[parts[0].length() + 1];
 
 Review comment:
   ok can you have another vector then with unique_ptr in order to
   guarantee they’re freed?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176945245
 
 

 ##
 File path: include/mxnet/c_api.h
 ##
 @@ -218,25 +218,35 @@ MXNET_DLL int MXNotifyShutdown();
  * \param num_params Number of parameters
  * \param keys array of parameter keys
  * \param vals array of parameter values
+ * \param kvstoreHandle handle to kvstore
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, 
const char* const* vals);
+MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, 
const char* const* vals,
+  KVStoreHandle kvstoreHandle = nullptr);
 
 /*!
  * \brief Set up state of profiler
  * \param state indicate the working state of profiler,
  *  profiler not running when state == 0,
  *  profiler running when state == 1
+ * \param profile_process an int,
+ * when 0 command is for worker process,
+ * when 1 command is for server process
+ * \param kvstoreHandle handle to kvstore
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXSetProfilerState(int state);
+MXNET_DLL int MXSetProfilerState(int state, int profile_process, KVStoreHandle 
kvStoreHandle = nullptr);
 
 Review comment:
   default


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176945462
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -153,23 +156,82 @@ class KVStoreDistServer {
 
   void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
 CommandType recved_type = static_cast(recved.head);
-if (recved_type == CommandType::kStopServer) {
-  exec_.Stop();
-} else if (recved_type == CommandType::kSyncMode) {
-  sync_mode_ = true;
-} else if (recved_type == CommandType::kSetGradientCompression) {
-  gradient_compression_->DecodeParams(recved.body);
-} else {
-  // this uses value 0 for message id from frontend
-  // let the main thread to execute ctrl, which is necessary for python
-  exec_.Exec([this, recved]() {
-  CHECK(controller_);
-  controller_(recved.head, recved.body);
-});
+switch (recved_type) {
+  case CommandType::kStopServer:
+exec_.Stop();
+break;
+  case CommandType::kSyncMode:
+sync_mode_ = true;
+break;
+  case CommandType::kSetGradientCompression:
+gradient_compression_->DecodeParams(recved.body);
+break;
+  case CommandType::kSetProfilerParams:
+// last char is the type of profiler command
+ProcessServerProfilerCommands(static_cast
+  (recved.body.back() - '0'),
+  recved.body);
+break;
+  case CommandType::kController:
+// this uses value 0 for message id from frontend
+// let the main thread to execute ctrl, which is necessary for python
+exec_.Exec([this, recved]() {
+CHECK(controller_);
+controller_(recved.head, recved.body);
+  });
+break;
 }
 app->Response(recved);
   }
 
+  void ProcessServerProfilerCommands(KVStoreServerProfilerCommand type, const 
std::string& body) {
+switch (type) {
+  case KVStoreServerProfilerCommand::kSetConfig:
+SetProfilerConfig(body.substr(0, body.size() - 1));
+break;
+  case KVStoreServerProfilerCommand::kState:
+MXSetProfilerState(static_cast(body.front() - '0'), 0, nullptr);
+break;
+  case KVStoreServerProfilerCommand::kPause:
+MXProfilePause(static_cast(body.front() - '0'), 0, nullptr);
+break;
+  case KVStoreServerProfilerCommand::kDump:
+MXDumpProfile(static_cast(body.front() - '0'), 0, nullptr);
+break;
+}
+  }
+
+  void SetProfilerConfig(std::string params_str) {
+std::vector elems;
+mxnet::kvstore::split(params_str, ',', std::back_inserter(elems));
+std::vector ckeys;
+std::vector cvals;
+ckeys.reserve(elems.size());
+cvals.reserve(elems.size());
+
+for (size_t i=0; i < elems.size(); i++) {
+  std::vector parts;
+  mxnet::kvstore::split(elems[i], ':', std::back_inserter(parts));
+  CHECK(!parts[0].empty()) << "ProfilerConfig parameter is empty";
 
 Review comment:
   should you check the length of parts before indexing into it?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176944941
 
 

 ##
 File path: include/mxnet/kvstore.h
 ##
 @@ -361,6 +373,17 @@ class KVStore {
*/
   virtual void SendCommandToServers(int cmd_id, const std::string& cmd_body) { 
}
 
+  /**
+   * \brief Sends server profiler commands to all server nodes
+   * Only the worker with rank=0 sends the command which will be received by 
all servers
+   * \param type ProfilerCommand type
+   * \param params parameters for that command in the form of a string
+   */
+  virtual void SetServerProfilerCommand(const KVStoreServerProfilerCommand 
type,
+const std::string& params) {
+LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use distributed kvstore";
 
 Review comment:
   yeah but will the local still run?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176944971
 
 

 ##
 File path: include/mxnet/kvstore.h
 ##
 @@ -361,6 +373,17 @@ class KVStore {
*/
   virtual void SendCommandToServers(int cmd_id, const std::string& cmd_body) { 
}
 
+  /**
+   * \brief Sends server profiler commands to all server nodes
+   * Only the worker with rank=0 sends the command which will be received by 
all servers
+   * \param type ProfilerCommand type
+   * \param params parameters for that command in the form of a string
+   */
+  virtual void SetServerProfilerCommand(const KVStoreServerProfilerCommand 
type,
+const std::string& params) {
+LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use distributed kvstore";
 
 Review comment:
   what I mean is, for builds that may not have kvstore enabled, you might 
still want code to work, but only locally and not just blow up


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176944879
 
 

 ##
 File path: example/image-classification/common/fit.py
 ##
 @@ -150,6 +156,17 @@ def fit(args, network, data_loader, **kwargs):
 if args.gc_type != 'none':
 kv.set_gradient_compression({'type': args.gc_type,
  'threshold': args.gc_threshold})
+if args.profile_server_suffix:
+mx.profiler.set_config(filename=args.profile_server_suffix, 
profile_all=True, profile_process='server')
+mx.profiler.set_state(state='run', profile_process='server')
+
+if args.profile_worker_suffix:
+if kv.num_workers > 1:
+filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
+else:
+filename = args.profile_worker_suffix
+mx.profiler.set_config(filename=filename, profile_all=True, 
profile_process='worker')
 
 Review comment:
   why necessary to pass profile_process to both with the same value? any way 
not to? seems tedious


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176945010
 
 

 ##
 File path: include/mxnet/c_api.h
 ##
 @@ -218,25 +218,35 @@ MXNET_DLL int MXNotifyShutdown();
  * \param num_params Number of parameters
  * \param keys array of parameter keys
  * \param vals array of parameter values
+ * \param kvstoreHandle handle to kvstore
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, 
const char* const* vals);
+MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, 
const char* const* vals,
 
 Review comment:
   if you do, it should be with some macro that will be nothing in C. because 
if you include this header from a C file, it will choke on that default 
argument 


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-25 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176944903
 
 

 ##
 File path: include/mxnet/c_api.h
 ##
 @@ -218,25 +218,35 @@ MXNET_DLL int MXNotifyShutdown();
  * \param num_params Number of parameters
  * \param keys array of parameter keys
  * \param vals array of parameter values
+ * \param kvstoreHandle handle to kvstore
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, 
const char* const* vals);
+MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, 
const char* const* vals,
 
 Review comment:
   C API, so shouldn’t pass default arguments


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010079
 
 

 ##
 File path: include/mxnet/kvstore.h
 ##
 @@ -361,6 +373,17 @@ class KVStore {
*/
   virtual void SendCommandToServers(int cmd_id, const std::string& cmd_body) { 
}
 
+  /**
+   * \brief Sends server profiler commands to all server nodes
+   * Only the worker with rank=0 sends the command which will be received by 
all servers
+   * \param type ProfilerCommand type
+   * \param params parameters for that command in the form of a string
+   */
+  virtual void SetServerProfilerCommand(const KVStoreServerProfilerCommand 
type,
+const std::string& params) {
+LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use distributed kvstore";
 
 Review comment:
   do you really need to die here? will a warning suffice?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010410
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -32,6 +32,8 @@
 #include 
 #include 
 #include 
+#include "mxnet/c_api.h"
+#include "profiler/profiler.h"
 #include "ps/ps.h"
 
 Review comment:
   <>


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010585
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -170,6 +187,33 @@ class KVStoreDistServer {
 app->Response(recved);
   }
 
+  void SetProfilerConfig(std::string params_str) {
+std::vector elems;
+mxnet::kvstore::split(params_str, ',', std::back_inserter(elems));
+std::vector ckeys;
+std::vector cvals;
+ckeys.reserve(elems.size());
+cvals.reserve(elems.size());
+
+for (int i=0; i < elems.size(); i++) {
+  std::vector parts;
 
 Review comment:
   oh ok missed that my mistake


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174009887
 
 

 ##
 File path: example/image-classification/common/fit.py
 ##
 @@ -305,3 +321,8 @@ def fit(args, network, data_loader, **kwargs):
   epoch_end_callback=checkpoint,
   allow_missing=True,
   monitor=monitor)
+
+if args.profile_server_file:
+kv.set_server_profiler_state(state='stop')
+if args.profile_worker_file:
 
 Review comment:
   just curious if you had any special reason to stop the server before the 
worker.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010221
 
 

 ##
 File path: include/mxnet/kvstore.h
 ##
 @@ -361,6 +373,17 @@ class KVStore {
*/
   virtual void SendCommandToServers(int cmd_id, const std::string& cmd_body) { 
}
 
+  /**
+   * \brief Sends server profiler commands to all server nodes
+   * Only the worker with rank=0 sends the command which will be received by 
all servers
+   * \param type ProfilerCommand type
+   * \param params parameters for that command in the form of a string
+   */
+  virtual void SetServerProfilerCommand(const KVStoreServerProfilerCommand 
type,
+const std::string& params) {
+LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use distributed kvstore";
 
 Review comment:
   or a CHECK may be a caught error, right? is FATAL catchable? python user 
might not know whether kvstore was turned on. i mean, is this something where 
execution just can?t continue?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010394
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -32,6 +32,8 @@
 #include 
 #include 
 #include 
+#include "mxnet/c_api.h"
+#include "profiler/profiler.h"
 
 Review comment:
   ../profiler/


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010534
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -125,6 +127,8 @@ class KVStoreDistServer {
   }
 
   ~KVStoreDistServer() {
+profiler::Profiler::Get()->SetState(
+  profiler::Profiler::ProfilerState(profiler::Profiler::kNotRunning));
 delete ps_server_;
 
 Review comment:
   not sure what the context is when this is called. is this an independent 
process? normally profiler is shut down by static shutdown, after engine


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010819
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -153,23 +157,82 @@ class KVStoreDistServer {
 
   void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
 CommandType recved_type = static_cast(recved.head);
-if (recved_type == CommandType::kStopServer) {
-  exec_.Stop();
-} else if (recved_type == CommandType::kSyncMode) {
-  sync_mode_ = true;
-} else if (recved_type == CommandType::kSetGradientCompression) {
-  gradient_compression_->DecodeParams(recved.body);
-} else {
-  // this uses value 0 for message id from frontend
-  // let the main thread to execute ctrl, which is necessary for python
-  exec_.Exec([this, recved]() {
-  CHECK(controller_);
-  controller_(recved.head, recved.body);
-});
+switch (recved_type) {
+  case CommandType::kStopServer:
+exec_.Stop();
+break;
+  case CommandType::kSyncMode:
+sync_mode_ = true;
+break;
+  case CommandType::kSetGradientCompression:
+gradient_compression_->DecodeParams(recved.body);
+break;
+  case CommandType::kSetProfilerParams:
+// last char is the type of profiler command
+ProcessServerProfilerCommands(static_cast
+  (recved.body.back() - '0'),
+  recved.body);
+break;
+  case CommandType::kController:
+// this uses value 0 for message id from frontend
+// let the main thread to execute ctrl, which is necessary for python
+exec_.Exec([this, recved]() {
+CHECK(controller_);
+controller_(recved.head, recved.body);
+  });
+break;
 }
 app->Response(recved);
   }
 
+  void ProcessServerProfilerCommands(KVStoreServerProfilerCommand type, const 
std::string& body) {
+switch (type) {
+  case KVStoreServerProfilerCommand::kSetConfig:
+SetProfilerConfig(body.substr(0, body.size() - 1));
+break;
+  case KVStoreServerProfilerCommand::kState:
+MXSetProfilerState(static_cast(body.front() - '0'));
+break;
+  case KVStoreServerProfilerCommand::kPause:
+MXProfilePause(static_cast(body.front() - '0'));
+break;
+  case KVStoreServerProfilerCommand::kDump:
+MXDumpProfile(static_cast(body.front() - '0'));
+break;
+}
+  }
+
+  void SetProfilerConfig(std::string params_str) {
+std::vector elems;
+mxnet::kvstore::split(params_str, ',', std::back_inserter(elems));
+std::vector ckeys;
+std::vector cvals;
+ckeys.reserve(elems.size());
+cvals.reserve(elems.size());
+
+for (int i=0; i < elems.size(); i++) {
+  std::vector parts;
+  mxnet::kvstore::split(elems[i], ':', std::back_inserter(parts));
+  CHECK(!parts[0].empty()) << "ProfilerConfig parameter is empty";
+  CHECK(!parts[1].empty()) << "ProfilerConfig value is empty for parameter 
"<< parts[0];
+  if (parts[0] == "filename") {
+parts[1] = "rank" + std::to_string(ps::MyRank()) + "_" + parts[1];
+  }
+  char* ckey = new char[parts[0].length() + 1];
 
 Review comment:
   why can?t you just pass the c_str() pointers in the array passed to the api 
function so that you don?t have to allocate and free?
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training

2018-03-12 Thread GitBox
cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding 
support to profile kvstore server during distributed training
URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r174010353
 
 

 ##
 File path: src/kvstore/kvstore_dist_server.h
 ##
 @@ -32,6 +32,8 @@
 #include 
 #include 
 #include 
+#include "mxnet/c_api.h"
 
 Review comment:
   i think stuff in include/mxnet/xxx would tend to be in <>


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services