lebeg commented on a change in pull request #10911: Fix engine stop/start URL: https://github.com/apache/incubator-mxnet/pull/10911#discussion_r195025583
########## File path: src/engine/threaded_engine_pooled.cc ########## @@ -42,14 +43,38 @@ namespace engine { */ class ThreadedEnginePooled : public ThreadedEngine { public: - ThreadedEnginePooled() : - thread_pool_(kNumWorkingThreads, [this]() { ThreadWorker(&task_queue_); }), - io_thread_pool_(1, [this]() { ThreadWorker(&io_task_queue_); }) {} + ThreadedEnginePooled() { + this->Start(); + } ~ThreadedEnginePooled() noexcept(false) { - streams_.Finalize(); - task_queue_.SignalForKill(); - io_task_queue_.SignalForKill(); + StopNoWait(); + } + + void StopNoWait() { + streams_->Finalize(); + task_queue_->SignalForKill(); + io_task_queue_->SignalForKill(); + task_queue_ = nullptr; + io_task_queue_ = nullptr; + thread_pool_ = nullptr; + io_thread_pool_ = nullptr; + streams_ = nullptr; + } + + void Stop() override { + WaitForAll(); + StopNoWait(); + } + + void Start() override { + streams_.reset(new StreamManager<kMaxNumGpus, kNumStreamsPerGpu>()); + task_queue_.reset(new dmlc::ConcurrentBlockingQueue<OprBlock*>()); + io_task_queue_.reset(new dmlc::ConcurrentBlockingQueue<OprBlock*>()); + thread_pool_.reset(new ThreadPool(kNumWorkingThreads, [this]() { Review comment: Shouldn't this contain change to [another constructor of ThreadPool](https://github.com/apache/incubator-mxnet/blob/master/src/engine/thread_pool.h#L68-L70)? Only the one accepting `std::shared_ptr<dmlc::ManualEvent> ready_event` will wait until ready, which might be the issue fixed in https://github.com/apache/incubator-mxnet/pull/8995. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services