szha closed pull request #8804: Fix weird hang bug due to cuInit sometimes 
calls fork
URL: https://github.com/apache/incubator-mxnet/pull/8804
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/engine/threaded_engine_perdevice.cc 
b/src/engine/threaded_engine_perdevice.cc
index c01de75384..28bc92f7b2 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -55,7 +55,6 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
 #ifndef _WIN32
     pthread_atfork(
       []() {
-        Engine::Get()->WaitForAll();
         Engine::Get()->Stop();
       },
       []() {
@@ -71,10 +70,10 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
 #endif
   }
   ~ThreadedEnginePerDevice() noexcept(false) {
-    this->Stop();
+    this->StopNoWait();
   }
 
-  void Stop() override {
+  void StopNoWait() {
     SignalQueuesForKill();
     gpu_normal_workers_.Clear();
     gpu_copy_workers_.Clear();
@@ -82,7 +81,14 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     cpu_priority_worker_.reset(nullptr);
   }
 
+  void Stop() override {
+    if (is_worker_) return;
+    WaitForAll();
+    StopNoWait();
+  }
+
   void Start() override {
+    if (is_worker_) return;
     gpu_worker_nthreads_ = common::GetNumThreadPerGPU();
     cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
     // create CPU task
@@ -196,6 +202,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     ~ThreadWorkerBlock() noexcept(false) {}
   };
 
+  /*! \brief whether this is a worker thread. */
+  static MX_THREAD_LOCAL bool is_worker_;
   /*! \brief number of concurrent thread cpu worker uses */
   int cpu_worker_nthreads_;
   /*! \brief number of concurrent thread each gpu worker uses */
@@ -219,6 +227,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
                         bool is_copy_worker,
                         ThreadWorkerBlock<type> *block,
                         std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
+    this->is_worker_ = true;
 #if MXNET_USE_CUDA
     mshadow::Stream<gpu> *stream;
     do {
@@ -251,6 +260,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   template<dmlc::ConcurrentQueueType type>
   inline void CPUWorker(Context ctx,
                         ThreadWorkerBlock<type> *block) {
+    this->is_worker_ = true;
     auto* task_queue = &(block->task_queue);
     RunContext run_ctx{ctx, nullptr};
     // execute task
@@ -303,5 +313,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
 Engine *CreateThreadedEnginePerDevice() {
   return new ThreadedEnginePerDevice();
 }
+
+MX_THREAD_LOCAL bool ThreadedEnginePerDevice::is_worker_ = false;
+
 }  // namespace engine
 }  // namespace mxnet


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to