[mxnet] branch v1.9.x updated: Don't explicitly release CUDA resources at main Python process exit (#21182)

jevans Tue, 28 Feb 2023 10:35:04 -0800

This is an automated email from the ASF dual-hosted git repository.

jevans pushed a commit to branch v1.9.x
in repository https://gitbox.apache.org/repos/asf/mxnet.git



The following commit(s) were added to refs/heads/v1.9.x by this push:
     new 76d73dbd77 Don't explicitly release CUDA resources at main Python 
process exit (#21182)
76d73dbd77 is described below

commit 76d73dbd778c9ab4a4584dbc857c8c0694faa162
Author: Dick Carter <[email protected]>
AuthorDate: Tue Feb 28 10:34:39 2023 -0800

    Don't explicitly release CUDA resources at main Python process exit (#21182)
---
 src/engine/naive_engine.cc              | 12 +++++++++---
 src/engine/stream_manager.h             | 20 ++++++++++++--------
 src/engine/threaded_engine.h            |  5 +++--
 src/engine/threaded_engine_perdevice.cc | 13 +++++++++----
 src/engine/threaded_engine_pooled.cc    |  2 +-
 5 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index e7412fa6f4..1146053537 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -73,14 +73,20 @@ class NaiveEngine final : public Engine {
     LOG(INFO) << "Engine shutdown";
     for (size_t i = 0; i < streams_.size(); ++i) {
       if (streams_[i] != nullptr) {
-        // Catch exception for CUDA driver shutdown
-        MSHADOW_CATCH_ERROR(mshadow::DeleteStream(streams_[i]));
+        // If the main Python process is exiting (shutdown_phase_ == true),
+        // there's no need to explicitly release CUDA resources.
+        if (!shutdown_phase_) {
+          // Catch exception for CUDA driver shutdown
+          MSHADOW_CATCH_ERROR(mshadow::DeleteStream(streams_[i]));
+        }
         streams_[i] = nullptr;
       }
     }
     for (size_t i = 0; i < aux_streams_.size(); ++i) {
       if (aux_streams_[i] != nullptr) {
-        delete aux_streams_[i];
+        if (!shutdown_phase_) {
+          delete aux_streams_[i];
+        }
         aux_streams_[i] = nullptr;
       }
     }
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index e3a61a39d6..d753c76ed2 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -46,7 +46,7 @@ class StreamManager {
   }
   RunContext GetRunContext(Context const& ctx);
   RunContext GetIORunContext(Context const& ctx);
-  void Finalize();
+  void Finalize(bool shutdown_phase = false);
  private:
   std::mutex mutex_;
 #if MXNET_USE_CUDA
@@ -146,16 +146,20 @@ StreamManager<kNumGpus, kStreams>::StreamManager() {
 }
 
 template <std::size_t kNumGpus, std::size_t kStreams>
-void StreamManager<kNumGpus, kStreams>::Finalize() {
+void StreamManager<kNumGpus, kStreams>::Finalize(bool shutdown_phase) {
 #if MXNET_USE_CUDA
   for (std::size_t i = 0; i < kNumGpus; ++i) {
     if (gpu_cnt_.at(i) != -1) {
-      for (auto&& primary_stream : gpu_streams_.at(i)) {
-        // Catch exception for CUDA driver shutdown
-        MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(primary_stream));
-      }
-      for (auto&& aux_stream : gpu_aux_streams_.at(i)) {
-        delete aux_stream;
+      // If the main Python process is exiting (shutdown_phase == true),
+      // there's no need to explicitly release CUDA resources.
+      if (!shutdown_phase) {
+        for (auto&& primary_stream : gpu_streams_.at(i)) {
+          // Catch exception for CUDA driver shutdown
+          MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(primary_stream));
+        }
+        for (auto&& aux_stream : gpu_aux_streams_.at(i)) {
+          delete aux_stream;
+        }
       }
       gpu_cnt_.at(i) = -1;
     }
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 682816d03b..de21870fde 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -426,6 +426,9 @@ class ThreadedEngine : public Engine {
     return bulk_size;
   }
 
+  /*! \brief whether it is during shutdown phase*/
+  std::atomic<bool> shutdown_phase_{false};
+
  private:
   /*! \brief structure for holding bulk execution status */
   struct BulkStatus {
@@ -555,8 +558,6 @@ class ThreadedEngine : public Engine {
   std::atomic<int> pending_{0};
   /*! \brief whether we want to kill the waiters */
   std::atomic<bool> kill_{false};
-  /*! \brief whether it is during shutdown phase*/
-  std::atomic<bool> shutdown_phase_{false};
   /*!\brief show more information from engine actions */
   bool engine_info_{false};
   /*! \brief debug information about wait for var. */
diff --git a/src/engine/threaded_engine_perdevice.cc 
b/src/engine/threaded_engine_perdevice.cc
index c59a06b568..bbe17b1ce7 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -270,10 +270,15 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     while (task_queue->Pop(&opr_block)) {
       this->ExecuteOprBlock(run_ctx, opr_block);
     }
-    // Catch exception for CUDA driver shutdown
-    MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(stream));
-    if (aux_stream != nullptr)
-      delete aux_stream;
+    // If the main Python process is exiting (shutdown_phase_ == true),
+    // there's no need to explicitly release CUDA resources.
+    if (!shutdown_phase_) {
+      // Catch exception for CUDA driver shutdown
+      MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(stream));
+      if (aux_stream != nullptr) {
+        delete aux_stream;
+      }
+    }
 #else
     ready_event->signal();
 #endif
diff --git a/src/engine/threaded_engine_pooled.cc 
b/src/engine/threaded_engine_pooled.cc
index 43f72253c1..4ecf9e2f03 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -54,7 +54,7 @@ class ThreadedEnginePooled : public ThreadedEngine {
   }
 
   void StopNoWait() {
-    streams_->Finalize();
+    streams_->Finalize(shutdown_phase_);
     task_queue_->SignalForKill();
     io_task_queue_->SignalForKill();
     task_queue_ = nullptr;

[mxnet] branch v1.9.x updated: Don't explicitly release CUDA resources at main Python process exit (#21182)

Reply via email to