This is an automated email from the ASF dual-hosted git repository.
jevans pushed a commit to branch v1.9.x
in repository https://gitbox.apache.org/repos/asf/mxnet.git
The following commit(s) were added to refs/heads/v1.9.x by this push:
new 76d73dbd77 Don't explicitly release CUDA resources at main Python
process exit (#21182)
76d73dbd77 is described below
commit 76d73dbd778c9ab4a4584dbc857c8c0694faa162
Author: Dick Carter <[email protected]>
AuthorDate: Tue Feb 28 10:34:39 2023 -0800
Don't explicitly release CUDA resources at main Python process exit (#21182)
---
src/engine/naive_engine.cc | 12 +++++++++---
src/engine/stream_manager.h | 20 ++++++++++++--------
src/engine/threaded_engine.h | 5 +++--
src/engine/threaded_engine_perdevice.cc | 13 +++++++++----
src/engine/threaded_engine_pooled.cc | 2 +-
5 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index e7412fa6f4..1146053537 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -73,14 +73,20 @@ class NaiveEngine final : public Engine {
LOG(INFO) << "Engine shutdown";
for (size_t i = 0; i < streams_.size(); ++i) {
if (streams_[i] != nullptr) {
- // Catch exception for CUDA driver shutdown
- MSHADOW_CATCH_ERROR(mshadow::DeleteStream(streams_[i]));
+ // If the main Python process is exiting (shutdown_phase_ == true),
+ // there's no need to explicitly release CUDA resources.
+ if (!shutdown_phase_) {
+ // Catch exception for CUDA driver shutdown
+ MSHADOW_CATCH_ERROR(mshadow::DeleteStream(streams_[i]));
+ }
streams_[i] = nullptr;
}
}
for (size_t i = 0; i < aux_streams_.size(); ++i) {
if (aux_streams_[i] != nullptr) {
- delete aux_streams_[i];
+ if (!shutdown_phase_) {
+ delete aux_streams_[i];
+ }
aux_streams_[i] = nullptr;
}
}
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index e3a61a39d6..d753c76ed2 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -46,7 +46,7 @@ class StreamManager {
}
RunContext GetRunContext(Context const& ctx);
RunContext GetIORunContext(Context const& ctx);
- void Finalize();
+ void Finalize(bool shutdown_phase = false);
private:
std::mutex mutex_;
#if MXNET_USE_CUDA
@@ -146,16 +146,20 @@ StreamManager<kNumGpus, kStreams>::StreamManager() {
}
template <std::size_t kNumGpus, std::size_t kStreams>
-void StreamManager<kNumGpus, kStreams>::Finalize() {
+void StreamManager<kNumGpus, kStreams>::Finalize(bool shutdown_phase) {
#if MXNET_USE_CUDA
for (std::size_t i = 0; i < kNumGpus; ++i) {
if (gpu_cnt_.at(i) != -1) {
- for (auto&& primary_stream : gpu_streams_.at(i)) {
- // Catch exception for CUDA driver shutdown
- MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(primary_stream));
- }
- for (auto&& aux_stream : gpu_aux_streams_.at(i)) {
- delete aux_stream;
+ // If the main Python process is exiting (shutdown_phase == true),
+ // there's no need to explicitly release CUDA resources.
+ if (!shutdown_phase) {
+ for (auto&& primary_stream : gpu_streams_.at(i)) {
+ // Catch exception for CUDA driver shutdown
+ MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(primary_stream));
+ }
+ for (auto&& aux_stream : gpu_aux_streams_.at(i)) {
+ delete aux_stream;
+ }
}
gpu_cnt_.at(i) = -1;
}
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 682816d03b..de21870fde 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -426,6 +426,9 @@ class ThreadedEngine : public Engine {
return bulk_size;
}
+ /*! \brief whether it is during shutdown phase*/
+ std::atomic<bool> shutdown_phase_{false};
+
private:
/*! \brief structure for holding bulk execution status */
struct BulkStatus {
@@ -555,8 +558,6 @@ class ThreadedEngine : public Engine {
std::atomic<int> pending_{0};
/*! \brief whether we want to kill the waiters */
std::atomic<bool> kill_{false};
- /*! \brief whether it is during shutdown phase*/
- std::atomic<bool> shutdown_phase_{false};
/*!\brief show more information from engine actions */
bool engine_info_{false};
/*! \brief debug information about wait for var. */
diff --git a/src/engine/threaded_engine_perdevice.cc
b/src/engine/threaded_engine_perdevice.cc
index c59a06b568..bbe17b1ce7 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -270,10 +270,15 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
while (task_queue->Pop(&opr_block)) {
this->ExecuteOprBlock(run_ctx, opr_block);
}
- // Catch exception for CUDA driver shutdown
- MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(stream));
- if (aux_stream != nullptr)
- delete aux_stream;
+ // If the main Python process is exiting (shutdown_phase_ == true),
+ // there's no need to explicitly release CUDA resources.
+ if (!shutdown_phase_) {
+ // Catch exception for CUDA driver shutdown
+ MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(stream));
+ if (aux_stream != nullptr) {
+ delete aux_stream;
+ }
+ }
#else
ready_event->signal();
#endif
diff --git a/src/engine/threaded_engine_pooled.cc
b/src/engine/threaded_engine_pooled.cc
index 43f72253c1..4ecf9e2f03 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -54,7 +54,7 @@ class ThreadedEnginePooled : public ThreadedEngine {
}
void StopNoWait() {
- streams_->Finalize();
+ streams_->Finalize(shutdown_phase_);
task_queue_->SignalForKill();
io_task_queue_->SignalForKill();
task_queue_ = nullptr;