This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new bed66d20f1 [Disco] Disable splitting nccl communicator in single-group 
(#17264)
bed66d20f1 is described below

commit bed66d20f1640f814b9f27bcc439f8761e3070cf
Author: Wuwei Lin <[email protected]>
AuthorDate: Sat Aug 10 10:06:17 2024 -0700

    [Disco] Disable splitting nccl communicator in single-group (#17264)
---
 src/runtime/disco/nccl/nccl.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/runtime/disco/nccl/nccl.cc b/src/runtime/disco/nccl/nccl.cc
index d35fc911c6..a5240aa2b2 100644
--- a/src/runtime/disco/nccl/nccl.cc
+++ b/src/runtime/disco/nccl/nccl.cc
@@ -101,8 +101,12 @@ void InitCCLPerWorker(IntTuple device_ids, std::string 
unique_id_bytes) {
   ncclUniqueId id;
   std::memcpy(id.internal, unique_id_bytes.data(), NCCL_UNIQUE_ID_BYTES);
   NCCL_CALL(ncclCommInitRank(&ctx->global_comm, worker->num_workers, id, 
worker->worker_id));
-  NCCL_CALL(ncclCommSplit(ctx->global_comm, worker->worker_id / group_size,
-                          worker->worker_id % group_size, &ctx->group_comm, 
NULL));
+  if (worker->num_groups == 1) {
+    ctx->group_comm = ctx->global_comm;
+  } else {
+    NCCL_CALL(ncclCommSplit(ctx->global_comm, worker->worker_id / group_size,
+                            worker->worker_id % group_size, &ctx->group_comm, 
NULL));
+  }
 }
 
 void AllReduce(NDArray send, ReduceKind reduce_kind, bool in_group, NDArray 
recv) {

Reply via email to