This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new bed66d20f1 [Disco] Disable splitting nccl communicator in single-group
(#17264)
bed66d20f1 is described below
commit bed66d20f1640f814b9f27bcc439f8761e3070cf
Author: Wuwei Lin <[email protected]>
AuthorDate: Sat Aug 10 10:06:17 2024 -0700
[Disco] Disable splitting nccl communicator in single-group (#17264)
---
src/runtime/disco/nccl/nccl.cc | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/runtime/disco/nccl/nccl.cc b/src/runtime/disco/nccl/nccl.cc
index d35fc911c6..a5240aa2b2 100644
--- a/src/runtime/disco/nccl/nccl.cc
+++ b/src/runtime/disco/nccl/nccl.cc
@@ -101,8 +101,12 @@ void InitCCLPerWorker(IntTuple device_ids, std::string
unique_id_bytes) {
ncclUniqueId id;
std::memcpy(id.internal, unique_id_bytes.data(), NCCL_UNIQUE_ID_BYTES);
NCCL_CALL(ncclCommInitRank(&ctx->global_comm, worker->num_workers, id,
worker->worker_id));
- NCCL_CALL(ncclCommSplit(ctx->global_comm, worker->worker_id / group_size,
- worker->worker_id % group_size, &ctx->group_comm,
NULL));
+ if (worker->num_groups == 1) {
+ ctx->group_comm = ctx->global_comm;
+ } else {
+ NCCL_CALL(ncclCommSplit(ctx->global_comm, worker->worker_id / group_size,
+ worker->worker_id % group_size, &ctx->group_comm,
NULL));
+ }
}
void AllReduce(NDArray send, ReduceKind reduce_kind, bool in_group, NDArray
recv) {