eric-haibin-lin commented on a change in pull request #11591: [MXNET-331] 
Single machine All Reduce Topology-aware Communication (Updated)
URL: https://github.com/apache/incubator-mxnet/pull/11591#discussion_r201218586
 
 

 ##########
 File path: src/kvstore/gpu_topology.h
 ##########
 @@ -0,0 +1,1067 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef MXNET_KVSTORE_GPU_TOPOLOGY_H_
+#define MXNET_KVSTORE_GPU_TOPOLOGY_H_
+#if MXNET_USE_CUDA
+  #include <cuda_runtime_api.h>
+  #include <cuda.h>
+#endif
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <random>
+#include <stack>
+#include <queue>
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+
+#define MXNET_KVSTORE_MAXDEPTH 16
+
+namespace mxnet {
+namespace kvstore {
+
+template <typename T>
+inline void PrintVector(const std::string& str, const std::vector<T>& vec) {
+  std::cout << str << ":\n";
+  for (unsigned i = 0; i < vec.size(); ++i)
+    std::cout << vec[i] << " ";
+  std::cout << std::endl;
+}
+
+template <typename T>
+inline void PrintMatrix(const std::string& str, const std::vector<T>& matrix,
+    int num_rows, int num_cols) {
+  std::cout << str << ":\n";
+  int count = 0;
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      std::cout << matrix[count++] << " ";
+    }
+    std::cout << std::endl;
+  }
+}
+
+inline void PrintTopo(const std::string& str, const std::vector<size_t>& 
topo_row,
+    std::vector<size_t> scan_row) {
+  PrintVector("Topo vector", topo_row);
+  PrintVector("Scan vector", scan_row);
+  std::cout << str << ":\n";
+  int depth = scan_row.size()-1;
+  for (int row = 0; row < depth; ++row) {
+    int start = scan_row[row];
+    int end   = scan_row[row+1];
+    for (; start < end; start++) {
+      for (int i = 0; i < (2 << (depth-row-2))+1; ++i) {
+        std::cout << " ";
+      }
+      std::cout << topo_row[start];
+    }
+    std::cout << std::endl;
+  }
+}
+
+// Uses BFS to find whether undirected graph is connected or not given its
+// adjacency matrix
+// Note: only consider matrix values > 1, because we care about whether it is
+// connected using only NVLink connections
+template <typename T>
+inline bool IsConnected(const std::vector<T>& matrix,
+                        int                   num_gpus) {
+  int source = 0;
+  std::vector<bool> visited(num_gpus, false);
+  std::queue<int> work_list;
+
+  work_list.push(source);
+  visited[source] = true;
+  while (!work_list.empty()) {
+    int curr = work_list.front();
+    work_list.pop();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      int neighbour = matrix[curr*num_gpus + i];
+      if (i != curr && neighbour > 1 && visited[i] == false) {
+        visited[i] = true;
+        work_list.push(i);
+      }
+    }
+  }
+
+  for (int i = 0; i < num_gpus; ++i) {
+    if (visited[i] == false)
+      return false;
+  }
+  return true;
+}
+
+// Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
+// @input:  devs is a vector of GPU contexts
+// @output: matrix is adjacency matrix of link topology graph
+//          where edge weight represents relative performance of NVIDIA GPUs
+//            0: Self-connection
+//            1: PCI-E
+//            2: 1 NVLink connection
+//            3: 2 NVLink connections
+template <typename T>
+inline void GetP2PWeight(const std::vector<Context>& devs,
+                         std::vector<T>*             matrix) {
+  int num_gpus = devs.size();
+  int count    = 0;
+  std::vector<int> zero_dev_id(num_gpus, -1);
+  for (auto d : devs) {
+    zero_dev_id[count] = d.dev_id;
+    count++;
+  }
+
+#if MXNET_USE_CUDA
+  cudaDeviceP2PAttr attr;
+  attr = cudaDevP2PAttrPerformanceRank;
+  std::vector<int> max(num_gpus, 0);
+
+  for (int row = 0; row < num_gpus; ++row) {
+    for (int col = 0; col < num_gpus; ++col) {
+      if (row == col) {
+        (*matrix)[row*num_gpus+col] = 0;
+      } else {
+        int value;
+        int row_gpu = zero_dev_id[row];
+        int col_gpu = zero_dev_id[col];
+        cudaDeviceGetP2PAttribute(&value, attr, row_gpu, col_gpu);
+        if (value > max[row])
+          max[row] = value;
+        (*matrix)[row*num_gpus+col] = static_cast<T>(value)+1;
+      }
+    }
+  }
+
+  // Check that all GPUs have at least 1 NVLink connection
+  int max_value = 0;
+  for (unsigned int i = 0; i < max.size(); ++i) {
+    if (max[i] > max_value)
+      max_value = max[i];
+  }
+
+  // If all GPUs are  connected by NVLink, then we can use NVLink only
+  // to communicate instead of going over PCI-E
+  bool connected = IsConnected(*matrix, num_gpus);
+
+  if (connected) {
+    for (auto& matrix_value : *matrix) {
+      matrix_value = (matrix_value == 1) ? 0 : matrix_value;
+    }
+  }
+  PrintMatrix("Weight W", *matrix, num_gpus, num_gpus);
+#else
+  LOG(WARNING) << "GPU required for link topology";
+#endif
+}
+
+// Dense matrix-vector multiplication
+// Assume: matrix is square
+//   y = A*x (no accumulate)
+template <typename T>
+inline void gemv(const std::vector<T>&   A,
+                 const std::vector<int>& x,
+                 std::vector<T>*         y) {
+  int nrows = x.size();
+  int count = 0;
+  for (int row=0; row < nrows; ++row) {
+    (*y)[row] = 0;
+    for (int col=0; col < nrows; ++col) {
+      (*y)[row] += A[count]*static_cast<T>(x[col]);
+      count++;
+    }
+  }
+}
+
+// Element-wise multiplication between 2 dense vectors
+//   w = w * alpha*u
+template <typename T>
+inline void ewisemult(const std::vector<int>& u,
+                      T                       alpha,
+                      std::vector<T>*         w) {
+  int nelem = u.size();
+  for (int i=0; i < nelem; ++i) {
+    (*w)[i] *= alpha*static_cast<T>(u[i]);
+  }
+}
+
+// Computes best 2 nodes a,b to swap given objective function:
+//   g = max_{a \in A, b \in B} D(a) + D(b) - 2*W(a,b)
+//
+// Optimization: Only need to look at upper triangular since weight matrix is
+//   symmetric
+template <typename T>
+inline void FindBestMove(const std::vector<T>&          W,
+                         const std::vector<int>&        P_temp,
+                         const std::vector<T>&          D,
+                  const std::unordered_set<int>& used,
 
 Review comment:
   We don't use multiple spaces / tabs in MXNet. Please only one space only. 
i.e.
   `const std::vector<T>&<SPACE>W,`
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to