stefanhenneking commented on a change in pull request #7226: Extending the GPU 
dot operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r131979993
 
 

 ##########
 File path: src/operator/tensor/dot-inl.cuh
 ##########
 @@ -9,66 +9,163 @@
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 
+#include <cub/cub.cuh>
+
 namespace mxnet {
 namespace op {
-using mshadow::cuda::kBaseThreadNum;
 
 /*!
- * \brief Scalar kernel of dot(csr, dns1) = dns2
+ * \brief GPU auxiliary kernel to flag non-zero rows of an rsp matrix with 
indices.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct SetRspRowFlgKernel {
+  /*!
+   * \brief
+   * \param tid      global thread id
+   * \param row_flg  array to flag storage indices of non-zero rows
+   * \param row_idx  rsp matrix row index array storing indices of non-zero 
rows
+   * \param nnr      rsp matrix number of non-zero rows (storage shape)
+   */
+  template<typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_flg,
+                                             const RType* row_idx,
+                                             const nnvm::dim_t nnr) {
+    if (tid < nnr) {
+      row_flg[row_idx[tid]] = tid+1;
+    }
+  }
+};
+
+/*!
+ * \brief GPU auxiliary kernel for marking non-zero columns of a csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct MarkCsrZeroColsWarpKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param col_idx   csr matrix column indices
+   * \param indptr    csr matrix row index pointer
+   * \param num_rows  csr matrix number of rows
+   * \param num_cols  csr matrix number of columns
+   */
+  template<typename CType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* flg,
+                                             const CType* col_idx,
+                                             const IType* indptr,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    typedef unsigned long long int uint64_cu;
+    static_assert(sizeof(uint64_cu) == sizeof(nnvm::dim_t), "unexpected sizeof 
dim_t");
+
+    const nnvm::dim_t warp_id = tid / 32;      // global warp   id
+    const nnvm::dim_t lane    = tid & (32-1);  // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      uint64_cu zero = 0;
+      uint64_cu one = 1;
+      for (IType j = indptr[warp_id]+lane; j < indptr[warp_id+1]; j+=32) {
 
 Review comment:
   I can try out the implication - just for fun - but nonetheless race 
conditions should be avoided. Having said that, looking at the more detailed 
kernel runtimes (that I didn't publish above), you'll find that this Kernel 
accounts for less than 1% of the total runtime of the transpose csr dot 
operator. Further optimization would have to happen in the matrix multiply 
itself as it uses up >99% of the total runtime.
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to