haojin2 commented on a change in pull request #10371: [MXNET-263] Support for 
dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r181177291
 
 

 ##########
 File path: src/operator/tensor/dot-inl.cuh
 ##########
 @@ -895,6 +988,131 @@ inline void DotCsrRspDnsImpl(const OpContext& ctx,
   });
 }
 
+/*
+ * \brief GPU Impl of dot(dns, csr) = csr
+ */
+template<typename gpu>
+inline void DotDnsCsrCsrImpl(const OpContext& ctx,
+                             const TBlob& lhs, const NDArray& rhs,
+                             const OpReqType req, NDArray* ret) {
+  LOG(FATAL) << "dot(dense, csr) = csr is not implemented on GPU";
+}
+
+/*
+ * \brief GPU Impl of dot(dns, csr) = dns and dot(dns, csr.T) = dns
+ */
+template<typename gpu>
+inline void DotDnsCsrDnsImpl(const OpContext& ctx,
+                             const TBlob& dns, const NDArray& rhs,
+                             const OpReqType req, NDArray* ret,
+                             const bool transpose_b) {
+  CHECK_EQ(req, kWriteTo);
+  CHECK_EQ(rhs.storage_type(), kCSRStorage);
+
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using nnvm::dim_t;
+
+  /* Initialize data structures */
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  TBlob csr_data = rhs.data();
+  TBlob csr_indices = rhs.aux_data(csr::kIdx);
+  TBlob csr_indptr = rhs.aux_data(csr::kIndPtr);
+  if (!rhs.storage_initialized()) {
+    FillZerosCsrImpl(s, *ret);
+    return;
+  }
+
+  // if dot(dense, csr) = dns, transform to csc first
+  if (!transpose_b) {
+    // LOG(FATAL) << "dot(dns, csr) = dns not implemented yet";
+    const nnvm::dim_t csr_rows = rhs.shape()[0];
+    const nnvm::dim_t csr_cols = rhs.shape()[1];
+    const nnvm::dim_t dns_rows = dns.shape_[0];
+    const nnvm::dim_t nnz = rhs.storage_shape().Size();
+
+    MSHADOW_SGL_DBL_TYPE_SWITCH(csr_data.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(csr_indices.type_flag_, IType, {
+        MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {
+          DType* csc_data_ptr = NULL;
+          unsigned long long* csc_indices_ptr = NULL;
+          unsigned long long* csc_indptr_ptr = NULL;
+          unsigned long long* col_counters = NULL;
+          size_t ull_mem_size = sizeof(unsigned long long);
+          void* temp_storage = NULL;
+          size_t temp_storage_bytes = 0;
+          CType out_num_rows = ret->shape()[0];
+          CType out_num_cols = ret->shape()[1];
+          // Get necessary temporary storage amount
+          cub::DeviceScan::ExclusiveSum(NULL,
+                                        temp_storage_bytes,
+                                        csc_indices_ptr,
+                                        csc_indices_ptr,
+                                        csr_cols+1,
+                                        Stream<gpu>::GetStream(s));
+          temp_storage_bytes += (ull_mem_size - (temp_storage_bytes % 
ull_mem_size));
+          Tensor<gpu, 1, char> workspace =
+            ctx.requested[0].get_space_typed<gpu, 1, char>(
+              Shape1(nnz*sizeof(DType) + nnz*ull_mem_size +
+                     2*(csr_cols + 1)*ull_mem_size +
+                     temp_storage_bytes),
+              s);
+          csc_indices_ptr = reinterpret_cast<unsigned long 
long*>(workspace.dptr_);
+          csc_indptr_ptr = reinterpret_cast<unsigned long long*>(
+                             workspace.dptr_ + nnz*ull_mem_size);
+          col_counters = reinterpret_cast<unsigned long long*>(
+                           workspace.dptr_ + nnz*ull_mem_size + 
(csr_cols+1)*ull_mem_size);
+          csc_data_ptr = reinterpret_cast<DType*>(workspace.dptr_ + 
nnz*ull_mem_size +
+                                                  2*(csr_cols+1)*ull_mem_size);
+          temp_storage = reinterpret_cast<void*>(workspace.dptr_ + 
nnz*sizeof(DType) +
+                                                 nnz*ull_mem_size + 
2*(csr_cols+1)*ull_mem_size);
+          mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(
+            s, dns_rows*csr_cols, ret->data().dptr<DType>());
+          // Reset values for indptr, ready for histogramming
+          mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(
+            s, csr_cols + 1, csc_indptr_ptr);
+          // Histogramming on col id
+          mxnet_op::Kernel<CsrTransHistogramKernel, gpu>::Launch(
+            s, nnz, csr_indices.dptr<IType>(), csc_indptr_ptr, nnz);
+          cub::DeviceScan::ExclusiveSum(temp_storage,
+                                        temp_storage_bytes,
+                                        csc_indptr_ptr,
+                                        csc_indptr_ptr,
+                                        csr_cols+1,
+                                        Stream<gpu>::GetStream(s));
+          // Reset values for col_counter, ready for the final transform
+          mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(
+            s, csr_cols+1, col_counters);
 
 Review comment:
   Done

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to