[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-19 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r182918193
 
 

 ##
 File path: src/operator/tensor/dot.cc
 ##
 @@ -51,13 +150,19 @@ NNVM_REGISTER_OP(dot)
 dot(x,y)[0,0,1,1] = 0
 sum(x[0,0,:]*y[:,1,1]) = 0
 
-The storage type of ``dot`` output depends on storage types of inputs and 
transpose options:
+The storage type of ``dot`` output depends on storage types of inputs, 
transpose options and given
+hint for output storage type:
 
+Implemented sprase operations include:
 - dot(csr, default) = default
-- dot(csr.T, default) = row_sparse
+- dot(csr, default, transpose_a=True) = row_sparse
 - dot(csr, row_sparse) = default
-- dot(default, csr) = csr
-- otherwise, ``dot`` generates output with default storage
+- dot(default, csr) = csr on CPU only
+- dot(default, csr) = dense on GPU only
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-19 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r182918204
 
 

 ##
 File path: src/operator/tensor/dot-inl.cuh
 ##
 @@ -442,6 +445,105 @@ struct DotCsrRspDnsScalarKernel {
   }
 };
 
+/*!
+ * \brief GPU Kernel to scatter row id to corresponding entries
+ * \param tid global thread id
+ * \param csr_indptr  indptr array of csr
+ * \param csr_rowsarray of row id of csr elements
+ * \param num_rowstotal number of rows in csr matrix
+ * Parallelization by output elements: 1 thread/row
+ */
+struct CsrRowScatterKernel {
+  template
+  __device__ __forceinline__ static void Map(int tid,
+ const CType* csr_indptr,
+ CType* csr_rows,
+ const nnvm::dim_t num_rows) {
+if (tid < num_rows) {
+  for (CType i = csr_indptr[tid]; i < csr_indptr[tid+1]; ++i) {
+csr_rows[i] = tid;
+  }
+}
+  }
+};
+
+/*!
+ * \brief GPU Kernel of generation of transposed csr matrix
+ * \param tid   global thread id
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-19 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r182918157
 
 

 ##
 File path: src/operator/tensor/dot-inl.cuh
 ##
 @@ -895,6 +997,150 @@ inline void DotCsrRspDnsImpl(const OpContext& ctx,
   });
 }
 
+/*
+ * \brief GPU Impl of dot(dns, csr) = csr
+ */
+template
+inline void DotDnsCsrCsrImpl(const OpContext& ctx,
+ const TBlob& lhs, const NDArray& rhs,
+ const OpReqType req, NDArray* ret) {
+  LOG(FATAL) << "dot(dense, csr) = csr is not implemented on GPU";
+}
+
+/*
+ * \brief GPU Impl of dot(dns, csr) = dns and dot(dns, csr.T) = dns
+ */
+template
+inline void DotDnsCsrDnsImpl(const OpContext& ctx,
+ const TBlob& dns, const NDArray& rhs,
+ const OpReqType req, NDArray* ret,
+ const bool transpose_b) {
+  CHECK_EQ(req, kWriteTo);
+  CHECK_EQ(rhs.storage_type(), kCSRStorage);
+
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using nnvm::dim_t;
+
+  /* Initialize data structures */
+  mshadow::Stream* s = ctx.get_stream();
+  TBlob csr_data = rhs.data();
+  TBlob csr_indices = rhs.aux_data(csr::kIdx);
+  TBlob csr_indptr = rhs.aux_data(csr::kIndPtr);
+  if (!rhs.storage_initialized()) {
+FillZerosCsrImpl(s, *ret);
+return;
+  }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(csr_data.type_flag_, DType, { // data type
+MSHADOW_IDX_TYPE_SWITCH(csr_indices.type_flag_, IType, { // indptr type
+  MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {  // colidx type
+const CType out_num_rows = ret->shape()[0];
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-13 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r181446729
 
 

 ##
 File path: tests/python/unittest/test_sparse_operator.py
 ##
 @@ -1286,10 +1309,18 @@ def test_sparse_dot_zero_output(lhs_shape, trans_lhs, 
rhs_num_cols):
 test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', 
False, lhs_d, rhs_d)  # test gpu SpMM
 test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', 
True, lhs_d, rhs_d)  # (scalar kernel)
 test_dot_dns_csr(lhs_shape, (lhs_shape[1], rnd.randint(50, 200)), 
lhs_d, lhs_d)
+test_dot_dns_csr(lhs_shape, (rnd.randint(50, 200), lhs_shape[1]), 
lhs_d, lhs_d, trans_rhs=True)
 for rhs_d in density:
 test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 
'row_sparse', False, lhs_d, rhs_d)
 test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 
'row_sparse', True, lhs_d, rhs_d)
-
+test_infer_forward_stype(lhs_shape, (lhs_shape[1], rnd.randint(10, 
20)),
+ lhs_d, rhs_d, False, False)
+test_infer_forward_stype(lhs_shape, (rnd.randint(10, 20), 
lhs_shape[1]),
+ lhs_d, rhs_d, False, True)
+test_infer_forward_stype(lhs_shape, (lhs_shape[0], rnd.randint(10, 
20)),
+ lhs_d, rhs_d, True, False)
+test_infer_forward_stype(lhs_shape, (rnd.randint(10, 20), 
lhs_shape[0]),
+ lhs_d, rhs_d, True, True)
 
 Review comment:
   Sure


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-05 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r179593860
 
 

 ##
 File path: src/operator/tensor/dot-inl.h
 ##
 @@ -235,13 +235,21 @@ inline bool DotForwardInferStorageType(const 
nnvm::NodeAttrs& attrs,
  DispatchMode::kFComputeEx);
   }
   if (!dispatched && lhs_stype == kDefaultStorage && rhs_stype == kCSRStorage 
&&
-  !param.transpose_a && !param.transpose_b) {
 
 Review comment:
   Working on that now.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-04 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r179237403
 
 

 ##
 File path: src/operator/tensor/dot.cu
 ##
 @@ -23,10 +23,142 @@
  */
 
 #include "./dot-inl.h"
+#include 
 
 namespace mxnet {
 namespace op {
 
+template
+inline void DotDnsCsrCsrImpl(const OpContext& ctx,
+ const TBlob& lhs, const NDArray& rhs,
+ const OpReqType req, NDArray* ret) {
+  LOG(FATAL) << "dot(dense, csr) = csr is not implemented on GPU";
+}
+
+/*
+ * \brief GPU Impl of dot(dns, csr) = dns and dot(dns, csr.T) = dns
+ */
+template
+inline void DotDnsCsrDnsImpl(const OpContext& ctx,
+ const TBlob& dns, const NDArray& rhs,
+ const OpReqType req, NDArray* ret,
+ const bool transpose_b) {
+  CHECK_EQ(req, kWriteTo);
+  CHECK_EQ(rhs.storage_type(), kCSRStorage);
+
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using nnvm::dim_t;
+
+  /* Initialize data structures */
+  mshadow::Stream* s = ctx.get_stream();
+  TBlob csr_data = rhs.data();
+  TBlob csr_indices = rhs.aux_data(csr::kIdx);
+  TBlob csr_indptr = rhs.aux_data(csr::kIndPtr);
+  if (!rhs.storage_initialized()) {
+FillZerosCsrImpl(s, *ret);
+return;
+  }
+
+  // if dot(dense, csr) = dns, transform to csc first
+  if (!transpose_b) {
+LOG(FATAL) << "dot(dns, csr) = dns not implemented yet";
+const nnvm::dim_t csr_rows = rhs.shape()[0];
+const nnvm::dim_t csr_cols = rhs.shape()[1];
+const nnvm::dim_t nnz = rhs.storage_shape().Size();
+
+MSHADOW_SGL_DBL_TYPE_SWITCH(csr_data.type_flag_, DType, {
+  MSHADOW_IDX_TYPE_SWITCH(csr_indices.type_flag_, IType, {
+MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {
+  mshadow::Stream* cpu_s = ctx.get_stream();
+  Tensor csc_data =
+ctx.requested[0].get_space_typed(
+  Shape1(nnz), s);
+  Tensor csc_indices =
+ctx.requested[0].get_space_typed(
+  Shape1(nnz), s);
+  Tensor csc_indptr =
+ctx.requested[0].get_space_typed(
+  Shape1(csr_cols + 1), s);
+  DType data_buf[nnz] = {0};
+  Tensor csc_data_c =
+ctx.requested[0].get_space_typed(
+  Shape1(nnz), cpu_s);
+  /* Tensor csc_indices_c = */
+  /*   ctx.requested[0].get_space_typed( */
+  /* Shape1(nnz), cpu_s); */
+  /* Tensor csc_indptr_c = */
+  /*   ctx.requested[0].get_space_typed( */
+  /* Shape1(csr_cols + 1), cpu_s); */
+  // reset values for indptr, ready for histogramming
+  mxnet_op::Kernel::Launch(
+s, csr_cols + 1, csc_indptr.dptr_);
+  // histogramming on col id
+  mxnet_op::Kernel::Launch(
+s, nnz, csr_indices.dptr(),
+csc_indptr.dptr_, nnz);
+  size_t temp_storage_bytes = 0;
+  // Get necessary temporary storage amount
+  cub::DeviceScan::ExclusiveSum(nullptr,
+temp_storage_bytes,
+csc_indptr.dptr_,
+csc_indptr.dptr_,
+csr_cols+1,
+Stream::GetStream(s));
+  LOG(INFO) << "temp storage bytes: " << temp_storage_bytes;
+  Tensor workspace =
+ctx.requested[0].get_space_typed(
+  Shape1(temp_storage_bytes), s);
+  cub::DeviceScan::ExclusiveSum(workspace.dptr_,
+temp_storage_bytes,
+csc_indptr.dptr_,
+csc_indptr.dptr_,
+csr_cols+1,
+Stream::GetStream(s));
+  Tensor col_counters =
+ctx.requested[0].get_space_typed(Shape1(csr_cols+1), 
s);
+  // reset values for indptr, ready for histogramming
+  mxnet_op::Kernel::Launch(
+s, csr_cols+1, col_counters.dptr_);
+  mxnet_op::Kernel::Launch(
+s, csr_rows, csr_data.dptr(), csr_indices.dptr(),
+csr_indptr.dptr(), csc_data.dptr_, csc_indices.dptr_,
+csc_indptr.dptr_, col_counters.dptr_, csr_rows, csr_cols);
+
+  cudaMemcpy(data_buf, csc_data.dptr_, nnz * sizeof(DType), 
cudaMemcpyDeviceToHost);
+  for 

[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-04 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r179237417
 
 

 ##
 File path: src/operator/tensor/dot.cu
 ##
 @@ -23,10 +23,142 @@
  */
 
 #include "./dot-inl.h"
+#include 
 
 namespace mxnet {
 namespace op {
 
+template
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU

2018-04-04 Thread GitBox
haojin2 commented on a change in pull request #10371: [MXNET-263] [WIP] Support 
for dot(dns, csr) = dns and dot(dns, csr.T) = dns on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10371#discussion_r179234938
 
 

 ##
 File path: src/operator/tensor/dot-inl.cuh
 ##
 @@ -442,6 +442,99 @@ struct DotCsrRspDnsScalarKernel {
   }
 };
 
+/*!
+ * \brief GPU Kernel to re-arrange nnz elements to csc order
+ * Parallelization by output elements: 1 thread/row of csr
+ */
+struct CscDataIndicesKernel {
+  template
+  __device__ __forceinline__ static void Map(int tid,
+ const DType* csr_data,
+ const IType* csr_indices,
+ const CType* csr_indptr,
+ DType* csc_data,
+ IType* csc_indices,
+ CType* csc_indptr,
+ int* workspace,
+ const nnvm::dim_t num_rows,
+ const nnvm::dim_t num_cols) {
+if (tid < num_rows) {
+  for (CType i = csr_indptr[tid]; i < csr_indptr[tid + 1]; ++i) {
+// target column
+IType target_col = csr_indices[i];
+int target_offset = atomicAdd([target_col], 1);
+CType new_pos = csc_indptr[target_col] + target_offset;
+csc_data[new_pos] = csr_data[i];
+csc_indices[new_pos] = tid;
+  }
+}
+  }
+};
+
+/*!
+ * \brief GPU Kernel of getting count for every column
+ * Parallelization by output elements: 1 thread/element
+ */
+struct CsrTransHistogramKernel {
+  /*!
+   * \brief
+   * \param tid  global thread id
+   * \param in_indices   csr matrix column indices
+   * \param out_indptr   csr matrix row pointer
+   * \param nnz  number of non-zero elements in csr
+   */
+  template
+  __device__ __forceinline__ static void Map(int tid,
+ const IType* in_indices,
+ CType* out_indptr,
+ const nnvm::dim_t nnz) {
+if (tid < nnz) {
+  atomicAdd(_indptr[in_indices[tid] + 1], 1);
+}
+  }
+};
+
+/*!
+ * \brief GPU Kernel of dot(dns, csr.T) = dns
+ * Parallelization by output elements: 1 thread/element
+ */
+struct DotDnsCsrTransDnsKernel {
+  /*!
+   * \brief
+   * \param tid  global thread id
+   * \param lhs_data lhs dense matrix data
+   * \param rhs_data csr matrix data
+   * \param rhs_indices  csr matrix column indices
+   * \param rhs_indptr   csr matrix row pointer
+   * \param out  output matrix data
+   * \param lhs_num_cols lhs dns matrix number of columns
+   * \param out_num_rows output dns matrix number of rows
+   * \param out_num_cols output dns matrix number of columns
+   */
+  template
+  __device__ __forceinline__ static void Map(int tid,
+ const DType* lhs_data,
+ const DType* rhs_data,
+ const IType* rhs_indices,
+ const CType* rhs_indptr,
+ DType* out,
+ const nnvm::dim_t lhs_num_cols,
+ const nnvm::dim_t out_num_rows,
+ const nnvm::dim_t out_num_cols) {
+using nnvm::dim_t;
+if (tid < out_num_rows*out_num_cols) {
+  const dim_t i = static_cast(tid) / out_num_cols;  // i = row this 
thread computes
+  const dim_t k = static_cast(tid) % out_num_cols;  // k = col this 
thread computes
+  // Compute inner product of i-th row and k-th col
+  DType sum = 0;
+  for (CType col_id = rhs_indptr[k]; col_id < rhs_indptr[k + 1]; ++col_id) 
{
+sum += lhs_data[i * lhs_num_cols + rhs_indices[col_id]] * 
rhs_data[col_id];
+  }
+  out[i*out_num_cols+k] = sum;
 
 Review comment:
   Good catch, done.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services