[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-09 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132261720
 
 

 ##
 File path: src/operator/tensor/dot-inl.h
 ##
 @@ -187,8 +187,8 @@ inline bool DotForwardInferStorageType(const 
nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const DotParam& param = nnvm::get(attrs.parsed);
   // csr has many zero columns, so the result of dot(csr.T, matrix) should be 
rsp
-  // dot(csr.T,dns)=rsp not yet implemented on gpu
-  if (param.transpose_a && kCSRStorage == (*in_attrs)[0] && ctx.dev_type != 
Context::kGPU) {
+  // TODO(stefan/haibin): don't enforce kRowSparseStorage if out_attrs has 
already been set
+  if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) {
 STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage);
 
 Review comment:
   I discussed this with @eric-haibin-lin . We should change all 
`STORAGE_TYPE_ASSIGN_CHECK` to `type_assign` to support fallback mechanism. In 
the long run, we should also save the `type_assign` return value to determine 
whether to fallback to `FCompute` inside `FComputeEx`.
 

This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-09 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054842
 
 

 ##
 File path: benchmark/python/dot.py
 ##
 @@ -0,0 +1,265 @@
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+ 
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of 
omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+'data_mini': 'kdda.t.mini',
+'data_name': 'kdda.t',
+'data_origin_name': 'kdda.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";,
+'feature_dim': 20216830,
+'m': 200,
+'batch_size': [64]
+}
+
+avazu = {
+'data_mini': 'avazu-app.t.mini',
+'data_name': 'avazu-app.t',
+'data_origin_name': 'avazu-app.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";,
+'feature_dim': 100,
+'m': 500,
+'batch_size': [64, 128]
+}
+
+
+def measure_cost(wait, repeat, f, *args, **kwargs):
+start = time.time()
+if wait:
+for i in range(repeat):
+(f(*args, **kwargs)).wait_to_read()
+else:
+for i in range(repeat):
+f(*args, **kwargs)
+end = time.time()
+diff = end - start
+return diff / repeat
+
+
+def test_dot_real(data_dict):
+def get_iter(path, data_shape, batch_size):
+data_train = mx.io.LibSVMIter(data_libsvm=path,
+  data_shape=data_shape,
+  batch_size=batch_size)
+data_iter = iter(data_train)
+return data_iter
+
+data_dir = os.path.join(os.getcwd(), 'data')
+
+path = os.path.join(data_dir, data_dict['data_name'])
+if not os.path.exists(path):
+get_data(
+data_dir,
+data_dict['data_name'],
+data_dict['url'],
+data_dict['data_origin_name']
+)
+assert os.path.exists(path)
+
+k = data_dict['feature_dim']
+m = data_dict['m']
+density = estimate_density(path, data_dict['feature_dim'])
+
+mini_path = os.path.join(data_dir, data_dict['data_mini'])
+if not os.path.exists(mini_path):
+os.system("head -n 2000 %r > %r" % (path, mini_path))
+assert os.path.exists(mini_path)
+
+print "Running Benchmarking on %r data" % data_dict['data_mini']
+for batch_size in data_dict['batch_size']:  # iterator through different 
batch size of choice
+print "batch_size is %d" % batch_size
+# model
+data_shape = (k, )
+train_iter = get_iter(mini_path, data_shape, batch_size)
+weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m))
+
+csr_data = []
+dns_data = []
+num_batch = 0
+for batch in train_iter:
+data = train_iter.getdata()
+csr_data.append(data)
+dns_data.append(data.todense())
+num_batch += 1
+bag_of_data = [csr_data, dns_data]
+num_repeat = 5
+costs = []
+for d in bag_of_data:
+weight.wait_to_read()
+cost = 0.
+count = 0
+for d_batch in d:
+d_batch.wait_to_read()
+cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, 
weight)
+count += 1
+costs.append(cost/count)
+t_sparse = costs[0]
+t_dense = costs[1]
+ratio = t_dense / t_sparse
+print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse')
+fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f"
+print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, 
t_sparse))
+
+
+def test_dot_synthetic():
+"""benchmark sparse mxnet dot and scipy dot operator with matrices of 
given density.
+`t_sparse` is the runtime of the invoked sparse dot operator in ms, while 
`t_dense` is the 
+runtime of dot(dns, dns), with the same matrices except that they are in 
default storage type.
+"""
+# Benchmark MXNet's sparse dot operator
+def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, 
rhs_den, trans_lhs, ctx, repeat):
+set_default_context(ctx)
+# Create matrix instances
+lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den)
+rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den)
+lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense()
+rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense()
+# One warm up run, verify correctness
+out = 

[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054939
 
 

 ##
 File path: benchmark/python/dot.py
 ##
 @@ -0,0 +1,265 @@
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+ 
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of 
omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+'data_mini': 'kdda.t.mini',
+'data_name': 'kdda.t',
+'data_origin_name': 'kdda.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";,
+'feature_dim': 20216830,
+'m': 200,
+'batch_size': [64]
+}
+
+avazu = {
+'data_mini': 'avazu-app.t.mini',
+'data_name': 'avazu-app.t',
+'data_origin_name': 'avazu-app.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";,
+'feature_dim': 100,
+'m': 500,
+'batch_size': [64, 128]
+}
+
+
+def measure_cost(wait, repeat, f, *args, **kwargs):
+start = time.time()
+if wait:
+for i in range(repeat):
+(f(*args, **kwargs)).wait_to_read()
+else:
+for i in range(repeat):
+f(*args, **kwargs)
+end = time.time()
+diff = end - start
+return diff / repeat
+
+
+def test_dot_real(data_dict):
+def get_iter(path, data_shape, batch_size):
+data_train = mx.io.LibSVMIter(data_libsvm=path,
+  data_shape=data_shape,
+  batch_size=batch_size)
+data_iter = iter(data_train)
+return data_iter
+
+data_dir = os.path.join(os.getcwd(), 'data')
+
+path = os.path.join(data_dir, data_dict['data_name'])
+if not os.path.exists(path):
+get_data(
+data_dir,
+data_dict['data_name'],
+data_dict['url'],
+data_dict['data_origin_name']
+)
+assert os.path.exists(path)
+
+k = data_dict['feature_dim']
+m = data_dict['m']
+density = estimate_density(path, data_dict['feature_dim'])
+
+mini_path = os.path.join(data_dir, data_dict['data_mini'])
+if not os.path.exists(mini_path):
+os.system("head -n 2000 %r > %r" % (path, mini_path))
+assert os.path.exists(mini_path)
+
+print "Running Benchmarking on %r data" % data_dict['data_mini']
+for batch_size in data_dict['batch_size']:  # iterator through different 
batch size of choice
+print "batch_size is %d" % batch_size
+# model
+data_shape = (k, )
+train_iter = get_iter(mini_path, data_shape, batch_size)
+weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m))
+
+csr_data = []
+dns_data = []
+num_batch = 0
+for batch in train_iter:
+data = train_iter.getdata()
+csr_data.append(data)
+dns_data.append(data.todense())
+num_batch += 1
+bag_of_data = [csr_data, dns_data]
+num_repeat = 5
+costs = []
+for d in bag_of_data:
+weight.wait_to_read()
+cost = 0.
+count = 0
+for d_batch in d:
+d_batch.wait_to_read()
+cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, 
weight)
+count += 1
+costs.append(cost/count)
+t_sparse = costs[0]
+t_dense = costs[1]
+ratio = t_dense / t_sparse
+print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse')
+fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f"
+print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, 
t_sparse))
+
+
+def test_dot_synthetic():
+"""benchmark sparse mxnet dot and scipy dot operator with matrices of 
given density.
+`t_sparse` is the runtime of the invoked sparse dot operator in ms, while 
`t_dense` is the 
+runtime of dot(dns, dns), with the same matrices except that they are in 
default storage type.
+"""
+# Benchmark MXNet's sparse dot operator
+def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, 
rhs_den, trans_lhs, ctx, repeat):
+set_default_context(ctx)
+# Create matrix instances
+lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den)
+rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den)
+lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense()
+rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense()
+# One warm up run, verify correctness
+out = 

[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132100222
 
 

 ##
 File path: src/operator/tensor/dot-inl.h
 ##
 @@ -484,27 +534,26 @@ inline void DotCsrDnsRspImpl(mshadow::Stream* s,
   MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
 MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
   MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
-MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, {  // col idx 
type
-  if (kWriteTo == req) {
-mxnet_op::Kernel::Launch(
-s, data_out.Size(), data_out.dptr());
-  }
+MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, {  // row idx 
type
+  dim_t num_threads, seg_len;
 
 Review comment:
   It's not recommended to leave variables uninitialized. It's also a good 
practice to minimize the lifetime of a variable. Could you change it to `dim_t 
num_threads = data_out.Size()` and move defining `seg_len` to the line 545?
 

This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132098902
 
 

 ##
 File path: src/operator/tensor/dot-inl.cuh
 ##
 @@ -199,37 +316,203 @@ struct DotCsrTransDnsDnsThreadBlockKernel {
 };
 
 /*!
- * \brief Warp block kernel of dot(csr.T(), dns1) = dns2
+ * \brief GPU warp block kernel of dot(csr.T, dns1) = dns2
  * Parallelization by columns: 1 warp computes one lhs column for all rhs 
columns
  */
-template
 struct DotCsrTransDnsDnsWarpBlockKernel {
+  /*!
+   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
+   */
   template
-  __device__ __forceinline__ static void Map(int tid, DType* out, const DType* 
data_l, const IType* indptr_l,
- const CType* col_idx_l, const 
DType* data_r,
- const int num_cols_r) {
-const int warp_id = tid / 32;   // global warp id
-const int lane = tid & (32-1);  // local thread id within warp
-const int icol = warp_id;   // lhs column that this warp computes
+  __device__ __forceinline__ static void Map(int tid,
+ DType* out,
+ const DType* data_l,
+ const IType* indptr_l,
+ const CType* col_idx_l,
+ const DType* data_r,
+ const nnvm::dim_t num_cols_r) {
+using nnvm::dim_t;
+const dim_t warp_id = tid / 32;   // global warp id
+const dim_t lane = tid & (32-1);  // local thread id within warp
+const dim_t icol = warp_id;   // lhs column that this warp computes
 
 // Compute range of nnz elements in this column
-const int low  = static_cast(indptr_l[icol]);
-const int high = static_cast(indptr_l[icol+1]);
+const dim_t low  = static_cast(indptr_l[icol]);
+const dim_t high = static_cast(indptr_l[icol+1]);
 
 // Iterate through the nnz elements in lhs column
-for (int j = low+lane; j < high; j+=32) {
-  const int irow = static_cast(col_idx_l[j]);
+for (dim_t j = low+lane; j < high; j+=32) {
+  const dim_t irow = static_cast(col_idx_l[j]);
   const DType datum_l = data_l[j];
   // Iterate over all rhs columns
-  for (int k = 0; k < num_cols_r; k++) {
+  for (dim_t k = 0; k < num_cols_r; k++) {
 const DType val = datum_l*data_r[icol*num_cols_r+k];
 atomicAdd(static_cast(&(out[irow*num_cols_r+k])), val);
   }
 }
   }
 };
 
-inline void DotCsrDnsDnsImpl(mshadow::Stream* s,
+/*!
+ * \brief GPU warp kernel of dot(csr.T, dns) = rsp
+ * Parallelization by columns: 1 warp computes one lhs column for one rhs 
column
+ */
+struct DotCsrTransDnsRspWarpKernel {
+  /*!
+   * \brief
+   * \param tid  global thread id
+   * \param out  output rsp matrix data
+   * \param row_flg_sum_out  inclusive prefix sum array over 0/1 marked row 
flag array
+   * \param data_l   csr matrix data
+   * \param indptr_l csr matrix row index pointer
+   * \param col_idx_lcsr matrix column indices
+   * \param data_r   dns matrix data
+   * \param num_cols_r   dns matrix number of columns
+   */
+  template
+  __device__ __forceinline__ static void Map(int tid,
+ DType* out,
+ const nnvm::dim_t* 
row_flg_sum_out,
+ const DType* data_l,
+ const IType* indptr_l,
+ const CType* col_idx_l,
+ const DType* data_r,
+ const nnvm::dim_t num_cols_r) {
+using nnvm::dim_t;
+const dim_t warp_id = tid / 32;   // global warp id
+const dim_t lane = tid & (32-1);  // local thread id within warp
+const dim_t icol = warp_id / num_cols_r;  // lhs column that this warp 
computes
+const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp 
computes
+
+// Compute range of nnz elements in this column
+const dim_t low  = static_cast(indptr_l[icol]);
 
 Review comment:
   I'm confused here. If `icol` is the column id of lhs, how come it is applied 
in `indptr_l` (indexed by the row id of lhs)?
 

This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132100597
 
 

 ##
 File path: src/operator/tensor/dot-inl.h
 ##
 @@ -616,28 +677,27 @@ inline void DotCsrRspRspImpl(mshadow::Stream* s,
   MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
 MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
   MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
-MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // col idx type
-  if (kWriteTo == req) {
-mxnet_op::Kernel::Launch(
-s, data_out.Size(), data_out.dptr());
-  }
-  int num_threads = mxnet_op::get_num_threads(data_out.shape_[0]);
-  size_t seg_len = (data_out.shape_[0] + num_threads - 1) / 
num_threads;
+MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+  dim_t num_threads, seg_len;
 
 Review comment:
   same here.
 

This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054842
 
 

 ##
 File path: benchmark/python/dot.py
 ##
 @@ -0,0 +1,265 @@
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+ 
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of 
omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+'data_mini': 'kdda.t.mini',
+'data_name': 'kdda.t',
+'data_origin_name': 'kdda.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";,
+'feature_dim': 20216830,
+'m': 200,
+'batch_size': [64]
+}
+
+avazu = {
+'data_mini': 'avazu-app.t.mini',
+'data_name': 'avazu-app.t',
+'data_origin_name': 'avazu-app.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";,
+'feature_dim': 100,
+'m': 500,
+'batch_size': [64, 128]
+}
+
+
+def measure_cost(wait, repeat, f, *args, **kwargs):
+start = time.time()
+if wait:
+for i in range(repeat):
+(f(*args, **kwargs)).wait_to_read()
+else:
+for i in range(repeat):
+f(*args, **kwargs)
+end = time.time()
+diff = end - start
+return diff / repeat
+
+
+def test_dot_real(data_dict):
+def get_iter(path, data_shape, batch_size):
+data_train = mx.io.LibSVMIter(data_libsvm=path,
+  data_shape=data_shape,
+  batch_size=batch_size)
+data_iter = iter(data_train)
+return data_iter
+
+data_dir = os.path.join(os.getcwd(), 'data')
+
+path = os.path.join(data_dir, data_dict['data_name'])
+if not os.path.exists(path):
+get_data(
+data_dir,
+data_dict['data_name'],
+data_dict['url'],
+data_dict['data_origin_name']
+)
+assert os.path.exists(path)
+
+k = data_dict['feature_dim']
+m = data_dict['m']
+density = estimate_density(path, data_dict['feature_dim'])
+
+mini_path = os.path.join(data_dir, data_dict['data_mini'])
+if not os.path.exists(mini_path):
+os.system("head -n 2000 %r > %r" % (path, mini_path))
+assert os.path.exists(mini_path)
+
+print "Running Benchmarking on %r data" % data_dict['data_mini']
+for batch_size in data_dict['batch_size']:  # iterator through different 
batch size of choice
+print "batch_size is %d" % batch_size
+# model
+data_shape = (k, )
+train_iter = get_iter(mini_path, data_shape, batch_size)
+weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m))
+
+csr_data = []
+dns_data = []
+num_batch = 0
+for batch in train_iter:
+data = train_iter.getdata()
+csr_data.append(data)
+dns_data.append(data.todense())
+num_batch += 1
+bag_of_data = [csr_data, dns_data]
+num_repeat = 5
+costs = []
+for d in bag_of_data:
+weight.wait_to_read()
+cost = 0.
+count = 0
+for d_batch in d:
+d_batch.wait_to_read()
+cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, 
weight)
+count += 1
+costs.append(cost/count)
+t_sparse = costs[0]
+t_dense = costs[1]
+ratio = t_dense / t_sparse
+print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse')
+fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f"
+print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, 
t_sparse))
+
+
+def test_dot_synthetic():
+"""benchmark sparse mxnet dot and scipy dot operator with matrices of 
given density.
+`t_sparse` is the runtime of the invoked sparse dot operator in ms, while 
`t_dense` is the 
+runtime of dot(dns, dns), with the same matrices except that they are in 
default storage type.
+"""
+# Benchmark MXNet's sparse dot operator
+def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, 
rhs_den, trans_lhs, ctx, repeat):
+set_default_context(ctx)
+# Create matrix instances
+lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den)
+rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den)
+lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense()
+rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense()
+# One warm up run, verify correctness
+out = 

[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132099745
 
 

 ##
 File path: src/operator/tensor/dot-inl.h
 ##
 @@ -187,8 +187,8 @@ inline bool DotForwardInferStorageType(const 
nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const DotParam& param = nnvm::get(attrs.parsed);
   // csr has many zero columns, so the result of dot(csr.T, matrix) should be 
rsp
-  // dot(csr.T,dns)=rsp not yet implemented on gpu
-  if (param.transpose_a && kCSRStorage == (*in_attrs)[0] && ctx.dev_type != 
Context::kGPU) {
+  // TODO(stefan/haibin): don't enforce kRowSparseStorage if out_attrs has 
already been set
+  if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) {
 STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage);
 
 Review comment:
   Could you change it as @eric-haibin-lin said? `STORAGE_TYPE_ASSIGN_CHECK` 
would lead to abort if assigning fails. We want fallback for all un-supported 
combinations of stypes, so no abort at storage inference stage.
 

This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054895
 
 

 ##
 File path: benchmark/python/dot.py
 ##
 @@ -0,0 +1,265 @@
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+ 
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of 
omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+'data_mini': 'kdda.t.mini',
+'data_name': 'kdda.t',
+'data_origin_name': 'kdda.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";,
+'feature_dim': 20216830,
+'m': 200,
+'batch_size': [64]
+}
+
+avazu = {
+'data_mini': 'avazu-app.t.mini',
+'data_name': 'avazu-app.t',
+'data_origin_name': 'avazu-app.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";,
+'feature_dim': 100,
+'m': 500,
+'batch_size': [64, 128]
+}
+
+
+def measure_cost(wait, repeat, f, *args, **kwargs):
+start = time.time()
+if wait:
+for i in range(repeat):
+(f(*args, **kwargs)).wait_to_read()
+else:
+for i in range(repeat):
+f(*args, **kwargs)
+end = time.time()
+diff = end - start
+return diff / repeat
+
+
+def test_dot_real(data_dict):
+def get_iter(path, data_shape, batch_size):
+data_train = mx.io.LibSVMIter(data_libsvm=path,
+  data_shape=data_shape,
+  batch_size=batch_size)
+data_iter = iter(data_train)
+return data_iter
+
+data_dir = os.path.join(os.getcwd(), 'data')
+
+path = os.path.join(data_dir, data_dict['data_name'])
+if not os.path.exists(path):
+get_data(
+data_dir,
+data_dict['data_name'],
+data_dict['url'],
+data_dict['data_origin_name']
+)
+assert os.path.exists(path)
+
+k = data_dict['feature_dim']
+m = data_dict['m']
+density = estimate_density(path, data_dict['feature_dim'])
+
+mini_path = os.path.join(data_dir, data_dict['data_mini'])
+if not os.path.exists(mini_path):
+os.system("head -n 2000 %r > %r" % (path, mini_path))
+assert os.path.exists(mini_path)
+
+print "Running Benchmarking on %r data" % data_dict['data_mini']
+for batch_size in data_dict['batch_size']:  # iterator through different 
batch size of choice
+print "batch_size is %d" % batch_size
+# model
+data_shape = (k, )
+train_iter = get_iter(mini_path, data_shape, batch_size)
+weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m))
+
+csr_data = []
+dns_data = []
+num_batch = 0
+for batch in train_iter:
+data = train_iter.getdata()
+csr_data.append(data)
+dns_data.append(data.todense())
+num_batch += 1
+bag_of_data = [csr_data, dns_data]
+num_repeat = 5
+costs = []
+for d in bag_of_data:
+weight.wait_to_read()
+cost = 0.
+count = 0
+for d_batch in d:
+d_batch.wait_to_read()
+cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, 
weight)
+count += 1
+costs.append(cost/count)
+t_sparse = costs[0]
+t_dense = costs[1]
+ratio = t_dense / t_sparse
+print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse')
+fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f"
+print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, 
t_sparse))
+
+
+def test_dot_synthetic():
+"""benchmark sparse mxnet dot and scipy dot operator with matrices of 
given density.
+`t_sparse` is the runtime of the invoked sparse dot operator in ms, while 
`t_dense` is the 
+runtime of dot(dns, dns), with the same matrices except that they are in 
default storage type.
+"""
+# Benchmark MXNet's sparse dot operator
+def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, 
rhs_den, trans_lhs, ctx, repeat):
+set_default_context(ctx)
+# Create matrix instances
+lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den)
+rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den)
+lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense()
+rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense()
+# One warm up run, verify correctness
+out = 

[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator

2017-08-08 Thread git
reminisce commented on a change in pull request #7226: Extending the GPU dot 
operator
URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132052984
 
 

 ##
 File path: benchmark/python/dot.py
 ##
 @@ -0,0 +1,265 @@
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+ 
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of 
omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+'data_mini': 'kdda.t.mini',
+'data_name': 'kdda.t',
+'data_origin_name': 'kdda.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";,
+'feature_dim': 20216830,
+'m': 200,
+'batch_size': [64]
+}
+
+avazu = {
+'data_mini': 'avazu-app.t.mini',
+'data_name': 'avazu-app.t',
+'data_origin_name': 'avazu-app.t.bz2',
+'url': 
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";,
+'feature_dim': 100,
+'m': 500,
+'batch_size': [64, 128]
+}
+
+
+def measure_cost(wait, repeat, f, *args, **kwargs):
 
 Review comment:
   I got the point of adding `wait` as an argument here from reading the code 
below. Could you add a comment explaining that `wait=True` is for mxnet 
benchmark and `False` for scipy?
 

This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services