[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132261720 ## File path: src/operator/tensor/dot-inl.h ## @@ -187,8 +187,8 @@ inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); const DotParam& param = nnvm::get(attrs.parsed); // csr has many zero columns, so the result of dot(csr.T, matrix) should be rsp - // dot(csr.T,dns)=rsp not yet implemented on gpu - if (param.transpose_a && kCSRStorage == (*in_attrs)[0] && ctx.dev_type != Context::kGPU) { + // TODO(stefan/haibin): don't enforce kRowSparseStorage if out_attrs has already been set + if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) { STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); Review comment: I discussed this with @eric-haibin-lin . We should change all `STORAGE_TYPE_ASSIGN_CHECK` to `type_assign` to support fallback mechanism. In the long run, we should also save the `type_assign` return value to determine whether to fallback to `FCompute` inside `FComputeEx`. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054842 ## File path: benchmark/python/dot.py ## @@ -0,0 +1,265 @@ +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { +'data_mini': 'kdda.t.mini', +'data_name': 'kdda.t', +'data_origin_name': 'kdda.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";, +'feature_dim': 20216830, +'m': 200, +'batch_size': [64] +} + +avazu = { +'data_mini': 'avazu-app.t.mini', +'data_name': 'avazu-app.t', +'data_origin_name': 'avazu-app.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";, +'feature_dim': 100, +'m': 500, +'batch_size': [64, 128] +} + + +def measure_cost(wait, repeat, f, *args, **kwargs): +start = time.time() +if wait: +for i in range(repeat): +(f(*args, **kwargs)).wait_to_read() +else: +for i in range(repeat): +f(*args, **kwargs) +end = time.time() +diff = end - start +return diff / repeat + + +def test_dot_real(data_dict): +def get_iter(path, data_shape, batch_size): +data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) +data_iter = iter(data_train) +return data_iter + +data_dir = os.path.join(os.getcwd(), 'data') + +path = os.path.join(data_dir, data_dict['data_name']) +if not os.path.exists(path): +get_data( +data_dir, +data_dict['data_name'], +data_dict['url'], +data_dict['data_origin_name'] +) +assert os.path.exists(path) + +k = data_dict['feature_dim'] +m = data_dict['m'] +density = estimate_density(path, data_dict['feature_dim']) + +mini_path = os.path.join(data_dir, data_dict['data_mini']) +if not os.path.exists(mini_path): +os.system("head -n 2000 %r > %r" % (path, mini_path)) +assert os.path.exists(mini_path) + +print "Running Benchmarking on %r data" % data_dict['data_mini'] +for batch_size in data_dict['batch_size']: # iterator through different batch size of choice +print "batch_size is %d" % batch_size +# model +data_shape = (k, ) +train_iter = get_iter(mini_path, data_shape, batch_size) +weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m)) + +csr_data = [] +dns_data = [] +num_batch = 0 +for batch in train_iter: +data = train_iter.getdata() +csr_data.append(data) +dns_data.append(data.todense()) +num_batch += 1 +bag_of_data = [csr_data, dns_data] +num_repeat = 5 +costs = [] +for d in bag_of_data: +weight.wait_to_read() +cost = 0. +count = 0 +for d_batch in d: +d_batch.wait_to_read() +cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, weight) +count += 1 +costs.append(cost/count) +t_sparse = costs[0] +t_dense = costs[1] +ratio = t_dense / t_sparse +print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse') +fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f" +print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse)) + + +def test_dot_synthetic(): +"""benchmark sparse mxnet dot and scipy dot operator with matrices of given density. +`t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the +runtime of dot(dns, dns), with the same matrices except that they are in default storage type. +""" +# Benchmark MXNet's sparse dot operator +def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat): +set_default_context(ctx) +# Create matrix instances +lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den) +rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den) +lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense() +rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense() +# One warm up run, verify correctness +out =
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054939 ## File path: benchmark/python/dot.py ## @@ -0,0 +1,265 @@ +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { +'data_mini': 'kdda.t.mini', +'data_name': 'kdda.t', +'data_origin_name': 'kdda.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";, +'feature_dim': 20216830, +'m': 200, +'batch_size': [64] +} + +avazu = { +'data_mini': 'avazu-app.t.mini', +'data_name': 'avazu-app.t', +'data_origin_name': 'avazu-app.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";, +'feature_dim': 100, +'m': 500, +'batch_size': [64, 128] +} + + +def measure_cost(wait, repeat, f, *args, **kwargs): +start = time.time() +if wait: +for i in range(repeat): +(f(*args, **kwargs)).wait_to_read() +else: +for i in range(repeat): +f(*args, **kwargs) +end = time.time() +diff = end - start +return diff / repeat + + +def test_dot_real(data_dict): +def get_iter(path, data_shape, batch_size): +data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) +data_iter = iter(data_train) +return data_iter + +data_dir = os.path.join(os.getcwd(), 'data') + +path = os.path.join(data_dir, data_dict['data_name']) +if not os.path.exists(path): +get_data( +data_dir, +data_dict['data_name'], +data_dict['url'], +data_dict['data_origin_name'] +) +assert os.path.exists(path) + +k = data_dict['feature_dim'] +m = data_dict['m'] +density = estimate_density(path, data_dict['feature_dim']) + +mini_path = os.path.join(data_dir, data_dict['data_mini']) +if not os.path.exists(mini_path): +os.system("head -n 2000 %r > %r" % (path, mini_path)) +assert os.path.exists(mini_path) + +print "Running Benchmarking on %r data" % data_dict['data_mini'] +for batch_size in data_dict['batch_size']: # iterator through different batch size of choice +print "batch_size is %d" % batch_size +# model +data_shape = (k, ) +train_iter = get_iter(mini_path, data_shape, batch_size) +weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m)) + +csr_data = [] +dns_data = [] +num_batch = 0 +for batch in train_iter: +data = train_iter.getdata() +csr_data.append(data) +dns_data.append(data.todense()) +num_batch += 1 +bag_of_data = [csr_data, dns_data] +num_repeat = 5 +costs = [] +for d in bag_of_data: +weight.wait_to_read() +cost = 0. +count = 0 +for d_batch in d: +d_batch.wait_to_read() +cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, weight) +count += 1 +costs.append(cost/count) +t_sparse = costs[0] +t_dense = costs[1] +ratio = t_dense / t_sparse +print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse') +fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f" +print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse)) + + +def test_dot_synthetic(): +"""benchmark sparse mxnet dot and scipy dot operator with matrices of given density. +`t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the +runtime of dot(dns, dns), with the same matrices except that they are in default storage type. +""" +# Benchmark MXNet's sparse dot operator +def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat): +set_default_context(ctx) +# Create matrix instances +lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den) +rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den) +lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense() +rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense() +# One warm up run, verify correctness +out =
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132100222 ## File path: src/operator/tensor/dot-inl.h ## @@ -484,27 +534,26 @@ inline void DotCsrDnsRspImpl(mshadow::Stream* s, MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, { // data type MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type -MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, { // col idx type - if (kWriteTo == req) { -mxnet_op::Kernel::Launch( -s, data_out.Size(), data_out.dptr()); - } +MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, { // row idx type + dim_t num_threads, seg_len; Review comment: It's not recommended to leave variables uninitialized. It's also a good practice to minimize the lifetime of a variable. Could you change it to `dim_t num_threads = data_out.Size()` and move defining `seg_len` to the line 545? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132098902 ## File path: src/operator/tensor/dot-inl.cuh ## @@ -199,37 +316,203 @@ struct DotCsrTransDnsDnsThreadBlockKernel { }; /*! - * \brief Warp block kernel of dot(csr.T(), dns1) = dns2 + * \brief GPU warp block kernel of dot(csr.T, dns1) = dns2 * Parallelization by columns: 1 warp computes one lhs column for all rhs columns */ -template struct DotCsrTransDnsDnsWarpBlockKernel { + /*! + * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation. + */ template - __device__ __forceinline__ static void Map(int tid, DType* out, const DType* data_l, const IType* indptr_l, - const CType* col_idx_l, const DType* data_r, - const int num_cols_r) { -const int warp_id = tid / 32; // global warp id -const int lane = tid & (32-1); // local thread id within warp -const int icol = warp_id; // lhs column that this warp computes + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { +using nnvm::dim_t; +const dim_t warp_id = tid / 32; // global warp id +const dim_t lane = tid & (32-1); // local thread id within warp +const dim_t icol = warp_id; // lhs column that this warp computes // Compute range of nnz elements in this column -const int low = static_cast(indptr_l[icol]); -const int high = static_cast(indptr_l[icol+1]); +const dim_t low = static_cast(indptr_l[icol]); +const dim_t high = static_cast(indptr_l[icol+1]); // Iterate through the nnz elements in lhs column -for (int j = low+lane; j < high; j+=32) { - const int irow = static_cast(col_idx_l[j]); +for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); const DType datum_l = data_l[j]; // Iterate over all rhs columns - for (int k = 0; k < num_cols_r; k++) { + for (dim_t k = 0; k < num_cols_r; k++) { const DType val = datum_l*data_r[icol*num_cols_r+k]; atomicAdd(static_cast(&(out[irow*num_cols_r+k])), val); } } } }; -inline void DotCsrDnsDnsImpl(mshadow::Stream* s, +/*! + * \brief GPU warp kernel of dot(csr.T, dns) = rsp + * Parallelization by columns: 1 warp computes one lhs column for one rhs column + */ +struct DotCsrTransDnsRspWarpKernel { + /*! + * \brief + * \param tid global thread id + * \param out output rsp matrix data + * \param row_flg_sum_out inclusive prefix sum array over 0/1 marked row flag array + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_lcsr matrix column indices + * \param data_r dns matrix data + * \param num_cols_r dns matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const nnvm::dim_t* row_flg_sum_out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { +using nnvm::dim_t; +const dim_t warp_id = tid / 32; // global warp id +const dim_t lane = tid & (32-1); // local thread id within warp +const dim_t icol = warp_id / num_cols_r; // lhs column that this warp computes +const dim_t kcol = warp_id % num_cols_r; // rhs column that this warp computes + +// Compute range of nnz elements in this column +const dim_t low = static_cast(indptr_l[icol]); Review comment: I'm confused here. If `icol` is the column id of lhs, how come it is applied in `indptr_l` (indexed by the row id of lhs)? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132100597 ## File path: src/operator/tensor/dot-inl.h ## @@ -616,28 +677,27 @@ inline void DotCsrRspRspImpl(mshadow::Stream* s, MSHADOW_TYPE_SWITCH(data_l.type_flag_, DType, { // data type MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type -MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // col idx type - if (kWriteTo == req) { -mxnet_op::Kernel::Launch( -s, data_out.Size(), data_out.dptr()); - } - int num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); - size_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; +MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + dim_t num_threads, seg_len; Review comment: same here. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054842 ## File path: benchmark/python/dot.py ## @@ -0,0 +1,265 @@ +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { +'data_mini': 'kdda.t.mini', +'data_name': 'kdda.t', +'data_origin_name': 'kdda.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";, +'feature_dim': 20216830, +'m': 200, +'batch_size': [64] +} + +avazu = { +'data_mini': 'avazu-app.t.mini', +'data_name': 'avazu-app.t', +'data_origin_name': 'avazu-app.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";, +'feature_dim': 100, +'m': 500, +'batch_size': [64, 128] +} + + +def measure_cost(wait, repeat, f, *args, **kwargs): +start = time.time() +if wait: +for i in range(repeat): +(f(*args, **kwargs)).wait_to_read() +else: +for i in range(repeat): +f(*args, **kwargs) +end = time.time() +diff = end - start +return diff / repeat + + +def test_dot_real(data_dict): +def get_iter(path, data_shape, batch_size): +data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) +data_iter = iter(data_train) +return data_iter + +data_dir = os.path.join(os.getcwd(), 'data') + +path = os.path.join(data_dir, data_dict['data_name']) +if not os.path.exists(path): +get_data( +data_dir, +data_dict['data_name'], +data_dict['url'], +data_dict['data_origin_name'] +) +assert os.path.exists(path) + +k = data_dict['feature_dim'] +m = data_dict['m'] +density = estimate_density(path, data_dict['feature_dim']) + +mini_path = os.path.join(data_dir, data_dict['data_mini']) +if not os.path.exists(mini_path): +os.system("head -n 2000 %r > %r" % (path, mini_path)) +assert os.path.exists(mini_path) + +print "Running Benchmarking on %r data" % data_dict['data_mini'] +for batch_size in data_dict['batch_size']: # iterator through different batch size of choice +print "batch_size is %d" % batch_size +# model +data_shape = (k, ) +train_iter = get_iter(mini_path, data_shape, batch_size) +weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m)) + +csr_data = [] +dns_data = [] +num_batch = 0 +for batch in train_iter: +data = train_iter.getdata() +csr_data.append(data) +dns_data.append(data.todense()) +num_batch += 1 +bag_of_data = [csr_data, dns_data] +num_repeat = 5 +costs = [] +for d in bag_of_data: +weight.wait_to_read() +cost = 0. +count = 0 +for d_batch in d: +d_batch.wait_to_read() +cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, weight) +count += 1 +costs.append(cost/count) +t_sparse = costs[0] +t_dense = costs[1] +ratio = t_dense / t_sparse +print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse') +fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f" +print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse)) + + +def test_dot_synthetic(): +"""benchmark sparse mxnet dot and scipy dot operator with matrices of given density. +`t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the +runtime of dot(dns, dns), with the same matrices except that they are in default storage type. +""" +# Benchmark MXNet's sparse dot operator +def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat): +set_default_context(ctx) +# Create matrix instances +lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den) +rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den) +lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense() +rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense() +# One warm up run, verify correctness +out =
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132099745 ## File path: src/operator/tensor/dot-inl.h ## @@ -187,8 +187,8 @@ inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); const DotParam& param = nnvm::get(attrs.parsed); // csr has many zero columns, so the result of dot(csr.T, matrix) should be rsp - // dot(csr.T,dns)=rsp not yet implemented on gpu - if (param.transpose_a && kCSRStorage == (*in_attrs)[0] && ctx.dev_type != Context::kGPU) { + // TODO(stefan/haibin): don't enforce kRowSparseStorage if out_attrs has already been set + if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) { STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); Review comment: Could you change it as @eric-haibin-lin said? `STORAGE_TYPE_ASSIGN_CHECK` would lead to abort if assigning fails. We want fallback for all un-supported combinations of stypes, so no abort at storage inference stage. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132054895 ## File path: benchmark/python/dot.py ## @@ -0,0 +1,265 @@ +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { +'data_mini': 'kdda.t.mini', +'data_name': 'kdda.t', +'data_origin_name': 'kdda.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";, +'feature_dim': 20216830, +'m': 200, +'batch_size': [64] +} + +avazu = { +'data_mini': 'avazu-app.t.mini', +'data_name': 'avazu-app.t', +'data_origin_name': 'avazu-app.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";, +'feature_dim': 100, +'m': 500, +'batch_size': [64, 128] +} + + +def measure_cost(wait, repeat, f, *args, **kwargs): +start = time.time() +if wait: +for i in range(repeat): +(f(*args, **kwargs)).wait_to_read() +else: +for i in range(repeat): +f(*args, **kwargs) +end = time.time() +diff = end - start +return diff / repeat + + +def test_dot_real(data_dict): +def get_iter(path, data_shape, batch_size): +data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) +data_iter = iter(data_train) +return data_iter + +data_dir = os.path.join(os.getcwd(), 'data') + +path = os.path.join(data_dir, data_dict['data_name']) +if not os.path.exists(path): +get_data( +data_dir, +data_dict['data_name'], +data_dict['url'], +data_dict['data_origin_name'] +) +assert os.path.exists(path) + +k = data_dict['feature_dim'] +m = data_dict['m'] +density = estimate_density(path, data_dict['feature_dim']) + +mini_path = os.path.join(data_dir, data_dict['data_mini']) +if not os.path.exists(mini_path): +os.system("head -n 2000 %r > %r" % (path, mini_path)) +assert os.path.exists(mini_path) + +print "Running Benchmarking on %r data" % data_dict['data_mini'] +for batch_size in data_dict['batch_size']: # iterator through different batch size of choice +print "batch_size is %d" % batch_size +# model +data_shape = (k, ) +train_iter = get_iter(mini_path, data_shape, batch_size) +weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m)) + +csr_data = [] +dns_data = [] +num_batch = 0 +for batch in train_iter: +data = train_iter.getdata() +csr_data.append(data) +dns_data.append(data.todense()) +num_batch += 1 +bag_of_data = [csr_data, dns_data] +num_repeat = 5 +costs = [] +for d in bag_of_data: +weight.wait_to_read() +cost = 0. +count = 0 +for d_batch in d: +d_batch.wait_to_read() +cost += measure_cost(True, num_repeat, mx.nd.dot, d_batch, weight) +count += 1 +costs.append(cost/count) +t_sparse = costs[0] +t_dense = costs[1] +ratio = t_dense / t_sparse +print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse') +fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f" +print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse)) + + +def test_dot_synthetic(): +"""benchmark sparse mxnet dot and scipy dot operator with matrices of given density. +`t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the +runtime of dot(dns, dns), with the same matrices except that they are in default storage type. +""" +# Benchmark MXNet's sparse dot operator +def bench_mx_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, lhs_den, rhs_den, trans_lhs, ctx, repeat): +set_default_context(ctx) +# Create matrix instances +lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den) +rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den) +lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.todense() +rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.todense() +# One warm up run, verify correctness +out =
[GitHub] reminisce commented on a change in pull request #7226: Extending the GPU dot operator
reminisce commented on a change in pull request #7226: Extending the GPU dot operator URL: https://github.com/apache/incubator-mxnet/pull/7226#discussion_r132052984 ## File path: benchmark/python/dot.py ## @@ -0,0 +1,265 @@ +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { +'data_mini': 'kdda.t.mini', +'data_name': 'kdda.t', +'data_origin_name': 'kdda.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2";, +'feature_dim': 20216830, +'m': 200, +'batch_size': [64] +} + +avazu = { +'data_mini': 'avazu-app.t.mini', +'data_name': 'avazu-app.t', +'data_origin_name': 'avazu-app.t.bz2', +'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2";, +'feature_dim': 100, +'m': 500, +'batch_size': [64, 128] +} + + +def measure_cost(wait, repeat, f, *args, **kwargs): Review comment: I got the point of adding `wait` as an argument here from reading the code below. Could you add a comment explaining that `wait=True` is for mxnet benchmark and `False` for scipy? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services