[SYSTEMML-445] Improved performance of GPU right indexing - Added slice_dense_dense kernel for right indexing - Fixed GPU bufferpool performance bug where we were doing unnecessary recomputation of nnz for sparse input. - Refactored asserts to throw exceptions to be consistent with our other operators. Also, this allows our other APIs as well as production jar to catch unexpected bugs.
Closes #663. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4cf95c92 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4cf95c92 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4cf95c92 Branch: refs/heads/master Commit: 4cf95c92e61902ebcc17a377d4c16efc108a60f4 Parents: e624d14 Author: Niketan Pansare <[email protected]> Authored: Wed Sep 13 15:58:56 2017 -0700 Committer: Niketan Pansare <[email protected]> Committed: Wed Sep 13 15:58:56 2017 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 3 + src/main/cpp/kernels/SystemML.cu | 29 +- src/main/cpp/kernels/SystemML.ptx | 2864 +++++++++--------- .../java/org/apache/sysml/api/DMLScript.java | 2 + .../apache/sysml/api/ScriptExecutorUtils.java | 1 + .../java/org/apache/sysml/conf/DMLConfig.java | 6 +- .../context/ExecutionContext.java | 16 +- .../runtime/instructions/cp/CPInstruction.java | 1 + .../instructions/gpu/GPUInstruction.java | 16 +- .../instructions/gpu/context/CSRPointer.java | 3 +- .../instructions/gpu/context/GPUContext.java | 87 +- .../instructions/gpu/context/GPUObject.java | 170 +- .../runtime/matrix/data/LibMatrixCUDA.java | 208 +- .../runtime/matrix/data/LibMatrixCuDNN.java | 42 +- .../sysml/runtime/matrix/data/MatrixBlock.java | 9 + 15 files changed, 1879 insertions(+), 1578 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index 454d0cc..aaf7316 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -84,6 +84,9 @@ <!-- sets the GPUs to use per process, -1 for all GPUs, a specific GPU number (5), a range (eg: 0-2) or a comma separated list (eg: 0,2,4)--> <systemml.gpu.availableGPUs>-1</systemml.gpu.availableGPUs> + <!-- whether to synchronize GPUs after every GPU instruction --> + <systemml.gpu.sync.postProcess>true</systemml.gpu.sync.postProcess> + <!-- maximum wrap length for instruction and miscellaneous timer column of statistics --> <systemml.stats.maxWrapLength>30</systemml.stats.maxWrapLength> </root> http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index bb6482d..231a32a 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -39,13 +39,13 @@ nvcc -ptx -arch=sm_30 SystemML.cu * @param ru row upper * @param cl column lower * @param cu column upper + * @param retClen number of columns of output matrix */ extern "C" -__global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, double* ret, int rl, int ru, int cl, int cu) { +__global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, double* ret, int rl, int ru, int cl, int cu, int retClen) { int index = blockIdx.x * blockDim.x + threadIdx.x; int rowIndex = index + rl; if (rowIndex <= ru){ - int retClen = cu - cl + 1; // Iterate over elements of the row 'rowIndex'. for(int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex+1]; i++) { // Only slice if the index falls into the given range @@ -56,6 +56,31 @@ __global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, do } } +/** + * Performs a slice operation where the input matrix is dense and the output matrix is dense. + * + * @params in dense input pointer + * @params ret dense output pointer + * @param rl row lower + * @param ru row upper + * @param cl column lower + * @param cu column upper + * @param inClen number of columns of input matrix + * @param retClen number of columns of output matrix + */ +extern "C" +__global__ void slice_dense_dense(double* in, double* ret, int rl, int ru, int cl, int cu, int inClen, int retClen) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int rowIndex = index + rl; + if (rowIndex <= ru){ + int inIndex = rowIndex*inClen + cl; + int retIndex = index*retClen; + for(int i = retIndex; i < retIndex+retClen; i++, inIndex++) { + ret[i] = in[inIndex]; + } + } +} + /** * Does a copy of upper to lower triangle of the given matrix
