[3/3] systemml git commit: [SYSTEMML-445] Improved performance of GPU right indexing

niketanpansare Wed, 13 Sep 2017 16:04:56 -0700

[SYSTEMML-445] Improved performance of GPU right indexing

- Added slice_dense_dense kernel for right indexing
- Fixed GPU bufferpool performance bug where we were doing unnecessary
  recomputation of nnz for sparse input.
- Refactored asserts to throw exceptions to be consistent with our other
  operators. Also, this allows our other APIs as well as production jar to
  catch unexpected bugs.


Closes #663.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4cf95c92
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4cf95c92
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4cf95c92

Branch: refs/heads/master
Commit: 4cf95c92e61902ebcc17a377d4c16efc108a60f4
Parents: e624d14
Author: Niketan Pansare <[email protected]>
Authored: Wed Sep 13 15:58:56 2017 -0700
Committer: Niketan Pansare <[email protected]>
Committed: Wed Sep 13 15:58:56 2017 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |    3 +
 src/main/cpp/kernels/SystemML.cu                |   29 +-
 src/main/cpp/kernels/SystemML.ptx               | 2864 +++++++++---------
 .../java/org/apache/sysml/api/DMLScript.java    |    2 +
 .../apache/sysml/api/ScriptExecutorUtils.java   |    1 +
 .../java/org/apache/sysml/conf/DMLConfig.java   |    6 +-
 .../context/ExecutionContext.java               |   16 +-
 .../runtime/instructions/cp/CPInstruction.java  |    1 +
 .../instructions/gpu/GPUInstruction.java        |   16 +-
 .../instructions/gpu/context/CSRPointer.java    |    3 +-
 .../instructions/gpu/context/GPUContext.java    |   87 +-
 .../instructions/gpu/context/GPUObject.java     |  170 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  208 +-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |   42 +-
 .../sysml/runtime/matrix/data/MatrixBlock.java  |    9 +
 15 files changed, 1879 insertions(+), 1578 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index 454d0cc..aaf7316 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -84,6 +84,9 @@
     <!-- sets the GPUs to use per process, -1 for all GPUs, a specific GPU 
number (5), a range (eg: 0-2) or a comma separated list (eg: 0,2,4)-->
     <systemml.gpu.availableGPUs>-1</systemml.gpu.availableGPUs>
     
+    <!-- whether to synchronize GPUs after every GPU instruction -->
+    <systemml.gpu.sync.postProcess>true</systemml.gpu.sync.postProcess>
+    
     <!-- maximum wrap length for instruction and miscellaneous timer column of 
statistics -->
    <systemml.stats.maxWrapLength>30</systemml.stats.maxWrapLength>
 </root>

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index bb6482d..231a32a 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -39,13 +39,13 @@ nvcc -ptx -arch=sm_30 SystemML.cu
  * @param ru row upper
  * @param cl column lower
  * @param cu column upper
+ * @param retClen number of columns of output matrix
  */
 extern "C"
-__global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, 
double* ret, int rl, int ru, int cl, int cu) {
+__global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, 
double* ret, int rl, int ru, int cl, int cu, int retClen) {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int rowIndex = index + rl;
     if (rowIndex <= ru){
-       int retClen = cu - cl + 1;
        // Iterate over elements of the row 'rowIndex'.
        for(int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex+1]; i++) {
                // Only slice if the index falls into the given range
@@ -56,6 +56,31 @@ __global__ void slice_sparse_dense(double* inVal, int* 
inRowPtr, int* colInd, do
     }
 }
 
+/**
+ * Performs a slice operation where the input matrix is dense and the output 
matrix is dense.
+ * 
+ * @params in dense input pointer
+ * @params ret dense output pointer
+ * @param rl row lower
+ * @param ru row upper
+ * @param cl column lower
+ * @param cu column upper
+ * @param inClen number of columns of input matrix
+ * @param retClen number of columns of output matrix
+ */
+extern "C"
+__global__ void slice_dense_dense(double* in, double* ret, int rl, int ru, int 
cl, int cu, int inClen, int retClen) {
+       int index = blockIdx.x * blockDim.x + threadIdx.x;
+       int rowIndex = index + rl;
+    if (rowIndex <= ru){
+       int inIndex = rowIndex*inClen + cl;
+       int retIndex = index*retClen;
+       for(int i = retIndex; i < retIndex+retClen; i++, inIndex++) {
+                       ret[i] = in[inIndex];
+               }
+    }
+}
+
 
 /**
  * Does a copy of upper to lower triangle of the given matrix

[3/3] systemml git commit: [SYSTEMML-445] Improved performance of GPU right indexing

Reply via email to