incubator-systemml git commit: [HOTFIX] for sparse GPU transpose

nakul02 Wed, 17 May 2017 18:47:13 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 1fc764b9b -> c3aeb48bf



[HOTFIX] for sparse GPU transpose


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c3aeb48b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c3aeb48b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c3aeb48b

Branch: refs/heads/master
Commit: c3aeb48bf6b54febb861b7b4381c3d7af450a8e8
Parents: 1fc764b
Author: Nakul Jindal <[email protected]>
Authored: Wed May 17 18:46:21 2017 -0700
Committer: Nakul Jindal <[email protected]>
Committed: Wed May 17 18:46:21 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixCUDA.java      | 118 +++++++++++--------
 1 file changed, 68 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c3aeb48b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 074119b..b023159 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -52,6 +52,7 @@ import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
 import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
 import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
 import static 
jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -61,6 +62,8 @@ import static 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
+import jcuda.jcusparse.cusparseAction;
+import jcuda.jcusparse.cusparseIndexBase;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -2732,7 +2735,7 @@ public class LibMatrixCUDA {
         * Performs sparse and dense dgeam given two input matrices
         * C = alpha* op( A ) + beta* op ( B )
         * where op = transpose or not (specified by isLeftTransposed and 
isRightTransposed).
-        *
+        * To indicate a transpose operation, make sure in1 == in2 and 
isLeftTransposed == isRightTransposed == true
         * @param ec execution context
         * @param gCtx a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
@@ -2756,35 +2759,6 @@ public class LibMatrixCUDA {
                int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
                int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-               int lda = (int) in1.getNumColumns();
-               int ldb = (int) in2.getNumColumns();
-               int m = (int) in1.getNumColumns();
-               int n = (int) in2.getNumRows();
-               if (isLeftTransposed && isRightTransposed) {
-                       m = (int) in1.getNumRows();
-                       n = (int) in2.getNumColumns();
-               }
-               else if (isLeftTransposed) {
-                       m = (int) in1.getNumRows();
-               } else if (isRightTransposed) {
-                       n = (int) in2.getNumColumns();
-               }
-               int ldc = m;
-
-
-
-               /**
-               int m = (int) in1.getNumRows();
-               int n = (int) in1.getNumColumns();
-               if(!isLeftTransposed && isRightTransposed) {
-                       m = (int) in1.getNumColumns();
-                       n = (int) in1.getNumRows();
-               }
-               int lda = isLeftTransposed ? n : m;
-               int ldb = isRightTransposed ? n : m;
-               int ldc = m;
-               **/
-
                MatrixObject out = ec.getMatrixObject(outputName);
                boolean isSparse1 = isInSparseFormat(gCtx, in1);
                boolean isSparse2 = isInSparseFormat(gCtx, in2);
@@ -2792,39 +2766,83 @@ public class LibMatrixCUDA {
                long t0=0,t1=0;
                // TODO: Implement sparse-dense matrix cublasDgeam kernel
                if(isSparse1 || isSparse2) {
+                       int m = (int)in1.getNumRows();
+                       int n = (int)in1.getNumColumns();
                        // Invoke cuSparse when either are in sparse format
                        // Perform sparse-sparse dgeam
-                       if(!isInSparseFormat(gCtx, in1)) {
-                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                       if (!isInSparseFormat(gCtx, in1)) {
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       t0 = System.nanoTime();
                                in1.getGPUObject(gCtx).denseToSparse();
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
+                                                       System.nanoTime() - t0);
                        }
                        CSRPointer A = 
in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-                       if(!isInSparseFormat(gCtx, in2)) {
-                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                       if (!isInSparseFormat(gCtx, in2)) {
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       t0 = System.nanoTime();
                                in2.getGPUObject(gCtx).denseToSparse();
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
+                                                       System.nanoTime() - t0);
                        }
                        CSRPointer B = 
in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 
                        ec.allocateGPUMatrixObject(outputName);
+                       out.getGPUObject(gCtx).addReadLock();
 
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       CSRPointer C = CSRPointer.allocateForDgeam(gCtx, 
getCusparseHandle(gCtx), A, B, m, n);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
+                       if (in1 == in2 && isLeftTransposed == true && 
isLeftTransposed == isRightTransposed) {
+                               // Special case for transpose
 
-                       out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-                       //long sizeOfC = CSRPointer.estimateSize(C.nnz, 
out.getNumRows());
-                       out.getGPUObject(gCtx).addReadLock();
-                       if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
-                       JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, 
n, alphaPtr, A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
-                                                       B.descr, (int)B.nnz, 
B.val, B.rowPtr, B.colInd,
-                                                       C.descr, C.val, 
C.rowPtr, C.colInd);
-                       //cudaDeviceSynchronize;
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
-               }
-               else {
+                               int nnz = (int)A.nnz;
+                               CSRPointer C = CSRPointer.allocateEmpty(gCtx, 
nnz, n);
+                               
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+                               cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, 
nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, 
cusparseAction.CUSPARSE_ACTION_NUMERIC, 
cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
+                       } else {
+                               // General case (cusparse does not support 
accept the transpose operator for dgeam)
+                               // TODO: to implement the transposed + dgeam 
for sparse matrices, they need to be converted to csc, which is effectively a 
tranpose
+                               if (isLeftTransposed || isRightTransposed) {
+                                       throw new DMLRuntimeException(
+                                                       "Transpose in 
cusparseDcsrgeam not supported for sparse matrices on GPU");
+                               }
+
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       t1 = System.nanoTime();
+                               CSRPointer C = 
CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
+                                                       System.nanoTime() - t1);
+
+                               
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+                               //long sizeOfC = CSRPointer.estimateSize(C.nnz, 
out.getNumRows());
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       t0 = System.nanoTime();
+                               
JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, 
(int) A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
+                                               B.descr, (int) B.nnz, B.val, 
B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
+                               //cudaDeviceSynchronize;
+                               if (GPUStatistics.DISPLAY_STATISTICS)
+                                       
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB,
+                                                       System.nanoTime() - t0);
+                       }
+               } else {
                        // Dense-Dense dgeam
+
+                       int lda = (int) in1.getNumColumns();
+                       int ldb = (int) in2.getNumColumns();
+                       int m = (int) in1.getNumColumns();
+                       int n = (int) in2.getNumRows();
+                       if (isLeftTransposed && isRightTransposed) {
+                               m = (int) in1.getNumRows();
+                               n = (int) in2.getNumColumns();
+                       }
+                       else if (isLeftTransposed) {
+                               m = (int) in1.getNumRows();
+                       } else if (isRightTransposed) {
+                               n = (int) in2.getNumColumns();
+                       }
+                       int ldc = m;
+
                        Pointer A = getDensePointer(gCtx, in1, instName);
                        Pointer B = getDensePointer(gCtx, in2, instName);
                        getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix

incubator-systemml git commit: [HOTFIX] for sparse GPU transpose

Reply via email to