[SYSTEMML-446] [SYSTEMML-702] Updated the sparse matrix multiplication to 
minimize sparse-to-dense as well as dense-to-sparse conversion

1. The goal of this PR is not to improve performance (for example: by 
considering the cost of sparse-to-dense vs FLOPs required given a memory 
budget) but instead to minimize sparse-to-dense conversion in the GPU matrix 
multiplication operator.

2. If matmult uses unnecessary sparse-to-dense conversion, then we run into
  risk of one of the two situations:
- In best case some of the matmult won't be pushed to GPU under worst-case 
memory budget.
- On other hand, if these conversion are not accounted for, they may cause OOMs.

3. Every operator (except dense-sparse matrix multiplication) uses only memory 
allocated to input and output matrices.

4. Since there is no CuSPARSE kernel for dense-sparse matrix multiplication 
operator, we either have to transpose the output after performing sparse-dense 
matrix multiplication or perform dense-dense matrix multiplication after 
converting the sparse input to dense format.

Closes #686.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6de8f051
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6de8f051
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6de8f051

Branch: refs/heads/master
Commit: 6de8f051daaefdab403c0edd3c7beb30c9619033
Parents: 323dd72
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Fri Oct 20 12:29:20 2017 -0800
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Fri Oct 20 13:29:20 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/hops/AggBinaryOp.java |  30 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |   3 +-
 .../instructions/gpu/GPUInstruction.java        |   1 +
 .../instructions/gpu/context/CSRPointer.java    |   7 +-
 .../instructions/gpu/context/GPUObject.java     |   2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 541 +------------------
 .../runtime/matrix/data/LibMatrixCuMatMult.java | 480 ++++++++++++++++
 .../test/gpu/MatrixMultiplicationOpTest.java    |  98 +++-
 8 files changed, 581 insertions(+), 581 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index cfa99a4..d733d6a 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -371,40 +371,16 @@ public class AggBinaryOp extends Hop implements 
MultiThreadedHop
                double ret = 0;
 
                if (isGPUEnabled()) {
-                       // In GPU Mode, intermediate memory is only needed in 
case of one of the matrix blocks is sparse
-                       // When sparse block is converted to dense and a dense 
MM takes place, we need (dim1 * dim2)
-                       // When dense block is converted to sparse and a sparse 
MM takes place, we need (dim1 * dim2 * 2)
-
                        Hop in1 = _input.get(0);
                        Hop in2 = _input.get(1);
                        double in1Sparsity = 
OptimizerUtils.getSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
                        double in2Sparsity = 
OptimizerUtils.getSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-
                        boolean in1Sparse = in1Sparsity < 
MatrixBlock.SPARSITY_TURN_POINT;
                        boolean in2Sparse = in2Sparsity < 
MatrixBlock.SPARSITY_TURN_POINT;
-
-                       boolean in1UltraSparse = in1Sparsity < 
MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
-                       boolean in2UltraSparse = in2Sparsity < 
MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
-
-                       // For Matmult X * Y, if X is sparse, Y is dense, X is 
converted to dense
-                       // If X is ultrasparse, Y is converted to sparse
-                       if (in1Sparse ^ in2Sparse) { // one sparse, one dense
-                               if (in1Sparse) {
-                                       if (in1UltraSparse) {
-                                               ret += 2 * 
OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), 
in2.getNnz());
-                                       } else {
-                                               ret += 
OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), 
in1.getNnz());
-                                       }
-                               } else if (in2Sparse) {
-                                       if (in2UltraSparse) {
-                                               ret += 2 * 
OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), 
in1.getNnz());
-                                       } else {
-                                               ret += 
OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), 
in2.getNnz());
-                                       }
-                               }
-
+                       if(in1Sparse && !in2Sparse) {
+                               // Only in sparse-dense cases, we need 
additional memory budget for GPU
+                               ret += 
OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 1.0);
                        }
-
                }
 
                //account for potential final dense-sparse transformation 
(worst-case sparse representation)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index dfc000f..29a1ead 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuMatMult;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
@@ -94,7 +95,7 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction {
                int clen = (int) (_isRightTransposed ? m2.getNumRows() : 
m2.getNumColumns());
 
                ec.setMetaData(_output.getName(), rlen, clen);
-               LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), 
getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, 
_isRightTransposed);
+               LibMatrixCuMatMult.matmult(ec, ec.getGPUContext(0), 
getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, 
_isRightTransposed);
                
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index ed2a4a2..9dd163d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -78,6 +78,7 @@ public abstract class GPUInstruction extends Instruction {
        public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB =   
"Mdmdm";        // time spent in matrix mult of dense matrices
        public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB =  
"Msmdv";        // time spent in matrix mult of sparse matrix and dense vector
        public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = 
"Msmsm";  // time spent in matrix mult of sparse matrices
+       public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB = 
"Msmdm";  // time spent in matrix mult of sparse matrices
        public final static String MISC_TIMER_SYRK_LIB =                        
                                                                        
"Msyrk";        // time spent in symmetric rank-k update
 
        // Other BLAS instructions

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 5a6e21c..7176a9c 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -37,7 +37,9 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
 
 import jcuda.Pointer;
 import jcuda.Sizeof;
@@ -495,11 +497,13 @@ public class CSRPointer {
         * @param cublasHandle   a valid {@link cublasHandle}
         * @param rows           number of rows in this CSR matrix
         * @param cols           number of columns in this CSR matrix
+        * @param instName          name of the invoking instruction to 
record{@link Statistics}.
         * @return A {@link Pointer} to the allocated dense matrix (in 
column-major format)
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, 
cublasHandle cublasHandle, int rows,
-                       int cols) throws DMLRuntimeException {
+                       int cols, String instName) throws DMLRuntimeException {
+               long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null 
? System.nanoTime() : 0;
                LOG.trace("GPU : sparse -> column major dense (inside 
CSRPointer) on " + this + ", GPUContext="
                                + getGPUContext());
                long size = ((long) rows) * getDoubleSizeOf((long) cols);
@@ -512,6 +516,7 @@ public class CSRPointer {
                } else {
                        LOG.debug("in CSRPointer, the values array, row 
pointers array or column indices array was null");
                }
+               if (GPUStatistics.DISPLAY_STATISTICS && instName != null) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
                return A;
        }
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index feb34bc..06327db 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -458,7 +458,7 @@ public class GPUObject {
                        throw new DMLRuntimeException("Expected cusparse to be 
initialized");
                int rows = toIntExact(mat.getNumRows());
                int cols = toIntExact(mat.getNumColumns());
-               
setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle,
 null, rows, cols));
+               
setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle,
 null, rows, cols, null));
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index f4a00ab..7e25299 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -22,18 +22,11 @@ package org.apache.sysml.runtime.matrix.data;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
 import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
-import static 
jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
@@ -175,11 +168,11 @@ public class LibMatrixCUDA {
        }
 
 
-       private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws 
DMLRuntimeException{
+       protected static cusparseHandle getCusparseHandle(GPUContext gCtx) 
throws DMLRuntimeException{
                return gCtx.getCusparseHandle();
        }
 
-       private static cublasHandle getCublasHandle(GPUContext gCtx) throws 
DMLRuntimeException {
+       protected static cublasHandle getCublasHandle(GPUContext gCtx) throws 
DMLRuntimeException {
                return gCtx.getCublasHandle();
        }
 
@@ -410,7 +403,7 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                if(isInSparseFormat(gCtx, left)) {
                        // For sparse TSMM, invoke matmult (TODO: possible 
performance improvement)
-                       matmult(ec, gCtx, instName, left, left, outputName, 
isLeftTransposed, !isLeftTransposed);
+                       LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, 
left, outputName, isLeftTransposed, !isLeftTransposed);
                        return;
                }
 
@@ -481,534 +474,6 @@ public class LibMatrixCUDA {
        //******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
        //********************************************************************/
 
-       //********************************************************************/
-       //***************** MATRIX MULTIPLY Functions ************************/
-       //********************************************************************/
-
-       /**
-        * Matrix multiply on GPU
-        * Examines sparsity and shapes and routes call to appropriate method
-        * from cuBLAS or cuSparse
-        * C = op(A) x op(B)
-        * <p>
-        * Memory Requirements -
-        * Both dense - inputs, output, no intermediate
-        * Both sparse - inputs, output, no intermediate
-        * One sparse, one dense - inputs, output, intermediates - (input_dim1 
* input_dim2) OR (input_dim1 * input_dim2 + input in sparse format)
-        *
-        * @param ec                Current {@link ExecutionContext} instance
-        * @param gCtx              a valid {@link GPUContext}
-        * @param instName          name of the invoking instruction to 
record{@link Statistics}.
-        * @param left              Matrix A
-        * @param right             Matrix B
-        * @param outputName        Name of the output matrix C (in code 
generated after LOP layer)
-        * @param isLeftTransposed  op for A, transposed or not
-        * @param isRightTransposed op for B, tranposed or not
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        * @return output of matrix multiply
-        */
-       public static MatrixObject matmult(ExecutionContext ec, GPUContext 
gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
-                       boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
-               if (ec.getGPUContext(0) != gCtx)
-                       throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
-               }
-               if(!left.getGPUObject(gCtx).isAllocated() || 
!right.getGPUObject(gCtx).isAllocated())
-                       throw new DMLRuntimeException("One of input is not 
allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + 
right.getGPUObject(gCtx).isAllocated());
-
-               boolean bothDense = !left.getGPUObject(gCtx).isSparse() && 
!right.getGPUObject(gCtx).isSparse();
-               boolean bothSparse = left.getGPUObject(gCtx).isSparse() && 
right.getGPUObject(gCtx).isSparse();
-
-               MatrixObject output = ec.getMatrixObject(outputName);
-
-               long outRLen = isLeftTransposed ? left.getNumColumns() : 
left.getNumRows();
-               long outCLen = isRightTransposed ? right.getNumRows() : 
right.getNumColumns();
-
-               if (bothDense) {                // Dense C = Dense A * Dense B
-                       // For both dense, do cuBLAS
-                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName, outRLen, outCLen); // Allocated the dense output matrix
-                       denseDenseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed);
-               }
-               else if (bothSparse){   // Sparse C = Sparse A * Sparse B
-                       ec.allocateGPUMatrixObject(outputName, outRLen, 
outCLen);
-                       bothSparseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed);
-               }
-               else {  // Either of A or B is sparse, Sparse C = Sparse/Dense 
A * Dense/Sparse B
-                       // Convert the dense to sparse and use the 
cusparseDcsrgemm routine
-                       ec.allocateGPUMatrixObject(outputName, outRLen, 
outCLen);
-                       eitherSparseMatmult(gCtx, instName, output, left, 
right, isLeftTransposed, isRightTransposed);
-               }
-
-               return output;
-       }
-
-       /**
-        * One of the matrices is sparse, the other dense
-        * C = op(A) x op(B)
-        *
-        * @param gCtx              a valid {@link GPUContext}
-        * @param instName          the invoking instruction's name for record 
{@link Statistics}.
-        * @param output            allocated output object for C on host to 
which GPU output will be attached
-        * @param left              Matrix A on host
-        * @param right             Matrix B on host
-        * @param isLeftTransposed  op for A, tranposed or not
-        * @param isRightTransposed op for B, transposed or not
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void eitherSparseMatmult(GPUContext gCtx, String 
instName, MatrixObject output, MatrixObject left, MatrixObject right,
-                       boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
-
-               int m = toInt(isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
-               int n = toInt(isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
-               int k = toInt(isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
-               int k1 = toInt(isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
-               if(k != k1)
-                       throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
-
-               if(m == -1 || n == -1 || k == -1)
-                       throw new DMLRuntimeException("Incorrect dimensions");
-
-
-               if (left.getGPUObject(gCtx).isSparse()) {
-                       // Left sparse, right dense
-                       sparseDenseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed, m, n, k);
-               } else {
-                       // Left dense, right sparse
-                       denseSparseMatmult(gCtx, instName, left, right, output, 
isLeftTransposed, isRightTransposed, m, n, k);
-               }
-       }
-
-       /**
-        * C = op(A) * op(B) where A is dense and B is sparse
-        * If B is ultrasparse, A is converted to a sparse matrix and {@code 
sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, 
CSRPointer)} is invoked
-        * otherwise B is converted to a dense matrix and {@code 
denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, 
Pointer)} is invoked.
-        *
-        * @param gCtx              a valid {@link GPUContext}
-        * @param instName          the invoking instruction's name for record 
{@link Statistics}.
-        * @param left              {@link MatrixObject} of A
-        * @param right             {@link MatrixObject} of B
-        * @param output            {@link MatrixObject} of the output matrix C
-        * @param isLeftTransposed  whether matrix A needs to be transposed
-        * @param isRightTransposed whether matrix B needs to be transposed
-        * @param m                 ?
-        * @param n                 ?
-        * @param k                 ?
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void denseSparseMatmult(GPUContext gCtx, String 
instName, MatrixObject left, MatrixObject right, MatrixObject output,
-                       boolean isLeftTransposed, boolean isRightTransposed, 
int m, int n, int k)
-                                       throws DMLRuntimeException {
-               // right sparse, left dense
-               CSRPointer B = 
right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-               Pointer ADense = getDensePointer(gCtx, left, instName);
-               if (B.isUltraSparse(k, n)){
-                       if(LOG.isTraceEnabled()) {
-                               LOG.trace(" GPU : Convert d M %*% sp M --> sp M 
%*% sp M)" + ", GPUContext=" + gCtx);
-                       }
-
-                       // Convert left to CSR and do cuSparse matmul
-                       int rowsA = (int)left.getNumRows();
-                       int colsA = (int)left.getNumColumns();
-
-                       long t0=0,t1=0, t2=0;
-                       if (DMLScript.STATISTICS) t0 = System.nanoTime();
-                       Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, 
colsA, colsA, rowsA);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       CSRPointer A = 
GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, 
rowsA, colsA);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseCount.add(1);
-                       sparseSparseMatmult(gCtx, instName, A, B, output, 
isLeftTransposed, isRightTransposed, m, n, k);
-
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       A.deallocate();
-                       gCtx.cudaFreeHelper(AT);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
-               } else {
-                       if(LOG.isTraceEnabled()) {
-                               LOG.trace(" GPU : Convert d M %*% sp M --> d M 
%*% d M" + ", GPUContext=" + gCtx);
-                       }
-                       // Convert right to dense and do a cuBlas matmul
-                       // BDenseTransposed is a column major matrix
-                       // Note the arguments to denseDenseMatmult to 
accommodate for this.
-                       long t0=0, t1=0;
-                       if (DMLScript.STATISTICS) t0 = System.nanoTime();
-                       Pointer BDenseTransposed = 
B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), 
(int)right.getNumRows(), (int)right.getNumColumns());
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       boolean allocated = 
output.getGPUObject(gCtx).acquireDeviceModifyDense();       // To allocate the 
dense matrix
-                       if (GPUStatistics.DISPLAY_STATISTICS && allocated) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-                       Pointer C = getDensePointer(gCtx, output, instName);
-                       denseDenseMatmult(gCtx, instName, C,
-                                       toInt(left.getNumRows()), 
toInt(left.getNumColumns()),
-                                       toInt(right.getNumColumns()), 
toInt(right.getNumRows()),
-                                       isLeftTransposed, !isRightTransposed,
-                                       ADense, BDenseTransposed);
-
-                       gCtx.cudaFreeHelper(instName, BDenseTransposed);
-               }
-       }
-
-       /**
-        * * C = op(A) * op(B) where A is sparse and B is dense
-        * If A is ultrasparse, B is converted to a sparse matrix and {@code 
sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, 
CSRPointer)} is invoked
-        * otherwise A is converted to a dense matrix and {@code 
denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, 
Pointer)} is invoked.
-        *
-        * @param gCtx              a valid {@link GPUContext}
-        * @param instName          the invoking instruction's name for record 
{@link Statistics}.
-        * @param output            the output matrix object
-        * @param left              matrix A
-        * @param right             matrix B
-        * @param isLeftTransposed  if A needs to be transposed
-        * @param isRightTransposed if B needs to be transposed
-        * @param m                 ?
-        * @param n                 ?
-        * @param k                 ?
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void sparseDenseMatmult(GPUContext gCtx, String 
instName, MatrixObject output, MatrixObject left, MatrixObject right,
-                       boolean isLeftTransposed, boolean isRightTransposed, 
int m, int n, int k)
-                                       throws DMLRuntimeException {
-               CSRPointer A = 
left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-               Pointer BDense = getDensePointer(gCtx, right, instName);
-
-               if (n == 1){
-                       // Sparse Matrix - Dense Vector multiply
-                       sparseMatrixDenseVectorMult(gCtx, instName, output, A, 
BDense, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns());
-
-               } else {
-
-                       long t0=0, t1=0, t2=0;
-                       // Sparse Matrix Dense Matrix multiply
-                       if (A.isUltraSparse(m, k)){
-                               if(LOG.isTraceEnabled()) {
-                                       LOG.trace(" GPU : Convert sp M %*% d M 
--> sp M %*% sp M" + ", GPUContext=" + gCtx);
-                               }
-                               // Convert right to CSR and do cuSparse matmul
-                               int rowsB = (int)right.getNumRows();
-                               int colsB = (int)right.getNumColumns();
-
-                               if (DMLScript.STATISTICS) t0 = 
System.nanoTime();
-                               Pointer BT = GPUObject.transpose(gCtx, BDense, 
rowsB, colsB, colsB, rowsB);
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
-                               if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                               CSRPointer B = 
GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, 
rowsB, colsB);
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseCount.add(1);
-
-                               sparseSparseMatmult(gCtx, instName, A, B, 
output, isLeftTransposed, isRightTransposed, m, n, k);
-
-                               if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                               B.deallocate();
-                               gCtx.cudaFreeHelper(BT);
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
-                       } else {
-                               if(LOG.isTraceEnabled()) {
-                                       LOG.trace(" GPU : Convert sp M %*% d M 
--> d M %*% d M" + ", GPUContext=" + gCtx);
-                               }
-                               // Convert left to dense and do a cuBlas matmul
-                               // ADenseTransposed is a column major matrix
-                               // Note the arguments to denseDenseMatmult to 
accommodate for this.
-                               if (DMLScript.STATISTICS) t0 = 
System.nanoTime();
-                               Pointer ADenseTransposed = 
A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), 
(int)left.getNumRows(), (int)left.getNumColumns());
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
-                               if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                               boolean allocated = 
output.getGPUObject(gCtx).acquireDeviceModifyDense();       // To allocate the 
dense matrix
-                               if (allocated && 
GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-
-                               Pointer C = getDensePointer(gCtx, output, 
instName);
-                               denseDenseMatmult(gCtx, instName, C,
-                                               toInt(left.getNumColumns()), 
toInt(left.getNumRows()),
-                                               toInt(right.getNumRows()), 
toInt(right.getNumColumns()),
-                                               !isLeftTransposed, 
isRightTransposed,
-                                               ADenseTransposed, BDense);
-
-                               gCtx.cudaFreeHelper(instName, ADenseTransposed);
-                       }
-               }
-       }
-
-       /**
-        * C = op(A) x B
-        * A is a sparse matrix, B is a dense vector
-        *
-        * @param gCtx         a valid {@link GPUContext}
-        * @param instName     the invoking instruction's name for record 
{@link Statistics}.
-        * @param output       allocated output on the host, to which the GPU 
output C will be attached
-        * @param A            sparse matrix A on the GPU
-        * @param B_dense      dense matrix/vector B on the GPU
-        * @param isATranposed op for A, tranposed or not
-        * @param m            number of rows in A (not op(A))
-        * @param k            number of cols in A or number of rows in B (not 
op(A) or op(B))
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String 
instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean 
isATranposed,
-                       int m, int k) throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + 
gCtx);
-               }
-               int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-               long size = m * Sizeof.DOUBLE;
-               if (isATranposed){
-                       size = k * Sizeof.DOUBLE;
-                       transA = CUSPARSE_OPERATION_TRANSPOSE;
-               }
-               Pointer C_dense = gCtx.allocate(instName, (int)size);
-               long t1=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-               cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, 
(int)A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), 
C_dense);
-               //cudaDeviceSynchronize;        // Since cusparseDcsrmv is 
asynchronously executed
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - 
t1);
-
-               output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
-       }
-
-       /**
-        * Sparse C = Sparse op(A) * Sparse op(B)
-        * Reroutes call to sparse matrix-vector mult if needed
-        *
-        * @param gCtx              a valid {@link GPUContext}
-        * @param instName          the invoking instruction's name for record 
{@link Statistics}.
-        * @param output            ?
-        * @param instName          name of the invoking instruction to 
record{@link Statistics}.
-        * @param left              ?
-        * @param right             ?
-        * @param isLeftTransposed  ?
-        * @param isRightTransposed ?
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void bothSparseMatmult(GPUContext gCtx, String instName, 
MatrixObject output, MatrixObject left, MatrixObject right,
-                       boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
-               int m = toInt(isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
-               int n = toInt(isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
-               int k = toInt(isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
-               int k1 = toInt(isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
-               if(k != k1)
-                       throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
-
-               if(m == -1 || n == -1 || k == -1)
-                       throw new DMLRuntimeException("Incorrect dimensions");
-
-               CSRPointer A = 
left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-               CSRPointer B = 
right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-
-               // TODO if (m == 1) {   // Vector-matrix multiplication
-
-               if (!isRightTransposed && right.getNumColumns() == 1){  // 
Matrix-Vector multiplication
-                       sparseMatrixVectorMult(gCtx, instName, output, 
isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns(), 
(int)right.getNumRows(), A, B);
-               } else {                                                        
                                        // Matrix-Matrix multiplication
-                       sparseSparseMatmult(gCtx, instName, A, B, output, 
isLeftTransposed, isRightTransposed, m, n, k);
-               }
-       }
-
-       /**
-        * Does a sparse matrix-vector multiply.
-        * C = op(A) x B, A is a sparse matrix, B is a sparse vector with 
numCols = 1.
-        *
-        * @param gCtx         a valid {@link GPUContext}
-        * @param instName     the invoking instruction's name for record 
{@link Statistics}.
-        * @param output       allocated output object C to which the GPU 
output matrix will be attached
-        * @param isATranposed if A is to be transposed or not (the op in op(A))
-        * @param m            number of rows in A (not op(A))
-        * @param n            number of cols in A (not op(A))
-        * @param k            number of rows in B, (cols in B is assumed to be 
1)
-        * @param A            left sparse matrix on GPU
-        * @param B            right sparse vector on GPU
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void sparseMatrixVectorMult(GPUContext gCtx, String 
instName, MatrixObject output, boolean isATranposed, int m, int n, int k,
-                       CSRPointer A, CSRPointer B) throws DMLRuntimeException {
-               long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-               Pointer BDenseVector = 
B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 
1);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-               sparseMatrixDenseVectorMult(gCtx, instName, output, A, 
BDenseVector, isATranposed, m, k);
-       }
-
-       /**
-        * Does a sparse-sparse Matrix multiply
-        * C = op(A) x op(B), A, B are sparse matrices
-        *
-        * @param gCtx              a valid {@link GPUContext}
-        * @param instName          the invoking instruction's name for record 
{@link Statistics}.
-        * @param A                 left sparse matrix on GPU
-        * @param B                 right sparse matrix on GPU
-        * @param output            allocated output object on host to which 
the GPU output matrix will be attached
-        * @param isLeftTransposed  op for A - to be transposed or not
-        * @param isRightTransposed op for B
-        * @param m                 number of rows in op(A)
-        * @param n                 number of cols in op(B)
-        * @param k                 number of cols in op(A) or rows in op(B)
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void sparseSparseMatmult(GPUContext gCtx, String 
instName, CSRPointer A, CSRPointer B, MatrixObject output,
-                       boolean isLeftTransposed, boolean isRightTransposed, 
int m, int n, int k) throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + 
gCtx);
-               }
-
-               int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
-               int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
-
-               long t0=0, t1=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-               CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, 
getCusparseHandle(gCtx), A, transA, B, transB, m, n, k);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
-
-               output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-               cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, 
k,
-                               A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
-                               B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
-                               C.descr, C.val, C.rowPtr, C.colInd);
-               //cudaDeviceSynchronize;
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - 
t1);
-       }
-
-       /**
-        * Dense dense matrix multiply
-        * C = op(A) * op(B), A and B are dense matrices
-        *
-        * @param gCtx              a valid {@link GPUContext}
-        * @param instName          name of the invoking instruction to 
record{@link Statistics}.
-        * @param output            output object C on host with GPU data 
allocated
-        * @param left              left matrix A (in row-major order)
-        * @param right             right matrix B (in row-major order)
-        * @param isLeftTransposed  op for A, transposed or not
-        * @param isRightTransposed op for B, transposed or not
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private static void denseDenseMatmult(GPUContext gCtx, String instName, 
MatrixObject output, MatrixObject left, MatrixObject right,
-                       boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
-
-               Pointer leftPtr = getDensePointer(gCtx, left, instName);
-               Pointer rightPtr = getDensePointer(gCtx, right, instName);
-
-               int leftRows = toInt(left.getNumRows());
-               int leftCols = toInt(left.getNumColumns());
-               int rightRows = toInt(right.getNumRows());
-               int rightCols = toInt(right.getNumColumns());
-               Pointer C = getDensePointer(gCtx, output, instName);
-               denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, 
rightRows, rightCols, isLeftTransposed, isRightTransposed,
-                               leftPtr, rightPtr);
-       }
-
-       /**
-        * Dense-dense matrix multiply
-        * C = op(A) * op(B), A and B are dense matrices
-        * On the host, the matrices are in row-major format; cuBLAS expects 
them in column-major format.
-        * What we have as input is t(A) and t(B), t(X) = transpose of X.
-        * We do t(B) %*% t(A) to get t(C);
-        * If we were to calculate t(t(C), we would get the resultant matrix C, 
but this would be in column-major format.
-        * What we really want is t(C). This we already have as the result of 
t(B) %*% t(A).
-        *
-        * @param gCtx               a valid {@link GPUContext}
-        * @param instName           name of the invoking instruction to 
record{@link Statistics}.
-        * @param output             output allocated on GPU in column major 
format
-        * @param leftRows1          number of rows in A
-        * @param leftCols1          number of cols in A
-        * @param rightRows1         number of rows in B
-        * @param rightCols1         number of cols in B
-        * @param isLeftTransposed1  op for A, transposed or not
-        * @param isRightTransposed1 op for B, transposed or not
-        * @param leftPtr            A allocated on the GPU in row-major format
-        * @param rightPtr           B allocated on the GPU in row-major format
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static void denseDenseMatmult(GPUContext gCtx, String instName, 
Pointer output, int leftRows1, int leftCols1, int rightRows1,
-                       int rightCols1, boolean isLeftTransposed1, boolean 
isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
-                                       throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
-               }
-
-               Pointer A = rightPtr;
-               Pointer B = leftPtr;
-
-               // To compensate for the input matrices being in row-major 
format instead of column-major (the way cublas expects)
-               int leftRows = rightCols1;
-               int leftCols = rightRows1;
-               int rightRows = leftCols1;
-               int rightCols = leftRows1;
-
-               boolean isLeftTransposed = isRightTransposed1;
-               boolean isRightTransposed = isLeftTransposed1;
-
-               // Note: the dimensions are swapped
-               int m = isLeftTransposed ? leftCols : leftRows ;
-               int n = isRightTransposed ? rightRows : rightCols;
-               int k = isLeftTransposed ?  leftRows : leftCols;
-               int k1 = isRightTransposed ?  rightCols : rightRows;
-               if(k != k1)
-                       throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
-
-               if(m == -1 || n == -1 || k == -1)
-                       throw new DMLRuntimeException("Incorrect dimensions");
-
-               double[] one = { 1 };
-               double[] zero = { 0 };
-
-               //int lda = leftRows;
-               //int ldb = leftCols;
-               int lda = isLeftTransposed ?  k : m;
-               int ldb = isRightTransposed ? n : k;
-               int ldc = m;
-
-               int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : 
cublasOperation.CUBLAS_OP_N;
-               int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : 
cublasOperation.CUBLAS_OP_N;
-
-               long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-               Pointer C = output;
-               if (m == 1 && n == 1){
-                       // Vector product
-                       LOG.debug(" GPU Dense-dense Vector Product");
-                       double[] result = {0};
-                       JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 
1, Pointer.to(result));
-                       // By default in CuBlas V2, cublas pointer mode is set 
to CUBLAS_POINTER_MODE_HOST.
-                       // This means that scalar values passed are on host (as 
opposed to on device).
-                       // The result is copied from the host back to the 
device so that the rest of
-                       // infrastructure can treat it uniformly.
-                       cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, 
cudaMemcpyHostToDevice);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
-               } else if (m == 1) {
-                       // Vector-matrix multiply
-                       LOG.debug(" GPU Dense Vector-Matrix Multiply");
-                       transb = isRightTransposed ? 
cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
-                       JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, 
rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - 
t0);
-               } else if (n == 1){
-                       // Matrix-vector multiply
-                       LOG.debug(" GPU Dense Matrix-Vector Multiply");
-                       JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, 
leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - 
t0);
-               } else {
-                       LOG.debug(" GPU Dense-Dense Matrix Multiply ");
-                       JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, 
transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - 
t0);
-               }
-       }
-
-       //********************************************************************/
-       //***************** END OF MATRIX MULTIPLY Functions *****************/
-       //********************************************************************/
-
 
        //********************************************************************/
        //****************  UNARY AGGREGATE Functions ************************/

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
new file mode 100644
index 0000000..21a2a35
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -0,0 +1,480 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static 
jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixCuMatMult extends LibMatrixCUDA {
+
+       private static final Log LOG = 
LogFactory.getLog(LibMatrixCuMatMult.class.getName());
+
+       private static class CuMatMultParameters {
+               /*
+                * For the operation, C = op(A) %*% op(B), the below parameters 
are used
+                * to invoke the corresponding kernels in CuBLAS and CuSPARSE.
+                * 
+                * All the below values have to be valid or else this class has 
to throw
+                * an exception. No special values like -1 for unknowns allowed.
+                */
+               public int m; // number of rows of matrix op(A) and C.
+               public int n; // number of columns of matrix op(B) and C.
+               public int k; // number of columns of op(A) and rows of op(B).
+               public int lda; // leading dimension of two-dimensional array 
used to
+                                               // store the matrix A.
+               public int ldb; // leading dimension of two-dimensional array 
used to
+                                               // store matrix B.
+               public int ldc; // leading dimension of a two-dimensional array 
used to
+                                               // store the matrix C.
+               public long leftNumRows; // number of rows of A
+               public long leftNumCols; // number of cols of A
+               public long rightNumRows; // number of rows of B
+               public long rightNumCols; // number of cols of B
+               private boolean isLeftTransposed; // is op(A) = t(A)
+               private boolean isRightTransposed; // is op(B) = t(B)
+
+               public CuMatMultParameters(long leftNumRows1, long 
leftNumCols1, long rightNumRows1, long rightNumCols1,
+                               boolean isLeftTransposed1, boolean 
isRightTransposed1) throws DMLRuntimeException {
+                       leftNumRows = leftNumRows1;
+                       leftNumCols = leftNumCols1;
+                       rightNumRows = rightNumRows1;
+                       rightNumCols = rightNumCols1;
+                       isLeftTransposed = isLeftTransposed1;
+                       isRightTransposed = isRightTransposed1;
+                       setDimensions();
+               }
+
+               public void rowToColumnMajor() throws DMLRuntimeException {
+                       // To compensate for the input matrices being in 
row-major format
+                       // instead of column-major (the way cublas expects)
+                       isRightTransposed = swap(isLeftTransposed, 
isLeftTransposed = isRightTransposed);
+                       rightNumCols = swap(leftNumRows, leftNumRows = 
rightNumCols);
+                       rightNumRows = swap(leftNumCols, leftNumCols = 
rightNumRows);
+                       setDimensions();
+               }
+
+               private void validate() throws DMLRuntimeException {
+                       int k1 = toInt(isRightTransposed ? rightNumCols : 
rightNumRows);
+                       if (k != k1)
+                               throw new DMLRuntimeException("Dimension 
mismatch: " + k + " != " + k1 + " [" + leftNumRows + ","
+                                               + leftNumCols + "," + 
rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " "
+                                               + isRightTransposed);
+               }
+
+               private void setDimensions() throws DMLRuntimeException {
+                       // Validate the dimensions
+                       m = toInt(isLeftTransposed ? leftNumCols : leftNumRows);
+                       n = toInt(isRightTransposed ? rightNumRows : 
rightNumCols);
+                       k = toInt(isLeftTransposed ? leftNumRows : leftNumCols);
+                       lda = isLeftTransposed ? k : m;
+                       ldb = isRightTransposed ? n : k;
+                       ldc = m;
+                       if (m == -1 || n == -1 || k == -1)
+                               throw new DMLRuntimeException("Incorrect 
dimensions");
+               }
+       }
+
+       /**
+        * Matrix multiply on GPU Examines sparsity and shapes and routes call 
to
+        * appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
+        *
+        * The user is expected to call
+        * ec.releaseMatrixOutputForGPUInstruction(outputName);
+        *
+        * @param ec
+        *            Current {@link ExecutionContext} instance
+        * @param gCtx
+        *            a valid {@link GPUContext}
+        * @param instName
+        *            name of the invoking instruction to record{@link 
Statistics}.
+        * @param left
+        *            Matrix A
+        * @param right
+        *            Matrix B
+        * @param outputName
+        *            Name of the output matrix C (in code generated after LOP
+        *            layer)
+        * @param isLeftTransposed
+        *            op for A, transposed or not
+        * @param isRightTransposed
+        *            op for B, tranposed or not
+        * @throws DMLRuntimeException
+        *             if DMLRuntimeException occurs
+        * @return output of matrix multiply
+        */
+       public static MatrixObject matmult(ExecutionContext ec, GPUContext 
gCtx, String instName, MatrixObject left,
+                       MatrixObject right, String outputName, boolean 
isLeftTransposed, boolean isRightTransposed)
+                       throws DMLRuntimeException {
+               boolean isM1Sparse = isInSparseFormat(gCtx, left);
+               boolean isM2Sparse = isInSparseFormat(gCtx, right);
+               MatrixObject output = ec.getMatrixObject(outputName);
+               long outRLen = isLeftTransposed ? left.getNumColumns() : 
left.getNumRows();
+               long outCLen = isRightTransposed ? right.getNumRows() : 
right.getNumColumns();
+
+               CuMatMultParameters params = new 
CuMatMultParameters(left.getNumRows(), left.getNumColumns(),
+                               right.getNumRows(), right.getNumColumns(), 
isLeftTransposed, isRightTransposed);
+
+               if (isM1Sparse && isM2Sparse) {
+                       // 
-------------------------------------------------------------------------------------
+                       // sparse-sparse matrix multiplication
+                       params.validate();
+                       int transa = cusparseOp(isLeftTransposed);
+                       int transb = cusparseOp(isRightTransposed);
+
+                       // Step 1: Allocate output => sparse format
+                       ec.allocateGPUMatrixObject(outputName, outRLen, 
outCLen);
+
+                       // Step 2: Get the handles to sparse/dense pointers for 
left, right
+                       // and output
+                       CSRPointer A = 
left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+                       CSRPointer B = 
right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       CSRPointer C = 
CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, 
B, transb,
+                                       params.m, params.n, params.k);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
+                                               System.nanoTime() - t0);
+
+                       // Step 3: Invoke the kernel
+                       long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), 
transa, transb, params.m, params.n, params.k, A.descr,
+                                       (int) A.nnz, A.val, A.rowPtr, A.colInd, 
B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
+                                       C.val, C.rowPtr, C.colInd);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB,
+                                               System.nanoTime() - t1);
+                       output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+                       // 
-------------------------------------------------------------------------------------
+               } else if (!isM1Sparse && isM2Sparse) {
+                       // 
-------------------------------------------------------------------------------------
+                       // dense-sparse matrix multiplication
+                       // Step 1: Allocate output => dense format
+                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName, outRLen, outCLen);
+
+                       // Step 2: Get the handles to sparse/dense pointers for 
left, right
+                       // and output
+                       Pointer A = getDensePointer(gCtx, left, instName);
+                       CSRPointer B = 
right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+                       Pointer C = getDensePointer(gCtx, output, instName);
+
+                       // Step 3: Invoke the kernel
+                       denseSparseMatMult(getCusparseHandle(gCtx), instName, 
C, A, B, params);
+                       // 
-------------------------------------------------------------------------------------
+               } else if (isM1Sparse && !isM2Sparse) {
+                       // 
-------------------------------------------------------------------------------------
+                       // sparse-dense matrix multiplication
+                       // Step 1: Allocate output => dense format
+                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName, outRLen, outCLen);
+
+                       // Step 2: Get the handles to sparse/dense pointers for 
left, right
+                       // and output
+                       CSRPointer A = 
left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+                       Pointer B = getDensePointer(gCtx, right, instName);
+                       Pointer C = getDensePointer(gCtx, output, instName);
+
+                       // Step 3: Invoke the kernel
+                       sparseDenseMatMult(gCtx, instName, C, A, B, 
left.getNumRows(), left.getNumColumns(), right.getNumRows(),
+                                       right.getNumColumns(), outRLen, 
outCLen, isLeftTransposed, isRightTransposed);
+                       // 
-------------------------------------------------------------------------------------
+               } else {
+                       // 
-------------------------------------------------------------------------------------
+                       // dense-dense matrix multiplication
+                       // Step 1: Allocate output => dense format
+                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName, outRLen, outCLen);
+
+                       // Step 2: Get the handles to sparse/dense pointers for 
left, right
+                       // and output
+                       Pointer A = getDensePointer(gCtx, left, instName);
+                       Pointer B = getDensePointer(gCtx, right, instName);
+                       Pointer C = getDensePointer(gCtx, output, instName);
+
+                       // Step 3: Invoke the kernel
+                       denseDenseMatMult(getCublasHandle(gCtx), instName, C, 
A, B, params);
+                       // 
-------------------------------------------------------------------------------------
+               }
+               return output;
+       }
+
+       /**
+        * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+        * multiplication for operation: C = op(A) * op(B) This assumes B and C 
are
+        * allocated in dense row-major format and A is sparse.
+        * 
+        * Other than input and output, this method requires additional memory =
+        * outRLen * outCLen * Sizeof.DOUBLE
+        * 
+        * @param gCtx
+        *            a valid {@link GPUContext}
+        * @param instName
+        *            name of the invoking instruction to record{@link 
Statistics}.
+        * @param C
+        *            output matrix pointer
+        * @param A
+        *            left matrix pointer
+        * @param B
+        *            right matrix pointer
+        * @param leftNumRows
+        *            number of rows of A
+        * @param leftNumColumns
+        *            number of cols of A
+        * @param rightNumRows
+        *            number of rows of B
+        * @param rightNumColumns
+        *            number of cols of B
+        * @param outRLen
+        *            number of rows of C
+        * @param outCLen
+        *            number of cols of C
+        * @param isLeftTransposed
+        *            is op(A) = t(A)
+        * @param isRightTransposed
+        *            is op(B) = t(B)
+        * @throws DMLRuntimeException
+        *             if error
+        */
+       private static void sparseDenseMatMult(GPUContext gCtx, String 
instName, Pointer C, CSRPointer A, Pointer B,
+                       long leftNumRows, long leftNumColumns, long 
rightNumRows, long rightNumColumns, long outRLen, long outCLen,
+                       boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
+               // t(C) = t(B) %*% t(A)
+               Pointer output = null;
+               if (outRLen != 1 && outCLen != 1) {
+                       output = gCtx.allocate(outRLen * outCLen * 
Sizeof.DOUBLE);
+               } else {
+                       // no transpose required for vector output
+                       output = C;
+               }
+               CuMatMultParameters params = new 
CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
+                               leftNumColumns, !isRightTransposed, 
!isLeftTransposed);
+               denseSparseMatMult(getCusparseHandle(gCtx), instName, output, 
B, A, params);
+               if (outRLen != 1 && outCLen != 1) {
+                       // Transpose: C = t(output)
+                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       JCublas2.cublasDgeam(gCtx.getCublasHandle(), 
cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
+                                       toInt(outCLen), toInt(outRLen), one(), 
output, toInt(outRLen), zero(), new Pointer(),
+                                       toInt(outRLen), C, toInt(outCLen));
+                       if (!DMLScript.EAGER_CUDA_FREE)
+                               JCuda.cudaDeviceSynchronize();
+                       gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime()
+                                               - t0);
+               }
+       }
+
+       /**
+        * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+        * multiplication for operation: C = op(A) * op(B) This assumes B and C 
are
+        * allocated in dense row-major format and A is sparse.
+        * 
+        * @param handle
+        *            cusparse handle
+        * @param instName
+        *            name of the invoking instruction to record{@link 
Statistics}.
+        * @param C
+        *            output matrix pointer
+        * @param A
+        *            left matrix pointer
+        * @param B
+        *            right matrix pointer
+        * @param param
+        *            BLAS parameters
+        * @throws DMLRuntimeException
+        *             if error
+        */
+       private static void denseSparseMatMult(cusparseHandle handle, String 
instName, Pointer C, Pointer A, CSRPointer B,
+                       CuMatMultParameters param) throws DMLRuntimeException {
+               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               String kernel = 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB;
+               // Ignoring sparse vector dense matrix multiplication and dot 
product
+               boolean isVector = (param.leftNumRows == 1 && 
!param.isLeftTransposed)
+                               || (param.leftNumCols == 1 && 
param.isLeftTransposed);
+               if (isVector) {
+                       LOG.debug(" GPU Sparse-Dense Matrix Vector ");
+                       int m = toInt(param.rightNumRows);
+                       int n = toInt(param.rightNumCols);
+                       int transa = 
reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+                       JCusparse.cusparseDcsrmv(handle, transa, m, n, 
toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+                                       zero(), C);
+                       kernel = 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB;
+               } else {
+                       int m = toInt(param.rightNumRows);
+                       int k = toInt(param.rightNumCols);
+                       param.rowToColumnMajor();
+                       param.validate();
+                       int transa = 
reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+                       int transb = cusparseOp(param.isRightTransposed);
+                       LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs 
transpose) ");
+                       JCusparse.cusparseDcsrmm2(handle, transa, transb, m, 
param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+                                       B.rowPtr, B.colInd, A, param.ldb, 
zero(), C, param.ldc);
+               }
+               if (GPUStatistics.DISPLAY_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(instName, kernel, 
System.nanoTime() - t0);
+       }
+
+       /**
+        * Internal method to invoke the appropriate CuBLAS kernel for matrix
+        * multiplication for operation: C = op(A) * op(B) This assumes A, B 
and C
+        * are allocated in dense format. The caller is expected to invoke
+        * params.rowToColumnMajor().
+        * 
+        * @param handle
+        *            cublas handle
+        * @param instName
+        *            name of the invoking instruction to record{@link 
Statistics}.
+        * @param C
+        *            output matrix pointer
+        * @param A
+        *            left matrix pointer
+        * @param B
+        *            right matrix pointer
+        * @param param
+        *            BLAS parameters
+        * @throws DMLRuntimeException
+        *             if error
+        */
+       private static void denseDenseMatMult(cublasHandle handle, String 
instName, Pointer C, Pointer A, Pointer B,
+                       CuMatMultParameters param) throws DMLRuntimeException {
+               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               String kernel = null;
+               param.rowToColumnMajor();
+               param.validate();
+               int transa = cublasOp(param.isLeftTransposed);
+               int transb = cublasOp(param.isRightTransposed);
+               B = swap(A, A = B);
+               if (param.m == 1 && param.n == 1) {
+                       // Vector product
+                       LOG.debug(" GPU Dense-dense Vector Product");
+                       double[] result = { 0 };
+                       JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, 
Pointer.to(result));
+                       // By default in CuBlas V2, cublas pointer mode is set 
to
+                       // CUBLAS_POINTER_MODE_HOST.
+                       // This means that scalar values passed are on host (as 
opposed to
+                       // on device).
+                       // The result is copied from the host back to the 
device so that the
+                       // rest of
+                       // infrastructure can treat it uniformly.
+                       cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, 
cudaMemcpyHostToDevice);
+                       kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB;
+               } else if (param.m == 1) {
+                       // Vector-matrix multiply
+                       LOG.debug(" GPU Dense Vector-Matrix Multiply");
+                       transb = reverseCublasOp(transb);
+                       int rightNumRows = (transb == 
CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
+                       int rightNumCols = (transb == 
CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
+                       JCublas2.cublasDgemv(handle, transb, rightNumRows, 
rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
+                       kernel = 
GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB;
+               } else if (param.n == 1) {
+                       // Matrix-vector multiply
+                       LOG.debug(" GPU Dense Matrix-Vector Multiply");
+                       int leftNumRows = (transa == 
CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
+                       int leftNumCols = (transa == 
CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
+                       JCublas2.cublasDgemv(handle, transa, leftNumRows, 
leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
+                       kernel = 
GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB;
+               } else {
+                       LOG.debug(" GPU Dense-Dense Matrix Multiply ");
+                       JCublas2.cublasDgemm(handle, transa, transb, param.m, 
param.n, param.k, one(), A, param.lda, B, param.ldb,
+                                       zero(), C, param.ldc);
+                       kernel = 
GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
+               }
+               if (GPUStatistics.DISPLAY_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(instName, kernel, 
System.nanoTime() - t0);
+       }
+
+       // Convenient methods to swap two values
+       // Usage: y = swap(x, x=y);
+       private static long swap(long x, long y) {
+               return x;
+       }
+
+       private static boolean swap(boolean x, boolean y) {
+               return x;
+       }
+
+       private static Pointer swap(Pointer x, Pointer y) {
+               return x;
+       }
+
+       /**
+        * Convenient wrapper to return appropriate cuSPARSE trans value
+        * 
+        * @param isTransposed
+        *            is op(input) = t(input)
+        * @return CUSPARSE_OPERATION_TRANSPOSE or 
CUSPARSE_OPERATION_NON_TRANSPOSE
+        */
+       private static int cusparseOp(boolean isTransposed) {
+               return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
+       }
+
+       /**
+        * Convenient wrapper to return appropriate cuBLAS trans value
+        * 
+        * @param isTransposed
+        *            is op(input) = t(input)
+        * @return CUBLAS_OP_T or CUBLAS_OP_N
+        */
+       private static int cublasOp(boolean isTransposed) {
+               return isTransposed ? cublasOperation.CUBLAS_OP_T : 
cublasOperation.CUBLAS_OP_N;
+       }
+
+       /**
+        * Flips the cuBLAS trans value
+        * 
+        * @param trans
+        *            can be CUBLAS_OP_T or CUBLAS_OP_N
+        * @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T
+        */
+       private static int reverseCublasOp(int trans) {
+               return trans == cublasOperation.CUBLAS_OP_T ? 
cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
+       }
+
+       /**
+        * Flips the cuSPARSE trans value
+        * 
+        * @param trans
+        *            can be CUSPARSE_OPERATION_NON_TRANSPOSE or
+        *            CUSPARSE_OPERATION_TRANSPOSE
+        * @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is
+        *         CUSPARSE_OPERATION_TRANSPOSE else 
CUSPARSE_OPERATION_TRANSPOSE
+        */
+       private static int reverseCusparseOp(int trans) {
+               return trans == CUSPARSE_OPERATION_TRANSPOSE ? 
CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java 
b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index 81bc254..d983716 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -50,9 +50,81 @@ public class MatrixMultiplicationOpTest extends GPUTests {
        public void matrixMatrixTest1() {
                String scriptStr = "O = X %*% Y";
 
-               int[] X1 = { 1, 128, 513, 1024 };
-               int[] X2 = { 128, 512, 1024 };
-               int[] Y2 = { 1, 128, 513, 1024 };
+               int[] X1 = { 1, 128, 1024 };
+               int[] X2 = { 1, 128, 1024 };
+               int[] Y2 = { 1, 128, 1024 };
+               double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+               double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+               for (int x1 = 0; x1 < X1.length; x1++) {
+                       for (int x2 = 0; x2 < X2.length; x2++) {
+                               int y1 = x2;
+                               for (int y2 = 0; y2 < Y2.length; y2++) {
+                                       for (int sx = 0; sx < SX.length; sx++) {
+                                               for (int sy = 0; sy < 
SY.length; sy++) {
+                                                       
assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], 
SY[sy]);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       @Test
+       public void commonCaseMLMatrixMatrixTest1() {
+               String scriptStr = "O = X %*% Y";
+
+               int[] X1 = { 1000000 };
+               int[] X2 = { 1000 };
+               int[] Y2 = { 1, 20 };
+               double[] SX = { 0.0, 0.03, 0.3 };
+               double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+               for (int x1 = 0; x1 < X1.length; x1++) {
+                       for (int x2 = 0; x2 < X2.length; x2++) {
+                               int y1 = x2;
+                               for (int y2 = 0; y2 < Y2.length; y2++) {
+                                       for (int sx = 0; sx < SX.length; sx++) {
+                                               for (int sy = 0; sy < 
SY.length; sy++) {
+                                                       
assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], 
SY[sy]);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       @Test
+       public void commonCaseDLMatrixMatrixTest1() {
+               String scriptStr = "O = X %*% Y";
+
+               int[] X1 = { 100 };
+               int[] X2 = { 600, 900  };
+               int[] Y2 = { 205800 };
+               double[] SX = { 0.0, 0.03, 0.3 };
+               double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+               for (int x1 = 0; x1 < X1.length; x1++) {
+                       for (int x2 = 0; x2 < X2.length; x2++) {
+                               int y1 = x2;
+                               for (int y2 = 0; y2 < Y2.length; y2++) {
+                                       for (int sx = 0; sx < SX.length; sx++) {
+                                               for (int sy = 0; sy < 
SY.length; sy++) {
+                                                       
assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], 
SY[sy]);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       @Test
+       public void commonCaseDLMatrixMatrixTest2() {
+               String scriptStr = "O = X %*% Y";
+
+               int[] X1 = { 64 };
+               int[] X2 = { 196608   };
+               int[] Y2 = { 512 };
                double[] SX = { 0.0, 0.03, 0.3, 0.9 };
                double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -74,9 +146,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
        public void matrixMatrixTest2() {
                String scriptStr = "O = X %*% t(Y)";
 
-               int[] X1 = { 1, 128, 513, 1024 };
-               int[] X2 = { 128, 512, 1024 };
-               int[] Y1 = { 1, 128, 513, 1024 };
+               int[] X1 = { 1, 128, 1024 };
+               int[] X2 = { 1, 128, 1024 };
+               int[] Y1 = { 1, 128, 1024 };
                double[] SX = { 0.0, 0.03, 0.3, 0.9 };
                double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -98,9 +170,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
        public void matrixMatrixTest3() {
                String scriptStr = "O = t(X) %*% Y";
 
-               int[] X1 = { 1, 128, 513, 1024 };
-               int[] X2 = { 128, 512, 1024 };
-               int[] Y2 = { 1, 128, 513, 1024 };
+               int[] X1 = { 1, 128, 1024 };
+               int[] X2 = { 1, 128, 1024 };
+               int[] Y2 = { 1, 128, 1024 };
                double[] SX = { 0.0, 0.03, 0.3, 0.9 };
                double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -122,9 +194,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
        public void matrixMatrixTest4() {
                String scriptStr = "O = t(X) %*% t(Y)";
 
-               int[] X1 = { 1, 128, 513, 1024 };
-               int[] X2 = { 128, 512, 1024 };
-               int[] Y1 = { 1, 128, 513, 1024 };
+               int[] X1 = { 1, 128, 1024 };
+               int[] X2 = { 1, 128, 1024 };
+               int[] Y1 = { 1, 128, 1024 };
                double[] SX = { 0.0, 0.03, 0.3, 0.9 };
                double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -146,7 +218,7 @@ public class MatrixMultiplicationOpTest extends GPUTests {
        public void transposeSelfMatrixMultiply() {
                String scriptStr = "O = t(X) %*% X";
 
-               int[] sizes = { 1, 128, 512, 1024, 2049 };
+               int[] sizes = { 1, 128, 1024 };
                double[] sparsities = { 0.0, 0.03, 0.3, 0.9 };
 
                for (int i = 0; i < sizes.length; i++) {

Reply via email to