Repository: systemml
Updated Branches:
  refs/heads/master e0006a272 -> 8e3c6f8b8


http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 87bac47..fe3edd2 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -141,6 +141,9 @@ public abstract class GPUInstruction extends Instruction {
        public final static String MISC_TIMER_RELU_KERNEL =                    
"nnrk";  // time spent in the relu kernel
        public final static String MISC_TIMER_CUDNN_INIT =                     
"nni";   // time spent in initializations for cudnn call
        public final static String MISC_TIMER_CUDNN_CLEANUP =                  
"nnc";   // time spent in cleanup for cudnn call
+       public final static String MISC_TIMER_DENSE_IM2COL_KERNEL=             
"nndim2c"; // time spent in dense im2col cuda kernel
+       public final static String MISC_TIMER_SPARSE_IM2COL_KERNEL=            
"nnsim2c"; // time spent in sparse im2col cuda kernel
+       public final static String MISC_TIMER_DENSE_REORG_KNPQ_KERNEL=         
"nndrknpq"; // time spent in dense reorg_knpq cuda kernel
 
        protected GPUINSTRUCTION_TYPE _gputype;
        protected Operator _optr;

http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 22cdbcb..c8e70bf 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -514,6 +514,57 @@ public class GPUObject {
                boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && 
getJcudaSparseMatrixPtr().nnz == 0;
                return isEmptyAndSparseAndAllocated;
        }
+               
+       /**
+        * Being allocated is a prerequisite for computing nnz.
+        * Note: if the matrix is in dense format, it explicitly re-computes 
the number of nonzeros.
+        *
+        * @param instName instruction name
+        * @param recomputeDenseNNZ recompute NNZ if dense
+        * @return the number of nonzeroes
+        * @throws DMLRuntimeException if error
+        */
+       public long getNnz(String instName, boolean recomputeDenseNNZ) throws 
DMLRuntimeException {
+               if(isAllocated()) {
+                       if(LibMatrixCUDA.isInSparseFormat(getGPUContext(), 
mat)) {
+                               return getJcudaSparseMatrixPtr().nnz;
+                       }
+                       else {
+                               if(!recomputeDenseNNZ)
+                                       return -1;
+                               
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               GPUContext gCtx = getGPUContext();
+                               cusparseHandle cusparseHandle = 
gCtx.getCusparseHandle();
+                               cusparseMatDescr matDescr = 
CSRPointer.getDefaultCuSparseMatrixDescriptor();
+                               if (cusparseHandle == null)
+                                       throw new DMLRuntimeException("Expected 
cusparse to be initialized");
+                               int rows = toIntExact(mat.getNumRows());
+                               int cols = toIntExact(mat.getNumColumns());
+                               Pointer nnzPerRowPtr = null;
+                               Pointer nnzTotalDevHostPtr = null;
+                               gCtx.ensureFreeSpace(getIntSizeOf(rows + 1));
+                               nnzPerRowPtr = 
gCtx.allocate(getIntSizeOf(rows));
+                               nnzTotalDevHostPtr = 
gCtx.allocate(getIntSizeOf(1));
+                               
LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, 
getJcudaDenseMatrixPtr(), rows,
+                                               nnzPerRowPtr, 
nnzTotalDevHostPtr);
+                               int[] nnzC = { -1 };
+                               cudaMemcpy(Pointer.to(nnzC), 
nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+                               if (nnzC[0] == -1) {
+                                       throw new DMLRuntimeException(
+                                                       "cusparseDnnz did not 
calculate the correct number of nnz on the GPU");
+                               }
+                               gCtx.cudaFreeHelper(nnzPerRowPtr);
+                               gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
+                                       
GPUStatistics.maintainCPMiscTimes(instName, 
CPInstruction.MISC_TIMER_RECOMPUTE_NNZ, System.nanoTime()-t1);
+                       }
+                               return nnzC[0];
+                       }
+               }
+               else 
+                       throw new DMLRuntimeException("Expected the GPU object 
to be allocated");
+       }
 
        public boolean acquireDeviceRead(String opcode) throws 
DMLRuntimeException {
                if(LOG.isTraceEnabled()) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 1433b5a..4f2de29 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -192,6 +192,25 @@ public class LibMatrixCUDA {
                        return mo.getGPUObject(gCtx).isSparse();
                return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), 
mo.getNumColumns(), mo.getNnz());
        }
+       
+       /**
+        * Note: if the matrix is in dense format, it explicitly re-computes 
the number of nonzeros.
+        * 
+        * @param gCtx a valid GPU context
+        * @param instName instruction name
+        * @param mo matrix object
+        * @param recomputeDenseNNZ recompute NNZ if dense
+        * @return number of non-zeroes
+        * @throws DMLRuntimeException if error
+        */
+       public static long getNnz(GPUContext gCtx, String instName, 
MatrixObject mo, boolean recomputeDenseNNZ) throws DMLRuntimeException {
+               if(mo.getGPUObject(gCtx) != null && 
mo.getGPUObject(gCtx).isAllocated()) {
+                       return mo.getGPUObject(gCtx).getNnz(instName, 
recomputeDenseNNZ);
+               }
+               else {
+                       return mo.getNnz();
+               }
+       }
 
 
        protected static cusparseHandle getCusparseHandle(GPUContext gCtx) 
throws DMLRuntimeException{

http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index c88cfd2..d07c2b2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -50,6 +50,7 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
 import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.utils.GPUStatistics;
@@ -63,6 +64,10 @@ import static 
jcuda.jcudnn.cudnnSoftmaxMode.CUDNN_SOFTMAX_MODE_CHANNEL;
  */
 public class LibMatrixCuDNN extends LibMatrixCUDA {
 
+       // Currently we only use nnz information from the sparse matrix which 
is pre-computed
+       // TODO: experiment how often does dense matrix is empty where 
recomputing nnz before calling CuDNN will help
+       private static final boolean RECOMPUTE_DENSE_NNZ = false;
+       
        protected static int CONVOLUTION_PREFERENCE = 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
        private static final Log LOG = 
LogFactory.getLog(LibMatrixCuDNN.class.getName());
 
@@ -103,7 +108,60 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                biasAdd(gCtx, instName, output, bias, output);
        }
        
-
+       /**
+        * Performs im2col operation on GPU
+        * 
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param image input matrix object
+        * @param isSparseImage is input image sparse
+        * @param N        number of input images
+        * @param C        number of channels
+        * @param H        height of each image
+        * @param W        width of each image
+        * @param R        height of filter
+        * @param S        width of filter
+        * @param pad_h    padding height
+        * @param pad_w    padding width
+        * @param stride_h stride height
+        * @param stride_w string width
+        * @param P        output height
+        * @param Q        output width
+        * @return output im2col pointer (the caller is expected to free this 
pointer) or null if image is an empty matrix
+        * @throws DMLRuntimeException if error
+        */
+       private static Pointer denseIm2col(GPUContext gCtx, String instName, 
MatrixObject image, boolean isSparseImage, long N, long C, long H, long W,
+                       int R, int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P, int Q) throws DMLRuntimeException {
+               Pointer im2colPointer = null;
+               long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
+               if(isSparseImage) {
+                       CSRPointer inPointer = getSparsePointer(gCtx, image, 
instName);
+                       if(inPointer.nnz < 0) {
+                               throw new DMLRuntimeException("Unknown number 
of nonzeroes in denseIm2col");
+                       }
+                       else if(inPointer.nnz > 0) {
+                               im2colPointer = gCtx.allocate(instName, 
C*R*S*N*P*Q*sizeOfDataType);
+                               
getCudaKernels(gCtx).launchKernel("sparse_dense_im2col", 
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(inPointer.nnz)), 
+                                               inPointer.val, 
inPointer.rowPtr, inPointer.colInd, im2colPointer, inPointer.nnz, N, 
+                                               C*H*W, H*W, W, R, S, P, Q, P*Q, 
R*S, N*P*Q, stride_h, stride_w, pad_h, pad_w);
+                               if (DMLScript.FINEGRAINED_STATISTICS)
+                                       
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_IM2COL_KERNEL, System.nanoTime() - t1);
+                       }
+                       else
+                               return null;
+               }
+               else {
+                       im2colPointer = gCtx.allocate(instName, 
C*R*S*N*P*Q*sizeOfDataType);
+                       Pointer imagePointer = getDensePointerForCuDNN(gCtx, 
image, instName);
+                       getCudaKernels(gCtx).launchKernel("dense_dense_im2col", 
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*C*H*W)), 
+                                       imagePointer, im2colPointer, N*C*H*W, 
+                                       C*H*W, H*W, W, R, S, P, Q, P*Q, R*S, 
N*P*Q, stride_h, stride_w, pad_h, pad_w);
+                       if (DMLScript.FINEGRAINED_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_IM2COL_KERNEL, System.nanoTime() - t1);
+               }
+               return im2colPointer;
+       }
+       
        /**
         * Performs a 2D convolution
         * 
@@ -133,32 +191,68 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
+               long NPQ = N*P*Q;
+               
+               boolean isSparseFilter = isInSparseFormat(gCtx, filter);
+               long filterNnz = getNnz(gCtx, instName, filter, 
RECOMPUTE_DENSE_NNZ);
+               if(filterNnz == 0) {
+                       return; // since filter is empty
+               }
+               boolean isSparseImage = isInSparseFormat(gCtx, image);
+               long imageNnz = getNnz(gCtx, instName, image, 
RECOMPUTE_DENSE_NNZ);
+               if(imageNnz == 0) {
+                       return; // since image is empty
+               }
+               Pointer dstPointer = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
                
                if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < 
maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
-                       // Filter and output are accounted as dense in the 
memory estimation for conv2d
-                       double overhead = isInSparseFormat(gCtx, filter) ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
-                       overhead += isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
-
-                       Pointer filterPointer = getDensePointerForCuDNN(gCtx, 
filter, instName);
-                       Pointer dstPointer = getDensePointerForCuDNN(gCtx, 
outputBlock, instName);
-                       
-                       // Required for LibMatrixCuDNNConvolutionAlgorithm
-                       long workspaceLimit = (long) 
(intermediateMemoryBudget-overhead);
-                       int localN = overhead <= intermediateMemoryBudget ? N : 
1;
-                       
-                       try(LibMatrixCuDNNConvolutionAlgorithm algo = 
-                                       
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionForwardAlgorithm(gCtx, 
instName, 
-                                       localN, C, H, W, K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, workspaceLimit)) {
-                               if(localN == N) {
-                                       // Perform all-input all-channel conv2d
-                                       Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
-                                       cudnnConv2d(gCtx, instName, 
imagePointer, filterPointer, dstPointer, algo);
-                               }
-                               else {
-                                       try(LibMatrixCuDNNInputRowFetcher 
imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image)) {
-                                               for(int n = 0; n < N; n++) {
-                                                       // Perform one-input 
all-channel conv2d
-                                                       cudnnConv2d(gCtx, 
instName, imgFetcher.getNthRow(n), filterPointer, 
dstPointer.withByteOffset(n*KPQ*sizeOfDataType), algo);
+                       if(isSparseFilter && 
+                               (OptimizerUtils.estimateSizeExactSparsity(CRS, 
NPQ, 1.0) + OptimizerUtils.estimateSizeExactSparsity(K, NPQ, 1.0)) < 
intermediateMemoryBudget) {
+                               // Sparse filter conv2d
+                               // Perform dense im2col
+                               Pointer im2colPointer = denseIm2col(gCtx, 
instName, image, isSparseImage,
+                                               N, C, H, W, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
+                               
+                               // Perform matrix multiplication
+                               CSRPointer filterPointer = 
filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+                               Pointer matmultOutputPointer = 
gCtx.allocate(instName, NKPQ*sizeOfDataType);
+                               LibMatrixCuMatMult.sparseDenseMatMult(gCtx, 
instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, 
K, NPQ, false, false);
+                               gCtx.cudaFreeHelper(instName, im2colPointer);
+                               
+                               // Perform reorg_knpq a reorg operation of 
matmultOutputPointer matrix with dimensions [K, NPQ]
+                               // and return a matrix dstPointer with 
dimensions [N, KPQ]
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               getCudaKernels(gCtx).launchKernel("reorg_knpq", 
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(NKPQ)), 
+                                               matmultOutputPointer, 
dstPointer, NKPQ, NPQ, KPQ, P*Q);
+                               if (DMLScript.FINEGRAINED_STATISTICS)
+                                       
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_REORG_KNPQ_KERNEL, System.nanoTime() - t1);
+                               gCtx.cudaFreeHelper(instName, 
matmultOutputPointer);
+                       }
+                       else {
+                               // Filter and output are accounted as dense in 
the memory estimation for conv2d
+                               double overhead = isSparseFilter ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
+                               overhead += isSparseImage ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
+
+                               Pointer filterPointer = 
getDensePointerForCuDNN(gCtx, filter, instName);
+                               
+                               // Required for 
LibMatrixCuDNNConvolutionAlgorithm
+                               long workspaceLimit = (long) 
(intermediateMemoryBudget-overhead);
+                               int localN = overhead <= 
intermediateMemoryBudget ? N : 1;
+                               
+                               try(LibMatrixCuDNNConvolutionAlgorithm algo = 
+                                               
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionForwardAlgorithm(gCtx, 
instName, 
+                                               localN, C, H, W, K, R, S, 
pad_h, pad_w, stride_h, stride_w, P, Q, workspaceLimit)) {
+                                       if(localN == N) {
+                                               // Perform all-input 
all-channel conv2d
+                                               Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
+                                               cudnnConv2d(gCtx, instName, 
imagePointer, filterPointer, dstPointer, algo);
+                                       }
+                                       else {
+                                               
try(LibMatrixCuDNNInputRowFetcher imgFetcher = new 
LibMatrixCuDNNInputRowFetcher(gCtx, instName, image)) {
+                                                       for(int n = 0; n < N; 
n++) {
+                                                               // Perform 
one-input all-channel conv2d
+                                                               
cudnnConv2d(gCtx, instName, imgFetcher.getNthRow(n), filterPointer, 
dstPointer.withByteOffset(n*KPQ*sizeOfDataType), algo);
+                                                       }
                                                }
                                        }
                                }
@@ -313,11 +407,21 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
                
+               boolean isSparseDout = isInSparseFormat(gCtx, dout);
+               long doutNnz = getNnz(gCtx, instName, dout, 
RECOMPUTE_DENSE_NNZ);
+               if(doutNnz == 0) {
+                       return; // since dout is empty
+               }
+               boolean isSparseImage = isInSparseFormat(gCtx, image);
+               long imageNnz = getNnz(gCtx, instName, image, 
RECOMPUTE_DENSE_NNZ);
+               if(imageNnz == 0) {
+                       return; // since image is empty
+               }
                
                if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < 
maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
                        Pointer dwPointer = getDensePointerForCuDNN(gCtx, 
outputBlock, instName);
-                       double overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
-                       overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
+                       double overhead = isSparseImage ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
+                       overhead += isSparseDout ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
 
                        // Required for LibMatrixCuDNNConvolutionAlgorithm
                        long workspaceLimit = (long) 
(intermediateMemoryBudget-overhead);
@@ -420,10 +524,21 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
+               boolean isSparseFilter = isInSparseFormat(gCtx, filter);
+               long filterNnz = getNnz(gCtx, instName, filter, 
RECOMPUTE_DENSE_NNZ);
+               if(filterNnz == 0) {
+                       return; // since filter is empty
+               }
+               boolean isSparseDout = isInSparseFormat(gCtx, dout);
+               long doutNnz = getNnz(gCtx, instName, dout, 
RECOMPUTE_DENSE_NNZ);
+               if(doutNnz == 0) {
+                       return; // since dout is empty
+               }
+               
                if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < 
maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
-                       double overhead = isInSparseFormat(gCtx, filter) ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
-                       overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
+                       double overhead = isSparseFilter ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
+                       overhead += isSparseDout ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
                        Pointer filterPointer = getDensePointerForCuDNN(gCtx, 
filter, instName);
                        Pointer dstPointer = getDensePointerForCuDNN(gCtx, 
output, instName);
                        

http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index ce0ad5b..21d9fd1 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -268,7 +268,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
         * @throws DMLRuntimeException
         *             if error
         */
-       private static void sparseDenseMatMult(GPUContext gCtx, String 
instName, Pointer C, CSRPointer A, Pointer B,
+       static void sparseDenseMatMult(GPUContext gCtx, String instName, 
Pointer C, CSRPointer A, Pointer B,
                        long leftNumRows, long leftNumColumns, long 
rightNumRows, long rightNumColumns, long outRLen, long outCLen,
                        boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
                // t(C) = t(B) %*% t(A)

http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java
index 92ae8a3..e30de2c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java
@@ -634,20 +634,27 @@ public class LibMatrixDNNConv2d
                        LibMatrixMult.vectAddInPlace(bias[k], out, cix, PQ);
        }
        
+       // 
----------------------------------------------------------------------------------------------
+       // TODO: Support sparse native convolution operations without dense 
intermediates + dense matmult
+       // Currently, it will fall back to more optimized sparse Java-based 
operators.
        private static boolean 
isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {
                // NativeHelper.conv2dBackwardFilterSparseDense only if input 
is sparse. 
                // dout converted to dense if sparse.
-               return params.enableNative && params.input1.isInSparseFormat();
+               // return params.enableNative && 
params.input1.isInSparseFormat();
+               return false;
        }
        
        private static boolean isEligibleForConv2dSparse(ConvolutionParameters 
params) {
                // NativeHelper.conv2dSparse only if filter is dense and input 
is sparse
-               return params.enableNative && params.input1.isInSparseFormat() 
&& !params.input2.isInSparseFormat();
+               // return params.enableNative && 
params.input1.isInSparseFormat() && !params.input2.isInSparseFormat();
+               return false;
        }
        
        private static boolean 
isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) {
                // NativeHelper.conv2dBackwardDataDense only if filter is 
dense. 
                // dout converted to dense if sparse.
-               return params.enableNative && !params.input1.isInSparseFormat();
+               // return params.enableNative && 
!params.input1.isInSparseFormat();
+               return false;
        }
+       // 
----------------------------------------------------------------------------------------------
 }

Reply via email to