[MINOR] Refactoring and cleanup cp convolution operations

This patch refactors the convolution operations to remove unnecessary
and unused code as a preparation step for the support of large dense
blocks.

Furthermore, this also includes a fix for special cases of sparse-dense
matrix multiplications over large dense blocks.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/45eec2d2
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/45eec2d2
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/45eec2d2

Branch: refs/heads/master
Commit: 45eec2d258a5239413df8071716011aaabd4d28a
Parents: 20b1b5a
Author: Matthias Boehm <[email protected]>
Authored: Wed Jan 10 19:03:07 2018 -0800
Committer: Matthias Boehm <[email protected]>
Committed: Thu Jan 11 11:45:09 2018 -0800

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 281 ++++----
 .../runtime/matrix/data/LibMatrixDNNConv2d.java | 653 +++++++++++++++++++
 .../LibMatrixDNNConv2dBackwardDataHelper.java   | 114 ----
 .../LibMatrixDNNConv2dBackwardFilterHelper.java | 206 ------
 .../matrix/data/LibMatrixDNNConv2dHelper.java   | 307 ---------
 .../runtime/matrix/data/LibMatrixDNNHelper.java | 526 +--------------
 .../runtime/matrix/data/LibMatrixDNNIm2Col.java | 351 ++++++++++
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   | 419 ------------
 .../matrix/data/LibMatrixDNNPooling.java        | 532 +++++++++++++++
 .../data/LibMatrixDNNPoolingBackwardHelper.java | 299 ---------
 .../matrix/data/LibMatrixDNNPoolingHelper.java  | 170 -----
 .../runtime/matrix/data/LibMatrixDNNRelu.java   |  89 +++
 .../matrix/data/LibMatrixDNNRotate180.java      | 109 ++++
 .../data/LibMatrixDNNRotate180Helper.java       | 110 ----
 .../runtime/matrix/data/LibMatrixMult.java      |   2 +-
 15 files changed, 1865 insertions(+), 2303 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index ca38db3..e8a88d8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -31,8 +31,6 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.instructions.InstructionUtils;
-import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 
 /*
@@ -51,7 +49,7 @@ import org.apache.sysml.runtime.util.ConvolutionUtils;
  * followed by the above mentioned functions are as follows:
  *   execute(LibMatrixDNNHelper.get__Workers(params), params);
  * 3. LibMatrixDNN's execute() method ensures the creation and shutdown of the 
ExecutorService.
- * 4. LibMatrixDNNHelper.get__Workers creates appropriate workers based on the 
runtime characteristics of
+ * 4. LibMatrixDNN__.getWorkers creates appropriate workers based on the 
runtime characteristics of
  * the input data (for example: input activations, filter, dout, ...). For 
code maintenance, these workers
  * are placed in the respective LibMatrixDNN__Helper files.
  * 5. The above mentioned workers may also use additional workers such as 
im2col and rotate180.
@@ -129,18 +127,7 @@ public class LibMatrixDNN {
                loopedConvBwdDataMatMultTime.set(0);
                loopedConvBwdDataCol2ImTime.set(0);
        }
-       
-       // Commonly used operators
-       static BinaryOperator _binaryElementWiseAddition = null;
-       static BinaryOperator _binaryElementWiseMultiplication = null;
-       static {
-               try {
-                       _binaryElementWiseAddition = 
InstructionUtils.parseBinaryOperator("+");
-                       _binaryElementWiseMultiplication = 
InstructionUtils.parseBinaryOperator("*");
-               } catch (DMLRuntimeException e) {
-                       throw new RuntimeException("ERROR initializing 
LibMatrixDNN", e);
-               }
-       }
+
        // 
------------------------------------------------------------------------------------------------
        
        /**
@@ -154,11 +141,10 @@ public class LibMatrixDNN {
         */
        public static void conv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
                LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, 
params);
-               
                if(params.bias != null && params.bias.isInSparseFormat())
                        params.bias.sparseToDense(); // Since bias is extremely 
small array
                
-               long nnz = execute(LibMatrixDNNHelper.getConv2dWorkers(params), 
params);
+               long nnz = execute(LibMatrixDNNConv2d.getConv2dWorkers(params), 
params);
                
                //post-processing: maintain nnz
                outputBlock.setNonZeros(nnz);
@@ -177,7 +163,7 @@ public class LibMatrixDNN {
        public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
                checkInputsConv2dBackwardData(filter, dout, outputBlock, 
params);
                
-               long nnz = 
execute(LibMatrixDNNHelper.getConv2dBackwardDataWorkers(params), params);
+               long nnz = 
execute(LibMatrixDNNConv2d.getConv2dBackwardDataWorkers(params), params);
                
                //post-processing: maintain nnz
                outputBlock.setNonZeros(nnz);
@@ -196,99 +182,34 @@ public class LibMatrixDNN {
        public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
                checkInputsConv2dBackwardFilter(input, dout, outputBlock, 
params);
                
-               
execute(LibMatrixDNNHelper.getConv2dBackwardFilterWorkers(params), params);
+               
execute(LibMatrixDNNConv2d.getConv2dBackwardFilterWorkers(params), params);
                
                //post-processing: maintain nnz
                outputBlock.recomputeNonZeros(); 
                outputBlock.examSparsity();
        }
        
-       
-       private static void checkOrThrowException(String msg, long lhs, long 
rhs) throws DMLRuntimeException {
-               if(lhs != rhs)
-                       throw new DMLRuntimeException(msg + ":" + lhs + " != " 
+ rhs);
-       }
-       private static void checkOrThrowException(String msg, long lhs, long 
rhs1, long rhs2, long rhs3) throws DMLRuntimeException {
-               if(lhs != (rhs1*rhs2*rhs3))
-                       throw new DMLRuntimeException(msg + ":" + lhs + " != (" 
+ rhs1 + " * " + rhs2 + " * " + rhs3);
-       }
-       
-       static void checkInputsConv2dBackwardData(MatrixBlock filter, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  
throws DMLRuntimeException {
-               params.input1 = filter;
-               params.input2 = dout;
-               params.output = outputBlock;
-               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of rows of input filter != "
-                               + "number of filters in filter_shape", 
filter.getNumRows(), params.K);
-               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of columns of input filter != "
-                               + "channels*filter_height*filter_height in 
filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
-               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of rows of input errors != "
-                               + "batch size in input_shape", 
dout.getNumRows(), params.N);
-               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of columns of input errors != "
-                               + "expected input error channels*height*width", 
dout.getNumColumns(), params.K, params.P, params.Q);
-               if(params.stride_h <= 0 || params.stride_w <= 0) 
-                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
-               
-               if(DMLScript.FINEGRAINED_STATISTICS) {
-                       if(filter.isInSparseFormat() || 
dout.isInSparseFormat()) {
-                               conv2dBwdDataSparseCount.addAndGet(1);
-                       }
-                       else {
-                               conv2dBwdDataDenseCount.addAndGet(1);
-                       }
-               }
-       }
-       
-       static void checkInputsConv2dBackwardFilter(MatrixBlock input, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  
throws DMLRuntimeException {
+       public static void maxpooling(MatrixBlock input, MatrixBlock output, 
ConvolutionParameters params) throws DMLRuntimeException {
                params.input1 = input;
-               params.input2 = dout;
-               params.output = outputBlock;
-               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of rows of input data != "
-                               + "batch size in input_shape", 
input.getNumRows(), params.N);
-               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of columns of input data != "
-                               + "channels*input_height*input_height in 
input_shape", input.getNumColumns(), params.C, params.H, params.W);
-               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of rows of input errors != "
-                               + "batch size in input_shape", 
dout.getNumRows(), params.N);
-               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of columns of input errors != "
-                               + "expected input error channels*height*width", 
dout.getNumColumns(), params.K, params.P, params.Q);
-               if(params.stride_h <= 0 || params.stride_w <= 0) 
-                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
+               params.output = output;
                
-               if(DMLScript.FINEGRAINED_STATISTICS) {
-                       if(input.isInSparseFormat() || dout.isInSparseFormat()) 
{
-                               conv2dBwdFilterSparseCount.addAndGet(1);
-                       }
-                       else {
-                               conv2dBwdFilterDenseCount.addAndGet(1);
-                       }
+               if(input.getNumColumns() != params.C*params.H*params.W || 
input.getNumRows() != params.N) {
+                       throw new DMLRuntimeException("Incorrect input 
dimensions in maxpooling:" + input.getNumRows() + " " 
+                               + input.getNumColumns() + " " + params.N + " " 
+ params.C*params.H*params.W);
                }
-       }
-       
-       static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
-               params.input1 = input;
-               params.input2 = filter;
-               params.output = outputBlock;
                
-               checkOrThrowException("Incorrect input to conv2d: Number of 
rows of input filter != "
-                               + "number of filters in filter_shape", 
filter.getNumRows(), params.K);
-               checkOrThrowException("Incorrect input to conv2d: Number of 
columns of input filter != "
-                               + "channels*filter_height*filter_height in 
filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
-               checkOrThrowException("Incorrect input to conv2d: Number of 
rows of input data != "
-                               + "batch size in input_shape", 
input.getNumRows(), params.N);
-               checkOrThrowException("Incorrect input to conv2d: Number of 
columns of input data != "
-                               + "channels*input_height*input_height in 
input_shape", input.getNumColumns(), params.C, params.H, params.W);
-               if(params.stride_h <= 0 || params.stride_w <= 0) 
-                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
+               //materialize indexes unless basic case with stride=1 and pad=0
+               if( !params.isStride1Pad0() || input.sparse )
+                       fillIndexesArray(params);
                
-               if(DMLScript.FINEGRAINED_STATISTICS) {
-                       if(input.isInSparseFormat() || 
filter.isInSparseFormat()) {
-                               conv2dSparseCount.addAndGet(1);
-                       }
-                       else {
-                               conv2dDenseCount.addAndGet(1);
-                       }
-               }
+               long nnz = 
execute(LibMatrixDNNPooling.getMaxPoolingWorkers(params), params);
+               
+               // post-processing: maintain nnz
+               output.setNonZeros(nnz);
+               output.examSparsity();
        }
        
+
        /**
         * This method computes the backpropogation errors for previous layer 
of maxpooling operation
         * 
@@ -325,7 +246,7 @@ public class LibMatrixDNN {
                if( !(params.input1.isInSparseFormat() && 
!params.input2.isInSparseFormat()) )
                        fillIndexesArray(params); //not needed for sparse-dense
                
-               long nnz = 
execute(LibMatrixDNNHelper.getMaxPoolingBackwardWorkers(params, 
performReluBackward), params);
+               long nnz = 
execute(LibMatrixDNNPooling.getMaxPoolingBackwardWorkers(params, 
performReluBackward), params);
                
                //post-processing: maintain nnz 
                outputBlock.setNonZeros(nnz); 
@@ -333,29 +254,6 @@ public class LibMatrixDNN {
        }
        
        /**
-        * This method computes start and end indexes required for max_pool and 
max_pool_backward operations.
-        * This speeds up the performance of max_pool and  max_pool_backward
-        * 
-        * @param params parameters required for max_pool and max_pool_backward 
operations
-        */
-       private static void fillIndexesArray(ConvolutionParameters params) {
-               params.start_indexes_h = new int[params.P];
-               params.end_indexes_h = new int[params.P];
-               params.start_indexes_w = new int[params.Q];
-               params.end_indexes_w = new int[params.Q];
-               for( int p=0, ix=-params.pad_h; p < params.P; p++, 
ix+=params.stride_h ) {
-                       // Note: We do not treat pad as zero
-                       params.start_indexes_h[p] = Math.max(ix, 0);
-                       params.end_indexes_h[p] = Math.min(ix+params.R, 
params.H);
-               }
-               for( int q=0, ix=-params.pad_w; q < params.Q; q++, 
ix+=params.stride_w) {
-                       // Note: We do not treat pad as zero
-                       params.start_indexes_w[q] = Math.max(ix, 0);
-                       params.end_indexes_w[q] = Math.min(ix+params.S, 
params.W);
-               }
-       }
-       
-       /**
         * This method computes the backpropagation errors for previous layer 
of relu operation
         * 
         * @param input input matrix
@@ -375,7 +273,7 @@ public class LibMatrixDNN {
                                input.getNumRows() + " != " + dout.getNumRows() 
+ " || " + input.getNumColumns() + " != " + dout.getNumColumns());
                }
                
-               execute(LibMatrixDNNHelper.getReluBackwardWorkers(params), 
params);
+               execute(LibMatrixDNNRelu.getReluBackwardWorkers(params), 
params);
                
                // post-processing: maintain nnz
                outputBlock.recomputeNonZeros(); 
@@ -503,7 +401,7 @@ public class LibMatrixDNN {
                        }
                        
                        //post-processing: maintain nnz
-                       params.output.recomputeNonZeros(); 
+                       params.output.recomputeNonZeros();
                        params.output.examSparsity();
                }
                else {
@@ -511,26 +409,6 @@ public class LibMatrixDNN {
                }
        }
        
-       public static void maxpooling(MatrixBlock input, MatrixBlock output, 
ConvolutionParameters params) throws DMLRuntimeException {
-               params.input1 = input;
-               params.output = output;
-               
-               if(input.getNumColumns() != params.C*params.H*params.W || 
input.getNumRows() != params.N) {
-                       throw new DMLRuntimeException("Incorrect input 
dimensions in maxpooling:" + input.getNumRows() + " " 
-                               + input.getNumColumns() + " " + params.N + " " 
+ params.C*params.H*params.W);
-               }
-               
-               //materialize indexes unless basic case with stride=1 and pad=0
-               if( !params.isStride1Pad0() || input.sparse )
-                       fillIndexesArray(params);
-               
-               long nnz = 
execute(LibMatrixDNNHelper.getMaxPoolingWorkers(params), params);
-               
-               // post-processing: maintain nnz
-               output.setNonZeros(nnz);
-               output.examSparsity();
-       }
-       
        /**
         * Executes the tasks in parallel using java's ExecutorService.
         *  
@@ -564,18 +442,111 @@ public class LibMatrixDNN {
                return lnnz;
        }
        
-       static boolean 
isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {
-               // NativeHelper.conv2dBackwardFilterSparseDense only if input 
is sparse. 
-               // dout converted to dense if sparse.
-               return params.enableNative && params.input1.isInSparseFormat();
+       private static void checkOrThrowException(String msg, long lhs, long 
rhs) throws DMLRuntimeException {
+               if(lhs != rhs)
+                       throw new DMLRuntimeException(msg + ":" + lhs + " != " 
+ rhs);
+       }
+       private static void checkOrThrowException(String msg, long lhs, long 
rhs1, long rhs2, long rhs3) throws DMLRuntimeException {
+               if(lhs != (rhs1*rhs2*rhs3))
+                       throw new DMLRuntimeException(msg + ":" + lhs + " != (" 
+ rhs1 + " * " + rhs2 + " * " + rhs3);
+       }
+       
+       static void checkInputsConv2dBackwardData(MatrixBlock filter, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  
throws DMLRuntimeException {
+               params.input1 = filter;
+               params.input2 = dout;
+               params.output = outputBlock;
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of rows of input filter != "
+                               + "number of filters in filter_shape", 
filter.getNumRows(), params.K);
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of columns of input filter != "
+                               + "channels*filter_height*filter_height in 
filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of rows of input errors != "
+                               + "batch size in input_shape", 
dout.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of columns of input errors != "
+                               + "expected input error channels*height*width", 
dout.getNumColumns(), params.K, params.P, params.Q);
+               if(params.stride_h <= 0 || params.stride_w <= 0) 
+                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
+               
+               if(DMLScript.FINEGRAINED_STATISTICS) {
+                       if(filter.isInSparseFormat() || 
dout.isInSparseFormat()) {
+                               conv2dBwdDataSparseCount.addAndGet(1);
+                       }
+                       else {
+                               conv2dBwdDataDenseCount.addAndGet(1);
+                       }
+               }
+       }
+       
+       static void checkInputsConv2dBackwardFilter(MatrixBlock input, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  
throws DMLRuntimeException {
+               params.input1 = input;
+               params.input2 = dout;
+               params.output = outputBlock;
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of rows of input data != "
+                               + "batch size in input_shape", 
input.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of columns of input data != "
+                               + "channels*input_height*input_height in 
input_shape", input.getNumColumns(), params.C, params.H, params.W);
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of rows of input errors != "
+                               + "batch size in input_shape", 
dout.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of columns of input errors != "
+                               + "expected input error channels*height*width", 
dout.getNumColumns(), params.K, params.P, params.Q);
+               if(params.stride_h <= 0 || params.stride_w <= 0) 
+                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
+               
+               if(DMLScript.FINEGRAINED_STATISTICS) {
+                       if(input.isInSparseFormat() || dout.isInSparseFormat()) 
{
+                               conv2dBwdFilterSparseCount.addAndGet(1);
+                       }
+                       else {
+                               conv2dBwdFilterDenseCount.addAndGet(1);
+                       }
+               }
        }
-       static boolean isEligibleForConv2dSparse(ConvolutionParameters params) {
-               // NativeHelper.conv2dSparse only if filter is dense and input 
is sparse
-               return params.enableNative && params.input1.isInSparseFormat() 
&& !params.input2.isInSparseFormat();
+       
+       static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               params.input1 = input;
+               params.input2 = filter;
+               params.output = outputBlock;
+               
+               checkOrThrowException("Incorrect input to conv2d: Number of 
rows of input filter != "
+                               + "number of filters in filter_shape", 
filter.getNumRows(), params.K);
+               checkOrThrowException("Incorrect input to conv2d: Number of 
columns of input filter != "
+                               + "channels*filter_height*filter_height in 
filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+               checkOrThrowException("Incorrect input to conv2d: Number of 
rows of input data != "
+                               + "batch size in input_shape", 
input.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to conv2d: Number of 
columns of input data != "
+                               + "channels*input_height*input_height in 
input_shape", input.getNumColumns(), params.C, params.H, params.W);
+               if(params.stride_h <= 0 || params.stride_w <= 0) 
+                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
+               
+               if(DMLScript.FINEGRAINED_STATISTICS) {
+                       if(input.isInSparseFormat() || 
filter.isInSparseFormat()) {
+                               conv2dSparseCount.addAndGet(1);
+                       }
+                       else {
+                               conv2dDenseCount.addAndGet(1);
+                       }
+               }
        }
-       static boolean 
isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) {
-               // NativeHelper.conv2dBackwardDataDense only if filter is 
dense. 
-               // dout converted to dense if sparse.
-               return params.enableNative && !params.input1.isInSparseFormat();
+       
+       /**
+        * This method computes start and end indexes required for max_pool and 
max_pool_backward operations.
+        * This speeds up the performance of max_pool and  max_pool_backward
+        * 
+        * @param params parameters required for max_pool and max_pool_backward 
operations
+        */
+       private static void fillIndexesArray(ConvolutionParameters params) {
+               params.start_indexes_h = new int[params.P];
+               params.end_indexes_h = new int[params.P];
+               params.start_indexes_w = new int[params.Q];
+               params.end_indexes_w = new int[params.Q];
+               for( int p=0, ix=-params.pad_h; p < params.P; p++, 
ix+=params.stride_h ) {
+                       // Note: We do not treat pad as zero
+                       params.start_indexes_h[p] = Math.max(ix, 0);
+                       params.end_indexes_h[p] = Math.min(ix+params.R, 
params.H);
+               }
+               for( int q=0, ix=-params.pad_w; q < params.Q; q++, 
ix+=params.stride_w) {
+                       // Note: We do not treat pad as zero
+                       params.start_indexes_w[q] = Math.max(ix, 0);
+                       params.end_indexes_w[q] = Math.min(ix+params.S, 
params.W);
+               }
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java
new file mode 100644
index 0000000..92ae8a3
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java
@@ -0,0 +1,653 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.concurrent.Callable;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2Col.Im2colWorker;
+import 
org.apache.sysml.runtime.matrix.data.LibMatrixDNNRotate180.Rotate180Worker;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
+
+/**
+ * This class contains the set of operators used for performing conv2d
+ */
+public class LibMatrixDNNConv2d 
+{
+       /**
+        * Factory method that returns list of callable tasks for performing 
conv2d
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing conv2d
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getConv2dWorkers(ConvolutionParameters params) throws DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<>();
+               
+               // Try to create twice as many tasks as threads for improved 
load balance
+               // (due to constant-sized intermediates, GC works well, so the 
overhead per task is small)
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k / 2));
+               
+               MatrixBlock in1 = params.input1;
+               boolean isEmptyDenseInput = !in1.isInSparseFormat() && 
in1.denseBlock == null;
+               boolean isTransPref = in1.sparse && !params.input2.sparse && 
+                       MatrixBlock.evalSparseFormatInMemory(in1.clen, 
in1.rlen, in1.nonZeros);
+               boolean applyNative = isEligibleForConv2dSparse(params)
+                       && !(!isEmptyDenseInput && isTransPref);
+               if( applyNative )
+                       Statistics.numNativeSparseConv2dCalls.increment();
+               
+               //transpose filter once for efficient sparse-dense multiplies 
in LoopedIm2ColConv2dTransAllChan
+               //in order to share the temporary object and its creation costs 
across threads
+               if( !applyNative && !isEmptyDenseInput && isTransPref ) {
+                       params.input2 = LibMatrixReorg.transpose(params.input2, 
+                               new MatrixBlock(params.input2.clen, 
params.input2.rlen, false), k);
+               }
+               
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       //note: we prefer the java backend for sparse inputs 
because the native 
+                       //implementation simply converts the sparse input into 
dense rows
+                       if( applyNative ) 
+                               ret.add(new SparseNativeConv2d(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else if(!isEmptyDenseInput && isTransPref)
+                               ret.add(new 
LoopedIm2ColConv2dTransAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params));
+                       else if(!isEmptyDenseInput)
+                               ret.add(new 
LoopedIm2ColConv2dAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params));
+                       else
+                               throw new DMLRuntimeException("Unsupported 
operator");
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
conv2d backward filter
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing conv2d backward filter
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getConv2dBackwardFilterWorkers(ConvolutionParameters params) throws 
DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<>();
+               // Try to create as many tasks as threads. 
+               // Creating more tasks will help in tail, but would have 
additional overhead of maintaining the intermediate
+               // data structures such as im2col blocks.
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               
+               boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() 
&& params.input1.denseBlock == null) || 
+                       (!params.input2.isInSparseFormat() && 
params.input2.denseBlock == null);
+               boolean applyNative = 
isEligibleForConv2dBackwardFilterSparseDense(params)
+                       && !params.input2.isInSparseFormat();
+               if( applyNative )
+                       
Statistics.numNativeSparseConv2dBwdFilterCalls.increment();
+               
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       //note: we prefer the java backend for sparse filters 
because the native 
+                       //implementation simply rotates the sparse filters into 
dense rows
+                       if( applyNative ) 
+                               ret.add(new 
SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, 
params.N), params));
+                       else if( params.input2.sparse && 
params.input1.getSparsity() > params.input2.getSparsity() )
+                               ret.add(new 
Conv2dBackwardFilterTrans(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params));
+                       else if(!isEmptyDenseInput)
+                               ret.add(new Conv2dBackwardFilter(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else
+                               throw new DMLRuntimeException("Unsupported 
operator");
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
conv2d backward data
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing conv2d backward data
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getConv2dBackwardDataWorkers(ConvolutionParameters params) throws 
DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<>();
+               
+               // Try to create as many tasks as threads. 
+               // Creating more tasks will help in tail, but would have 
additional overhead of maintaining the intermediate
+               // data structures such as im2col blocks.
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               
+               boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() 
&& params.input1.denseBlock == null) || 
+                       (!params.input2.isInSparseFormat() && 
params.input2.denseBlock == null);
+               boolean applyNative = 
isEligibleForConv2dBackwardDataDense(params)
+                       && !params.input2.isInSparseFormat();
+               if( applyNative )
+                       
Statistics.numNativeSparseConv2dBwdDataCalls.increment();
+               
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       //note: we prefer the java backend for sparse filters 
because the native 
+                       //implementation simply converts the sparse filters 
into dense rows
+                       if( applyNative ) 
+                               ret.add(new 
SparseNativeConv2dBackwardDataDense(i*taskSize, Math.min((i+1)*taskSize, 
params.N), params));
+                       else if(!isEmptyDenseInput)
+                               ret.add(new Conv2dBackwardData(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else
+                               throw new DMLRuntimeException("Unsupported 
operator");
+               }
+               
+               return ret;
+       }
+       
+       /**
+        * Performs convolution via: partialCopy1(filter %*% im2col(input)) = 
output
+        */
+       private static class LoopedIm2ColConv2dAllChan implements 
Callable<Long> 
+       {
+               protected final int _rl, _ru; 
+               protected final ConvolutionParameters _params;
+               
+               public LoopedIm2ColConv2dAllChan(int rl, int ru, 
ConvolutionParameters params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+
+               @Override
+               public Long call() throws Exception {
+                       final int PQ = _params.P*_params.Q, K = _params.K, CRS 
= _params.C*_params.R*_params.S;
+                       MatrixBlock outIm2col = new MatrixBlock(CRS, PQ, false);
+                       MatrixBlock outMM = new MatrixBlock(K, PQ, false);
+                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, outIm2col, _params, false);
+                       long time1 = 0; long time2 = 0;
+                       for(int n = _rl; n < _ru; n++)  {
+                               // im2col(input) => _im2ColOutBlock
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               im2ColWorker.execute(n);
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               
+                               // filter %*% _im2ColOutBlock => matMultOutBlock
+                               outMM.reset(outMM.rlen, outMM.clen, false);
+                               
LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, 
false, true, _params);
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
+                                       time1 += t2 - t1;
+                                       time2 += t3 - t2;
+                               }
+                               
+                               // Copy the matrix matMultOutBlock of shape [K 
X PQ] to params.output.denseBlock + destPos
+                               partialCopy1(outMM, 
_params.output.getDenseBlockValues(), n*K*PQ, K, PQ);
+                               
+                               // Add bias to current row if necessary, always 
dense
+                               if(_params.bias != null)
+                                       addBias(n, 
_params.output.getDenseBlockValues(),
+                                               
_params.bias.getDenseBlockValues(), K, PQ);
+                       }
+                       
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
+                               
LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1);
+                               
LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2);
+                       }
+                       
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
+               }
+               
+               // Copy the matrix src of shape [K X PQ] to 
params.output.denseBlock + destPos
+               private static void partialCopy1(MatrixBlock src, double [] 
dest, int destPos, int K, int PQ) {
+                       // Copying is required as LibMatrixMult.matrixMult 
(and/or Java) is not pointer aware.
+                       // This is not required in Native implementation
+                       if( src.isEmptyBlock() )
+                               return;
+                       if(src.isInSparseFormat()) {
+                               SparseBlock sblock = src.sparseBlock;
+                               for(int k = 0; k < src.getNumRows(); k++) {
+                                       if( sblock.isEmpty(k) ) continue;
+                                       int apos = sblock.pos(k);
+                                       int alen = sblock.size(k);
+                                       int[] aix = sblock.indexes(k);
+                                       double[] avals = sblock.values(k);
+                                       int desPosK = destPos + k*PQ;
+                                       for(int j = apos; j < apos+alen; j++)
+                                               dest[desPosK+aix[j]] = avals[j];
+                               }
+                       }
+                       else 
+                               System.arraycopy(src.getDenseBlockValues(), 0, 
dest, destPos, K * PQ);
+               }
+       }
+       
+       /**
+        * This implementation is similar to LoopedIm2ColConv2dAllChan, except 
for using a 
+        * sparse-dense matrix multiplication with t(t(Xi) %*% t(F)) instead of 
a 
+        * dense-sparse matrix multiplication with Xi %*% F.
+        * 
+        * NOTE: this implementation assumes that the filter is passed in 
transposed form
+        * in order to share this temporary matrix (and its creation cost) 
across threads.
+        */
+       private static class LoopedIm2ColConv2dTransAllChan extends 
LoopedIm2ColConv2dAllChan
+       {
+               public LoopedIm2ColConv2dTransAllChan(int rl, int ru, 
ConvolutionParameters params) {
+                       super(rl, ru, params);
+               }
+
+               @Override
+               public Long call() throws Exception {
+                       final int PQ = _params.P*_params.Q, K = _params.K, CRS 
= _params.C*_params.R*_params.S;
+                       MatrixBlock outIm2col = new MatrixBlock(PQ, CRS, false);
+                       MatrixBlock outMM = new MatrixBlock(PQ, K, false);
+                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, outIm2col, _params, true);
+                       
+                       for(int n = _rl; n < _ru; n++)  {
+                               // im2col(input) => _im2ColOutBlock
+                               im2ColWorker.execute(n);
+                               
+                               // t(_im2ColOutBlock) %*% t(filter) => 
t(matMultOutBlock)
+                               outMM.reset(outMM.rlen, outMM.clen, false);
+                               
LibMatrixDNNHelper.singleThreadedMatMult(outIm2col, _params.input2, outMM, 
false, false, _params);
+                               
+                               // Copy the matrix matMultOutBlock of shape [K 
X PQ] to params.output.denseBlock + destPos
+                               partialCopyTrans(outMM, _params.output, n*K*PQ, 
K, PQ);
+                               
+                               // Add bias to current row if necessary, always 
dense
+                               if(_params.bias != null)
+                                       addBias(n, 
_params.output.getDenseBlockValues(),
+                                               
_params.bias.getDenseBlockValues(), K, PQ);
+                       }
+                       
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
+               }
+               
+               private static void partialCopyTrans(MatrixBlock src, 
MatrixBlock dest, int destPos, int K, int PQ) {
+                       if( src.isEmptyBlock() )
+                               return;
+                       //copy src into its destination row w/ piggybacked 
transpose
+                       //src is [PQ x K] -> [K x PQ] -> [1 x KPQ]
+                       if(src.isInSparseFormat()) {
+                               SparseBlock sblock = src.sparseBlock;
+                               double[] c = dest.getDenseBlockValues();
+                               for(int i = 0; i < src.getNumRows(); i++) {
+                                       if( sblock.isEmpty(i) ) continue;
+                                       int apos = sblock.pos(i);
+                                       int alen = sblock.size(i);
+                                       int[] aix = sblock.indexes(i);
+                                       double[] avals = sblock.values(i);
+                                       int desPosK = destPos + i;
+                                       for(int j = apos; j < apos+alen; j++)
+                                               c[desPosK+aix[j]*PQ] = avals[j];
+                               }
+                       }
+                       else {
+                               double[] a = src.getDenseBlockValues();
+                               double[] c = dest.getDenseBlockValues();
+                               final int blocksizeIJ = 128; //128KB for L2
+                               //cache-conscious blocked execution
+                               for( int bi = 0; bi < PQ; bi+=blocksizeIJ )
+                                       for( int bj = 0; bj < K; 
bj+=blocksizeIJ ) {
+                                               int bimin = 
Math.min(bi+blocksizeIJ, PQ);
+                                               int bjmin = 
Math.min(bj+blocksizeIJ, K);
+                                               //core transpose operation
+                                               for(int i=bi, aix=bi*K+bj, 
cix=bj*PQ+bi; i<bimin; i++, aix+=K, cix++)
+                                                       
LibMatrixReorg.transposeRow(a, c, aix, destPos+cix, PQ, bjmin-bj);
+                                       }
+                       }
+               }
+       }
+       
+       /**
+        * This operator is used only if native is enabled, filter is dense and 
input is sparse
+        */
+       private static class SparseNativeConv2d implements Callable<Long> 
+       {
+               public final int _rl, _ru; 
+               private final ConvolutionParameters _params;
+               public SparseNativeConv2d(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+
+               @Override
+               public Long call() throws Exception {
+                       int KPQ = _params.K*_params.P*_params.Q;
+                       double[] temp = new double[KPQ];
+                       for(int n = _rl; n < _ru; n++)  {
+                               if( !_params.input1.getSparseBlock().isEmpty(n) 
) {
+                                       int apos = 
_params.input1.getSparseBlock().pos(n);
+                                       int alen = 
_params.input1.getSparseBlock().size(n);
+                                       int[] aix = 
_params.input1.getSparseBlock().indexes(n);
+                                       double[] avals = 
_params.input1.getSparseBlock().values(n);
+                                       NativeHelper.conv2dSparse(apos, alen, 
aix, avals, _params.input2.getDenseBlockValues(), temp, 
+                                                       1, _params.C, 
_params.H, _params.W, _params.K, _params.R, _params.S, 
+                                                       _params.stride_h, 
_params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+                                       System.arraycopy(temp, 0, 
_params.output.getDenseBlockValues(), n*KPQ, KPQ);
+                               }
+                       }
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
+               }
+       }
+       
+       // BACKWARD DATA
+       
+       /**
+        * This operator is used only if native is enabled and filter is 
sparse. 
+        * dout is converted into dense if sparse.
+        */
+       private static class SparseNativeConv2dBackwardDataDense implements 
Callable<Long> 
+       {
+               public final int _rl, _ru; 
+               private final ConvolutionParameters _params; 
+               public SparseNativeConv2dBackwardDataDense(int rl, int ru, 
ConvolutionParameters params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+
+               @Override
+               public Long call() throws Exception {
+                       int CHW = _params.C*_params.H*_params.W;
+                       double [] ret = new double[CHW];
+                       double [] filterArr = 
_params.input1.getDenseBlockValues();
+                       double [] dout_n = new 
double[_params.P*_params.Q*_params.K];
+                       for(int n = _rl; n < _ru; n++) {
+                               getRowInDenseFormat(_params.input2, n, dout_n);
+                               if(n > _rl)
+                                       Arrays.fill(ret, 0);
+                               NativeHelper.conv2dBackwardDataDense(filterArr, 
dout_n, ret, 1, 
+                                               _params.C, _params.H, 
_params.W, _params.K, 
+                                               _params.R, _params.S, 
_params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, 
_params.Q, 1);
+                               System.arraycopy(ret, 0, 
_params.output.getDenseBlockValues(), n*CHW, CHW);
+                       }
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
+               }
+       }
+       
+       /**
+        * General conv2d backward data operator
+        */
+       private static class Conv2dBackwardData implements Callable<Long> {
+
+               public final int _rl, _ru; 
+               private final ConvolutionParameters _params; 
+               public Conv2dBackwardData(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       int PQ = _params.P*_params.Q; int K = _params.K; int 
CRS = _params.C*_params.R*_params.S;
+                       MatrixBlock filter = _params.input1;
+                       MatrixBlock dout = _params.input2;
+                       MatrixBlock outRotate = new MatrixBlock(PQ, K, 
dout.sparse);
+                       MatrixBlock outMM = new MatrixBlock(PQ, CRS, false);
+                       outRotate.allocateBlock();
+                       LibMatrixDNNRotate180.Rotate180Worker rotate180Worker = 
+                               
LibMatrixDNNRotate180.Rotate180Worker.getWorker( dout, outRotate, _params, 
true, false);
+                       long time1 = 0; long time2 = 0;
+                       for(int n = _rl; n < _ru; n++)  {
+                               // rotate180(dout[n,]) => dout_reshaped
+                               rotate180Worker.execute(n, 0);
+                               // dout_reshaped %*% filter => temp
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               outMM.reset(PQ, CRS, false);
+                               
LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, 
!outRotate.sparse, false, _params);
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               // col2im(temp) => output[n,] 
+                               LibMatrixDNNIm2Col.doCol2imOverSingleImage(n, 
outMM, _params);
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
+                                       time1 += t2 - t1;
+                                       time2 += t3 - t2;
+                               }
+                       }
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
+                               
LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1);
+                               
LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2);
+                       }
+                       
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
+               }
+       }
+       
+       //BACKWARD FILTER
+       
+       /**
+        * This operator is used only if native is enabled and input is sparse. 
+        * dout is converted into dense if sparse.
+        */
+       private static class SparseNativeConv2dBackwardFilterDense implements 
Callable<Long> 
+       {
+               public final int _rl, _ru;
+               private final ConvolutionParameters _params;
+               public SparseNativeConv2dBackwardFilterDense(int rl, int ru, 
ConvolutionParameters params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       int CRS = _params.C*_params.R*_params.S, PQ = 
_params.P*_params.Q, K = _params.K;
+                       MatrixBlock dout_n = new MatrixBlock(PQ, K, false);
+                       dout_n.allocateBlock();
+                       LibMatrixDNNRotate180.Rotate180Worker rotate180Worker = 
+                                       
LibMatrixDNNRotate180.Rotate180Worker.getWorker( _params.input2, dout_n, 
_params, true, false);
+                       double [] ldout_n = dout_n.getDenseBlockValues();
+                       double [] partRet = new double[CRS*_params.K]; //CRS x K
+                       for(int n = _rl; n < _ru; n++) {
+                               if( !_params.input1.getSparseBlock().isEmpty(n) 
) {
+                                       // rotate180(dout[n,]) => dout_n
+                                       rotate180Worker.execute(n, 0);
+                                       
+                                       int apos = 
_params.input1.getSparseBlock().pos(n);
+                                       int alen = 
_params.input1.getSparseBlock().size(n);
+                                       int[] aix = 
_params.input1.getSparseBlock().indexes(n);
+                                       double[] avals = 
_params.input1.getSparseBlock().values(n);
+                                       
NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, 
+                                                       ldout_n, partRet, 1, 
_params.C, _params.H, _params.W, _params.K, 
+                                                       _params.R, _params.S, 
_params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, 
_params.Q, 1);
+                               }
+                       }
+                       inplaceTransAdd(partRet, _params);
+                       return 0L;
+               }
+       }
+       
+       /**
+        * General conv2d backward data operator
+        */
+       private static class Conv2dBackwardFilter implements Callable<Long> {
+               private final int _rl, _ru; 
+               private final ConvolutionParameters _params; 
+               
+               public Conv2dBackwardFilter(int rl, int ru, 
ConvolutionParameters params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       int PQ = _params.P*_params.Q, K = _params.K, CRS = 
_params.C*_params.R*_params.S;
+                       MatrixBlock dout = _params.input2;
+                       MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, 
false);
+                       MatrixBlock outRotate = new MatrixBlock(PQ, K, 
dout.sparse);
+                       MatrixBlock outMM = new MatrixBlock(CRS, K, false);
+                       outRotate.allocateBlock();
+                       
+                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, im2ColOutBlock, _params, false);
+                       Rotate180Worker rotate180Worker = 
Rotate180Worker.getWorker( dout, outRotate, _params, true, false);
+                       double [] partRet = new double[CRS*_params.K];
+                       long time1 = 0; long time2 = 0;
+                       for(int n = _rl; n < _ru; n++) {
+                               // rotate180(dout[n,]) => dout_reshaped
+                               rotate180Worker.execute(n, 0);
+                               
+                               // im2col(input) => _im2ColOutBlock
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               im2ColWorker.execute(n);
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               
+                               outMM.reset(CRS, K, false);
+                               
LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, 
!im2ColOutBlock.sparse, !outRotate.sparse, _params);
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               
+                               if( !outMM.isEmptyBlock() ) //accumulate row 
results
+                                       
LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS);
+                               
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
+                                       time1 += t2 - t1;
+                                       time2 += t3 - t2;
+                               }
+                       }
+                       inplaceTransAdd(partRet, _params);
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
+                               
LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
+                               
LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
+                       }
+                       return 0L;
+               }
+       }
+       
+       private static class Conv2dBackwardFilterTrans implements 
Callable<Long> {
+               private final int _rl, _ru; 
+               private final ConvolutionParameters _params;
+               
+               public Conv2dBackwardFilterTrans(int rl, int ru, 
ConvolutionParameters params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       int PQ = _params.P*_params.Q, K = _params.K, CRS = 
_params.C*_params.R*_params.S;
+                       MatrixBlock dout = _params.input2;
+                       MatrixBlock im2ColOutBlock = new MatrixBlock(PQ, CRS, 
false).allocateBlock();
+                       MatrixBlock outRotate = new MatrixBlock(K, PQ, 
dout.sparse).allocateBlock();
+                       MatrixBlock outMM = new MatrixBlock(K, CRS, 
false).allocateBlock();
+                       
+                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, im2ColOutBlock, _params, true);
+                       Rotate180Worker rotate180Worker = 
Rotate180Worker.getWorker( dout, outRotate, _params, true, true);
+                       double [] partRet = new double[CRS*_params.K];
+                       long time1 = 0; long time2 = 0;
+                       for(int n = _rl; n < _ru; n++) {
+                               // rotate180(dout[n,]) => dout_reshaped
+                               rotate180Worker.execute(n, 0);
+                               
+                               // im2col(input) => _im2ColOutBlock
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               im2ColWorker.execute(n);
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               
+                               outMM.reset(K, CRS, false);
+                               //Timing time = new Timing(true);
+                               
LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, 
+                                       outMM, !outRotate.sparse, 
!im2ColOutBlock.sparse, _params);
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                               
+                               if( !outMM.isEmptyBlock() ) //accumulate row 
results
+                                       
LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS);
+                               
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
+                                       time1 += t2 - t1;
+                                       time2 += t3 - t2;
+                               }
+                       }
+                       //no need to transpose because t(t(out)) cancel out
+                       inplaceAdd(partRet, _params);
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
+                               
LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
+                               
LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
+                       }
+                       return 0L;
+               }
+       }
+       
+       private static void inplaceAdd(double[] a, ConvolutionParameters 
params) {
+               synchronized (params.output.denseBlock) {
+                       LibMatrixMult.vectAdd(a, 
params.output.getDenseBlockValues(), 0, 0, a.length);
+               }
+       }
+       
+       private static void inplaceTransAdd(double[] a, ConvolutionParameters 
params) {
+               synchronized (params.output.denseBlock) {
+                       // Perform transposed addition: output of size [K, CRS] 
+= input of size [CRS,K]
+                       double [] c = params.output.getDenseBlockValues();
+                       final int CRS = params.C*params.R*params.S, K = 
params.K;
+                       final int blocksizeIJ = 128; //L2 cache
+                       
+                       //cache-conscious blocked execution
+                       for( int bi=0; bi<CRS; bi+=blocksizeIJ )
+                               for( int bj=0; bj<K; bj+=blocksizeIJ ) {
+                                       int bimin = Math.min(bi+blocksizeIJ, 
CRS);
+                                       int bjmin = Math.min(bj+blocksizeIJ, K);
+                                       //core transpose add operation
+                                       for(int i=bi, aix=bi*K; i<bimin; i++, 
aix+=K)
+                                               for(int j=bj, cix=i+bj*CRS; 
j<bjmin; j++, cix+=CRS)
+                                                       c[cix] += a[aix+j];
+                               }
+               }
+       }
+       
+       private static void getRowInDenseFormat(MatrixBlock input, int n, 
double []  ret) throws DMLRuntimeException {
+               if(input.getNumColumns() != ret.length) {
+                       throw new DMLRuntimeException("Invalid parameters");
+               }
+               // Use temporary array to avoid binary search
+               if(input.isInSparseFormat()) {
+                       Arrays.fill(ret, 0);
+                       if( !input.sparseBlock.isEmpty(n) ) {
+                               int apos = input.sparseBlock.pos(n);
+                               int alen = input.sparseBlock.size(n);
+                               int[] aix = input.sparseBlock.indexes(n);
+                               double[] avals = input.sparseBlock.values(n);
+                               for(int j=apos; j<apos+alen; j++)
+                                       ret[ aix[j] ] = avals[j];
+                       }
+               }
+               else {
+                       System.arraycopy(input.getDenseBlockValues(),
+                               n*input.getNumColumns(), ret, 0, 
input.getNumColumns());
+               }
+       }
+       
+       private static void addBias(int r, double [] out, double [] bias, int 
K, int PQ) {
+               for(int k=0, cix=r*K*PQ; k<K; k++, cix+=PQ)
+                       LibMatrixMult.vectAddInPlace(bias[k], out, cix, PQ);
+       }
+       
+       private static boolean 
isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {
+               // NativeHelper.conv2dBackwardFilterSparseDense only if input 
is sparse. 
+               // dout converted to dense if sparse.
+               return params.enableNative && params.input1.isInSparseFormat();
+       }
+       
+       private static boolean isEligibleForConv2dSparse(ConvolutionParameters 
params) {
+               // NativeHelper.conv2dSparse only if filter is dense and input 
is sparse
+               return params.enableNative && params.input1.isInSparseFormat() 
&& !params.input2.isInSparseFormat();
+       }
+       
+       private static boolean 
isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) {
+               // NativeHelper.conv2dBackwardDataDense only if filter is 
dense. 
+               // dout converted to dense if sparse.
+               return params.enableNative && !params.input1.isInSparseFormat();
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
deleted file mode 100644
index 03dfa93..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.matrix.data;
-
-import java.util.Arrays;
-import java.util.concurrent.Callable;
-
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.utils.NativeHelper;
-
-/**
- * This class contains the set of operators used for performing conv2d 
backward data
- */
-public class LibMatrixDNNConv2dBackwardDataHelper {
-
-       /**
-        * This operator is used only if native is enabled and filter is 
sparse. 
-        * dout is converted into dense if sparse.
-        */
-       public static class SparseNativeConv2dBackwardDataDense implements 
Callable<Long> 
-       {
-               public int _rl; public int _ru; 
-               private final ConvolutionParameters _params; 
-               public SparseNativeConv2dBackwardDataDense(int rl, int ru, 
ConvolutionParameters params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-
-               @Override
-               public Long call() throws Exception {
-                       int CHW = _params.C*_params.H*_params.W;
-                       double [] ret = new double[CHW];
-                       double [] filterArr = 
_params.input1.getDenseBlockValues();
-                       double [] dout_n = new 
double[_params.P*_params.Q*_params.K];
-                       for(int n = _rl; n < _ru; n++) {
-                               
LibMatrixDNNHelper.getRowInDenseFormat(_params.input2, n, dout_n);
-                               if(n > _rl)
-                                       Arrays.fill(ret, 0);
-                               NativeHelper.conv2dBackwardDataDense(filterArr, 
dout_n, ret, 1, 
-                                               _params.C, _params.H, 
_params.W, _params.K, 
-                                               _params.R, _params.S, 
_params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, 
_params.Q, 1);
-                               System.arraycopy(ret, 0, 
_params.output.getDenseBlockValues(), n*CHW, CHW);
-                       }
-                       //multi-threaded nnz maintenance of current working set
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-       }
-       
-       /**
-        * General conv2d backward data operator
-        */
-       public static class Conv2dBackwardData implements Callable<Long> {
-
-               public int _rl; public int _ru; 
-               private final ConvolutionParameters _params; 
-               public Conv2dBackwardData(int rl, int ru, ConvolutionParameters 
params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       int PQ = _params.P*_params.Q; int K = _params.K; int 
CRS = _params.C*_params.R*_params.S;
-                       MatrixBlock filter = _params.input1;
-                       MatrixBlock dout = _params.input2;
-                       MatrixBlock outRotate = new MatrixBlock(PQ, K, 
dout.sparse);
-                       MatrixBlock outMM = new MatrixBlock(PQ, CRS, false);
-                       outRotate.allocateBlock();
-                       LibMatrixDNNRotate180Helper.Rotate180Worker 
rotate180Worker = 
-                               
LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, outRotate, 
_params, true, false);
-                       long time1 = 0; long time2 = 0;
-                       for(int n = _rl; n < _ru; n++)  {
-                               // rotate180(dout[n,]) => dout_reshaped
-                               rotate180Worker.execute(n, 0);
-                               // dout_reshaped %*% filter => temp
-                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               outMM.reset(PQ, CRS, false);
-                               
LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, 
!outRotate.sparse, false, _params);
-                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               // col2im(temp) => output[n,] 
-                               LibMatrixDNNHelper.doCol2imOverSingleImage(n, 
outMM, _params);
-                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               
-                               if(DMLScript.FINEGRAINED_STATISTICS) {
-                                       time1 += t2 - t1;
-                                       time2 += t3 - t2;
-                               }
-                       }
-                       if(DMLScript.FINEGRAINED_STATISTICS) {
-                               
LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1);
-                               
LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2);
-                       }
-                       
-                       //multi-threaded nnz maintenance of current working set
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-       }
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
deleted file mode 100644
index f30916c..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.matrix.data;
-
-import java.util.concurrent.Callable;
-
-import org.apache.sysml.api.DMLScript;
-import 
org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker;
-import 
org.apache.sysml.runtime.matrix.data.LibMatrixDNNRotate180Helper.Rotate180Worker;
-import org.apache.sysml.utils.NativeHelper;
-
-public class LibMatrixDNNConv2dBackwardFilterHelper {
-
-       /**
-        * This operator is used only if native is enabled and input is sparse. 
-        * dout is converted into dense if sparse.
-        */
-       public static class SparseNativeConv2dBackwardFilterDense implements 
Callable<Long> 
-       {
-
-               public int _rl; public int _ru; 
-               private final ConvolutionParameters _params; 
-               public SparseNativeConv2dBackwardFilterDense(int rl, int ru, 
ConvolutionParameters params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       int CRS = _params.C*_params.R*_params.S, PQ = 
_params.P*_params.Q, K = _params.K;
-                       MatrixBlock dout_n = new MatrixBlock(PQ, K, false);
-                       dout_n.allocateBlock();
-                       LibMatrixDNNRotate180Helper.Rotate180Worker 
rotate180Worker = 
-                                       
LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( _params.input2, dout_n, 
_params, true, false);
-                       double [] ldout_n = dout_n.getDenseBlockValues();
-                       double [] partRet = new double[CRS*_params.K]; //CRS x K
-                       for(int n = _rl; n < _ru; n++) {
-                               if( !_params.input1.getSparseBlock().isEmpty(n) 
) {
-                                       // rotate180(dout[n,]) => dout_n
-                                       rotate180Worker.execute(n, 0);
-                                       
-                                       int apos = 
_params.input1.getSparseBlock().pos(n);
-                                       int alen = 
_params.input1.getSparseBlock().size(n);
-                                       int[] aix = 
_params.input1.getSparseBlock().indexes(n);
-                                       double[] avals = 
_params.input1.getSparseBlock().values(n);
-                                       
NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, 
-                                                       ldout_n, partRet, 1, 
_params.C, _params.H, _params.W, _params.K, 
-                                                       _params.R, _params.S, 
_params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, 
_params.Q, 1);
-                               }
-                       }
-                       inplaceTransAdd(partRet, _params);
-                       return 0L;
-               }
-       }
-       
-       /**
-        * General conv2d backward data operator
-        */
-       public static class Conv2dBackwardFilter implements Callable<Long> {
-               private final int _rl, _ru; 
-               private final ConvolutionParameters _params; 
-               
-               public Conv2dBackwardFilter(int rl, int ru, 
ConvolutionParameters params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       int PQ = _params.P*_params.Q, K = _params.K, CRS = 
_params.C*_params.R*_params.S;
-                       MatrixBlock dout = _params.input2;
-                       MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, 
false);
-                       MatrixBlock outRotate = new MatrixBlock(PQ, K, 
dout.sparse);
-                       MatrixBlock outMM = new MatrixBlock(CRS, K, false);
-                       outRotate.allocateBlock();
-                       
-                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, im2ColOutBlock, _params, true, false);
-                       Rotate180Worker rotate180Worker = 
Rotate180Worker.getWorker( dout, outRotate, _params, true, false);
-                       double [] partRet = new double[CRS*_params.K];
-                       long time1 = 0; long time2 = 0;
-                       for(int n = _rl; n < _ru; n++) {
-                               // rotate180(dout[n,]) => dout_reshaped
-                               rotate180Worker.execute(n, 0);
-                               
-                               // im2col(input) => _im2ColOutBlock
-                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               im2ColWorker.execute(n);
-                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               
-                               outMM.reset(CRS, K, false);
-                               
LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, 
!im2ColOutBlock.sparse, !outRotate.sparse, _params);
-                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               
-                               if( !outMM.isEmptyBlock() ) //accumulate row 
results
-                                       
LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS);
-                               
-                               if(DMLScript.FINEGRAINED_STATISTICS) {
-                                       time1 += t2 - t1;
-                                       time2 += t3 - t2;
-                               }
-                       }
-                       inplaceTransAdd(partRet, _params);
-                       if(DMLScript.FINEGRAINED_STATISTICS) {
-                               
LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
-                               
LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
-                       }
-                       return 0L;
-               }
-       }
-       
-       public static class Conv2dBackwardFilterTrans implements Callable<Long> 
{
-               private final int _rl, _ru; 
-               private final ConvolutionParameters _params; 
-               
-               public Conv2dBackwardFilterTrans(int rl, int ru, 
ConvolutionParameters params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       int PQ = _params.P*_params.Q, K = _params.K, CRS = 
_params.C*_params.R*_params.S;
-                       MatrixBlock dout = _params.input2;
-                       MatrixBlock im2ColOutBlock = new MatrixBlock(PQ, CRS, 
false).allocateBlock();
-                       MatrixBlock outRotate = new MatrixBlock(K, PQ, 
dout.sparse).allocateBlock();
-                       MatrixBlock outMM = new MatrixBlock(K, CRS, 
false).allocateBlock();
-                       
-                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, im2ColOutBlock, _params, true, true);
-                       Rotate180Worker rotate180Worker = 
Rotate180Worker.getWorker( dout, outRotate, _params, true, true);
-                       double [] partRet = new double[CRS*_params.K];
-                       long time1 = 0; long time2 = 0;
-                       for(int n = _rl; n < _ru; n++) {
-                               // rotate180(dout[n,]) => dout_reshaped
-                               rotate180Worker.execute(n, 0);
-                               
-                               // im2col(input) => _im2ColOutBlock
-                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               im2ColWorker.execute(n);
-                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               
-                               outMM.reset(K, CRS, false);
-                               //Timing time = new Timing(true);
-                               
LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, 
-                                       outMM, !outRotate.sparse, 
!im2ColOutBlock.sparse, _params);
-                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               
-                               if( !outMM.isEmptyBlock() ) //accumulate row 
results
-                                       
LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS);
-                               
-                               if(DMLScript.FINEGRAINED_STATISTICS) {
-                                       time1 += t2 - t1;
-                                       time2 += t3 - t2;
-                               }
-                       }
-                       //no need to transpose because t(t(out)) cancel out
-                       inplaceAdd(partRet, _params);
-                       if(DMLScript.FINEGRAINED_STATISTICS) {
-                               
LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
-                               
LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
-                       }
-                       return 0L;
-               }
-       }
-       
-       private static void inplaceAdd(double[] a, ConvolutionParameters 
params) {
-               synchronized (params.output.denseBlock) {
-                       LibMatrixMult.vectAdd(a, 
params.output.getDenseBlockValues(), 0, 0, a.length);
-               }
-       }
-       
-       private static void inplaceTransAdd(double[] a, ConvolutionParameters 
params) {
-               synchronized (params.output.denseBlock) {
-                       // Perform transposed addition: output of size [K, CRS] 
+= input of size [CRS,K]
-                       double [] c = params.output.getDenseBlockValues();
-                       final int CRS = params.C*params.R*params.S, K = 
params.K;
-                       final int blocksizeIJ = 128; //L2 cache
-                       
-                       //cache-conscious blocked execution
-                       for( int bi=0; bi<CRS; bi+=blocksizeIJ )
-                               for( int bj=0; bj<K; bj+=blocksizeIJ ) {
-                                       int bimin = Math.min(bi+blocksizeIJ, 
CRS);
-                                       int bjmin = Math.min(bj+blocksizeIJ, K);
-                                       //core transpose add operation
-                                       for(int i=bi, aix=bi*K; i<bimin; i++, 
aix+=K)
-                                               for(int j=bj, cix=i+bj*CRS; 
j<bjmin; j++, cix+=CRS)
-                                                       c[cix] += a[aix+j];
-                               }
-               }
-       }
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
deleted file mode 100644
index 3699512..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.matrix.data;
-
-import java.util.ArrayList;
-import java.util.concurrent.Callable;
-
-import org.apache.sysml.api.DMLScript;
-import 
org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker;
-import org.apache.sysml.utils.NativeHelper;
-
-/**
- * This class contains the set of operators used for performing conv2d
- */
-public class LibMatrixDNNConv2dHelper {
-
-       /**
-        * Performs convolution via: partialCopy1(filter %*% im2col(input)) = 
output.
-        * This operator has less memory pressure than 
LoopedIm2ColConv2dAllChannels.
-        */
-       public static class LoopedIm2ColConv2dOneChan implements Callable<Long> 
-       {
-               protected final int _rl, _ru; 
-               protected final ConvolutionParameters _params; 
-               protected final ArrayList<MatrixBlock> _filters;
-               
-               public LoopedIm2ColConv2dOneChan(int rl, int ru, 
ConvolutionParameters params, ArrayList<MatrixBlock> filters) {
-                       _rl = rl; _ru = ru;
-                       _params = params; 
-                       _filters = filters;
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       int PQ = _params.P*_params.Q; int K = _params.K;
-                       int RS = _params.R*_params.S;
-                       MatrixBlock im2ColOutBlock = new MatrixBlock(RS, PQ, 
false);
-                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, im2ColOutBlock, _params, false, false);
-                       long time1 = 0; long time2 = 0;
-                       for(int n = _rl; n < _ru; n++)  {
-                               for(int c = 0; c < _params.C; c++)  {
-                                       // im2col(input) => _im2ColOutBlock
-                                       long t1 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
-                                       im2ColWorker.execute(n, c);
-                                       long t2 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
-                                       
-                                       // filter %*% _im2ColOutBlock => 
matMultOutBlock
-                                       MatrixBlock matMultOutBlock = new 
MatrixBlock(K, PQ, false);
-                                       
LibMatrixDNNHelper.singleThreadedMatMult(_filters.get(c), im2ColOutBlock, 
matMultOutBlock, false, true, _params);
-                                       long t3 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
-                                       
-                                       if(DMLScript.FINEGRAINED_STATISTICS) {
-                                               time1 += t2 - t1;
-                                               time2 += t3 - t2;
-                                       }
-                                       
-                                       // Add the matrix matMultOutBlock of 
shape [K X PQ] to params.output.denseBlock + destPos
-                                       add(matMultOutBlock, 
_params.output.getDenseBlockValues(), n*K*PQ, K, PQ);
-                               }
-                               // Add bias to current row if necessary, always 
dense
-                               if(_params.bias != null)
-                                       LibMatrixDNNHelper.addBias(n, 
_params.output.getDenseBlockValues(),
-                                               
_params.bias.getDenseBlockValues(), K, PQ);
-                       }
-                       if(DMLScript.FINEGRAINED_STATISTICS) {
-                               
LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1);
-                               
LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2);
-                       }
-                       
-                       //multi-threaded nnz maintenance of current working set
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-               
-               // Copy the matrix src of shape [K X PQ] to 
params.output.denseBlock + destPos
-               private static void add(MatrixBlock src, double [] dest, int 
destPos, int K, int PQ) {
-                       // Copying is required as LibMatrixMult.matrixMult 
(and/or Java) is not pointer aware.
-                       // This is not required in Native implementation
-                       if(!src.isEmptyBlock()) {
-                               if(src.isInSparseFormat()) {
-                                       // Copy the sparse matrix 
matMultOutBlock of shape [K X PQ] to 
-                                       // params.output.denseBlock + destPos
-                                       for(int k = 0; k < src.getNumRows(); 
k++) {
-                                               if( !src.sparseBlock.isEmpty(k) 
) {
-                                                       int apos = 
src.sparseBlock.pos(k);
-                                                       int alen = 
src.sparseBlock.size(k);
-                                                       int[] aix = 
src.sparseBlock.indexes(k);
-                                                       double[] avals = 
src.sparseBlock.values(k);
-                                                       int desPosK = destPos + 
k*PQ;
-                                                       for(int j = apos; j < 
apos+alen; j++) {
-                                                               int pqIndex = 
aix[j];
-                                                               dest[desPosK + 
pqIndex ] += avals[j];
-                                                       }
-                                               }
-                                       }
-                               }
-                               else {
-                                       
LibMatrixMult.vectAdd(src.getDenseBlockValues(), dest, 0, destPos, K*PQ);
-                               }
-                       }
-               }
-       }       
-       
-       /**
-        * Performs convolution via: partialCopy1(filter %*% im2col(input)) = 
output
-        */
-       public static class LoopedIm2ColConv2dAllChan implements Callable<Long> 
-       {
-               protected final int _rl, _ru; 
-               protected final ConvolutionParameters _params;
-               
-               public LoopedIm2ColConv2dAllChan(int rl, int ru, 
ConvolutionParameters params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-
-               @Override
-               public Long call() throws Exception {
-                       final int PQ = _params.P*_params.Q, K = _params.K, CRS 
= _params.C*_params.R*_params.S;
-                       MatrixBlock outIm2col = new MatrixBlock(CRS, PQ, false);
-                       MatrixBlock outMM = new MatrixBlock(K, PQ, false);
-                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, outIm2col, _params, true, false);
-                       long time1 = 0; long time2 = 0;
-                       for(int n = _rl; n < _ru; n++)  {
-                               // im2col(input) => _im2ColOutBlock
-                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               im2ColWorker.execute(n);
-                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               
-                               // filter %*% _im2ColOutBlock => matMultOutBlock
-                               outMM.reset(outMM.rlen, outMM.clen, false);
-                               
LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, 
false, true, _params);
-                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               
-                               if(DMLScript.FINEGRAINED_STATISTICS) {
-                                       time1 += t2 - t1;
-                                       time2 += t3 - t2;
-                               }
-                               
-                               // Copy the matrix matMultOutBlock of shape [K 
X PQ] to params.output.denseBlock + destPos
-                               partialCopy1(outMM, 
_params.output.getDenseBlockValues(), n*K*PQ, K, PQ);
-                               
-                               // Add bias to current row if necessary, always 
dense
-                               if(_params.bias != null)
-                                       LibMatrixDNNHelper.addBias(n, 
_params.output.getDenseBlockValues(),
-                                               
_params.bias.getDenseBlockValues(), K, PQ);
-                       }
-                       
-                       if(DMLScript.FINEGRAINED_STATISTICS) {
-                               
LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1);
-                               
LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2);
-                       }
-                       
-                       //multi-threaded nnz maintenance of current working set
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-               
-               // Copy the matrix src of shape [K X PQ] to 
params.output.denseBlock + destPos
-               private static void partialCopy1(MatrixBlock src, double [] 
dest, int destPos, int K, int PQ) {
-                       // Copying is required as LibMatrixMult.matrixMult 
(and/or Java) is not pointer aware.
-                       // This is not required in Native implementation
-                       if( src.isEmptyBlock() )
-                               return;
-                       if(src.isInSparseFormat()) {
-                               SparseBlock sblock = src.sparseBlock;
-                               for(int k = 0; k < src.getNumRows(); k++) {
-                                       if( sblock.isEmpty(k) ) continue;
-                                       int apos = sblock.pos(k);
-                                       int alen = sblock.size(k);
-                                       int[] aix = sblock.indexes(k);
-                                       double[] avals = sblock.values(k);
-                                       int desPosK = destPos + k*PQ;
-                                       for(int j = apos; j < apos+alen; j++)
-                                               dest[desPosK+aix[j]] = avals[j];
-                               }
-                       }
-                       else 
-                               System.arraycopy(src.getDenseBlockValues(), 0, 
dest, destPos, K * PQ);
-               }
-       }
-       
-       /**
-        * This implementation is similar to LoopedIm2ColConv2dAllChan, except 
for using a 
-        * sparse-dense matrix multiplication with t(t(Xi) %*% t(F)) instead of 
a 
-        * dense-sparse matrix multiplication with Xi %*% F.
-        * 
-        * NOTE: this implementation assumes that the filter is passed in 
transposed form
-        * in order to share this temporary matrix (and its creation cost) 
across threads.
-        */
-       public static class LoopedIm2ColConv2dTransAllChan extends 
LoopedIm2ColConv2dAllChan
-       {
-               public LoopedIm2ColConv2dTransAllChan(int rl, int ru, 
ConvolutionParameters params) {
-                       super(rl, ru, params);
-               }
-
-               @Override
-               public Long call() throws Exception {
-                       final int PQ = _params.P*_params.Q, K = _params.K, CRS 
= _params.C*_params.R*_params.S;
-                       MatrixBlock outIm2col = new MatrixBlock(PQ, CRS, false);
-                       MatrixBlock outMM = new MatrixBlock(PQ, K, false);
-                       Im2colWorker im2ColWorker = Im2colWorker.getWorker( 
_params.input1, outIm2col, _params, true, true);
-                       
-                       for(int n = _rl; n < _ru; n++)  {
-                               // im2col(input) => _im2ColOutBlock
-                               im2ColWorker.execute(n);
-                               
-                               // t(_im2ColOutBlock) %*% t(filter) => 
t(matMultOutBlock)
-                               outMM.reset(outMM.rlen, outMM.clen, false);
-                               
LibMatrixDNNHelper.singleThreadedMatMult(outIm2col, _params.input2, outMM, 
false, false, _params);
-                               
-                               // Copy the matrix matMultOutBlock of shape [K 
X PQ] to params.output.denseBlock + destPos
-                               partialCopyTrans(outMM, _params.output, n*K*PQ, 
K, PQ);
-                               
-                               // Add bias to current row if necessary, always 
dense
-                               if(_params.bias != null)
-                                       LibMatrixDNNHelper.addBias(n, 
_params.output.getDenseBlockValues(),
-                                               
_params.bias.getDenseBlockValues(), K, PQ);
-                       }
-                       
-                       //multi-threaded nnz maintenance of current working set
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-               
-               private static void partialCopyTrans(MatrixBlock src, 
MatrixBlock dest, int destPos, int K, int PQ) {
-                       if( src.isEmptyBlock() )
-                               return;
-                       //copy src into its destination row w/ piggybacked 
transpose
-                       //src is [PQ x K] -> [K x PQ] -> [1 x KPQ]
-                       if(src.isInSparseFormat()) {
-                               SparseBlock sblock = src.sparseBlock;
-                               double[] c = dest.getDenseBlockValues();
-                               for(int i = 0; i < src.getNumRows(); i++) {
-                                       if( sblock.isEmpty(i) ) continue;
-                                       int apos = sblock.pos(i);
-                                       int alen = sblock.size(i);
-                                       int[] aix = sblock.indexes(i);
-                                       double[] avals = sblock.values(i);
-                                       int desPosK = destPos + i;
-                                       for(int j = apos; j < apos+alen; j++)
-                                               c[desPosK+aix[j]*PQ] = avals[j];
-                               }
-                       }
-                       else {
-                               double[] a = src.getDenseBlockValues();
-                               double[] c = dest.getDenseBlockValues();
-                               final int blocksizeIJ = 128; //128KB for L2
-                               //cache-conscious blocked execution
-                               for( int bi = 0; bi < PQ; bi+=blocksizeIJ )
-                                       for( int bj = 0; bj < K; 
bj+=blocksizeIJ ) {
-                                               int bimin = 
Math.min(bi+blocksizeIJ, PQ);
-                                               int bjmin = 
Math.min(bj+blocksizeIJ, K);
-                                               //core transpose operation
-                                               for(int i=bi, aix=bi*K+bj, 
cix=bj*PQ+bi; i<bimin; i++, aix+=K, cix++)
-                                                       
LibMatrixReorg.transposeRow(a, c, aix, destPos+cix, PQ, bjmin-bj);
-                                       }
-                       }
-               }
-       }
-       
-       /**
-        * This operator is used only if native is enabled, filter is dense and 
input is sparse
-        */
-       public static class SparseNativeConv2d implements Callable<Long> 
-       {
-               public int _rl; public int _ru; 
-               private final ConvolutionParameters _params;
-               public SparseNativeConv2d(int rl, int ru, ConvolutionParameters 
params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-
-               @Override
-               public Long call() throws Exception {
-                       int KPQ = _params.K*_params.P*_params.Q;
-                       double[] temp = new double[KPQ];
-                       for(int n = _rl; n < _ru; n++)  {
-                               if( !_params.input1.getSparseBlock().isEmpty(n) 
) {
-                                       int apos = 
_params.input1.getSparseBlock().pos(n);
-                                       int alen = 
_params.input1.getSparseBlock().size(n);
-                                       int[] aix = 
_params.input1.getSparseBlock().indexes(n);
-                                       double[] avals = 
_params.input1.getSparseBlock().values(n);
-                                       NativeHelper.conv2dSparse(apos, alen, 
aix, avals, _params.input2.getDenseBlockValues(), temp, 
-                                                       1, _params.C, 
_params.H, _params.W, _params.K, _params.R, _params.S, 
-                                                       _params.stride_h, 
_params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
-                                       System.arraycopy(temp, 0, 
_params.output.getDenseBlockValues(), n*KPQ, KPQ);
-                               }
-                       }
-                       //multi-threaded nnz maintenance of current working set
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-       }
-}

Reply via email to