Repository: incubator-systemml
Updated Branches:
  refs/heads/master 1570ee0e1 -> efbfd987c


[SYSTEMML-769] Adding memoryless loop-based convolution operator that is
useful in low-memory situation

The initial set of experiments on Lenet have show that there is a
performance degradation due to Cache RLS for im2col approach for large
number of images and small memory budget size (as example: with a 20g JVM,
the overhead is additional 1000 seconds). This degradation is not observed
when we increases the  memory budget (as an example: with 80g JVM).

Additional features:
1. Moved setting of nnz from instruction to LibMatrixDNN
2. Improved the degree of parallelism of ConvTask

TODOs:
- Improve the performance of LibMatrixDNN.doConv2d_Backward_Filter. Since
  this was not in top 10 heavy hitter, I have left this for subsequent PR.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/efbfd987
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/efbfd987
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/efbfd987

Branch: refs/heads/master
Commit: efbfd987c9d27aefdfa45525bd09c4c8b40482df
Parents: 1570ee0
Author: Niketan Pansare <[email protected]>
Authored: Wed Jul 6 11:50:47 2016 -0700
Committer: Niketan Pansare <[email protected]>
Committed: Wed Jul 6 11:50:47 2016 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    |   5 +-
 .../instructions/CPInstructionParser.java       |   4 +-
 .../cp/ConvolutionCPInstruction.java            |  26 +-
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 321 ++++++++++++++++++-
 .../sysml/runtime/util/ConvolutionUtils.java    |  42 ++-
 .../functions/tensor/Conv2DBackwardTest.java    |  56 +++-
 .../functions/tensor/Conv2DTest.java            |  58 +++-
 7 files changed, 473 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 3ec597b..3da2cd2 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -39,6 +39,8 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
        private Hop.ConvOp op;
 
        private int _maxNumThreads = -1; //-1 for unlimited
+       
+       public static boolean FORCE_NON_IM2COL = false;
 
        private ConvolutionOp() {
                //default constructor for clone
@@ -173,8 +175,9 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                }
                
                Lop in = inputs.get(0).constructLops();
+               int numThreads = et == ExecType.CP ? 
OptimizerUtils.getConstrainedNumThreads(primaryOp.getMaxNumThreads()) : 1;
                ConvolutionTransform transform1 = new ConvolutionTransform( in, 
-                               HopsConv2Lops.get(op), primaryOp.getDataType(), 
primaryOp.getValueType(), et, 1);
+                               HopsConv2Lops.get(op), primaryOp.getDataType(), 
primaryOp.getValueType(), et, numThreads);
                
                // setOutputDimensions(transform1);
                transform1.getOutputParameters().setDimensions(

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 23ca5ef..f3a1621 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -220,7 +220,9 @@ public class CPInstructionParser extends InstructionParser
                String2CPInstructionType.put( "col2im"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "maxpooling"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "maxpooling_backward"      , 
CPINSTRUCTION_TYPE.Convolution);
-
+               String2CPInstructionType.put( "conv2d"      , 
CPINSTRUCTION_TYPE.Convolution);
+               String2CPInstructionType.put( "conv2d_backward_filter"      , 
CPINSTRUCTION_TYPE.Convolution);
+               
                // Quaternary instruction opcodes
                String2CPInstructionType.put( "wsloss"  , 
CPINSTRUCTION_TYPE.Quaternary);
                String2CPInstructionType.put( "wsigmoid", 
CPINSTRUCTION_TYPE.Quaternary);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 1030071..15b043f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -116,7 +116,9 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                                        padding, input_shape, filter_shape, k);
                } 
                else if (opcode.equalsIgnoreCase("pooling_backward_reshape")
-                               || 
opcode.equalsIgnoreCase("maxpooling_backward")) {
+                               || 
opcode.equalsIgnoreCase("maxpooling_backward")
+                               || opcode.equalsIgnoreCase("conv2d")
+                               || 
opcode.equalsIgnoreCase("conv2d_backward_filter")) {
                        InstructionUtils.checkNumFields(parts, 16);
                        // dout, stride1, stride2, padding1, padding2
                        // input_shape1, input_shape2, input_shape3, 
input_shape4,
@@ -191,7 +193,6 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, C * R * S, N * P 
* Q, true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.im2col(matBlock, outputBlock, params);
-                       outputBlock.setNonZeros(params.outputNNZ.get());
                }
                else if (instOpcode.equalsIgnoreCase("reshape_col")) {
                        checkHeightWidth(ec, params);
@@ -200,7 +201,6 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, N, K * P * Q, 
true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.reshape_col(matBlock, outputBlock, params);
-                       outputBlock.setNonZeros(matBlock.getNonZeros()); // As 
number of non-zeros doesnot change for reshape_col
                }
                else if (instOpcode.equalsIgnoreCase("rotate180")) {
                        checkHeightWidth(ec, params);
@@ -208,7 +208,6 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, N * P * Q, K, 
true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.rotate180(matBlock, outputBlock, params);
-                       outputBlock.setNonZeros(matBlock.getNonZeros()); // As 
number of non-zeros doesnot change for rotate180
                }
                else if (instOpcode.equalsIgnoreCase("col2im")) {
                        checkHeightWidth(ec, params);
@@ -216,7 +215,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        // needs to be zeroed-out
                        outputBlock = getDenseOutputBlock(ec, N, C * H * W, 
false);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-                       LibMatrixDNN.col2im(matBlock, outputBlock, params); // 
No efficient nnz computation, so setting it to -1
+                       LibMatrixDNN.col2im(matBlock, outputBlock, params);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling")) {
                        // Is eligible for REUSE_NONZEROED_OUTPUT but cannot 
guarantee that previous output has been rmvar-ed
@@ -224,7 +223,6 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, N, C*P*Q, true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
-                       outputBlock.setNonZeros(params.outputNNZ.get());
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
                        MatrixBlock dout = ec.getMatrixInput(_in2.getName());
@@ -232,7 +230,21 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        // without somewhat expensive HashMap checks
                        outputBlock = getDenseOutputBlock(ec, N, C*H*W, false);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-                       LibMatrixDNN.maxpooling_backward(matBlock, dout, 
outputBlock, params); // No efficient nnz computation, so setting it to -1
+                       LibMatrixDNN.maxpooling_backward(matBlock, dout, 
outputBlock, params);
+                       ec.releaseMatrixInput(_in2.getName());
+               }
+               else if (instOpcode.equalsIgnoreCase("conv2d")) {
+                       MatrixBlock filter = ec.getMatrixInput(_in2.getName());
+                       outputBlock = getDenseOutputBlock(ec, N, K*P*Q, false);
+                       params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+                       LibMatrixDNN.conv2d(matBlock, filter, outputBlock, 
params);
+                       ec.releaseMatrixInput(_in2.getName());
+               }
+               else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) 
{
+                       MatrixBlock filter = ec.getMatrixInput(_in2.getName());
+                       outputBlock = getDenseOutputBlock(ec, K, C*R*S, false);
+                       params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+                       LibMatrixDNN.conv2d_backward_filter(matBlock, filter, 
outputBlock, params);
                        ec.releaseMatrixInput(_in2.getName());
                }
                else {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 0565136..0f6b646 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -20,16 +20,20 @@ package org.apache.sysml.runtime.matrix.data;
 
 import java.lang.ref.SoftReference;
 import java.util.ArrayList;
+import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 
+
 public class LibMatrixDNN {
 
        public static final boolean ALLOW_MULTI_THREADED_OPS = true;
@@ -56,10 +60,17 @@ public class LibMatrixDNN {
        }
        
        enum TaskType {
-               ReshapeCol, Rotate180, Im2Col, Col2Im, MaxPooling_Forward, 
MaxPooling_Backward
+               ReshapeCol, Rotate180, Im2Col, Col2Im, MaxPooling_Forward, 
MaxPooling_Backward, LoopBasedConv2d
        }
        public static final int TASK_SIZE = 64; // to take care of extremely 
small tasks
        
+       public static class TemporaryConvolutionData {
+               public int [] minIndexArrR;
+               public int [] minIndexArrS;
+               public int [] maxIndexArrR;
+               public int [] maxIndexArrS;
+       }
+       
        public static class ConvolutionParameters {
                public int N; public int C; public int H; public int W;
                public int K; public int R; public int S; public int stride_h; 
public int stride_w; public int pad_h; public int pad_w;
@@ -70,6 +81,8 @@ public class LibMatrixDNN {
                MatrixBlock input1; MatrixBlock input2; MatrixBlock output;
                boolean reuseNonZeroedOutput = false;
                
+               public TemporaryConvolutionData tmpData;
+               
                private int convertToInt(long val) throws DMLRuntimeException {
                        if( val > Integer.MAX_VALUE ) {
                                throw new DMLRuntimeException("The value for 
ConvolutionParameters is too large:" + val);
@@ -138,6 +151,146 @@ public class LibMatrixDNN {
                }
        }
        
+       public static void conv2d_backward_filter(MatrixBlock input, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               params.input1 = input;
+               params.input2 = dout;
+               params.output = outputBlock;
+               if(input.getNumRows() != params.N || input.getNumColumns() != 
params.C*params.H*params.W || 
+                               dout.getNumRows() != params.N || 
dout.getNumColumns() != params.K*params.P*params.Q) {
+                       throw new DMLRuntimeException("Incorrect input to 
conv2d_backward_filter");
+               }
+               
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       for (int c = 0; c < params.C; c++) {
+                               for (int k = 0; k < params.K; k++) {
+                                       for (int r = 0; r < params.R; r++) {
+                                               for (int s = 0; s < params.S; 
s++) {
+                                                       
doConv2d_Backward_Filter(k, c, r, s, params);
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else {
+                       ArrayList<ConvBackwardFilterTask> tasks = new 
ArrayList<ConvBackwardFilterTask>();              
+                       for (int c = 0; c < params.C; c++) {
+                               for (int k = 0; k < params.K; k++) {
+                                       for (int r = 0; r < params.R; r++) {
+                                               for (int s = 0; s < params.S; 
s++) {
+                                                       tasks.add(new 
ConvBackwardFilterTask(k, c, r, s, params));
+                                               }
+                                       }
+                               }
+                       }
+                       ExecutorService pool = Executors.newFixedThreadPool( 
Math.min(constrainedNumThreads, tasks.size()) );
+                       List<Future<Object>> taskret;
+                       try {
+                               taskret = pool.invokeAll(tasks);
+                               pool.shutdown();
+                               for( Future<Object> task : taskret )
+                                       task.get();
+                       } catch (InterruptedException e) {
+                               throw new DMLRuntimeException("Error while 
executing multi-threaded ConvBackwardFilterTask", e);
+                       } catch (ExecutionException e) {
+                               throw new DMLRuntimeException("Error while 
executing multi-threaded ConvBackwardFilterTask", e);
+                       }
+               }
+       }
+       
+       public static void doConv2d_Backward_Filter(int k, int c, int r, int s, 
ConvolutionParameters params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] doutArray = null;
+               if (!params.input2.isInSparseFormat())
+                       doutArray = params.input2.getDenseBlock();
+               double [] outputArray = params.output.getDenseBlock();
+               
+               long outputVal = 0;
+               for (int n = 0; n < params.N; n++) {
+                       for (int p = 0; p < params.P; p++) {
+                               for (int q = 0; q < params.Q; q++) {
+                                       double doutVal = 0;
+                                       if(doutArray != null)
+                                               doutVal = 
doutArray[n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + q];
+                                       else
+                                               doutVal = 
params.input2.quickGetValue(n, k*params.P*params.Q + p*params.Q + q);
+                                       if(doutVal != 0) {
+                                               // TODO: Improve the 
performance by striding
+                                               for (int h = 0; h < params.H; 
h++) {
+                                                       for (int w = 0; w < 
params.W; w++) { 
+                                                               if(h == 
p*params.stride_h + r - params.pad_h &&
+                                                                               
w == q*params.stride_w + s - params.pad_w) {
+                                                                       
if(inputArray != null)
+                                                                               
outputVal += doutVal*inputArray[n*params.C*params.H*params.W + 
c*params.H*params.W + h*params.W+w];
+                                                                       else 
+                                                                               
outputVal += doutVal*params.input1.quickGetValue(n, c*params.H*params.W + 
h*params.W + w);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+               outputArray[k*params.C*params.R*params.S + c*params.R*params.S 
+ r*params.S + s] = outputVal;
+       }
+       
+       private static class ConvBackwardFilterTask implements Callable<Object> 
{
+               int k; int c; int r; int s;
+               ConvolutionParameters params;
+               public ConvBackwardFilterTask(int k, int c, int r, int s, 
ConvolutionParameters params) {
+                       this.k = k;
+                       this.c = c;
+                       this.r = r;
+                       this.s = s;
+                       this.params = params;
+               }
+
+               @Override
+               public Object call() throws Exception {
+                       doConv2d_Backward_Filter(k, c, r, s, params);
+                       return null;
+               }
+               
+       }
+       
+       public static void conv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               params.input1 = input;
+               params.input2 = filter;
+               params.output = outputBlock;
+               
+               if(input.getNumRows() != params.N || input.getNumColumns() != 
params.C*params.H*params.W || 
+                               filter.getNumRows() != params.K || 
filter.getNumColumns() != params.C*params.R*params.S) {
+                       throw new DMLRuntimeException("Incorrect input to 
conv2d");
+               }
+               
+               params.tmpData = new TemporaryConvolutionData();                
+               params.tmpData.minIndexArrR = new int[params.R];
+               params.tmpData.maxIndexArrR = new int[params.R];
+               params.tmpData.minIndexArrS = new int[params.S];
+               params.tmpData.maxIndexArrS = new int[params.S];
+               for (int r = 0; r < params.R; r++) {
+                       params.tmpData.minIndexArrR[r] = getMinPQ(params.pad_h, 
r, params.stride_h);
+                       params.tmpData.maxIndexArrR[r] = getMaxPQ(params.pad_h, 
r, params.stride_h, params.P, params.H);
+               }
+               for (int s = 0; s < params.S; s++) {
+                       params.tmpData.minIndexArrS[s] = getMinPQ(params.pad_w, 
s, params.stride_w);
+                       params.tmpData.maxIndexArrS[s] = getMaxPQ(params.pad_w, 
s, params.stride_w, params.Q, params.W);
+               }
+               
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       for (int n = 0; n < params.N; n++) {
+                               for (int k = 0; k < params.K; k++) {
+                                       doLoopBasedConv2d(n, k, params);
+                               }
+                       }
+               }
+               else
+                       runParallelConvTask(constrainedNumThreads, params.K, 
TaskType.LoopBasedConv2d, params);
+       }
+       
        public static void maxpooling_backward(MatrixBlock input, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
                params.input1 = input;
                params.input2 = dout;
@@ -163,6 +316,122 @@ public class LibMatrixDNN {
                }
        }
        
+       /**
+        * This is essentially memory-less operation and can be used when the 
memory pressure is extremely high.
+        * @param n
+        * @param k
+        * @param params
+        */
+       private static void doLoopBasedConv2d(int n, int k, 
ConvolutionParameters params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] filterArray = null;
+               if (!params.input2.isInSparseFormat())
+                       filterArray = params.input2.getDenseBlock();
+               double [] outputArray = params.output.getDenseBlock();
+               
+               int outputOffset = n*params.K*params.P*params.Q + 
k*params.P*params.Q;
+               
+               int [] minIndexArrR = params.tmpData.minIndexArrR;
+               int [] maxIndexArrR = params.tmpData.maxIndexArrR;
+               int [] minIndexArrS = params.tmpData.minIndexArrS;
+               int [] maxIndexArrS = params.tmpData.maxIndexArrS;
+               
+               if(inputArray != null && filterArray != null) {
+                       for (int c = 0; c < params.C; c++) {
+                               for (int r = 0; r < params.R; r++) {
+                                       int filterOffset = 
k*params.C*params.R*params.S + c*params.R*params.S + r*params.S;
+                                       for (int p = minIndexArrR[r]; p < 
maxIndexArrR[r]; p++) {
+                                               for (int s = 0; s < params.S; 
s++) {
+                                                       double filterVal = 
filterArray[filterOffset + s];
+                                                       if(filterVal != 0) {
+                                                               int h = 
p*params.stride_h + r - params.pad_h;
+                                                               for (int q = 
minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
+                                                                       int w = 
q*params.stride_w + s - params.pad_w;
+                                                                       
outputArray[outputOffset + p*params.Q + q] += denseConvMultiply(inputArray, 
filterVal, params, n, c, h, w);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else if(inputArray != null && filterArray == null) {
+                       for (int c = 0; c < params.C; c++) {
+                               for (int r = 0; r < params.R; r++) {
+                                       for (int p = minIndexArrR[r]; p < 
maxIndexArrR[r]; p++) {
+                                               for (int s = 0; s < params.S; 
s++) {
+                                                       double filterVal = 
params.input2.quickGetValue(k, c*params.R*params.S + r*params.S + s);
+                                                       if(filterVal != 0) {
+                                                               int h = 
p*params.stride_h + r - params.pad_h;
+                                                               for (int q = 
minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
+                                                                       int w = 
q*params.stride_w + s - params.pad_w;
+                                                                       
outputArray[outputOffset + p*params.Q + q] += denseConvMultiply(inputArray, 
filterVal, params, n, c, h, w);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else if(inputArray == null && filterArray != null) {
+                       for (int c = 0; c < params.C; c++) {
+                               for (int r = 0; r < params.R; r++) {
+                                       int filterOffset = 
k*params.C*params.R*params.S + c*params.R*params.S + r*params.S;
+                                       for (int p = minIndexArrR[r]; p < 
maxIndexArrR[r]; p++) {
+                                               for (int s = 0; s < params.S; 
s++) {
+                                                       double filterVal = 
filterArray[filterOffset + s];
+                                                       if(filterVal != 0) {
+                                                               int h = 
p*params.stride_h + r - params.pad_h;
+                                                               for (int q = 
minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
+                                                                       int w = 
q*params.stride_w + s - params.pad_w;
+                                                                       
outputArray[outputOffset + p*params.Q + q] += sparseConvMultiply(inputArray, 
filterVal, params, n, c, h, w);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else if(inputArray == null && filterArray == null) {
+                       for (int c = 0; c < params.C; c++) {
+                               for (int r = 0; r < params.R; r++) {
+                                       for (int p = minIndexArrR[r]; p < 
maxIndexArrR[r]; p++) {
+                                               for (int s = 0; s < params.S; 
s++) {
+                                                       double filterVal = 
params.input2.quickGetValue(k, c*params.R*params.S + r*params.S + s);
+                                                       if(filterVal != 0) {
+                                                               int h = 
p*params.stride_h + r - params.pad_h;
+                                                               for (int q = 
minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
+                                                                       int w = 
q*params.stride_w + s - params.pad_w;
+                                                                       
outputArray[outputOffset + p*params.Q + q] += sparseConvMultiply(inputArray, 
filterVal, params, n, c, h, w);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       private static int getMinPQ(int pad, int filterSize, int stride) {
+               return Math.max(0, (int)Math.ceil(((double)(pad - 
filterSize))/stride));
+       }
+       
+       private static int getMaxPQ(int pad, int filterSize, int stride, int 
outputSize, int inputSize) {
+               return Math.min(outputSize, (int)Math.ceil(((double)(inputSize 
+ pad - filterSize)) / stride));
+       }
+       
+       private static double denseConvMultiply(double [] inputArray, double 
filterVal, ConvolutionParameters params,
+                       int n, int c, int h, int w) {
+               return inputArray[n*params.C*params.H*params.W + 
c*params.H*params.W + h*params.W+w]*filterVal;
+       }
+       
+       private static double sparseConvMultiply(double [] inputArray, double 
filterVal, ConvolutionParameters params,
+                       int n, int c, int h, int w) {
+               return params.input1.quickGetValue(n, c*params.H*params.W + 
h*params.W + w)*filterVal;
+       }
+       
        private static void doPoolingBackward(int n, int c, 
ConvolutionParameters params) {
                double [] inputArray = null;
                if (!params.input1.isInSparseFormat())
@@ -234,7 +503,8 @@ public class LibMatrixDNN {
                }
                else {
                        runParallelConvTask(constrainedNumThreads, params.C, 
TaskType.MaxPooling_Forward, params);
-               }       
+               }
+               outputBlock.setNonZeros(params.outputNNZ.get());
        }
 
        private static void doPooling(int n, int c, ConvolutionParameters 
params) {
@@ -291,6 +561,7 @@ public class LibMatrixDNN {
                else {
                        runParallelConvTask(constrainedNumThreads, 1, 
TaskType.Rotate180, params);
                }
+               outputBlock.setNonZeros(input.getNonZeros()); // As number of 
non-zeros doesnot change for rotate180
        }
        
        private static void doRotate180(int n, ConvolutionParameters params) {
@@ -332,27 +603,44 @@ public class LibMatrixDNN {
                else {
                        runParallelConvTask(constrainedNumThreads, 1, 
TaskType.ReshapeCol, params);
                }
-               
+               outputBlock.setNonZeros(input.getNonZeros()); // As number of 
non-zeros doesnot change for reshape_col
        }
        
        private static void runParallelConvTask(int constrainedNumThreads, int 
Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
-               ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();          
-               
                // Total number of compute units available: 
constrainedNumThreads
                // Static task allocation. TODO: Do this in dynamic way
+               int taskSize = TASK_SIZE;
+               while(true) {
+                       if(params.N * Math.ceil(Z/taskSize) > 
constrainedNumThreads || taskSize == 1) {
+                               doRunParallelConvTask(constrainedNumThreads, Z, 
type, params, taskSize);
+                               return;
+                       }
+                       taskSize = Math.max(taskSize/2, 1);
+               }
+       }
+       
+       private static void doRunParallelConvTask(int constrainedNumThreads, 
int Z, TaskType type, ConvolutionParameters params, int taskSize) throws 
DMLRuntimeException {
+               ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();          
+               
                for (int n = 0; n < params.N; n++) {
-                       for (int z = 0; z < Z; z += TASK_SIZE) {
-                               tasks.add(new ConvTask(n, n+1, z, Math.min(Z, 
z+TASK_SIZE), type, params));
+                       for (int z = 0; z < Z; z += taskSize) {
+                               tasks.add(new ConvTask(n, n+1, z, Math.min(Z, 
z+taskSize), type, params));
                        }
                }
 
                ExecutorService pool = Executors.newFixedThreadPool( 
Math.min(constrainedNumThreads, tasks.size()) );
+               List<Future<Object>> taskret;
                try {
-                       pool.invokeAll(tasks);
+                       taskret = pool.invokeAll(tasks);
+                       pool.shutdown();
+                       for( Future<Object> task : taskret )
+                               task.get();
                } catch (InterruptedException e) {
                        throw new DMLRuntimeException("Error while executing 
multi-threaded " + type.name(), e);
-               }       
-               pool.shutdown();
+               } catch (ExecutionException e) {
+                       throw new DMLRuntimeException("Error while executing 
multi-threaded " + type.name(), e);
+               }
+               
        }
        
        private static class ConvTask implements Callable<Object> {
@@ -369,7 +657,7 @@ public class LibMatrixDNN {
                }
                
                @Override
-               public Object call() throws Exception {
+               public Object call() throws DMLRuntimeException {
                        switch(type) {
                                case ReshapeCol:
                                        for (int n = n1; n < n2; n++) {
@@ -409,8 +697,15 @@ public class LibMatrixDNN {
                                                }
                                        }
                                        break;
+                               case LoopBasedConv2d:
+                                       for (int n = n1; n < n2; n++) {
+                                               for (int z = z1; z < z2; z++) {
+                                                       
LibMatrixDNN.doLoopBasedConv2d(n, z, params);
+                                               }
+                                       }
+                                       break;
                                default:
-                                       throw new RuntimeException("Unsupported 
ConvTask:" + type.name());
+                                       throw new 
DMLRuntimeException("Unsupported ConvTask:" + type.name());
                        }
                        return null;
                }
@@ -457,6 +752,7 @@ public class LibMatrixDNN {
                else {
                        runParallelConvTask(constrainedNumThreads, params.C, 
TaskType.Im2Col, params);
                }
+               outputBlock.setNonZeros(params.outputNNZ.get());
        }
        
        // Converts a matrix of dimension (CRS, NPQ) to a 4D tensor (N, C, H, W)
@@ -524,6 +820,7 @@ public class LibMatrixDNN {
                
        }
        
+       
        private static void doIm2colOverInputPath_NCHW(int n, int c, 
ConvolutionParameters params) {
                double [] inputArray = null;
                if (!params.input1.isInSparseFormat())

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java 
b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index 9def65a..0e469c2 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -73,11 +73,22 @@ public class ConvolutionUtils {
                return false;
        }
        
+       // Simple heuristic that prefers im2col for non-test/non-validation 
cases.
+       private static boolean preferIm2Col(ExecType et, long N, long K, long 
C, long R, long S, long P, long Q) throws HopsException {
+               if(et == ExecType.CP && ConvolutionOp.FORCE_NON_IM2COL) {
+                       return false;
+               }
+               else if(et == ExecType.CP && N < 256 ) {
+                       return true; // Prefer im2col to non-test/non-validation
+               }
+               return false;
+       }
+       
        public static Lop constructConvolutionBackwardFilterLops(Hop 
currentHop) throws HopsException, LopsException {
-               ExecType et = ExecType.CP;
+               ExecType et = ExecType.CP; // TODO: Check memory estimates
                if(DMLScript.USE_ACCELERATOR)
                        et = ExecType.GPU; // TODO: Add memory estimate checks
-               else
+               else if(et == ExecType.MR || et == ExecType.SPARK)
                        return null;
                
                if(currentHop != null && isTranspose(currentHop)) {
@@ -96,10 +107,27 @@ public class ConvolutionUtils {
                                        }
                                        
                                        // K, C * R * S
-                                       long K = 
currentHop.computeSizeInformation(inputs.get(10));
+                                       long N = 
currentHop.computeSizeInformation(inputs.get(6));
                                        long C = 
currentHop.computeSizeInformation(inputs.get(7));
+                                       long H = 
currentHop.computeSizeInformation(inputs.get(8));
+                                       long W = 
currentHop.computeSizeInformation(inputs.get(9));
+                                       long K = 
currentHop.computeSizeInformation(inputs.get(10));
                                        long R = 
currentHop.computeSizeInformation(inputs.get(12));
                                        long S = 
currentHop.computeSizeInformation(inputs.get(13));
+                                       long stride_h = 
currentHop.computeSizeInformation(inputs.get(2));
+                                       long stride_w = 
currentHop.computeSizeInformation(inputs.get(3));
+                                       long pad_h = 
currentHop.computeSizeInformation(inputs.get(4));
+                                       long pad_w = 
currentHop.computeSizeInformation(inputs.get(5));
+                                       long P = -1; long Q = -1;
+                                       if(H > 0 && R > 0 && stride_h > 0 && 
pad_h > 0)
+                                               P = ConvolutionUtils.getP(H, R, 
stride_h, pad_h);
+                                       if(W > 0 && S > 0 && stride_w > 0 && 
pad_w > 0)
+                                               Q = ConvolutionUtils.getQ(W, S, 
stride_w, pad_w);
+                                       
+                                       if(preferIm2Col(et, N, K, C, R, S, P, 
Q)) {
+                                               return null;
+                                       }
+                                       
                                        long rlen = K;
                                        long clen = 
ConvolutionOp.getExtractedVal(C, R, S);
                                        return 
ConvolutionOp.constructFusedConvolutionLops(et, inputs, 
ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, (ConvolutionOp) x_col, rlen, clen);
@@ -112,7 +140,7 @@ public class ConvolutionUtils {
        public static Lop constructConvolutionLops(Hop currentHop, ExecType et) 
throws HopsException, LopsException {
                if(DMLScript.USE_ACCELERATOR)
                        et = ExecType.GPU; // TODO: Add memory estimate checks
-               else
+               else if(et == ExecType.MR || et == ExecType.SPARK)
                        return null;
                
                if(currentHop != null && isConvolutionOp(currentHop, 
ConvOp.RESHAPE_COL)) {
@@ -131,6 +159,7 @@ public class ConvolutionUtils {
                                        
                                        // N, K * P * Q
                                        long N = 
currentHop.computeSizeInformation(inputs.get(6));
+                                       long C = 
currentHop.computeSizeInformation(inputs.get(7));
                                        long H = 
currentHop.computeSizeInformation(inputs.get(8));
                                        long W = 
currentHop.computeSizeInformation(inputs.get(9));
                                        long K = 
currentHop.computeSizeInformation(inputs.get(10));
@@ -145,6 +174,11 @@ public class ConvolutionUtils {
                                                P = ConvolutionUtils.getP(H, R, 
stride_h, pad_h);
                                        if(W > 0 && S > 0 && stride_w > 0 && 
pad_w > 0)
                                                Q = ConvolutionUtils.getQ(W, S, 
stride_w, pad_w);
+                                       
+                                       if(preferIm2Col(et, N, K, C, R, S, P, 
Q)) {
+                                               return null;
+                                       }
+                                       
                                        long rlen = N;
                                        long clen = 
ConvolutionOp.getExtractedVal(K, P, Q);
                                        return 
ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D, 
(ConvolutionOp) x_col, rlen, clen);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
index fd674d9..c213b55 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
@@ -22,6 +22,7 @@ import java.util.HashMap;
 
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.ConvolutionOp;
 import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
@@ -49,35 +50,70 @@ public class Conv2DBackwardTest extends AutomatedTestBase
        public void testConv2DBackwardFilterDense1() 
        {
                int numImg = 3; int imgSize = 3; int numChannels = 3; int 
numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
-               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DBackwardFilterDense2() 
        {
                int numImg = 3; int imgSize = 3; int numChannels = 3; int 
numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
-               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DBackwardFilterDense3() 
        {
                int numImg = 3; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
-               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DBackwardFilterDense4() 
        {
-               int numImg = 3; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 3; int stride = 1; int pad = 1;
-               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+               int numImg = 3; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DBackwardFilterDense5() 
        {
-               int numImg = 3; int imgSize = 10; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 3; int pad = 1;
-               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+               int numImg = 3; int imgSize = 10; int numChannels = 2; int 
numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, false);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense6() 
+       {
+               int numImg = 3; int imgSize = 3; int numChannels = 3; int 
numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense7() 
+       {
+               int numImg = 3; int imgSize = 3; int numChannels = 3; int 
numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense8() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense9() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense10() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 2; int 
numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad, true);
        }
        
        /**
@@ -86,12 +122,13 @@ public class Conv2DBackwardTest extends AutomatedTestBase
         * @param sparse
         */
        public void runConv2DBackwardFilterTest( ExecType et, int imgSize, int 
numImg, int numChannels, int numFilters, 
-                       int filterSize, int stride, int pad) 
+                       int filterSize, int stride, int pad, boolean 
forceNonIm2Col) 
        {
                RUNTIME_PLATFORM oldRTP = rtplatform;
                        
                boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
-               
+               boolean oldForceNonIm2col = ConvolutionOp.FORCE_NON_IM2COL;
+               ConvolutionOp.FORCE_NON_IM2COL = forceNonIm2Col;
                try
                {
                    TestConfiguration config = getTestConfiguration(TEST_NAME);
@@ -139,6 +176,7 @@ public class Conv2DBackwardTest extends AutomatedTestBase
                {
                        rtplatform = oldRTP;
                        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+                       ConvolutionOp.FORCE_NON_IM2COL = oldForceNonIm2col;
                }
        }
        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
index 3737801..8b87372 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
@@ -22,6 +22,7 @@ import java.util.HashMap;
 
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.ConvolutionOp;
 import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysml.test.integration.AutomatedTestBase;
@@ -47,42 +48,88 @@ public class Conv2DTest extends AutomatedTestBase
        public void testConv2DDense1() 
        {
                int numImg = 5; int imgSize = 3; int numChannels = 3; int 
numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DDense2() 
        {
                int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DDense3() 
        {
                int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DDense4() 
        {
                int numImg = 3; int imgSize = 10; int numChannels = 1; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
+       @Test
+       public void testConv2DDense5() 
+       {
+               int numImg = 3; int imgSize = 8; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
+       }
+       
+       @Test
+       public void testConv2DDense6() 
+       {
+               int numImg = 5; int imgSize = 3; int numChannels = 3; int 
numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DDense7() 
+       {
+               int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DDense8() 
+       {
+               int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DDense9() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 1; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DDense10() 
+       {
+               int numImg = 3; int imgSize = 8; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       
        /**
         * 
         * @param et
         * @param sparse
         */
        public void runConv2DTest( ExecType et, int imgSize, int numImg, int 
numChannels, int numFilters, 
-                       int filterSize, int stride, int pad) 
+                       int filterSize, int stride, int pad, boolean 
FORCE_NON_IM2COL) 
        {
                RUNTIME_PLATFORM oldRTP = rtplatform;
                        
                boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
                
+               boolean oldForceNonIm2col = ConvolutionOp.FORCE_NON_IM2COL; 
+               ConvolutionOp.FORCE_NON_IM2COL = FORCE_NON_IM2COL;
+               
                try
                {
                    TestConfiguration config = getTestConfiguration(TEST_NAME);
@@ -128,6 +175,7 @@ public class Conv2DTest extends AutomatedTestBase
                {
                        rtplatform = oldRTP;
                        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+                       ConvolutionOp.FORCE_NON_IM2COL = oldForceNonIm2col;
                }
        }
 }


Reply via email to