pooling builtin function

niketanpansare Mon, 16 May 2016 17:47:14 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 946b6634b -> c334c2c85



http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
new file mode 100644
index 0000000..b68a51c
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -0,0 +1,564 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.lang.ref.SoftReference;
+import java.util.ArrayList;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+
+public class LibMatrixDNN {
+
+       public static boolean ALLOW_MULTI_THREADED_OPS = true;
+       // Using hashmap to avoid any performance impacts of multimap
+       private static final ConcurrentHashMap<Integer, 
SoftReference<double[]>> non_zeroed_double_arr = new ConcurrentHashMap<Integer, 
SoftReference<double[]>>();
+       private static final int NON_ZEROED_DOUBLE_ARR_THRESHOLD = 100;
+       public static void cacheReuseableData(double[] arr) {
+               if(arr != null && arr.length >= 
NON_ZEROED_DOUBLE_ARR_THRESHOLD) {
+                       // Put the last recently removed arrays into the 
NON_ZEROED_DOUBLE_ARR as 
+                       // it has lower probability of being garbage collected
+                       // new Integer(arr.length) can be avoided here as 
autoboxing will do the trick
+                       non_zeroed_double_arr.put(arr.length, new 
SoftReference<double[]>(arr));
+               }
+       }
+       public static double[] getReuseableData(long length) {
+               if(length >= NON_ZEROED_DOUBLE_ARR_THRESHOLD) {
+                       // Explicit "new Integer" required here for 
HashMap.remove
+                       SoftReference<double[]> arr = 
non_zeroed_double_arr.remove(new Integer((int) length));
+                       if(arr != null) {
+                               return arr.get();
+                       }
+               }
+               return null;
+       }
+       
+       enum TaskType {
+               ReshapeCol, Rotate180, Im2Col, Col2Im, MaxPooling_Forward, 
MaxPooling_Backward
+       }
+       public static final int TASK_SIZE = 64; // to take care of extremely 
small tasks
+       
+       public static class ConvolutionParameters {
+               public int N; public int C; public int H; public int W;
+               public int K; public int R; public int S; public int stride_h; 
public int stride_w; public int pad_h; public int pad_w;
+               public int P; public int Q; public int numThreads;
+               
+               MatrixBlock input1; MatrixBlock input2; MatrixBlock output;
+               boolean reuseNonZeroedOutput = false;
+               
+               private int convertToInt(long val) throws DMLRuntimeException {
+                       if( val > Integer.MAX_VALUE ) {
+                               throw new DMLRuntimeException("The value for 
ConvolutionParameters is too large:" + val);
+                       }
+                       return (int) val;
+               }
+               
+               public boolean compare(ConvolutionParameters that) {
+                       if(this.N == that.N && this.C == that.C && this.H == 
that.H && this.W == that.W
+                                       && this.K == that.K && this.R == that.R 
&& this.S == that.S && this.stride_h == that.stride_h
+                                        && this.stride_w == that.stride_w  && 
this.pad_h == that.pad_h
+                                         && this.pad_w == that.pad_w   && 
this.numThreads == that.numThreads) {
+                               return true;
+                       }
+                       return false;
+               }
+               
+               public ConvolutionParameters(long N, long C, long H, long W,
+                               long K, long R, long S, long stride_h, long 
stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
+                       this.N = convertToInt(N);
+                       this.C = convertToInt(C);
+                       this.H = convertToInt(H);
+                       this.W = convertToInt(W);
+                       this.K = convertToInt(K);
+                       this.R = convertToInt(R);
+                       this.S = convertToInt(S);
+                       this.stride_h = convertToInt(stride_h);
+                       this.stride_w = convertToInt(stride_w);
+                       this.pad_h = convertToInt(pad_h);
+                       this.pad_w = convertToInt(pad_w);
+                       if(H >= 0 && pad_h >= 0 && R >= 0 && stride_h >= 0)
+                               P = (int) ((H + 2 * pad_h - R) / stride_h + 1);
+                       else
+                               P = -1;
+                       // P = convertToInt(ConvolutionUtils.getP(H, R, 
stride_h, pad_h));
+                       
+                       if(W >= 0 && pad_w >= 0 && S >= 0 && stride_w >= 0)
+                               Q = (int) ((W + 2 * pad_w - S) / stride_w + 1);
+                       else
+                               Q = -1;
+                       // Q = convertToInt(ConvolutionUtils.getQ(W, S, 
stride_w, pad_w));
+                       
+                       this.numThreads = numThreads;
+               }
+               
+               public ConvolutionParameters(int N, int C, int H, int W,
+                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int numThreads) {
+                       this.N = N;
+                       this.C = C;
+                       this.H = H;
+                       this.W = W;
+                       this.K = K;
+                       this.R = R;
+                       this.S = S;
+                       this.stride_h = stride_h;
+                       this.stride_w = stride_w;
+                       this.pad_h = pad_h;
+                       this.pad_w = pad_w;
+                       P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h);
+                       Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
+                       this.numThreads = numThreads;
+               }
+               
+               public void setReuseNonZeroedOutput(boolean 
reuseNonZeroedOutput) {
+                       this.reuseNonZeroedOutput = reuseNonZeroedOutput;
+               }
+       }
+       
+       public static void maxpooling_backward(MatrixBlock input, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               params.input1 = input;
+               params.input2 = dout;
+               params.output = outputBlock;
+               if(input.getNumColumns() != params.C*params.H*params.W || 
input.getNumRows() != params.N) {
+                       throw new DMLRuntimeException("Incorrect input 
dimensions in maxpooling_backward:" + input.getNumRows() + " " + 
input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
+               }
+
+               if(dout.getNumColumns() != params.C*params.P*params.Q || 
dout.getNumRows() != params.N) {
+                       throw new DMLRuntimeException("Incorrect dout 
dimensions in maxpooling_backward:" + input.getNumRows() + " " + 
input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
+               }
+
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       for (int n = 0; n < params.N; n++) {
+                               for (int c = 0; c < params.C; c++) {
+                                       doPoolingBackward(n, c, params);
+                               }
+                       }
+               }
+               else {
+                       runParallelConvTask(constrainedNumThreads, params.C, 
TaskType.MaxPooling_Backward, params);
+               }
+       }
+       
+       private static void doPoolingBackward(int n, int c, 
ConvolutionParameters params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] doutArray = null;
+               if (!params.input2.isInSparseFormat())
+                       doutArray = params.input2.getDenseBlock();
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlock();
+               
+               for (int p = 0; p < params.P; p++) {
+                       for (int q = 0; q < params.Q; q++) {
+                               int start_index_h = p * params.stride_h - 
params.pad_h;
+                               int start_index_w = q * params.stride_w - 
params.pad_w;
+                               int end_index_h = Math.min(start_index_h + 
params.R, params.H);
+                               int end_index_w = Math.min(start_index_w + 
params.S, params.W);
+                               start_index_h = Math.max(start_index_h, 0);
+                               start_index_w = Math.max(start_index_w, 0);
+                               int maxIndex = n*params.C*params.H*params.W + 
c*params.H*params.W +  start_index_h*params.W + start_index_w; 
+                               double maxVal = -Double.MAX_VALUE; 
+
+
+                               double currDoutVal = -1;
+                               for (int h = start_index_h; h < end_index_h; 
h++) {
+                                       for (int w = start_index_w; w < 
end_index_w; w++) {
+                                               if(inputArray != null)
+                                                       currDoutVal = 
inputArray[n*params.C*params.H*params.W + c*params.H*params.W +  h*params.W + 
w];
+                                               else
+                                                       currDoutVal = 
params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w);
+
+                                               if(maxVal < currDoutVal) {
+                                                       maxIndex = 
n*params.C*params.H*params.W + c*params.H*params.W +  h*params.W + w;
+                                                       maxVal = currDoutVal;
+                                               }
+                                       }
+                               }
+
+                               double inVal = -1;
+                               if(doutArray != null)
+                                       inVal = 
doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q +  p * params.Q + 
q];
+                               else
+                                       inVal = params.input2.quickGetValue(n, 
c*params.P*params.Q +  p * params.Q + q);
+
+                               // synchronized(this) {
+                                       outputArray[maxIndex] += inVal;
+                               // }
+                       }
+               }
+       }
+
+       public static void maxpooling(MatrixBlock input, MatrixBlock 
outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+               params.input1 = input;
+               params.output = outputBlock;
+               
+               if(input.getNumColumns() != params.C*params.H*params.W || 
input.getNumRows() != params.N) {
+                       throw new DMLRuntimeException("Incorrect input 
dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + 
" " + params.N + " " + params.K*params.P*params.Q);
+               }
+               
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       for (int n = 0; n < params.N; n++) {
+                               for (int c = 0; c < params.C; c++) {
+                                       doPooling(n, c, params);
+                               }
+                       }
+               }
+               else {
+                       runParallelConvTask(constrainedNumThreads, params.C, 
TaskType.MaxPooling_Forward, params);
+               }       
+       }
+
+       private static void doPooling(int n, int c, ConvolutionParameters 
params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlock();
+               
+               for (int p = 0; p < params.P; p++) {
+                       for (int q = 0; q < params.Q; q++) {
+                               int start_index_h = p * params.stride_h - 
params.pad_h;
+                               int start_index_w = q * params.stride_w - 
params.pad_w;
+                               int end_index_h = Math.min(start_index_h + 
params.R, params.H);
+                               int end_index_w = Math.min(start_index_w + 
params.S, params.W);
+                               start_index_h = Math.max(start_index_h, 0);
+                               start_index_w = Math.max(start_index_w, 0);
+                               int out_index = n*params.C*params.P*params.Q + 
c*params.P*params.Q +  p * params.Q + q;
+                               outputArray[out_index] = -Double.MAX_VALUE;
+                               for (int h = start_index_h; h < end_index_h; 
h++) {
+                                       for (int w = start_index_w; w < 
end_index_w; w++) {
+                                               double inVal = -1;
+                                               if(inputArray != null)
+                                                       inVal = 
inputArray[n*params.C*params.H*params.W + c*params.H*params.W +  h*params.W + 
w];
+                                               else
+                                                       inVal = 
params.input1.quickGetValue(n, c*params.H*params.W +  h*params.W + w);
+                                               outputArray[out_index] = 
Math.max(outputArray[out_index], inVal);
+                                       }
+                               }
+                       }
+               }
+       }
+               
+       // Reshape a 4D tensor of dimension (N, K, P, Q) to matrix of dimension 
(K, NPQ)
+       public static void rotate180(MatrixBlock input, MatrixBlock 
outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+               params.input1 = input;
+               params.output = outputBlock;
+               
+               if(input.getNumColumns() != params.K*params.P*params.Q || 
input.getNumRows() != params.N) {
+                       throw new DMLRuntimeException("Incorrect input 
dimensions in rotate180:" + input.getNumRows() + " " + input.getNumColumns() + 
" " + params.N + " " + params.K*params.P*params.Q);
+               }
+               
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       for (int n = 0; n < params.N; n++) {
+                               doRotate180(n, params);
+                       }
+               }
+               else {
+                       runParallelConvTask(constrainedNumThreads, 1, 
TaskType.Rotate180, params);
+               }
+       }
+       
+       private static void doRotate180(int n, ConvolutionParameters params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlock();
+               
+               for (int k = 0; k < params.K; k++) {
+                       for (int p = 0; p < params.P; p++) {
+                               for (int q = 0; q < params.Q; q++) {
+                                       if(inputArray != null)
+                                               
outputArray[n*params.K*params.P*params.Q + p*params.Q*params.K + q*params.K + 
k] = inputArray[n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q 
+ q];
+                                       else
+                                               
outputArray[n*params.P*params.Q*params.K + p*params.Q*params.K + q*params.K + 
k] = params.input1.quickGetValue(n, k*params.P*params.Q + p*params.Q + q);
+                               }
+                       }
+               }
+       }
+       
+       
+       // Reshape a matrix of dimension (K, NPQ) to 4D tensor of dimension (N, 
K, P, params.Q)
+       public static void reshape_col(MatrixBlock input, MatrixBlock 
outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+               params.input1 = input;
+               params.output = outputBlock;
+               
+               if(input.getNumColumns() != params.N*params.P*params.Q || 
input.getNumRows() != params.K) {
+                       throw new DMLRuntimeException("Incorrect input 
dimensions in reshape_col:" + input.getNumRows() + " " + input.getNumColumns());
+               }
+               
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       for (int n = 0; n < params.N; n++) { 
+                               doReshapeCol(n, params);
+                       }
+               }
+               else {
+                       runParallelConvTask(constrainedNumThreads, 1, 
TaskType.ReshapeCol, params);
+               }
+               
+       }
+       
+       private static void runParallelConvTask(int constrainedNumThreads, int 
Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
+               ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();          
+               
+               // Total number of compute units available: 
constrainedNumThreads
+               // Static task allocation. TODO: Do this in dynamic way
+               for (int n = 0; n < params.N; n++) {
+                       for (int z = 0; z < Z; z += TASK_SIZE) {
+                               tasks.add(new ConvTask(n, n+1, z, Math.min(Z, 
z+TASK_SIZE), type, params));
+                       }
+               }
+
+               ExecutorService pool = Executors.newFixedThreadPool( 
Math.min(constrainedNumThreads, tasks.size()) );
+               try {
+                       pool.invokeAll(tasks);
+               } catch (InterruptedException e) {
+                       throw new DMLRuntimeException("Error while executing 
multi-threaded " + type.name(), e);
+               }       
+               pool.shutdown();
+       }
+       
+       private static class ConvTask implements Callable<Object> {
+               int n1; int n2; int z1; int z2; 
+               ConvolutionParameters params;
+               TaskType type;
+               public ConvTask(int n1, int n2, int z1, int z2, TaskType type, 
ConvolutionParameters params) {
+                       this.n1 = n1;
+                       this.n2 = n2;
+                       this.z1 = z1;
+                       this.z2 = z2;
+                       this.type = type;
+                       this.params = params;
+               }
+               
+               @Override
+               public Object call() throws Exception {
+                       switch(type) {
+                               case ReshapeCol:
+                                       for (int n = n1; n < n2; n++) {
+                                               LibMatrixDNN.doReshapeCol(n, 
params);
+                                       }
+                                       break;
+                               case Rotate180:
+                                       for (int n = n1; n < n2; n++) {
+                                               LibMatrixDNN.doRotate180(n, 
params);
+                                       }
+                                       break;
+                               case Im2Col:
+                                       for (int n = n1; n < n2; n++) {
+                                               for (int z = z1; z < z2; z++) {
+                                                       
LibMatrixDNN.doIm2colOverInputPath_NCHW(n, z, params);
+                                               }
+                                       }
+                                       break;
+                               case Col2Im:
+                                       for (int n = n1; n < n2; n++) {
+                                               for (int z = z1; z < z2; z++) {
+                                                       
LibMatrixDNN.doCol2imOverInputPath_NCHW(n, z, params);
+                                               }
+                                       }
+                                       break;
+                               case MaxPooling_Forward:
+                                       for (int n = n1; n < n2; n++) {
+                                               for (int z = z1; z < z2; z++) {
+                                                       
LibMatrixDNN.doPooling(n, z, params);
+                                               }
+                                       }
+                                       break;
+                               case MaxPooling_Backward:
+                                       for (int n = n1; n < n2; n++) {
+                                               for (int z = z1; z < z2; z++) {
+                                                       
LibMatrixDNN.doPoolingBackward(n, z, params);
+                                               }
+                                       }
+                                       break;
+                               default:
+                                       throw new RuntimeException("Unsupported 
ConvTask:" + type.name());
+                       }
+                       return null;
+               }
+       }
+               
+       private static void doReshapeCol(int n, ConvolutionParameters params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlock();
+               
+               if(inputArray != null) {
+                       for (int k = 0; k < params.K; k++)  {
+                               System.arraycopy(inputArray, 
k*params.N*params.P*params.Q + n*params.P*params.Q, outputArray, 
n*params.K*params.P*params.Q + k*params.P*params.Q, params.P*params.Q);
+                       }
+               }
+               else {
+                       for (int k = 0; k < params.K; k++) {
+                               for (int p = 0; p < params.P; p++) { 
+                                       for (int q = 0; q < params.Q; q++) {
+                                               
outputArray[n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + 
q] = params.input1.quickGetValue(k, n*params.P*params.Q + p*params.Q + q);
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       // Converts a 4D tensor (N, C, R, S) to a matrix of dimension (CRS, NPQ)
+       public static void im2col(MatrixBlock input, MatrixBlock outputBlock, 
ConvolutionParameters params) throws DMLRuntimeException {
+               params.input1 = input;
+               params.output = outputBlock;
+               
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       for (int n = 0; n < params.N; n++) { // Do following 
for all images
+                               for (int c = 0; c < params.C; c++) { // Since 
format is NCHW
+                                       doIm2colOverInputPath_NCHW(n, c, 
params);
+                               }
+                       }
+               }
+               else {
+                       runParallelConvTask(constrainedNumThreads, params.C, 
TaskType.Im2Col, params);
+               }
+       }
+       
+       // Converts a matrix of dimension (CRS, NPQ) to a 4D tensor (N, C, H, W)
+       public static void col2im(MatrixBlock input, MatrixBlock outputBlock, 
ConvolutionParameters params) throws DMLRuntimeException {
+               params.input1 = input;
+               params.output = outputBlock;
+               
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+                       // Sequential col2im
+                       for (int n = 0; n < params.N; n++) { // Do following 
for all images
+                               for (int c = 0; c < params.C; c++) { // Since 
format is NCHW
+                                       doCol2imOverInputPath_NCHW(n, c, 
params);
+                               }
+                       }
+               }
+               else {
+                       // Parallel col2im
+                       runParallelConvTask(constrainedNumThreads, params.C, 
TaskType.Col2Im, params);
+               }
+       }
+       
+               
+       private static void doCol2imOverInputPath_NCHW(int n, int c, 
ConvolutionParameters params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlock();
+               
+               for (int r = 0; r < params.R; r++) { // Get an input patch of 
size R X S
+                       for (int s = 0; s < params.S; s++) {
+                               int localIndex = ((c*params.R*params.S*params.N 
+ r*params.S*params.N + s*params.N + n)*params.P*params.Q);
+                               
+                               int input_row = r - params.pad_h;
+                               // And copy it to outputArray[i] (taking care 
of padding & striding)
+                               for (int p = params.P; p > 0; p--) {
+                                       if (input_row >= 0 && input_row < 
params.H) {
+                                               int input_col = s - 
params.pad_w;
+                                               for (int q = params.Q; q > 0; 
q--, localIndex++) {
+                                                       if (input_col >= 0 && 
input_col < params.W) {
+                                                               // Copy from 
[channel c, height input_row, width input_col]
+                                                               int index = 
n*params.C*params.H*params.W + c*params.H*params.W + input_row*params.W + 
input_col;
+                                                               if (inputArray 
!= null) {
+                                                                       
outputArray[index] += inputArray[localIndex];
+                                                               }
+                                                               else {
+                                                                       // 
TODO: Optimize for sparse input
+                                                                       // 
Note: localIndex = row*N*P*Q + col
+                                                                       int row 
= localIndex / (params.N*params.P*params.Q);
+                                                                       int col 
= localIndex % (params.N*params.P*params.Q);
+                                                                       
outputArray[index] += params.input1.quickGetValue(row, col); 
+                                                               }
+                                                       }
+                                                       input_col += 
params.stride_w;
+                                               }
+                                       } else {
+                                               localIndex += params.Q;
+                                       }
+                                       input_row += params.stride_h;
+                               }
+                       }
+               }
+               
+       }
+       
+       private static void doIm2colOverInputPath_NCHW(int n, int c, 
ConvolutionParameters params) {
+               double [] inputArray = null;
+               if (!params.input1.isInSparseFormat())
+                       inputArray = params.input1.getDenseBlock();
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlock();
+               
+               final int inputOffset = n*params.C*params.H*params.W + 
c*params.H*params.W;
+               final int outputOffset = (c*params.R*params.S*params.N + 
n)*params.P*params.Q;
+               
+               for (int r = 0; r < params.R; r++) { // Get an input patch of 
size R X S
+                       for (int s = 0; s < params.S; s++) {
+                               int localIndex = outputOffset + 
((r*params.S*params.N + s*params.N)*params.P*params.Q);
+                               
+                               int input_row = r - params.pad_h;
+                               // And copy it to outputArray[i] (taking care 
of padding & striding)
+                               for (int p = params.P; p > 0; p--) {
+                                       if (input_row >= 0 && input_row < 
params.H) {
+                                               int input_col = s - 
params.pad_w;
+                                               for (int q = params.Q; q > 0; 
q--, localIndex++) {
+                                                       if (input_col >= 0 && 
input_col < params.W) {
+                                                               // Copy from 
[channel c, height input_row, width input_col]
+                                                               if(inputArray 
!= null)
+                                                                       
outputArray[localIndex] = inputArray[inputOffset + input_row*params.W + 
input_col];
+                                                               else
+                                                                       
outputArray[localIndex] = params.input1.quickGetValue(n, c*params.H*params.W + 
input_row*params.W + input_col);
+                                                       }
+                                                       else 
if(params.reuseNonZeroedOutput) {
+                                                               
outputArray[localIndex] = 0;
+                                                       }
+                                                       input_col += 
params.stride_w;
+                                               }
+                                       } else {
+                                               if(params.reuseNonZeroedOutput) 
{
+                                                       for(int i = localIndex; 
i < localIndex + params.Q; i++) {
+                                                               
outputArray[localIndex] = 0;
+                                                       }
+                                               }
+                                               localIndex += params.Q;
+                                       }
+                                       input_row += params.stride_h;
+                               }
+                       }
+               }
+               
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index c3af41f..19831a4 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -30,9 +30,9 @@ import java.io.ObjectOutputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
-
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.OptimizerUtils;
@@ -403,13 +403,8 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                        allocateDenseBlock();
        }
        
-       /**
-        * 
-        * @param clearNNZ
-        * @throws DMLRuntimeException
-        */
-       public void allocateDenseBlock(boolean clearNNZ) 
-               throws RuntimeException 
+       public void allocateDenseBlock(boolean clearNNZ, boolean zeroOut) 
+                       throws RuntimeException 
        {
                long limit = (long)rlen * clen;
                
@@ -420,14 +415,34 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                }
                
                //allocate block if non-existing or too small (guaranteed to be 
0-initialized),
-               if(denseBlock == null || denseBlock.length < limit ) {
+               if(!zeroOut && DMLScript.REUSE_NONZEROED_OUTPUT 
+                               && (denseBlock == null || denseBlock.length < 
limit)
+                               // Not a column vector
+                               && rlen != 1 && clen != 1) {
+                       denseBlock = LibMatrixDNN.getReuseableData(limit);
+               }
+               if(denseBlock == null || denseBlock.length < limit) {
                        denseBlock = new double[(int)limit];
                }
                
+               
                //clear nnz if necessary
                if( clearNNZ ) {
                        nonZeros = 0;
                }
+               
+               sparse = false;
+       }
+       
+       /**
+        * 
+        * @param clearNNZ
+        * @throws DMLRuntimeException
+        */
+       public void allocateDenseBlock(boolean clearNNZ) 
+               throws RuntimeException 
+       {
+               allocateDenseBlock(clearNNZ, true);
        }
        
        /**

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java 
b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
new file mode 100644
index 0000000..80b20cd
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.util;
+
+
+public class ConvolutionUtils {
+       
+       public static long getP(long H, long R, long verticalStride, long 
heightPadding) {
+               long ret = (H + 2 * heightPadding - R) / verticalStride + 1;
+               if(ret <= 0) {
+                       throw new RuntimeException("Incorrect output patch 
size: "
+                                       + "(image_height + 2 * pad_h - 
filter_height) / verticalStride + 1) needs to be positive, but is " + ret
+                                       + " (" + H + " + 2 * " + heightPadding 
+ " - " + R + ") / " + verticalStride + " + 1))");
+               }
+               return ret;
+       }
+       public static long getQ(long W, long S, long horizontalStride, long 
widthPadding) {
+               long ret = (W + 2 * widthPadding - S) / horizontalStride + 1;
+               if(ret <= 0) {
+                       throw new RuntimeException("Incorrect output patch 
size: (image_width + 2 * pad_w - filter_width) / horizontalStride + 1) needs to 
be positive, but is " + ret
+                                       + " (" + W + " + 2 * " + widthPadding + 
" - " + S + ") / " + horizontalStride + " + 1))");
+               }
+               return ret;
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index a517136..edb3493 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -102,6 +102,15 @@ public class Statistics
        private static AtomicLong lTotalLixUIP = new AtomicLong(0);
        
        
+       private static AtomicLong denseBlockAllocationTime = new AtomicLong(0);
+       private static AtomicLong sparseBlockAllocationTime = new AtomicLong(0);
+       
+       public static void incrementAllocationTime(long allocationTime, boolean 
isSparse) {
+               if(isSparse)
+                       sparseBlockAllocationTime.addAndGet(allocationTime);
+               else
+                       denseBlockAllocationTime.addAndGet(allocationTime);
+       }
        
        public static synchronized void setNoOfExecutedMRJobs(int 
iNoOfExecutedMRJobs) {
                Statistics.iNoOfExecutedMRJobs = iNoOfExecutedMRJobs;
@@ -340,6 +349,9 @@ public class Statistics
                resetJVMgcTime();
                resetJVMgcCount();
                resetCPHeavyHitters();
+               
+               denseBlockAllocationTime.set(0);
+               sparseBlockAllocationTime.set(0);
        }
        
        /**
@@ -591,6 +603,10 @@ public class Statistics
                        sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + 
CacheStatistics.displayHits() + ".\n");
                        sb.append("Cache writes (WB, FS, HDFS):\t" + 
CacheStatistics.displayWrites() + ".\n");
                        sb.append("Cache times (ACQr/m, RLS, EXP):\t" + 
CacheStatistics.displayTime() + " sec.\n");
+                       if(DMLScript.REUSE_NONZEROED_OUTPUT) {
+                               sb.append("Allocation time (Dense/Sparse):\t" + 
String.format("%.3f", denseBlockAllocationTime.doubleValue()/1000000000) 
+                                               + "/" + String.format("%.3f", 
sparseBlockAllocationTime.doubleValue()/1000000000)  + " sec.\n");
+                       }
                        sb.append("HOP DAGs recompiled (PRED, SB):\t" + 
getHopRecompiledPredDAGs() + "/" + getHopRecompiledSBDAGs() + ".\n");
                        sb.append("HOP DAGs recompile time:\t" + 
String.format("%.3f", ((double)getHopRecompileTime())/1000000000) + " sec.\n");
                        if( getFunRecompiles()>0 ) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardDataTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardDataTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardDataTest.java
new file mode 100644
index 0000000..b42b061
--- /dev/null
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardDataTest.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.test.integration.functions.tensor;
+
+import java.util.HashMap;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+public class Conv2DBackwardDataTest extends AutomatedTestBase
+{
+       
+       private final static String TEST_NAME = "Conv2DBackwardDataTest";
+       private final static String TEST_DIR = "functions/tensor/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
Conv2DTest.class.getSimpleName() + "/";
+       private final static double epsilon=0.0000000001;
+       
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
+                               new String[] {"B"}));
+       }
+       
+       @Test
+       public void testConv2DDense1() 
+       {
+               int numImg = 2; int imgSize = 10; int numChannels = 3; int 
numFilters = 2; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DDense2() 
+       {
+               int numImg = 5; int imgSize = 3; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 1; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DDense3() 
+       {
+               int numImg = 5; int imgSize = 3; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DDense4() 
+       {
+               int numImg = 5; int imgSize = 10; int numChannels = 2; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       
+       /**
+        * 
+        * @param et
+        * @param sparse
+        */
+       public void runConv2DTest( ExecType et, int imgSize, int numImg, int 
numChannels, int numFilters, 
+                       int filterSize, int stride, int pad) 
+       {
+               RUNTIME_PLATFORM oldRTP = rtplatform;
+                       
+               boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+               
+               try
+               {
+                   TestConfiguration config = getTestConfiguration(TEST_NAME);
+                   if(et == ExecType.SPARK) {
+                       rtplatform = RUNTIME_PLATFORM.SPARK;
+                   }
+                   else {
+                       rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP 
: RUNTIME_PLATFORM.SINGLE_NODE;
+                   }
+                       if( rtplatform == RUNTIME_PLATFORM.SPARK )
+                               DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+                       
+                       loadTestConfiguration(config);
+               
+                       /* This is for running the junit test the new way, 
i.e., construct the arguments directly */
+                       String RI_HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
+                       
+                       
+                       long P = ConvolutionUtils.getP(imgSize, filterSize, 
stride, pad);
+                       programArgs = new String[]{"-explain", "-args",  "" + 
imgSize, "" + numImg, 
+                                       "" + numChannels, "" + numFilters, 
+                                       "" + filterSize, "" + stride, "" + pad,
+                                       "" + P, "" + P, 
+                                       output("B")};
+                               
+                       boolean exceptionExpected = false;
+                       int expectedNumberOfJobs = -1;
+                       runTest(true, exceptionExpected, null, 
expectedNumberOfJobs);
+                       
+                       fullRScriptName = RI_HOME + TEST_NAME + ".R";
+                       rCmd = "Rscript" + " " + fullRScriptName + " " + 
imgSize + " " + numImg + 
+                                       " " + numChannels + " " + numFilters + 
+                                       " " + filterSize + " " + stride + " " + 
pad + " " + P + " " + P + " " + expectedDir();
+                       // Run comparison R script
+                       runRScript(true);
+                       HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
+                       
+                       HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
+                       TestUtils.compareMatrices(dmlfile, bHM, epsilon, 
"B-DML", "NumPy");
+                       
+               }
+               finally
+               {
+                       rtplatform = oldRTP;
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+               }
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
new file mode 100644
index 0000000..2789ab9
--- /dev/null
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.test.integration.functions.tensor;
+
+import java.util.HashMap;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+public class Conv2DBackwardTest extends AutomatedTestBase
+{
+       
+       private final static String TEST_NAME = "Conv2DBackwardTest";
+       private final static String TEST_DIR = "functions/tensor/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
Conv2DTest.class.getSimpleName() + "/";
+       private final static double epsilon=0.0000000001;
+       
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
+                               new String[] {"B"}));
+       }
+       
+       
+       @Test
+       public void testConv2DBackwardFilterDense1() 
+       {
+               int numImg = 3; int imgSize = 3; int numChannels = 3; int 
numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense2() 
+       {
+               int numImg = 3; int imgSize = 3; int numChannels = 3; int 
numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense3() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense4() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 3; int stride = 1; int pad = 1;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DBackwardFilterDense5() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 3; int pad = 1;
+               runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, 
numChannels, numFilters, filterSize, stride, pad);
+       }
+       
+       /**
+        * 
+        * @param et
+        * @param sparse
+        */
+       public void runConv2DBackwardFilterTest( ExecType et, int imgSize, int 
numImg, int numChannels, int numFilters, 
+                       int filterSize, int stride, int pad) 
+       {
+               RUNTIME_PLATFORM oldRTP = rtplatform;
+                       
+               boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+               
+               try
+               {
+                   TestConfiguration config = getTestConfiguration(TEST_NAME);
+                   if(et == ExecType.SPARK) {
+                       rtplatform = RUNTIME_PLATFORM.SPARK;
+                   }
+                   else {
+                       rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP 
: RUNTIME_PLATFORM.SINGLE_NODE;
+                   }
+                       if( rtplatform == RUNTIME_PLATFORM.SPARK )
+                               DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+                       
+                       loadTestConfiguration(config);
+               
+                       /* This is for running the junit test the new way, 
i.e., construct the arguments directly */
+                       String RI_HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
+                       
+                       
+                       long P = ConvolutionUtils.getP(imgSize, filterSize, 
stride, pad);
+                       
+                       programArgs = new String[]{"-explain", "-args",  "" + 
imgSize, "" + numImg, 
+                               "" + numChannels, "" + numFilters, 
+                               "" + filterSize, "" + stride, "" + pad,
+                               "" + P, "" + P, 
+                               output("B")};
+                               
+                       boolean exceptionExpected = false;
+                       int expectedNumberOfJobs = -1;
+                       runTest(true, exceptionExpected, null, 
expectedNumberOfJobs);
+                       
+                       fullRScriptName = RI_HOME + TEST_NAME + ".R";
+                       rCmd = "Rscript" + " " + fullRScriptName + " " + 
imgSize + " " + numImg + 
+                                       " " + numChannels + " " + numFilters + 
+                                       " " + filterSize + " " + stride + " " + 
pad + " " + P + " " + P + " " + expectedDir();
+                       // Run comparison R script
+                       runRScript(true);
+                       HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
+                       
+                       HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
+                       TestUtils.compareMatrices(dmlfile, bHM, epsilon, 
"B-DML", "NumPy");
+                       
+               }
+               finally
+               {
+                       rtplatform = oldRTP;
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+               }
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
new file mode 100644
index 0000000..3737801
--- /dev/null
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.test.integration.functions.tensor;
+
+import java.util.HashMap;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+public class Conv2DTest extends AutomatedTestBase
+{
+       
+       private final static String TEST_NAME = "Conv2DTest";
+       private final static String TEST_DIR = "functions/tensor/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
Conv2DTest.class.getSimpleName() + "/";
+       private final static double epsilon=0.0000000001;
+       
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
+                               new String[] {"B"}));
+       }
+       
+       @Test
+       public void testConv2DDense1() 
+       {
+               int numImg = 5; int imgSize = 3; int numChannels = 3; int 
numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DDense2() 
+       {
+               int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DDense3() 
+       {
+               int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       @Test
+       public void testConv2DDense4() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 1; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+       }
+       
+       /**
+        * 
+        * @param et
+        * @param sparse
+        */
+       public void runConv2DTest( ExecType et, int imgSize, int numImg, int 
numChannels, int numFilters, 
+                       int filterSize, int stride, int pad) 
+       {
+               RUNTIME_PLATFORM oldRTP = rtplatform;
+                       
+               boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+               
+               try
+               {
+                   TestConfiguration config = getTestConfiguration(TEST_NAME);
+                   if(et == ExecType.SPARK) {
+                       rtplatform = RUNTIME_PLATFORM.SPARK;
+                   }
+                   else {
+                       rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP 
: RUNTIME_PLATFORM.SINGLE_NODE;
+                   }
+                       if( rtplatform == RUNTIME_PLATFORM.SPARK )
+                               DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+                       
+                       loadTestConfiguration(config);
+               
+                       /* This is for running the junit test the new way, 
i.e., construct the arguments directly */
+                       String RI_HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
+                       
+                       
+                       programArgs = new String[]{"-explain", "-args",  "" + 
imgSize, "" + numImg, 
+                               "" + numChannels, "" + numFilters, 
+                               "" + filterSize, "" + stride, "" + pad, 
+                               output("B")};
+                       
+                       fullRScriptName = RI_HOME + TEST_NAME + ".R";
+                       rCmd = "Rscript" + " " + fullRScriptName + " " + 
imgSize + " " + numImg + 
+                                       " " + numChannels + " " + numFilters + 
+                                       " " + filterSize + " " + stride + " " + 
pad + " " + expectedDir(); 
+                       
+                       boolean exceptionExpected = false;
+                       int expectedNumberOfJobs = -1;
+                       runTest(true, exceptionExpected, null, 
expectedNumberOfJobs);
+
+                       // Run comparison R script
+                       runRScript(true);
+                       HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
+                       
+                       HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
+                       TestUtils.compareMatrices(dmlfile, bHM, epsilon, 
"B-DML", "B-R");
+                       
+               }
+               finally
+               {
+                       rtplatform = oldRTP;
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+               }
+       }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolBackwardTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolBackwardTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolBackwardTest.java
new file mode 100644
index 0000000..db53d0b
--- /dev/null
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolBackwardTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.test.integration.functions.tensor;
+
+import java.util.HashMap;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+public class PoolBackwardTest extends AutomatedTestBase
+{
+       
+       private final static String TEST_NAME = "PoolBackwardTest";
+       private final static String TEST_DIR = "functions/tensor/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
Conv2DTest.class.getSimpleName() + "/";
+       private final static double epsilon=0.0000000001;
+       
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
+                               new String[] {"B"}));
+       }
+       
+       @Test
+       public void testMaxPool2DBackwardDense1() 
+       {
+               int numImg = 1; int imgSize = 4; int numChannels = 1;  int 
stride = 2; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
+               runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, 
pad, poolSize1, poolSize2, "max");
+       }
+       
+       @Test
+       public void testMaxPool2DBackwardDense2() 
+       {
+               int numImg = 3; int imgSize = 6; int numChannels = 3;  int 
stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
+               runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, 
pad, poolSize1, poolSize2, "max");
+       }
+       
+       @Test
+       public void testMaxPool2DBackwardDense3() 
+       {
+               int numImg = 2; int imgSize = 7; int numChannels = 2;  int 
stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
+               runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, 
pad, poolSize1, poolSize2, "max");
+       }
+       
+       /**
+        * 
+        * @param et
+        * @param sparse
+        */
+       public void runPoolTest( ExecType et, int imgSize, int numImg, int 
numChannels, int stride, 
+                       int pad, int poolSize1, int poolSize2, String poolMode) 
+       {
+               RUNTIME_PLATFORM oldRTP = rtplatform;
+                       
+               boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+               
+               try
+               {
+                   TestConfiguration config = getTestConfiguration(TEST_NAME);
+                   if(et == ExecType.SPARK) {
+                       rtplatform = RUNTIME_PLATFORM.SPARK;
+                   }
+                   else {
+                       rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP 
: RUNTIME_PLATFORM.SINGLE_NODE;
+                   }
+                       if( rtplatform == RUNTIME_PLATFORM.SPARK )
+                               DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+                       
+                       loadTestConfiguration(config);
+               
+                       /* This is for running the junit test the new way, 
i.e., construct the arguments directly */
+                       String RI_HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
+                       
+                       long P = ConvolutionUtils.getP(imgSize, poolSize1, 
stride, pad);
+                       programArgs = new String[]{"-explain", "-args",  "" + 
imgSize, "" + numImg, 
+                                       "" + numChannels, "" + poolSize1, "" + 
poolSize2, 
+                                       "" + stride, "" + pad, poolMode, 
+                                       "" + P, "" + P, 
+                                       output("B")};
+                               
+                       boolean exceptionExpected = false;
+                       int expectedNumberOfJobs = -1;
+                       runTest(true, exceptionExpected, null, 
expectedNumberOfJobs);
+                       
+                       fullRScriptName = RI_HOME + TEST_NAME + ".R";
+                       rCmd = "Rscript" + " " + fullRScriptName + " " + 
imgSize + " " + numImg + 
+                                       " " + numChannels + " " + poolSize1 + 
+                                       " " + poolSize2 + " " + stride + " " + 
pad + " " +  P + " " + P + " " + expectedDir(); 
+                       
+                       // Run comparison R script
+                       runRScript(true);
+                       HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
+                       
+                       HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
+                       TestUtils.compareMatrices(dmlfile, bHM, epsilon, 
"B-DML", "NumPy");
+                       
+               }
+               finally
+               {
+                       rtplatform = oldRTP;
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+               }
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
new file mode 100644
index 0000000..dc0599a
--- /dev/null
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.test.integration.functions.tensor;
+
+import java.util.HashMap;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+public class PoolTest extends AutomatedTestBase
+{
+       
+       private final static String TEST_NAME = "PoolTest";
+       private final static String TEST_DIR = "functions/tensor/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
Conv2DTest.class.getSimpleName() + "/";
+       private final static double epsilon=0.0000000001;
+       
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
+                               new String[] {"B"}));
+       }
+       
+       @Test
+       public void testMaxPool2DDense1() 
+       {
+               int numImg = 1; int imgSize = 6; int numChannels = 1;  int 
stride = 2; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
+               runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, 
pad, poolSize1, poolSize2, "max");
+       }
+       
+       @Test
+       public void testMaxPool2DDense2() 
+       {
+               int numImg = 2; int imgSize = 6; int numChannels = 1;  int 
stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
+               runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, 
pad, poolSize1, poolSize2, "max");
+       }
+       
+       
+       @Test
+       public void testMaxPool2DDense3() 
+       {
+               int numImg = 3; int imgSize = 7; int numChannels = 2;  int 
stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
+               runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, 
pad, poolSize1, poolSize2, "max");
+       }
+       
+       @Test
+       public void testMaxPool2DDense4() 
+       {
+               int numImg = 2; int imgSize = 4; int numChannels = 2;  int 
stride = 1; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
+               runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, 
pad, poolSize1, poolSize2, "max");
+       }
+       
+       /**
+        * 
+        * @param et
+        * @param sparse
+        */
+       public void runPoolTest( ExecType et, int imgSize, int numImg, int 
numChannels, int stride, 
+                       int pad, int poolSize1, int poolSize2, String poolMode) 
+       {
+               RUNTIME_PLATFORM oldRTP = rtplatform;
+                       
+               boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+               
+               try
+               {
+                   TestConfiguration config = getTestConfiguration(TEST_NAME);
+                   if(et == ExecType.SPARK) {
+                       rtplatform = RUNTIME_PLATFORM.SPARK;
+                   }
+                   else {
+                       rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP 
: RUNTIME_PLATFORM.SINGLE_NODE;
+                   }
+                       if( rtplatform == RUNTIME_PLATFORM.SPARK )
+                               DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+                       
+                       loadTestConfiguration(config);
+               
+                       /* This is for running the junit test the new way, 
i.e., construct the arguments directly */
+                       String RI_HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
+                       
+                       programArgs = new String[]{"-explain", "-args",  "" + 
imgSize, "" + numImg, 
+                                       "" + numChannels, "" + poolSize1, "" + 
poolSize2, 
+                                       "" + stride, "" + pad, poolMode, 
+                                       output("B")};
+                               
+                       boolean exceptionExpected = false;
+                       int expectedNumberOfJobs = -1;
+                       runTest(true, exceptionExpected, null, 
expectedNumberOfJobs);
+                       
+                       fullRScriptName = RI_HOME + TEST_NAME + ".R";
+                       rCmd = "Rscript" + " " + fullRScriptName + " " + 
imgSize + " " + numImg + 
+                                       " " + numChannels + " " + poolSize1 + 
+                                       " " + poolSize2 + " " + stride + " " + 
pad + " " + expectedDir(); 
+                       
+                       // Run comparison R script
+                       runRScript(true);
+                       HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
+                       
+                       HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
+                       TestUtils.compareMatrices(dmlfile, bHM, epsilon, 
"B-DML", "NumPy");
+                       
+               }
+               finally
+               {
+                       rtplatform = oldRTP;
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+               }
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R 
b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R
new file mode 100644
index 0000000..e66d9e2
--- /dev/null
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R
@@ -0,0 +1,104 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args <- commandArgs(TRUE)
+library("Matrix")
+imgSize=as.integer(args[1])
+numImg=as.integer(args[2])
+numChannels=as.integer(args[3])
+numFilters=as.integer(args[4])
+filterSize=as.integer(args[5])
+stride=as.integer(args[6])
+pad=as.integer(args[7])
+P=as.integer(args[8])
+Q=as.integer(args[9])
+
+# Assumption: NCHW image format
+w=matrix(seq(1, numFilters*numChannels*filterSize*filterSize), numFilters, 
numChannels*filterSize*filterSize, byrow=TRUE)
+dout=matrix(seq(1, numImg*numFilters*P*Q), numImg, numFilters*P*Q, byrow=TRUE)
+
+
+col2im <- function(img_cols, C, Hin, Win, Hf, Wf,
+                  strideh, stridew, reduction) {
+
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  img = matrix(0, C, Hin*Win, byrow=TRUE)  # zeros
+  for (hout in 1:Hout) {  # all output rows
+    hin = (hout-1) * strideh + 1
+    for (wout in 1:Wout) {  # all output columns
+      win = (wout-1) * stridew + 1
+      # Extract a local patch of the input image corresponding spatially to 
the filter sizes.
+      img_patch = matrix(img_cols[,(hout-1)*Wout + wout], C, Hf*Wf, 
byrow=TRUE)  # zeros
+      for (c in 1:C) {  # all channels
+        img_patch_slice = matrix(img_patch[c,], Hf, Wf, byrow=TRUE)  # reshape
+        if (reduction == "add") {
+          img_slice = matrix(0, Hin, Win, byrow=TRUE)
+          img_slice[hin:(hin+Hf-1), win:(win+Wf-1)] = img_patch_slice
+          img[c,] = img[c,] + matrix(t(img_slice), 1, Hin*Win)
+        } else {
+          img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)
+          img_slice[hin:(hin+Hf-1), win:(win+Wf-1)] = img_patch_slice
+          img[c,] = matrix(t(img_slice), 1, Hin*Win)
+        }
+      }
+    }
+  }
+  img
+}
+
+unpad_image <- function(img_padded, Hin, Win, padh, padw) {
+  C = nrow(img_padded)
+  img = matrix(0, C, Hin*Win, byrow=TRUE)
+  for (c in 1:C) {
+    img_padded_slice = matrix(img_padded[c,], (Hin+2*padh), (Win+2*padw), 
byrow=TRUE)
+    img_slice = img_padded_slice[(padh+1):(padh+Hin), (padw+1):(padw+Win)]
+    img[c,] = matrix(t(img_slice), 1, Hin*Win)
+  }
+  img
+}
+
+conv2d_backward_data <- function(dout, Hout, Wout,
+                    W, N, C, Hin, Win, Hf, Wf,
+                    strideh, stridew, padh, padw) {
+
+  F = nrow(W)
+  
+  # Create gradient volumes
+  dX = matrix(0, N, C*Hin*Win, byrow=TRUE)
+  
+  # Partial derivatives for convolution - im2col implementation
+  for (n in 1:N) {  # all examples
+    doutn = matrix(dout[n,], F, Hout*Wout, byrow=TRUE)
+
+    # Compute dX
+    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
+    dXn_padded = col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew, "add")
+    dXn = unpad_image(dXn_padded, Hin, Win, padh, padw)
+    dX[n,] = matrix(t(dXn), 1, C*Hin*Win)  # reshape
+  }
+  
+  dX
+}
+
+dx = conv2d_backward_data(dout, P, Q, w, numImg, numChannels, imgSize, 
imgSize, filterSize, filterSize, stride, stride, pad, pad);
+
+writeMM(as(dx,"CsparseMatrix"), paste(args[10], "B", sep=""))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml 
b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml
new file mode 100644
index 0000000..78b2dee
--- /dev/null
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+imgSize=$1
+numImg=$2
+numChannels=$3
+numFilters=$4
+filterSize=$5
+stride=$6
+pad=$7
+
+P = $8
+Q = $9
+
+# Assumption: NCHW image format
+w=matrix(seq(1, numFilters*numChannels*filterSize*filterSize), 
rows=numFilters, cols=numChannels*filterSize*filterSize)
+dout=matrix(seq(1, numImg*numFilters*P*Q), rows=numImg, cols=numFilters*P*Q)
+dx = conv2d_backward_data(w, dout, stride=[stride, stride], padding=[pad, 
pad], input_shape=[numImg, numChannels, imgSize, imgSize], 
filter_shape=[numFilters, numChannels, filterSize, filterSize])
+write(dx, $10, format="text")
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R 
b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
new file mode 100644
index 0000000..91e0065
--- /dev/null
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
@@ -0,0 +1,107 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args <- commandArgs(TRUE)
+library("Matrix")
+imgSize=as.integer(args[1])
+numImg=as.integer(args[2])
+numChannels=as.integer(args[3])
+numFilters=as.integer(args[4])
+filterSize=as.integer(args[5])
+stride=as.integer(args[6])
+pad=as.integer(args[7])
+P=as.integer(args[8])
+Q=as.integer(args[9])
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, 
numChannels*imgSize*imgSize, byrow=TRUE)
+dout=matrix(seq(1, numImg*numFilters*P*Q), numImg, numFilters*P*Q, byrow=TRUE)
+
+
+pad_image <- function(img, Hin, Win, padh, padw){
+  C = nrow(img)
+  img_padded = matrix(0, C, (Hin+2*padh)*(Win+2*padw))  # zeros
+  for (c in 1:C) {
+    img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)  # depth slice C reshaped
+    img_padded_slice = matrix(0, Hin+2*padh, Win+2*padw)
+    img_padded_slice[(padh+1):(padh+Hin), (padw+1):(padw+Win)] = img_slice
+    img_padded[c,] = matrix(t(img_padded_slice), 1, (Hin+2*padh)*(Win+2*padw)) 
 # reshape
+  }
+  img_padded
+}
+
+im2col <- function(img, Hin, Win, Hf, Wf, strideh, stridew) {
+  C = nrow(img)
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  img_cols = matrix(0, C*Hf*Wf, Hout*Wout)  # zeros
+  for (hout in 1:Hout) {  # all output rows
+    hin = (hout-1) * strideh + 1
+    for (wout in 1:Wout) {  # all output columns
+      win = (wout-1) * stridew + 1
+      # Extract a local patch of the input image corresponding spatially to 
the filter sizes.
+      img_patch = matrix(0, C, Hf*Wf)  # zeros
+      for (c in 1:C) {  # all channels
+        img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)  # reshape
+        img_patch[c,] = matrix(t(img_slice[hin:(hin+Hf-1), win:(win+Wf-1)]), 
1, Hf*Wf)
+      }
+      img_cols[,(hout-1)*Wout + wout] = matrix(t(img_patch), C*Hf*Wf, 1)  # 
reshape
+    }
+  }
+  img_cols
+}
+
+conv2d_backward_filter <- function(dout, Hout, Wout,
+                    X, N, K, C, Hin, Win, Hf, Wf,
+                    strideh, stridew, padh, padw) {
+  
+  F = K
+  
+  # Create gradient volumes
+  dW = matrix(0, F, C*Hf*Wf, byrow=TRUE)
+  
+  # Create convenience gradient volumes for dW and db that will allow
+  # for one gradient to be stored per example, allowing for parallel
+  # computation at the expense of memory.  We will reduce at the end.
+  dWN = matrix(0, N, F*C*Hf*Wf, byrow=TRUE)
+
+  # Partial derivatives for convolution - im2col implementation
+  for (n in 1:N) {  # all examples
+    doutn = matrix(dout[n,], F, Hout*Wout, byrow=TRUE)
+
+    # Compute dW
+    Xn = matrix(X[n,], C, Hin*Win, byrow=TRUE)  # reshape
+    Xn_padded = pad_image(Xn, Hin, Win, padh, padw)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
+    Xn_padded_cols = im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
+    #dW = dW + doutn %*% t(Xn_padded_cols)
+    dWN[n,] = matrix(t(doutn %*% t(Xn_padded_cols)), 1, F*C*Hf*Wf)
+  }
+
+  # Reduce convenience gradient volumes with one gradient per example
+  # into single gradients for W and b.
+  dW = matrix(colSums(dWN), F, C*Hf*Wf, byrow=TRUE)
+  dW
+}
+
+dw = conv2d_backward_filter(dout, P, Q, x, numImg, numFilters, numChannels, 
imgSize, imgSize, filterSize, filterSize, stride, stride, pad, pad)
+
+
+writeMM(as(dw,"CsparseMatrix"), paste(args[10], "B", sep=""))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml 
b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
new file mode 100644
index 0000000..155c77b
--- /dev/null
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+imgSize=$1
+numImg=$2
+numChannels=$3
+numFilters=$4
+filterSize=$5
+stride=$6
+pad=$7
+
+P = $8
+Q = $9
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, 
cols=numChannels*imgSize*imgSize)
+dout=matrix(seq(1, numImg*numFilters*P*Q), rows=numImg, cols=numFilters*P*Q)
+dw = conv2d_backward_filter(x, dout, stride=[stride, stride], padding=[pad, 
pad], input_shape=[numImg, numChannels, imgSize, imgSize], 
filter_shape=[numFilters, numChannels, filterSize, filterSize])
+write(dw, $10, format="text")
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/Conv2DTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DTest.R 
b/src/test/scripts/functions/tensor/Conv2DTest.R
new file mode 100644
index 0000000..fe34c8f
--- /dev/null
+++ b/src/test/scripts/functions/tensor/Conv2DTest.R
@@ -0,0 +1,98 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args <- commandArgs(TRUE)
+library("Matrix")
+imgSize=as.integer(args[1])
+numImg=as.integer(args[2])
+numChannels=as.integer(args[3])
+numFilters=as.integer(args[4])
+filterSize=as.integer(args[5])
+stride=as.integer(args[6])
+pad=as.integer(args[7])
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, 
numChannels*imgSize*imgSize, byrow=TRUE)
+w=matrix(seq(1, numFilters*numChannels*filterSize*filterSize), numFilters, 
numChannels*filterSize*filterSize, byrow=TRUE)
+
+pad_image <- function(img, Hin, Win, padh, padw){
+  C = nrow(img)
+  img_padded = matrix(0, C, (Hin+2*padh)*(Win+2*padw), byrow=TRUE)  # zeros
+  for (c in 1:C) {
+    img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)  # depth slice C reshaped
+    img_padded_slice = matrix(0, Hin+2*padh, Win+2*padw)
+    img_padded_slice[(padh+1):(padh+Hin), (padw+1):(padw+Win)] = img_slice
+    img_padded[c,] = matrix(t(img_padded_slice), 1, (Hin+2*padh)*(Win+2*padw)) 
 # reshape
+  }
+  img_padded
+}
+
+im2col <- function(img, Hin, Win, Hf, Wf, strideh, stridew) {
+  C = nrow(img)
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  img_cols = matrix(0, C*Hf*Wf, Hout*Wout, byrow=TRUE)  # zeros
+  for (hout in 1:Hout) {  # all output rows
+    hin = (hout-1) * strideh + 1
+    for (wout in 1:Wout) {  # all output columns
+      win = (wout-1) * stridew + 1
+      # Extract a local patch of the input image corresponding spatially to 
the filter sizes.
+      img_patch = matrix(0, C, Hf*Wf, byrow=TRUE)  # zeros
+      for (c in 1:C) {  # all channels
+        img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)  # reshape
+        img_patch[c,] = matrix(t(img_slice[hin:(hin+Hf-1), win:(win+Wf-1)]), 
1, Hf*Wf)
+      }
+      img_cols[,(hout-1)*Wout + wout] = matrix(t(img_patch), C*Hf*Wf, 1)  # 
reshape
+    }
+  }
+  img_cols
+}
+               
+conv2d <- function(X, W, C, Hin, Win, Hf, Wf, strideh, stridew, padh, padw) {
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
+  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  
+  # Create output volume
+  out = matrix(0, N, F*Hout*Wout, byrow=TRUE)
+
+  # Convolution - im2col implementation
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], C, Hin*Win, byrow=TRUE)  # reshape
+
+    # Pad image
+    Xn_padded = pad_image(Xn, Hin, Win, padh, padw)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
+
+    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, 
Hout*Wout)
+    Xn_padded_cols = im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
+
+    # Convolve patches with filters
+    outn = W %*% Xn_padded_cols   # shape (F, Hout*Wout)
+    out[n,] = matrix(t(outn), 1, F*Hout*Wout)  # reshape
+  }
+  
+  out
+}
+
+output = conv2d(x, w, numChannels,  imgSize, imgSize, filterSize, filterSize, 
stride, stride, pad, pad);
+
+writeMM(as(output,"CsparseMatrix"), paste(args[8], "B", sep=""))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/Conv2DTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DTest.dml 
b/src/test/scripts/functions/tensor/Conv2DTest.dml
new file mode 100644
index 0000000..a03bcdd
--- /dev/null
+++ b/src/test/scripts/functions/tensor/Conv2DTest.dml
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+imgSize=$1
+numImg=$2
+numChannels=$3
+numFilters=$4
+filterSize=$5
+stride=$6
+pad=$7
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, 
cols=numChannels*imgSize*imgSize)
+w=matrix(seq(1, numFilters*numChannels*filterSize*filterSize), 
rows=numFilters, cols=numChannels*filterSize*filterSize)
+output = conv2d(x, w, padding=[pad, pad], stride=[stride, stride], 
input_shape=[numImg, numChannels, imgSize, imgSize], filter_shape=[numFilters, 
numChannels, filterSize, filterSize])
+
+write(output, $8, format="text")
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/PoolBackwardTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolBackwardTest.R 
b/src/test/scripts/functions/tensor/PoolBackwardTest.R
new file mode 100644
index 0000000..8cb8a7c
--- /dev/null
+++ b/src/test/scripts/functions/tensor/PoolBackwardTest.R
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args <- commandArgs(TRUE)
+library("Matrix")
+library("matrixStats") 
+imgSize=as.integer(args[1])
+numImg=as.integer(args[2])
+numChannels=as.integer(args[3])
+poolSize1=as.integer(args[4])
+poolSize2=as.integer(args[5])
+stride=as.integer(args[6])
+pad=as.integer(args[7])
+P=as.integer(args[8])
+Q=as.integer(args[9])
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, 
numChannels*imgSize*imgSize, byrow=TRUE)
+dout=matrix(seq(1, numImg*numChannels*P*Q), numImg, numChannels*P*Q, 
byrow=TRUE)
+
+max_pool_backward <- function(dout, Hout, Wout, X, C,
+                    Hin, Win, Hf, Wf, strideh, stridew)
+     {
+  N = nrow(X)
+  
+  # Create gradient volume
+  dX = matrix(0, N, C*Hin*Win, byrow=TRUE)
+  
+  # Gradient of max pooling
+  for (n in 1:N) {  # all examples
+    img = matrix(X[n,], C, Hin*Win, byrow=TRUE)
+    dimg = matrix(0, C, Hin*Win, byrow=TRUE)
+    for (c in 1:C) {  # all channels
+      img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)
+      dimg_slice = matrix(0, Hin, Win, byrow=TRUE)
+      for (hout in 1:Hout) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1) * stridew + 1
+          img_slice_patch = img_slice[hin:(hin+Hf-1), win:(win+Wf-1)]
+          max_val = max(img_slice_patch)
+          max_val_ind = (img_slice_patch == max_val)  # max value indicator
+          # gradient passes through only for the max value in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + 
(hout-1)*Wout + wout]
+          dimg_slice[hin:(hin+Hf-1), win:(win+Wf-1)] =
+            dimg_slice[hin:(hin+Hf-1), win:(win+Wf-1)] + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(t(dimg_slice), 1, Hin*Win)
+    }
+    dX[n,] = matrix(t(dimg), 1, C*Hin*Win)
+  }
+  
+  dX
+}
+
+output = max_pool_backward(dout, P, Q, x, numChannels, imgSize, imgSize, 
poolSize1, poolSize2, stride, stride)
+writeMM(as(output,"CsparseMatrix"), paste(args[10], "B", sep=""))
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/PoolBackwardTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolBackwardTest.dml 
b/src/test/scripts/functions/tensor/PoolBackwardTest.dml
new file mode 100644
index 0000000..0ee80df
--- /dev/null
+++ b/src/test/scripts/functions/tensor/PoolBackwardTest.dml
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# 
+#-------------------------------------------------------------
+imgSize=$1
+numImg=$2
+numChannels=$3
+poolSize1=$4
+poolSize2=$5
+stride=$6
+pad=$7
+poolMode=$8
+
+P = $9
+Q = $10
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, 
cols=numChannels*imgSize*imgSize)
+dout=matrix(seq(1, numImg*numChannels*P*Q), rows=numImg, cols=numChannels*P*Q)
+if(poolMode == "max") {
+       output = max_pool_backward(x, dout, stride=[stride, stride], 
padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], 
pool_size=[poolSize1, poolSize2])
+}
+else {
+       # Not supported yet
+       # output = avg_pool_backward(x, dout, stride=[stride, stride], 
padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], 
pool_size=[poolSize1, poolSize2])
+}
+write(output, $11, format="text")
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/PoolTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.R 
b/src/test/scripts/functions/tensor/PoolTest.R
new file mode 100644
index 0000000..3731807
--- /dev/null
+++ b/src/test/scripts/functions/tensor/PoolTest.R
@@ -0,0 +1,98 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args <- commandArgs(TRUE)
+library("Matrix")
+library("matrixStats") 
+imgSize=as.integer(args[1])
+numImg=as.integer(args[2])
+numChannels=as.integer(args[3])
+poolSize1=as.integer(args[4])
+poolSize2=as.integer(args[5])
+stride=as.integer(args[6])
+pad=as.integer(args[7])
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, 
numChannels*imgSize*imgSize, byrow=TRUE)
+
+pad_image <- function(img, Hin, Win, padh, padw){
+  C = nrow(img)
+  img_padded = matrix(0, C, (Hin+2*padh)*(Win+2*padw))  # zeros
+  for (c in 1:C) {
+    img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)  # depth slice C reshaped
+    img_padded_slice = matrix(0, Hin+2*padh, Win+2*padw)
+    img_padded_slice[(padh+1):(padh+Hin), (padw+1):(padw+Win)] = img_slice
+    img_padded[c,] = matrix(t(img_padded_slice), 1, (Hin+2*padh)*(Win+2*padw)) 
 # reshape
+  }
+  img_padded
+}
+
+im2col <- function(img, Hin, Win, Hf, Wf, strideh, stridew) {
+  C = nrow(img)
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  img_cols = matrix(0, C*Hf*Wf, Hout*Wout, byrow=TRUE)  # zeros
+  for (hout in 1:Hout) {  # all output rows
+    hin = (hout-1) * strideh + 1
+    for (wout in 1:Wout) {  # all output columns
+      win = (wout-1) * stridew + 1
+      # Extract a local patch of the input image corresponding spatially to 
the filter sizes.
+      img_patch = matrix(0, C, Hf*Wf, byrow=TRUE)  # zeros
+      for (c in 1:C) {  # all channels
+        img_slice = matrix(img[c,], Hin, Win, byrow=TRUE)  # reshape
+        img_patch[c,] = matrix(t(img_slice[hin:(hin+Hf-1), win:(win+Wf-1)]), 
1, Hf*Wf)
+      }
+      img_cols[,(hout-1)*Wout + wout] = matrix(t(img_patch), C*Hf*Wf, 1)  # 
reshape
+    }
+  }
+  img_cols
+}
+
+max_pool <- function(X, N, C, Hin, Win, Hf, Wf,
+                   strideh, stridew) {
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  # Create output volume
+  out = matrix(0, N, C*Hout*Wout, byrow=TRUE)
+
+  # Max pooling - im2col implementation
+  for (n in 1:N) {  # all examples
+    img = matrix(X[n,], C, Hin*Win, byrow=TRUE)  # reshape
+    img_maxes = matrix(0, C, Hout*Wout, byrow=TRUE)  # zeros
+
+    for (c in 1:C) {  # all channels
+      # Extract local image slice patches into columns with im2col, of shape 
(Hf*Wf, Hout*Wout)
+      img_slice_cols = im2col(matrix(t(img[c,]), 1, Hin*Win) , Hin, Win, Hf, 
Wf, strideh, stridew)
+
+      # Max pooling on patches
+      img_maxes[c,] = colMaxs(img_slice_cols)
+    }
+
+    out[n,] = matrix(t(img_maxes), 1, C*Hout*Wout)
+  }
+  
+  out
+}
+
+output = max_pool(x, numImg, numChannels, imgSize, imgSize, poolSize1, 
poolSize2, stride, stride)
+
+writeMM(as(output,"CsparseMatrix"), paste(args[8], "B", sep=""))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c334c2c8/src/test/scripts/functions/tensor/PoolTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.dml 
b/src/test/scripts/functions/tensor/PoolTest.dml
new file mode 100644
index 0000000..e163e89
--- /dev/null
+++ b/src/test/scripts/functions/tensor/PoolTest.dml
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# 
+#-------------------------------------------------------------
+imgSize=$1
+numImg=$2
+numChannels=$3
+poolSize1=$4
+poolSize2=$5
+stride=$6
+pad=$7
+poolMode=$8
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, 
cols=numChannels*imgSize*imgSize)
+if(poolMode == "max") {
+       output = max_pool(x, stride=[stride, stride], padding=[pad, pad], 
input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, 
poolSize2])
+}
+#else {
+       #output = avg_pool(x, stride=[stride, stride], padding=[pad, pad], 
input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, 
poolSize2])
+#}
+write(output, $9, format="text")
\ No newline at end of file

[1/2] incubator-systemml git commit: [SYSTEMML-540] Initial implementation of conv2d/pooling builtin function

Reply via email to