http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index e99c2ef..f81e929 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -18,26 +18,13 @@
  */
 package org.apache.sysml.runtime.matrix.data;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.BitSet;
-import java.util.concurrent.Callable;
 
-import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
-import 
org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardDataHelper.*;
-import 
org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.*;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dHelper.*;
-import 
org.apache.sysml.runtime.matrix.data.LibMatrixDNNPoolingBackwardHelper.*;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNNPoolingHelper.*;
-import org.apache.sysml.runtime.instructions.InstructionUtils;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.NativeHelper;
-import org.apache.sysml.utils.Statistics;
 
 
-public class LibMatrixDNNHelper {
-       
+public class LibMatrixDNNHelper 
+{
        protected static class CellIndex3 {
                public int ix1;
                public int ix2;
@@ -48,243 +35,6 @@ public class LibMatrixDNNHelper {
                }
        }
        
-       // *********************************** low-level runtime operator 
selection ***********************************************
-       // *********************************** based on runtime properties 
(sparsity, native, etc) ********************************
-       // These methods help reduce branch miss predictions and 
instruction-cache misses.
-       // Also, they simplify the design of LibMatrixDNN and help in 
code-maintenance.
-       
-       /**
-        * Factory method that returns list of callable tasks for performing 
maxpooling operation
-        * 
-        * @param params convolution parameters
-        * @return list of callable tasks for performing maxpooling operation
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static ArrayList<Callable<Long>> 
getMaxPoolingWorkers(ConvolutionParameters params) throws DMLRuntimeException {
-               ArrayList<Callable<Long>> ret = new ArrayList<>();
-               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               int taskSize = (int)(Math.ceil((double)params.N / k));
-               for(int i = 0; i*taskSize < params.N; i++) {
-                       if(params.input1.isInSparseFormat())
-                               ret.add(new SparseMaxPooling(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
-                       else
-                               ret.add(new DenseMaxPooling(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
-               }
-               return ret;
-       }
-       
-       /**
-        * Factory method that returns list of callable tasks for performing 
maxpooling backward operation
-        * 
-        * @param params convolution parameters
-        * @param performReluBackward whether to perform ReLU backward
-        * @return list of callable tasks for performing maxpooling backward 
operation
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static ArrayList<Callable<Long>> 
getMaxPoolingBackwardWorkers(ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
-               ArrayList<Callable<Long>> ret = new ArrayList<>();
-               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               int taskSize = (int)(Math.ceil((double)params.N / k));
-               boolean sparse1 = params.input1.isInSparseFormat();
-               boolean sparse2 = params.input2.isInSparseFormat();
-               for(int i = 0; i*taskSize < params.N; i++) {
-                       if( !sparse1 && !sparse2 )
-                               ret.add(new 
PoolingBackwardDenseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
-                       else if( !sparse1 && sparse2 )
-                               ret.add(new 
PoolingBackwardDenseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
-                       else if( sparse1 && !sparse2 ) 
-                               ret.add(new 
PoolingBackwardSparseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
-                       else if( sparse1 && sparse2 )
-                               ret.add(new 
PoolingBackwardSparseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
-               }
-               return ret;
-       }
-       
-       /**
-        * Factory method that returns list of callable tasks for performing 
relu backward operation
-        * 
-        * @param params convolution parameters
-        * @return list of callable tasks for performing relu backward operation
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static ArrayList<Callable<Long>> 
getReluBackwardWorkers(ConvolutionParameters params) throws DMLRuntimeException 
{
-               ArrayList<Callable<Long>> ret = new ArrayList<>();
-               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               int taskSize = (int)(Math.ceil((double)params.N / k));
-               for(int i = 0; i*taskSize < params.N; i++) {
-                       ret.add(new ReluBackward(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
-               }
-               return ret;
-       }
-       
-       /**
-        * Factory method that returns list of callable tasks for performing 
conv2d
-        * 
-        * @param params convolution parameters
-        * @return list of callable tasks for performing conv2d
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static ArrayList<Callable<Long>> 
getConv2dWorkers(ConvolutionParameters params) throws DMLRuntimeException {
-               ArrayList<Callable<Long>> ret = new ArrayList<>();
-               
-               // Try to create twice as many tasks as threads for improved 
load balance
-               // (due to constant-sized intermediates, GC works well, so the 
overhead per task is small)
-               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               int taskSize = (int)(Math.ceil((double)params.N / k / 2));
-               
-               // TODO: Decide here based on params whether to use 
LoopedIm2ColConv2dAllChannels or LoopedIm2ColConv2dOneChannel
-               // For now, let's stick to the existing approach of converting 
[1, CHW] to [CRS, PQ] as it allows matrix multiplication large enough matrix.
-               boolean allChannels = true; ArrayList<MatrixBlock> filters = 
null;
-               if(!allChannels)
-                       filters = splitFilter(params);
-               
-               MatrixBlock in1 = params.input1;
-               boolean isEmptyDenseInput = !in1.isInSparseFormat() && 
in1.denseBlock == null;
-               boolean isTransPref = in1.sparse && !params.input2.sparse && 
-                       MatrixBlock.evalSparseFormatInMemory(in1.clen, 
in1.rlen, in1.nonZeros);
-               boolean applyNative = 
LibMatrixDNN.isEligibleForConv2dSparse(params)
-                       && !(!isEmptyDenseInput && allChannels && isTransPref);
-               if( applyNative )
-                       Statistics.numNativeSparseConv2dCalls.increment();
-               
-               //transpose filter once for efficient sparse-dense multiplies 
in LoopedIm2ColConv2dTransAllChan
-               //in order to share the temporary object and its creation costs 
across threads
-               if( !applyNative && !isEmptyDenseInput && allChannels && 
isTransPref ) {
-                       params.input2 = LibMatrixReorg.transpose(params.input2, 
-                               new MatrixBlock(params.input2.clen, 
params.input2.rlen, false), k);
-               }
-               
-               for(int i = 0; i*taskSize < params.N; i++) {
-                       //note: we prefer the java backend for sparse inputs 
because the native 
-                       //implementation simply converts the sparse input into 
dense rows
-                       if( applyNative ) 
-                               ret.add(new SparseNativeConv2d(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
-                       else if(!isEmptyDenseInput && allChannels && 
isTransPref)
-                               ret.add(new 
LoopedIm2ColConv2dTransAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params));
-                       else if(!isEmptyDenseInput && allChannels)
-                               ret.add(new 
LoopedIm2ColConv2dAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params));
-                       else if(!isEmptyDenseInput && !allChannels)
-                               ret.add(new 
LoopedIm2ColConv2dOneChan(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, filters));
-                       else
-                               throw new DMLRuntimeException("Unsupported 
operator");
-               }
-               return ret;
-       }
-       
-       /**
-        * Factory method that returns list of callable tasks for performing 
conv2d backward filter
-        * 
-        * @param params convolution parameters
-        * @return list of callable tasks for performing conv2d backward filter
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static ArrayList<Callable<Long>> 
getConv2dBackwardFilterWorkers(ConvolutionParameters params) throws 
DMLRuntimeException {
-               ArrayList<Callable<Long>> ret = new ArrayList<>();
-               // Try to create as many tasks as threads. 
-               // Creating more tasks will help in tail, but would have 
additional overhead of maintaining the intermediate
-               // data structures such as im2col blocks.
-               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               int taskSize = (int)(Math.ceil((double)params.N / k));
-               
-               boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() 
&& params.input1.denseBlock == null) || 
-                       (!params.input2.isInSparseFormat() && 
params.input2.denseBlock == null);
-               boolean applyNative = 
LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params)
-                       && !params.input2.isInSparseFormat();
-               if( applyNative )
-                       
Statistics.numNativeSparseConv2dBwdFilterCalls.increment();
-               
-               for(int i = 0; i*taskSize < params.N; i++) {
-                       //note: we prefer the java backend for sparse filters 
because the native 
-                       //implementation simply rotates the sparse filters into 
dense rows
-                       if( applyNative ) 
-                               ret.add(new 
SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, 
params.N), params));
-                       else if( params.input2.sparse && 
params.input1.getSparsity() > params.input2.getSparsity() )
-                               ret.add(new 
Conv2dBackwardFilterTrans(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params));
-                       else if(!isEmptyDenseInput)
-                               ret.add(new Conv2dBackwardFilter(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
-                       else
-                               throw new DMLRuntimeException("Unsupported 
operator");
-               }
-               return ret;
-       }
-       
-       /**
-        * Factory method that returns list of callable tasks for performing 
conv2d backward data
-        * 
-        * @param params convolution parameters
-        * @return list of callable tasks for performing conv2d backward data
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static ArrayList<Callable<Long>> 
getConv2dBackwardDataWorkers(ConvolutionParameters params) throws 
DMLRuntimeException {
-               ArrayList<Callable<Long>> ret = new ArrayList<>();
-               
-               // Try to create as many tasks as threads. 
-               // Creating more tasks will help in tail, but would have 
additional overhead of maintaining the intermediate
-               // data structures such as im2col blocks.
-               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               int taskSize = (int)(Math.ceil((double)params.N / k));
-               
-               boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() 
&& params.input1.denseBlock == null) || 
-                       (!params.input2.isInSparseFormat() && 
params.input2.denseBlock == null);
-               boolean applyNative = 
LibMatrixDNN.isEligibleForConv2dBackwardDataDense(params)
-                       && !params.input2.isInSparseFormat();
-               if( applyNative )
-                       
Statistics.numNativeSparseConv2dBwdDataCalls.increment();
-               
-               for(int i = 0; i*taskSize < params.N; i++) {
-                       //note: we prefer the java backend for sparse filters 
because the native 
-                       //implementation simply converts the sparse filters 
into dense rows
-                       if( applyNative ) 
-                               ret.add(new 
SparseNativeConv2dBackwardDataDense(i*taskSize, Math.min((i+1)*taskSize, 
params.N), params));
-                       else if(!isEmptyDenseInput)
-                               ret.add(new Conv2dBackwardData(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
-                       else
-                               throw new DMLRuntimeException("Unsupported 
operator");
-               }
-                       
-               return ret;
-       }
-       
-       // *********************************** relu backward operator 
******************************************************
-       
-       /**
-        * Performs the operation: (X gt 0) * dout
-        */
-       public static class ReluBackward implements Callable<Long> 
-       {
-               public int _rl; public int _ru; 
-               private final ConvolutionParameters _params; 
-               double [] outputArray; int numOutCols;
-               public ReluBackward(int rl, int ru, ConvolutionParameters 
params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-                       outputArray= params.output.getDenseBlockValues();
-                       numOutCols = params.input1.getNumColumns();
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       if(!_params.input1.isInSparseFormat() && 
!_params.input2.isInSparseFormat()) {
-                               double [] inputArr = 
_params.input1.getDenseBlockValues();
-                               double [] doutArr = 
_params.input2.getDenseBlockValues();
-                               for(int i = _rl*numOutCols; i < _ru*numOutCols; 
i++) {
-                                       outputArray[i] = inputArr[i] > 0 ? 
doutArr[i] : 0;
-                               }
-                       }
-                       else {
-                               // Perform (X > 0)
-                               
ConvolutionUtils.scalarOperations(_params.input1, outputArray, _rl*numOutCols, 
numOutCols, _rl, _ru, 
-                                               
InstructionUtils.parseScalarBinaryOperator(">", false, 0));
-                               // Then perform (X > 0) * dout
-                               
ConvolutionUtils.binaryOperationInPlace(_params.input2, outputArray, 
_rl*numOutCols, numOutCols, _rl, _ru, 
-                                               
LibMatrixDNN._binaryElementWiseMultiplication);
-                       }
-                       return 0L;
-               }
-       }
-       
-       // *********************************** utility methods 
******************************************************
-       
        protected static CellIndex3 computeTensorIndexes(int j, int H, int W) {
                return computeTensorIndexes(j, H, W, new CellIndex3());
        }
@@ -307,54 +57,8 @@ public class LibMatrixDNNHelper {
                return ret;
        }
        
-       //Split a filter of size [K, CRS] into c filters of [K, RS]
-       private static ArrayList<MatrixBlock> splitFilter(ConvolutionParameters 
_params) {
-               ArrayList<MatrixBlock> ret = new ArrayList<>();
-               int RS = _params.R*_params.S; int CRS = 
_params.C*_params.R*_params.S;
-               double [] filter = _params.input2.getDenseBlockValues(); int S 
= _params.S;
-               for(int c = 0; c < _params.C; c++) {
-                       MatrixBlock mb = new MatrixBlock(_params.K, RS, false);
-                       mb.allocateDenseBlock(); long nnz = 0;
-                       double [] outputArr = mb.getDenseBlockValues();
-                       if(filter != null) {
-                               for(int k = 0; k < _params.K; k++) {
-                                       int outOffset = k*RS;
-                                       int filterOffset = k*CRS + c*RS;
-                                       for(int rs = 0; rs < RS; rs++) {
-                                               int outIndex = outOffset + rs;
-                                               outputArr[outIndex] = 
filter[filterOffset + rs];
-                                               nnz += outputArr[outIndex] != 0 
? 1 : 0;
-                                       }
-                               }
-                       }
-                       else {
-                               SparseBlock sblock = _params.input2.sparseBlock;
-                               CellIndex3 ix = new CellIndex3();
-                               for(int k = 0; k < _params.K; k++) {
-                                       if( sblock.isEmpty(k) ) continue;
-                                       // Find maxIndex
-                                       int apos = sblock.pos(k);
-                                       int alen = sblock.size(k);
-                                       int[] aix = sblock.indexes(k);
-                                       double[] avals = sblock.values(k);
-                                       for(int j=apos; j<apos+alen; j++) {
-                                               ix = 
computeTensorIndexes(aix[j], _params.R, _params.S, ix);
-                                               if(c != ix.ix1)
-                                                       continue;
-                                               outputArr[k*RS + ix.ix2*S + 
ix.ix3] = avals[j];
-                                               nnz += outputArr[k*RS + 
ix.ix2*S + ix.ix3] != 0 ? 1 : 0;
-                                       }
-                               }
-                       }
-                       mb.setNonZeros(nnz);
-                       ret.add(mb);
-               }
-               return ret;
-       }
-       
-       // Single-threaded matrix multiplication
-       static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, 
-                       boolean recomputeNNZM1, boolean recomputeNNZM2, 
ConvolutionParameters params) throws DMLRuntimeException {
+       protected static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock 
m2, MatrixBlock ret, 
+               boolean recomputeNNZM1, boolean recomputeNNZM2, 
ConvolutionParameters params) throws DMLRuntimeException {
                if( !params.enableNative || m1.sparse || m2.sparse ) {
                        prepNonZerosForMatrixMult(m1, recomputeNNZM1);
                        prepNonZerosForMatrixMult(m2, recomputeNNZM2);
@@ -372,228 +76,6 @@ public class LibMatrixDNNHelper {
                ret.setNonZeros((long)ret.rlen*ret.clen);
        }
        
-       static void addBias(int r, double [] out, double [] bias, int K, int 
PQ) {
-               for(int k=0, cix=r*K*PQ; k<K; k++, cix+=PQ)
-                       LibMatrixMult.vectAddInPlace(bias[k], out, cix, PQ);
-       }
-       
-       /**
-        * Returns the index of cell with maximum value. This method is 
optimized for dense input
-        * 
-        * @param p output feature map height
-        * @param q output feature map width
-        * @param inputOffset offset to be used for input index
-        * @param inputArray input array
-        * @param params convolution parameters
-        * @param performReluBackward perform ReLU backward
-        * @return index of cell with maximum value
-        */
-       static int getMaxIndex(int p, int q, int inputOffset, double [] 
inputArray, ConvolutionParameters params, boolean performReluBackward) {
-               int start_index_h = params.start_indexes_h[p];
-               int end_index_h = params.end_indexes_h[p];
-               int start_index_w = params.start_indexes_w[q];
-               int end_index_w = params.end_indexes_w[q];
-               
-               int maxIndex = -1; 
-               double maxVal = -Double.MAX_VALUE;
-               
-               // Note: We do not treat pad as zero and hence we don't do:  
-               // maxVal = 0 
-               // if start_index_h < 0 || start_index_w < 0 || end_index_h >= 
params.H || end_index_w >= params.W
-               
-               // Find maxIndex
-               double currDoutVal = -1;
-               for (int h = start_index_h; h < end_index_h; h++) {
-                       for (int w = start_index_w; w < end_index_w; w++) {
-                               currDoutVal = inputArray[inputOffset +  
h*params.W + w];
-                               currDoutVal = performReluBackward && 
currDoutVal < 0 ? 0 : currDoutVal;
-                               if(maxVal < currDoutVal) {
-                                       maxIndex = inputOffset +  h*params.W + 
w;
-                                       maxVal = currDoutVal;
-                               }
-                       }
-               }
-               return maxIndex;
-       }
-       
-       /**
-        * Returns the index of cell with maximum value. This method is 
optimized for sparse input
-        * 
-        * @param p output feature map height
-        * @param q output feature map width
-        * @param inputOffset offset to be used for input index
-        * @param n number of images
-        * @param c number of channels 
-        * @param input input matrix
-        * @param params convolution parameters
-        * @param performReluBackward perform ReLU on input
-        * @return index of the cell with maximum value
-        * @throws DMLRuntimeException if error occurs
-        */
-       static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int 
c, MatrixBlock input, ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
-               int start_h = params.start_indexes_h[p];
-               int end_h = params.end_indexes_h[p];
-               int start_w = params.start_indexes_w[q];
-               int end_w = params.end_indexes_w[q];
-               int numVals = (end_h-start_h)*(end_w-start_w);
-               BitSet bmap = new BitSet(numVals);
-               
-               int maxIndex = -1; 
-               double maxVal = -Double.MAX_VALUE;
-               
-               // Note: We do not treat pad as zero and hence we don't do:  
-               // maxVal = 0 
-               // if start_index_h < 0 || start_index_w < 0 || end_index_h >= 
params.H || end_index_w >= params.W
-
-               // input.isEmptyBlock() check is done by the caller
-               CellIndex3 ix = new CellIndex3();
-               SparseBlock sblock = input.sparseBlock;
-               if( !sblock.isEmpty(n) ) {
-                       // Find maxIndex
-                       int apos = sblock.pos(n);
-                       int alen = sblock.size(n);
-                       int[] aix = sblock.indexes(n);
-                       double[] avals = sblock.values(n);
-                       for(int j=apos; j<apos+alen; j++) {
-                               ix = computeTensorIndexes(aix[j], params.H, 
params.W, ix);
-                               if(c != ix.ix1) continue;
-                               int h = ix.ix2;
-                               int w = ix.ix3;
-                               if(h >= start_h && h < end_h && w >= start_w && 
w < end_w) {
-                                       double val = performReluBackward && 
avals[j] < 0 ? 0 : avals[j]; 
-                                       if(maxVal < val) {
-                                               maxIndex = inputOffset +  
h*params.W + w;
-                                               maxVal = val;
-                                       }
-                                       bmap.set((h-start_h)*(end_w-start_w) + 
(w-start_w));
-                               }
-                       }
-               }
-               
-               //correction for 0 maximum and subset of evaluated entries
-               int firstMiss = bmap.nextClearBit(0);
-               if( firstMiss < numVals && maxVal<0 ) {
-                       int lh = firstMiss/(end_w-start_w)+start_h;
-                       int lw = firstMiss%(end_w-start_w)+start_w;
-                       maxIndex = inputOffset + lh * params.W + lw;
-                       maxVal = 0;
-               }
-               return maxIndex;
-       }
-       
-       // Returns the row of matrix in dense format
-       static void getRowInDenseFormat(MatrixBlock input, int n, double []  
ret) throws DMLRuntimeException {
-               if(input.getNumColumns() != ret.length) {
-                       throw new DMLRuntimeException("Invalid parameters");
-               }
-               // Use temporary array to avoid binary search
-               if(input.isInSparseFormat()) {
-                       Arrays.fill(ret, 0);
-                       if( !input.sparseBlock.isEmpty(n) ) {
-                               int apos = input.sparseBlock.pos(n);
-                               int alen = input.sparseBlock.size(n);
-                               int[] aix = input.sparseBlock.indexes(n);
-                               double[] avals = input.sparseBlock.values(n);
-                               for(int j=apos; j<apos+alen; j++)
-                                       ret[ aix[j] ] = avals[j];
-                       }
-               }
-               else {
-                       System.arraycopy(input.getDenseBlockValues(),
-                               n*input.getNumColumns(), ret, 0, 
input.getNumColumns());
-               }
-       }
-       
-       // 
------------------------------------------------------------------------------------------------------
-       // Since col2im always operates on intermediate generated as part of 
matmult, it is not clear which operator to select apriori.
-       // Therefore, it is provided as utility function rather than an 
operator (like im2col or rotate180)
-       
-       //Converts input: PQ X CRS matrix and writes to 1 X CHW
-       static void doCol2imOverSingleImage(int outputN, MatrixBlock input, 
ConvolutionParameters params) throws DMLRuntimeException {
-               if(input.rlen != params.P*params.Q || input.clen != 
params.C*params.R*params.S) {
-                       throw new DMLRuntimeException("Incorrect input 
dimensions");
-               }
-               
-               double [] outputArray = null;
-               if (!params.output.isInSparseFormat())
-                       outputArray = params.output.getDenseBlockValues();
-               else {
-                       throw new DMLRuntimeException("Only dense output is 
implemented");
-               }
-               
-               if(!input.isInSparseFormat()) {
-                       double [] inputArray = input.getDenseBlockValues();
-                       doCol2IMDenseInput(0, outputN, inputArray, outputArray, 
params);
-               }
-               else {
-                       if(!input.isEmptyBlock()) {
-                               int outOffset = 
outputN*params.C*params.H*params.W;
-                               int HW = params.H*params.W;
-                               CellIndex3 ix = new CellIndex3();
-                               SparseBlock sblock = input.sparseBlock;
-                               for(int i = 0; i < input.getNumRows(); i++) {
-                                       if( sblock.isEmpty(i) ) continue;
-                                       ix = computeTensorIndexes(i, params.P, 
params.Q, ix);
-                                       int tmpP = ix.ix2*params.stride_h - 
params.pad_h;
-                                       int tmpQ = ix.ix3*params.stride_w - 
params.pad_w;
-                                       if(ix.ix1 != 0) 
-                                               throw new 
DMLRuntimeException("Incorrect tensor indexes: "+ ix + ", " + params.P + " " + 
params.Q);
-                                       int apos = sblock.pos(i);
-                                       int alen = sblock.size(i);
-                                       int[] aix = sblock.indexes(i);
-                                       double[] avals = sblock.values(i);
-                                       for(int j = apos; j < apos+alen; j++) {
-                                               ix = 
computeTensorIndexes(aix[j], params.R, params.S, ix);
-                                               int h = tmpP + ix.ix2;
-                                               int w = tmpQ + ix.ix3;
-                                               if(h >= 0 && h < params.H && w 
>= 0 && w < params.W) {
-                                                       int outIndex = 
outOffset + ix.ix1*HW + h*params.W + w;
-                                                       outputArray[outIndex] 
+= avals[j];
-                                               }
-                                       }
-                               }
-                       }
-               }
-       }
-       
-       // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0
-       // Or converts input: NPQ X CRS matrix and writes to N X CHW 
-       private static void doCol2IMDenseInput(int inputN, int outputN, double 
[] inputArray, double [] outputArray, ConvolutionParameters params) throws 
DMLRuntimeException {
-               final int outputNOffset = outputN*params.C*params.H*params.W;
-               final int HW = params.H*params.W;
-               final int inputNPQ = inputN*params.P*params.Q;
-               final int CRS = params.C*params.R*params.S;
-               final int RS = params.R*params.S;
-               for (int p = 0; p < params.P; p++) {
-                       // h = p*params.stride_h + r - params.pad_h
-                       //   = r + hOffset
-                       // Based on restrictions: h >= 0 and r >= 0 and h < 
params.H and r < params.R, we get
-                       // max(0, - hOffset) <= r < min(params.R, params.H - 
hOffset)
-                       final int hOffset = p*params.stride_h - params.pad_h;
-                       final int rStart = Math.max(0, - hOffset);
-                       final int rEnd = Math.min(params.R, params.H - hOffset);
-                       for (int q = 0; q < params.Q; q++) {
-                               // Using the same logic as above on following:
-                               // w = q*params.stride_w + s - params.pad_w
-                               final int wOffset = q*params.stride_w - 
params.pad_w;
-                               final int sStart = Math.max(0, - wOffset);
-                               final int sEnd = Math.min(params.S, params.W - 
wOffset);
-                               final int tempOffset = (inputNPQ + p*params.Q + 
q)*CRS;
-                               for (int c = 0; c < params.C; c++) {
-                                       final int outOffset = outputNOffset + 
c*HW;
-                                       final int inputOffset = tempOffset + 
c*RS;
-                                       for (int r = rStart; r < rEnd; r++) {
-                                               for (int s = sStart; s < sEnd; 
s++) {
-                                                       int inputIndex = 
inputOffset + r*params.S + s;
-                                                       int outIndex = 
outOffset + (hOffset + r)*params.W + wOffset + s;
-                                                       outputArray[outIndex] 
+= inputArray[inputIndex];
-                                               }
-                                       }
-                               }
-                       }
-               }
-       }
-       
        private static void prepNonZerosForMatrixMult(MatrixBlock mb, boolean 
update) {
                if( !update )
                        return;

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java
new file mode 100644
index 0000000..ac7a6a8
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNHelper.CellIndex3;
+
+/**
+ * This class contains the different implementation of im2col operation
+ */
+public class LibMatrixDNNIm2Col {
+       private static final Log LOG = 
LogFactory.getLog(LibMatrixDNNIm2Col.class.getName());
+       static interface Im2colWorker {
+               public void execute(int n);
+               public static Im2colWorker getWorker(MatrixBlock input, 
MatrixBlock out, ConvolutionParameters params, boolean trans) {
+                       if(!input.isInSparseFormat()) {
+                               boolean stride1Pad0 = params.stride_h == 1 && 
params.stride_w == 1 
+                                               && params.pad_h == 0 && 
params.pad_w == 0;
+                               // Note: Only dense im2col operators require 
the im2ColOutBlock to be allocated in the dense format.
+                               out.reset(out.rlen, out.clen, false);
+                               out.allocateDenseBlock();
+                               if( LOG.isTraceEnabled() ) 
+                                       LOG.trace("Using 
DenseIm2colWorkerAllChannels operator to perform im2col 
(stride1pad0="+stride1Pad0+").");
+                               if( stride1Pad0 && !trans )
+                                       return new 
DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlockValues(), 
out.getDenseBlockValues(), params);
+                               else
+                                       return new 
DenseIm2colWorkerAllChannels(input.getDenseBlockValues(), 
out.getDenseBlockValues(), params, trans);
+                       }
+                       else {
+                               if(LOG.isTraceEnabled()) 
+                                       LOG.trace("Using SparseIm2colWorker 
operator to perform im2col.");
+                               out.reset(out.rlen, out.clen, true);
+                               out.allocateSparseRowsBlock();
+                               //preallocate sparse-rows (larger than average 
sparsity to account for skew)
+                               int estnnz = 
(int)Math.ceil(4*input.getSparsity()*out.clen);
+                               for(int r = 0; r < out.rlen; r++)
+                                       out.getSparseBlock().allocate(r, 
Math.min(estnnz, out.clen));
+                               return new 
SparseSparseIm2colWorkerAllChan(input, out, params, trans);
+                       }
+               }
+       }
+       
+       /**
+        * Special case operator for performing dense im2col when stride = [1, 
1] and pad = [0, 0] by using System.arraycopy
+        */
+       private static class DenseIm2colWorkerStride1Pad0AllChannels implements 
Im2colWorker {
+               private final double [] inputArray, outputArray; 
+               private final int CRS, S, R, P, Q, CHW, H, W;
+               public DenseIm2colWorkerStride1Pad0AllChannels(double [] 
inputArray, double [] outputArray, ConvolutionParameters params) {
+                       this.inputArray = inputArray;
+                       this.outputArray = outputArray;
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.CHW = params.C*params.H*params.W;
+               }
+               
+               @Override
+               public void execute(int n) {
+                       int nOffset = n * CHW;
+                       for (int c = 0; c < CRS; ++c) {
+                               int wOffset = c % S;
+                               int hOffset = (c / S) % R;
+                               int cInput = c / R / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int hPadded = h + hOffset;
+                                       int outOffset = (c * P + h) * Q;
+                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
+                                       System.arraycopy(inputArray, 
inputOffset + wOffset, outputArray, outOffset, Q);
+                                       int w = Q - 1;
+                                       int wPadded = w + wOffset;
+                                       boolean assign = (hPadded < H && 
wPadded < W);
+                                       outputArray[outOffset + w] = assign ? 
inputArray[inputOffset + wPadded] : 0;
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Performing dense im2col (general case)
+        */
+       private static class DenseIm2colWorkerAllChannels implements 
Im2colWorker {
+               private final double[] inputArray, outputArray; 
+               private final int CRS, S, R, P, Q, CHW, H, W; 
+               private final int stride_h, stride_w, pad_h, pad_w;
+               private final boolean trans;
+               public DenseIm2colWorkerAllChannels(double [] inputArray, 
double [] outputArray, ConvolutionParameters params, boolean trans) {
+                       this.inputArray = inputArray;
+                       this.outputArray = outputArray;
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.CHW = params.C*params.H*params.W;
+                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
+                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+                       this.trans = trans;
+               }
+               
+               @Override
+               public void execute(int n) {
+                       //reset for selective copy
+                       Arrays.fill(outputArray, 0);
+                       
+                       int nOffset = n * CHW;
+                       for (int c = 0; c < CRS; ++c) {
+                               int wOffset = c % S;
+                               int hOffset = (c / S) % R;
+                               int cInput = c / R / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int outOffset = trans ? c+(h*Q*CRS) : 
(c*P+h)*Q;
+                                       int hPadded = h * stride_h - pad_h + 
hOffset;
+                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
+                                       if (hPadded < 0 || hPadded >= H ) 
continue;
+                                       for (int w = 0; w < Q; ++w) {
+                                               int wPadded = w * stride_w - 
pad_w + wOffset;
+                                               if( wPadded >= 0 && wPadded < W 
)
+                                                       outputArray[outOffset + 
(trans?w*CRS:w)] 
+                                                               = 
inputArray[inputOffset + wPadded];
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Performing sparse im2col for all channels for a given row n of the 
input matrix.
+        */
+       private static class SparseSparseIm2colWorkerAllChan implements 
Im2colWorker {
+               private final MatrixBlock input, output;
+               private final int S, R, P, Q, W, HW, RS;
+               private final int stride_h, stride_w, pad_h, pad_w;
+               private final boolean trans;
+               private final boolean simple;
+               public SparseSparseIm2colWorkerAllChan(MatrixBlock input, 
MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) {
+                       this.input = input;
+                       this.output = im2ColOutBlock;
+                       this.HW = params.H * params.W;
+                       this.RS = params.R * params.S;
+                       this.W = params.W; this.R = params.R; this.S = 
params.S; this.P = params.P; this.Q = params.Q;
+                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
+                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+                       this.trans = trans;
+                       this.simple = params.isStride1Pad0() && W == S && Q == 
1;
+                       if(!input.isInSparseFormat()) 
+                               throw new RuntimeException("Incorrect operator 
selection. Expected dense input for SparseIm2colWorkerAllChannels");
+               }
+               
+               @Override
+               public void execute(int n) {
+                       output.reset();
+                       SparseBlock sblock = input.sparseBlock;
+                       if( sblock.isEmpty(n) )
+                               return;
+                       int apos = sblock.pos(n);
+                       int alen = sblock.size(n);
+                       int[] aix = sblock.indexes(n);
+                       double[] avals = sblock.values(n);
+                       
+                       // Iterate over the sparse block
+                       for(int j=apos; j<apos+alen; j++) {
+                               // Note: the input is of shape [N, CHW]
+                               int chw = aix[j];
+                               
+                               // Get individual zero-based c,h,w indexes from 
zero-based 'chw'
+                               int cInput = chw / HW;
+                               int hInput = (chw - cInput*HW)/W;
+                               int wInput = chw % W; 
+                               
+                               if( simple )
+                                       
appendInputValueToIm2colOutputSimple(output, cInput, hInput, wInput, 
+                                               avals[j], R, S, RS, P, trans);
+                               else
+                                       appendInputValueToIm2colOutput(output, 
cInput, hInput, wInput, avals[j], 
+                                               R, S, P, Q, stride_h, stride_w, 
pad_h, pad_w, trans);
+                       }
+                       
+                       output.sortSparseRows();
+               }
+       }
+       
+       /**
+        * Appends the value corresponding to the given [, cInput, hInput, 
wInput] to the appropriate im2col location of the output
+        * 
+        * @param output output matrix block
+        * @param cInput input channel index (zero-based)
+        * @param hInput input height index (zero-based)
+        * @param wInput input width index (zero-based)
+        * @param value input value
+        * @param R filter height
+        * @param S filter width
+        * @param P output height
+        * @param Q output width
+        * @param stride_h stride height
+        * @param stride_w stride width
+        * @param pad_h pad height
+        * @param pad_w pad width
+        */
+       private static void appendInputValueToIm2colOutput(MatrixBlock output, 
int cInput, int hInput, int wInput, double value, 
+                       int R, int S, int P, int Q, int stride_h, int stride_w, 
int pad_h, int pad_w, boolean trans) {
+               int RS = R*S;
+               // For the given h,w index, insert avals[j] into respective 
r,s,p,q locations
+               
+               // Constraints: for(int r = 0; r < R; r++) { if(0 <= p && p < P 
&& (hInput - r + pad_h) % stride_h == 0) { ... } }
+               // Constraint 1: p >= 0 and p = (hInput - r + pad_h)  / stride_h
+               // Therefore,  r <= hInput + pad_h 
+               // Constraint 2: p < P and p = (hInput - r + pad_h)  / stride_h
+               // Therefore,  hInput + pad_h - P*stride_h < r
+               // Math.max(0, hInput + pad_h - P*stride_h + 1) <= r <= 
Math.min(R-1, hInput + pad_h)
+               int rMin = Math.max(0, hInput + pad_h - P*stride_h + 1);
+               int rMax = Math.min(R-1, hInput + pad_h);
+               int sMin = Math.max(0, wInput + pad_w - Q*stride_w + 1);
+               int sMax = Math.min(S-1, wInput + pad_w);
+               // Constraint 3: (hInput - r + pad_h) % stride_h == 0
+               while((hInput - rMin + pad_h) % stride_h != 0 && rMin <= rMax) 
rMin++;
+               while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) 
sMin++;
+               
+               for(int r = rMin; r <= rMax; r += stride_h) {
+                       // Only append value if h == hInput, where h = (r - 
pad_h) + p*stride_h and 0 <= p < P
+                       // Therefore, p = (hInput - r + pad_h)  / stride_h. Use 
the same logic for q.
+                       final int p = (hInput - r + pad_h)  / stride_h;
+                       final int pQ = p*Q;
+                       final int outRowIndex = cInput*RS + r*S;
+                       for(int s = sMin; s <= sMax; s += stride_w) {
+                               int q = (wInput - s + pad_w)  / stride_w;
+                               // chw -> [crs, pq]
+                               if( trans )
+                                       output.appendValue(pQ + q, outRowIndex 
+ s, value);
+                               else
+                                       output.appendValue(outRowIndex + s, pQ 
+ q, value);
+                       }
+               }
+       }
+       
+       private static void appendInputValueToIm2colOutputSimple(MatrixBlock 
output, int c, int h, int w,
+               double value, int R, int S, int RS, int P, boolean trans) {
+               int rMin = Math.max(0, h - P + 1);
+               int rMax = Math.min(R-1, h);
+               int cix = c*RS+w+rMin*S;
+               for(int p=h-rMin; p >= h-rMax; p--, cix+=S)
+                       output.appendValue(trans?p:cix, trans?cix:p, value);
+       }
+       
+
+       // 
------------------------------------------------------------------------------------------------------
+       // Since col2im always operates on intermediate generated as part of 
matmult, it is not clear which operator to select apriori.
+       // Therefore, it is provided as utility function rather than an 
operator (like im2col or rotate180)
+       
+       //Converts input: PQ X CRS matrix and writes to 1 X CHW
+       static void doCol2imOverSingleImage(int outputN, MatrixBlock input, 
ConvolutionParameters params) throws DMLRuntimeException {
+               if(input.rlen != params.P*params.Q || input.clen != 
params.C*params.R*params.S) {
+                       throw new DMLRuntimeException("Incorrect input 
dimensions");
+               }
+               
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlockValues();
+               else {
+                       throw new DMLRuntimeException("Only dense output is 
implemented");
+               }
+               
+               if(!input.isInSparseFormat()) {
+                       double [] inputArray = input.getDenseBlockValues();
+                       doCol2IMDenseInput(0, outputN, inputArray, outputArray, 
params);
+               }
+               else {
+                       if(!input.isEmptyBlock()) {
+                               int outOffset = 
outputN*params.C*params.H*params.W;
+                               int HW = params.H*params.W;
+                               CellIndex3 ix = new CellIndex3();
+                               SparseBlock sblock = input.sparseBlock;
+                               for(int i = 0; i < input.getNumRows(); i++) {
+                                       if( sblock.isEmpty(i) ) continue;
+                                       ix = 
LibMatrixDNNHelper.computeTensorIndexes(i, params.P, params.Q, ix);
+                                       int tmpP = ix.ix2*params.stride_h - 
params.pad_h;
+                                       int tmpQ = ix.ix3*params.stride_w - 
params.pad_w;
+                                       if(ix.ix1 != 0) 
+                                               throw new 
DMLRuntimeException("Incorrect tensor indexes: "+ ix + ", " + params.P + " " + 
params.Q);
+                                       int apos = sblock.pos(i);
+                                       int alen = sblock.size(i);
+                                       int[] aix = sblock.indexes(i);
+                                       double[] avals = sblock.values(i);
+                                       for(int j = apos; j < apos+alen; j++) {
+                                               ix = 
LibMatrixDNNHelper.computeTensorIndexes(aix[j], params.R, params.S, ix);
+                                               int h = tmpP + ix.ix2;
+                                               int w = tmpQ + ix.ix3;
+                                               if(h >= 0 && h < params.H && w 
>= 0 && w < params.W) {
+                                                       int outIndex = 
outOffset + ix.ix1*HW + h*params.W + w;
+                                                       outputArray[outIndex] 
+= avals[j];
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0
+       // Or converts input: NPQ X CRS matrix and writes to N X CHW 
+       private static void doCol2IMDenseInput(int inputN, int outputN, double 
[] inputArray, double [] outputArray, ConvolutionParameters params) throws 
DMLRuntimeException {
+               final int outputNOffset = outputN*params.C*params.H*params.W;
+               final int HW = params.H*params.W;
+               final int inputNPQ = inputN*params.P*params.Q;
+               final int CRS = params.C*params.R*params.S;
+               final int RS = params.R*params.S;
+               for (int p = 0; p < params.P; p++) {
+                       // h = p*params.stride_h + r - params.pad_h
+                       //   = r + hOffset
+                       // Based on restrictions: h >= 0 and r >= 0 and h < 
params.H and r < params.R, we get
+                       // max(0, - hOffset) <= r < min(params.R, params.H - 
hOffset)
+                       final int hOffset = p*params.stride_h - params.pad_h;
+                       final int rStart = Math.max(0, - hOffset);
+                       final int rEnd = Math.min(params.R, params.H - hOffset);
+                       for (int q = 0; q < params.Q; q++) {
+                               // Using the same logic as above on following:
+                               // w = q*params.stride_w + s - params.pad_w
+                               final int wOffset = q*params.stride_w - 
params.pad_w;
+                               final int sStart = Math.max(0, - wOffset);
+                               final int sEnd = Math.min(params.S, params.W - 
wOffset);
+                               final int tempOffset = (inputNPQ + p*params.Q + 
q)*CRS;
+                               for (int c = 0; c < params.C; c++) {
+                                       final int outOffset = outputNOffset + 
c*HW;
+                                       final int inputOffset = tempOffset + 
c*RS;
+                                       for (int r = rStart; r < rEnd; r++) {
+                                               for (int s = sStart; s < sEnd; 
s++) {
+                                                       int inputIndex = 
inputOffset + r*params.S + s;
+                                                       int outIndex = 
outOffset + (hOffset + r)*params.W + wOffset + s;
+                                                       outputArray[outIndex] 
+= inputArray[inputIndex];
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
deleted file mode 100644
index bd29848..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.matrix.data;
-
-import java.util.Arrays;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-/**
- * This class contains the different implementation of im2col operation
- */
-public class LibMatrixDNNIm2ColHelper {
-       private static final Log LOG = 
LogFactory.getLog(LibMatrixDNNIm2ColHelper.class.getName());
-       static interface Im2colWorker {
-               public void execute(int n);
-               public void execute(int n, int c);
-               public static Im2colWorker getWorker(MatrixBlock input, 
MatrixBlock out, ConvolutionParameters params, boolean allChannels, boolean 
trans) {
-                       if(!input.isInSparseFormat()) {
-                               boolean stride1Pad0 = params.stride_h == 1 && 
params.stride_w == 1 
-                                               && params.pad_h == 0 && 
params.pad_w == 0;
-                               // Note: Only dense im2col operators require 
the im2ColOutBlock to be allocated in the dense format.
-                               out.reset(out.rlen, out.clen, false);
-                               out.allocateDenseBlock();
-                               if( LOG.isTraceEnabled() ) 
-                                       LOG.trace("Using 
DenseIm2colWorkerAllChannels operator to perform "
-                                               + "im2col 
(stride1pad0="+stride1Pad0+", allChannels="+allChannels+").");
-                               if(allChannels && stride1Pad0 && !trans )
-                                       return new 
DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlockValues(), 
out.getDenseBlockValues(), params);
-                               else if( allChannels )
-                                       return new 
DenseIm2colWorkerAllChannels(input.getDenseBlockValues(), 
out.getDenseBlockValues(), params, trans);
-                               else if( stride1Pad0 )
-                                       return new 
DenseIm2colWorkerStride1Pad0(input.getDenseBlockValues(), 
out.getDenseBlockValues(), params);
-                               else
-                                       return new 
DenseIm2colWorker(input.getDenseBlockValues(), out.getDenseBlockValues(), 
params);
-                       }
-                       else {
-                               if(LOG.isTraceEnabled()) 
-                                       LOG.trace("Using SparseIm2colWorker 
operator to perform im2col.");
-                               out.reset(out.rlen, out.clen, true);
-                               out.allocateSparseRowsBlock();
-                               //preallocate sparse-rows (larger than average 
sparsity to account for skew)
-                               int estnnz = 
(int)Math.ceil(4*input.getSparsity()*out.clen);
-                               for(int r = 0; r < out.rlen; r++)
-                                       out.getSparseBlock().allocate(r, 
Math.min(estnnz, out.clen));
-                               return new 
SparseSparseIm2colWorkerAllChan(input, out, params, trans);
-                       }
-               }
-       }
-       
-       /**
-        * Special case operator for performing dense im2col when stride = [1, 
1] and pad = [0, 0] by using System.arraycopy
-        */
-       static class DenseIm2colWorkerStride1Pad0 implements Im2colWorker {
-               private final double [] inputArray, outputArray; 
-               private final int S, R, P, Q, CHW, H, W;
-               public DenseIm2colWorkerStride1Pad0(double [] inputArray, 
double [] outputArray, ConvolutionParameters params) {
-                       this.inputArray = inputArray;
-                       this.outputArray = outputArray;
-                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
-                       this.CHW = params.C*params.H*params.W;
-               }
-               
-               @Override
-               public void execute(int n) {
-                       throw new RuntimeException("Not supported");
-               }
-
-               @Override
-               public void execute(int n, int cInput) {
-                       int nOffset = n * CHW;
-                       int RS = R*S;
-                       for (int rs = 0; rs < RS; ++rs) {
-                               int wOffset = rs % S;
-                               int hOffset = rs / S;
-                               for (int h = 0; h < P; ++h) {
-                                       int hPadded = h + hOffset;
-                                       int outOffset = (rs * P + h) * Q;
-                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
-                                       System.arraycopy(inputArray, 
inputOffset + wOffset, outputArray, outOffset, Q);
-                                       int w = Q - 1;
-                                       int wPadded = w + wOffset;
-                                       boolean assign = (hPadded < H && 
wPadded < W);
-                                       outputArray[outOffset + w] = assign ? 
inputArray[inputOffset + wPadded] : 0;
-                               }
-                       }
-               }
-       }
-
-       
-               
-       /**
-        * Special case operator for performing dense im2col when stride = [1, 
1] and pad = [0, 0] by using System.arraycopy
-        */
-       static class DenseIm2colWorkerStride1Pad0AllChannels implements 
Im2colWorker {
-               private final double [] inputArray, outputArray; 
-               private final int CRS, S, R, P, Q, CHW, H, W;
-               public DenseIm2colWorkerStride1Pad0AllChannels(double [] 
inputArray, double [] outputArray, ConvolutionParameters params) {
-                       this.inputArray = inputArray;
-                       this.outputArray = outputArray;
-                       this.CRS = params.C * params.R * params.S;
-                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
-                       this.CHW = params.C*params.H*params.W;
-               }
-               
-               @Override
-               public void execute(int n, int c) {
-                       throw new RuntimeException("Not supported");
-               }
-
-               @Override
-               public void execute(int n) {
-                       int nOffset = n * CHW;
-                       for (int c = 0; c < CRS; ++c) {
-                               int wOffset = c % S;
-                               int hOffset = (c / S) % R;
-                               int cInput = c / R / S;
-                               for (int h = 0; h < P; ++h) {
-                                       int hPadded = h + hOffset;
-                                       int outOffset = (c * P + h) * Q;
-                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
-                                       System.arraycopy(inputArray, 
inputOffset + wOffset, outputArray, outOffset, Q);
-                                       int w = Q - 1;
-                                       int wPadded = w + wOffset;
-                                       boolean assign = (hPadded < H && 
wPadded < W);
-                                       outputArray[outOffset + w] = assign ? 
inputArray[inputOffset + wPadded] : 0;
-                               }
-                       }
-               }
-       }
-       
-       /**
-        * Performing dense im2col (general case)
-        */
-       static class DenseIm2colWorker implements Im2colWorker {
-               private final double [] inputArray, outputArray; 
-               private final int S, R, P, Q, CHW, H, W; 
-               private final int stride_h, stride_w, pad_h, pad_w;
-               public DenseIm2colWorker(double [] inputArray, double [] 
outputArray, ConvolutionParameters params) {
-                       this.inputArray = inputArray;
-                       this.outputArray = outputArray;
-                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
-                       this.CHW = params.C*params.H*params.W;
-                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
-                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
-               }
-               
-               @Override
-               public void execute(int n) {
-                       throw new RuntimeException("Not supported");
-               }
-
-               @Override
-               public void execute(int n, int cInput) {
-                       int nOffset = n * CHW; int RS = R*S;
-                       for (int rs = 0; rs < RS; ++rs) {
-                               int wOffset = rs % S;
-                               int hOffset = rs / S;
-                               for (int h = 0; h < P; ++h) {
-                                       int outOffset = (rs * P + h) * Q;
-                                       int hPadded = h * stride_h - pad_h + 
hOffset;
-                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
-                                       if (hPadded < 0 || hPadded >= H) {
-                                               Arrays.fill(outputArray, 
outOffset, outOffset+Q, 0);
-                                       } else {
-                                               for (int w = 0; w < Q; ++w) {
-                                                       int wPadded = w * 
stride_w - pad_w + wOffset;
-                                                       boolean assign = 
(wPadded >= 0 && wPadded < W);
-                                                       outputArray[outOffset + 
w] = assign ? inputArray[inputOffset + wPadded] : 0;
-                                               }
-                                       }
-                               }
-                       }
-               }
-       }
-       
-       /**
-        * Performing dense im2col (general case)
-        */
-       private static class DenseIm2colWorkerAllChannels implements 
Im2colWorker {
-               private final double[] inputArray, outputArray; 
-               private final int CRS, S, R, P, Q, CHW, H, W; 
-               private final int stride_h, stride_w, pad_h, pad_w;
-               private final boolean trans;
-               public DenseIm2colWorkerAllChannels(double [] inputArray, 
double [] outputArray, ConvolutionParameters params, boolean trans) {
-                       this.inputArray = inputArray;
-                       this.outputArray = outputArray;
-                       this.CRS = params.C * params.R * params.S;
-                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
-                       this.CHW = params.C*params.H*params.W;
-                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
-                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
-                       this.trans = trans;
-               }
-               
-               @Override
-               public void execute(int n, int c) {
-                       throw new RuntimeException("Not supported");
-               }
-
-               @Override
-               public void execute(int n) {
-                       //reset for selective copy
-                       Arrays.fill(outputArray, 0);
-                       
-                       int nOffset = n * CHW;
-                       for (int c = 0; c < CRS; ++c) {
-                               int wOffset = c % S;
-                               int hOffset = (c / S) % R;
-                               int cInput = c / R / S;
-                               for (int h = 0; h < P; ++h) {
-                                       int outOffset = trans ? c+(h*Q*CRS) : 
(c*P+h)*Q;
-                                       int hPadded = h * stride_h - pad_h + 
hOffset;
-                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
-                                       if (hPadded < 0 || hPadded >= H ) 
continue;
-                                       for (int w = 0; w < Q; ++w) {
-                                               int wPadded = w * stride_w - 
pad_w + wOffset;
-                                               if( wPadded >= 0 && wPadded < W 
)
-                                                       outputArray[outOffset + 
(trans?w*CRS:w)] 
-                                                               = 
inputArray[inputOffset + wPadded];
-                                       }
-                               }
-                       }
-               }
-       }
-       
-       /**
-        * Performing sparse im2col for all channels for a given row n of the 
input matrix.
-        */
-       private static class SparseSparseIm2colWorkerAllChan implements 
Im2colWorker {
-               private final MatrixBlock input, output;
-               private final int S, R, P, Q, W, HW, RS;
-               private final int stride_h, stride_w, pad_h, pad_w;
-               private final boolean trans;
-               private final boolean simple;
-               public SparseSparseIm2colWorkerAllChan(MatrixBlock input, 
MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) {
-                       this.input = input;
-                       this.output = im2ColOutBlock;
-                       this.HW = params.H * params.W;
-                       this.RS = params.R * params.S;
-                       this.W = params.W; this.R = params.R; this.S = 
params.S; this.P = params.P; this.Q = params.Q;
-                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
-                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
-                       this.trans = trans;
-                       this.simple = params.isStride1Pad0() && W == S && Q == 
1;
-                       if(!input.isInSparseFormat()) 
-                               throw new RuntimeException("Incorrect operator 
selection. Expected dense input for SparseIm2colWorkerAllChannels");
-               }
-               
-               @Override
-               public void execute(int n, int c) {
-                       throw new RuntimeException("Not supported");
-               }
-               
-               @Override
-               public void execute(int n) {
-                       output.reset();
-                       SparseBlock sblock = input.sparseBlock;
-                       if( sblock.isEmpty(n) )
-                               return;
-                       int apos = sblock.pos(n);
-                       int alen = sblock.size(n);
-                       int[] aix = sblock.indexes(n);
-                       double[] avals = sblock.values(n);
-                       
-                       // Iterate over the sparse block
-                       for(int j=apos; j<apos+alen; j++) {
-                               // Note: the input is of shape [N, CHW]
-                               int chw = aix[j];
-                               
-                               // Get individual zero-based c,h,w indexes from 
zero-based 'chw'
-                               int cInput = chw / HW;
-                               int hInput = (chw - cInput*HW)/W;
-                               int wInput = chw % W; 
-                               
-                               if( simple )
-                                       
appendInputValueToIm2colOutputSimple(output, cInput, hInput, wInput, 
-                                               avals[j], R, S, RS, P, trans);
-                               else
-                                       appendInputValueToIm2colOutput(output, 
cInput, hInput, wInput, avals[j], 
-                                               R, S, P, Q, stride_h, stride_w, 
pad_h, pad_w, trans);
-                       }
-                       
-                       output.sortSparseRows();
-               }
-       }
-       
-       /**
-        * Performing sparse im2col for a given channel c and for a given row n 
of the input matrix.
-        */
-       @SuppressWarnings("unused")
-       private static class SparseSparseIm2colWorker implements Im2colWorker {
-               private final MatrixBlock input, output;
-               private final int S, R, P, Q, W, HW;
-               private final int stride_h, stride_w, pad_h, pad_w; 
-               private final boolean trans;
-               
-               public SparseSparseIm2colWorker(MatrixBlock input, MatrixBlock 
im2ColOutBlock, ConvolutionParameters params, boolean trans) {
-                       this.input = input;
-                       this.output = im2ColOutBlock;
-                       this.HW = params.H*params.W;
-                       this.W = params.W; this.R = params.R; this.S = 
params.S; this.P = params.P; this.Q = params.Q;
-                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
-                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
-                       this.trans = trans;
-               }
-               
-               @Override
-               public void execute(int n) {
-                       throw new RuntimeException("Not supported");
-               }
-
-               @Override
-               public void execute(int n, int cInput) {
-                       output.reset();
-                       SparseBlock sblock = input.sparseBlock;
-                       if( sblock.isEmpty(n) )
-                               return;
-                       int apos = sblock.pos(n);
-                       int alen = sblock.size(n);
-                       int[] aix = sblock.indexes(n);
-                       double[] avals = sblock.values(n);
-                               
-                       // Iterate over the sparse block
-                       for(int j=apos; j<apos+alen; j++) {
-                               // Note: the input is of shape [N, CHW]
-                               int chw = aix[j];
-                               
-                               if(cInput == (chw / HW)) {
-                                       // Get individual zero-based c,h,w 
indexes from zero-based 'chw'
-                                       int hInput = (chw - cInput*HW)/W;
-                                       int wInput = chw % W; 
-                                       
-                                       appendInputValueToIm2colOutput(output, 
cInput, hInput, wInput, avals[j], 
-                                                       R, S, P, Q, stride_h, 
stride_w, pad_h, pad_w, trans);
-                               }
-                       }
-                       output.sortSparseRows();
-               }
-       }
-       
-       /**
-        * Appends the value corresponding to the given [, cInput, hInput, 
wInput] to the appropriate im2col location of the output
-        * 
-        * @param output output matrix block
-        * @param cInput input channel index (zero-based)
-        * @param hInput input height index (zero-based)
-        * @param wInput input width index (zero-based)
-        * @param value input value
-        * @param R filter height
-        * @param S filter width
-        * @param P output height
-        * @param Q output width
-        * @param stride_h stride height
-        * @param stride_w stride width
-        * @param pad_h pad height
-        * @param pad_w pad width
-        */
-       private static void appendInputValueToIm2colOutput(MatrixBlock output, 
int cInput, int hInput, int wInput, double value, 
-                       int R, int S, int P, int Q, int stride_h, int stride_w, 
int pad_h, int pad_w, boolean trans) {
-               int RS = R*S;
-               // For the given h,w index, insert avals[j] into respective 
r,s,p,q locations
-               
-               // Constraints: for(int r = 0; r < R; r++) { if(0 <= p && p < P 
&& (hInput - r + pad_h) % stride_h == 0) { ... } }
-               // Constraint 1: p >= 0 and p = (hInput - r + pad_h)  / stride_h
-               // Therefore,  r <= hInput + pad_h 
-               // Constraint 2: p < P and p = (hInput - r + pad_h)  / stride_h
-               // Therefore,  hInput + pad_h - P*stride_h < r
-               // Math.max(0, hInput + pad_h - P*stride_h + 1) <= r <= 
Math.min(R-1, hInput + pad_h)
-               int rMin = Math.max(0, hInput + pad_h - P*stride_h + 1);
-               int rMax = Math.min(R-1, hInput + pad_h);
-               int sMin = Math.max(0, wInput + pad_w - Q*stride_w + 1);
-               int sMax = Math.min(S-1, wInput + pad_w);
-               // Constraint 3: (hInput - r + pad_h) % stride_h == 0
-               while((hInput - rMin + pad_h) % stride_h != 0 && rMin <= rMax) 
rMin++;
-               while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) 
sMin++;
-               
-               for(int r = rMin; r <= rMax; r += stride_h) {
-                       // Only append value if h == hInput, where h = (r - 
pad_h) + p*stride_h and 0 <= p < P
-                       // Therefore, p = (hInput - r + pad_h)  / stride_h. Use 
the same logic for q.
-                       final int p = (hInput - r + pad_h)  / stride_h;
-                       final int pQ = p*Q;
-                       final int outRowIndex = cInput*RS + r*S;
-                       for(int s = sMin; s <= sMax; s += stride_w) {
-                               int q = (wInput - s + pad_w)  / stride_w;
-                               // chw -> [crs, pq]
-                               if( trans )
-                                       output.appendValue(pQ + q, outRowIndex 
+ s, value);
-                               else
-                                       output.appendValue(outRowIndex + s, pQ 
+ q, value);
-                       }
-               }
-       }
-       
-       private static void appendInputValueToIm2colOutputSimple(MatrixBlock 
output, int c, int h, int w,
-               double value, int R, int S, int RS, int P, boolean trans) {
-               int rMin = Math.max(0, h - P + 1);
-               int rMax = Math.min(R-1, h);
-               int cix = c*RS+w+rMin*S;
-               for(int p=h-rMin; p >= h-rMax; p--, cix+=S)
-                       output.appendValue(trans?p:cix, trans?cix:p, value);
-       }
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java
new file mode 100644
index 0000000..067e9ef
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java
@@ -0,0 +1,532 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.concurrent.Callable;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNHelper.CellIndex3;
+
+/**
+ * This class contains the set of operators used for performing pooling
+ */
+public class LibMatrixDNNPooling {
+       
+       // *********************************** low-level runtime operator 
selection ***********************************************
+       // *********************************** based on runtime properties 
(sparsity, native, etc) ********************************
+       // These methods help reduce branch miss predictions and 
instruction-cache misses.
+       // Also, they simplify the design of LibMatrixDNN and help in 
code-maintenance.
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
maxpooling operation
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing maxpooling operation
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getMaxPoolingWorkers(ConvolutionParameters params) throws DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<>();
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       if(params.input1.isInSparseFormat())
+                               ret.add(new SparseMaxPooling(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else
+                               ret.add(new DenseMaxPooling(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
maxpooling backward operation
+        * 
+        * @param params convolution parameters
+        * @param performReluBackward whether to perform ReLU backward
+        * @return list of callable tasks for performing maxpooling backward 
operation
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getMaxPoolingBackwardWorkers(ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<>();
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               boolean sparse1 = params.input1.isInSparseFormat();
+               boolean sparse2 = params.input2.isInSparseFormat();
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       if( !sparse1 && !sparse2 )
+                               ret.add(new 
PoolingBackwardDenseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
+                       else if( !sparse1 && sparse2 )
+                               ret.add(new 
PoolingBackwardDenseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
+                       else if( sparse1 && !sparse2 ) 
+                               ret.add(new 
PoolingBackwardSparseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
+                       else if( sparse1 && sparse2 )
+                               ret.add(new 
PoolingBackwardSparseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), 
params, performReluBackward));
+               }
+               return ret;
+       }
+       
+       private static class DenseMaxPooling implements Callable<Long> 
+       {
+               private final int _rl, _ru; 
+               private final ConvolutionParameters _params;
+               
+               public DenseMaxPooling(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       final int C = _params.C, P = _params.P, Q = _params.Q;
+                       final int R = _params.R, S = _params.S, H = _params.H, 
W = _params.W;
+                       final int HW = _params.H*_params.W;
+                       final int CHW = _params.C*_params.H*_params.W;
+                       final int CPQ = C*P*Q;
+                       double[] in = _params.input1.getDenseBlockValues();
+                       double[] out = _params.output.getDenseBlockValues();
+                       
+                       double minValForMaxPoolOperations = 
_params.minValForMaxPoolOperations;
+                       
+                       //thread-local initialization of output block 
+                       if( !(_params.isStride1Pad0() && _params.isAllOnes(P, 
Q, W)) )
+                               Arrays.fill(out, _rl*CPQ, _ru*CPQ, 
minValForMaxPoolOperations);
+                       
+                       if( _params.isStride1Pad0() && _params.isAllOnes(P, Q, 
W) ) { 
+                               //quick-path w/o materialized index arrays and 
+                               //simplified inner loops for P = 1, Q = 1, W = 1
+                               int lenh = Math.min(R,H);
+                               for(int i = _rl, oix=_rl*C; i < _ru; i++, 
oix+=C)
+                                       for (int c = 0, off=i*CHW; c < C; c++, 
off+=H)
+                                               out[oix+c] = 
max(minValForMaxPoolOperations, in, off, lenh);
+                       }
+                       else if( _params.isStride1Pad0() ) {
+                               //quick-path w/o materialized index arrays 
+                               for(int i = _rl; i < _ru; i++)
+                                       for (int c = 0, off=i*CHW, oix=i*CPQ; c 
< C; c++, off+=HW)
+                                               for (int p = 0; p < P; p++, 
oix+=Q)
+                                                       for (int h = p; h < 
Math.min(p+R,H); h++)
+                                                               for (int q = 0, 
off2=off+h*W; q < Q; q++)
+                                                                       
out[oix+q] = max(out[oix+q], in, off2+q, Math.min(S,W-q));
+                       }
+                       else { //general case
+                               int[] hl = _params.start_indexes_h, hu = 
_params.end_indexes_h;
+                               int[] wl = _params.start_indexes_w, wu = 
_params.end_indexes_w;
+                               for(int i = _rl; i < _ru; i++)
+                                       for (int c = 0, off=i*CHW, oix=i*CPQ; c 
< C; c++, off+=HW)
+                                               for (int p = 0; p < P; p++, 
oix+=Q)
+                                                       for (int h = hl[p]; h < 
hu[p]; h++)
+                                                               for (int q = 0, 
off2=off+h*W; q < Q; q++)
+                                                                       
out[oix+q] = max(out[oix+q], in, off2+wl[q], wu[q]-wl[q]);
+                       }
+                       
+                       //thread-local recomputation of non-zeros
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
+               }
+       }
+       
+       private static class SparseMaxPooling implements Callable<Long> 
+       {
+               private final int _rl, _ru; 
+               private final ConvolutionParameters _params;
+               private double [] outputArray;
+               private final int C, P, Q, W, H, CPQ, PQ;
+               
+               public SparseMaxPooling(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       outputArray = params.output.getDenseBlockValues();
+                       C = params.C; P = params.P; Q = params.Q; H = params.H; 
+                       W = params.W;
+                       CPQ = C*P*Q;
+                       PQ = P*Q;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       //thread-local initialization of output block 
+                       Arrays.fill(outputArray, _rl *CPQ, _ru*CPQ, 
_params.minValForMaxPoolOperations);
+                       
+                       for(int n = _rl; n < _ru; n++)  {
+                               if( !_params.input1.sparseBlock.isEmpty(n) ) {
+                                       final int apos = 
_params.input1.sparseBlock.pos(n);
+                                       final int alen = 
_params.input1.sparseBlock.size(n);
+                                       final int [] aix = 
_params.input1.sparseBlock.indexes(n);
+                                       final double [] avals = 
_params.input1.sparseBlock.values(n);
+                                       int chw = 0; int index = apos;
+                                       for (int c = 0; c < C; c++) {
+                                               final int outOffset = n*CPQ + 
c*PQ;
+                                               for(int h = 0; h < H; h++) {
+                                                       for(int w = 0; w < W; 
w++, chw++) {
+                                                               // Take into 
account zero values as well
+                                                               double nchwVal 
= 0;
+                                                               if(aix[index] 
== chw) {
+                                                                       nchwVal 
= avals[index++];
+                                                                       // 
Ensure that we satisfy the condition index < apos+alen
+                                                                       
if(index >= apos+alen) index--;
+                                                               }
+                                                               // Perform 
maxpooling without binary search :)
+                                                               // Tradeoff as 
compared to dense maxpooling: 
+                                                               // In dense 
maxpooling, iteration space CPQHW where H and W iterations are restricted by 
_params.start_indexes_h[p] 
+                                                               // and are 
eligible for JIT optimizations.
+                                                               // In sparse 
maxpooling, iteration space CHWPQ without HW restrictions.
+                                                               for (int p = 0; 
p < P; p++) {
+                                                                       if(h >= 
_params.start_indexes_h[p] && h < _params.end_indexes_h[p]) {
+                                                                               
final int outOffsetWithp = outOffset + p*Q;
+                                                                               
for (int q = 0; q < Q; q++) {
+                                                                               
        if(w >= _params.start_indexes_w[q] && w < _params.end_indexes_w[q]) {
+                                                                               
                outputArray[outOffsetWithp + q] = 
Math.max(outputArray[outOffsetWithp + q], nchwVal);
+                                                                               
        }
+                                                                               
}
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                               else {
+                                       // Empty input image
+                                       Arrays.fill(outputArray, n*CPQ, 
(n+1)*CPQ, 0);
+                               }
+                       }
+                       
+                       //thread-local recomputation of non-zeros
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
+               }
+       }
+       
+       //BACKWARD
+       
+       /**
+        * Performs the maxpooling backward operation for dense input and dense 
error (dout)
+        */
+       private static class PoolingBackwardDenseDense implements 
Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               boolean performReluBackward;
+               double [] inputArray, doutArray;
+               MatrixBlock output;
+               int C; int CHW; int P; int Q; int HW; int CPQ; int PQ;
+               public PoolingBackwardDenseDense(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       this.performReluBackward = performReluBackward;
+                       inputArray = params.input1.getDenseBlockValues();
+                       doutArray = params.input2.getDenseBlockValues();
+                       output = params.output;
+                       C = params.C; CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
+                       P = params.P; Q = params.Q; CPQ = 
params.C*params.P*params.Q;
+                       PQ = params.P*params.Q;
+                       if (inputArray == null || doutArray == null || 
output.getDenseBlock() == null )
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       double[] out = output.getDenseBlockValues();
+                       for(int n = _rl; n < _ru; n++)  {
+                               for (int c = 0; c < C; c++) {
+                                       final int inputOffset = n*CHW + c*HW;
+                                       final int outputOffset = n*CPQ + c*PQ;
+                                       for (int p = 0; p < P; p++) {
+                                               for (int q = 0; q < Q; q++) {
+                                                       int maxIndex = 
getMaxIndex(p, q, inputOffset, inputArray, _params, performReluBackward);
+                                                       if(maxIndex != -1)
+                                                               out[maxIndex] 
+= doutArray[outputOffset +  p * Q + q];
+                                               }
+                                       }
+                               }
+                       }
+                       //thread-local nnz maintenance
+                       return output.recomputeNonZeros(_rl, _ru-1);
+               }
+       }
+       
+       /**
+        * Performs the maxpooling backward operation for dense input and 
sparse error (dout)
+        */
+       private static class PoolingBackwardDenseSparse implements 
Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               MatrixBlock output; 
+               boolean performReluBackward;
+               double [] inputArray;  MatrixBlock dout;
+               int CHW; int P; int Q; int HW;
+               public PoolingBackwardDenseSparse(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       this.performReluBackward = performReluBackward;
+                       inputArray = params.input1.getDenseBlockValues();
+                       dout = params.input2;
+                       output = params.output;
+                       CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
+                       P = params.P; Q = params.Q; 
+                       if (inputArray == null || output.getDenseBlock() == 
null )
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+                       if (!params.input2.isInSparseFormat())
+                               throw new RuntimeException("Incorrect usage: 
Call optimized versions");
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       CellIndex3 ix = new CellIndex3();
+                       double[] out = output.getDenseBlockValues();
+                       SparseBlock sblock = dout.sparseBlock;
+                       for(int n = _rl; n < _ru; n++)  {
+                               if( sblock.isEmpty(n) ) continue;
+                               int apos = sblock.pos(n);
+                               int alen = sblock.size(n);
+                               int[] aix = sblock.indexes(n);
+                               double[] avals = sblock.values(n);
+                               for(int j = apos; j < apos+alen; j++) {
+                                       ix = 
LibMatrixDNNHelper.computeTensorIndexes(aix[j], P, Q, ix);
+                                       final int inputOffset = n*CHW + 
ix.ix1*HW;
+                                       int maxIndex = getMaxIndex(ix.ix2, 
ix.ix3,
+                                               inputOffset, inputArray, 
_params, performReluBackward);
+                                       if(maxIndex != -1)
+                                               out[maxIndex] += avals[j];
+                               }
+                       }
+                       //thread-local nnz maintenance
+                       return output.recomputeNonZeros(_rl, _ru-1);
+               }
+       }
+       
+       /**
+        * Performs the maxpooling backward operation for sparse input and 
dense error (dout)
+        */
+       private static class PoolingBackwardSparseDense implements 
Callable<Long> 
+       {
+               private final int _rl, _ru; 
+               private final ConvolutionParameters _params; 
+               private final boolean reluBack;
+               protected final MatrixBlock doutput, output;
+               
+               protected PoolingBackwardSparseDense(int rl, int ru, 
ConvolutionParameters params, boolean relu, MatrixBlock dout, MatrixBlock out) {
+                       _rl = rl; _ru = ru; 
+                       _params = params;
+                       reluBack = relu;
+                       doutput = dout;
+                       output = out;
+               }
+               
+               public PoolingBackwardSparseDense(int rl, int ru, 
ConvolutionParameters params, boolean relu) {
+                       this(rl, ru, params, relu, params.input2, 
params.output);
+                       if (doutput.getDenseBlock() == null || 
output.getDenseBlock() == null )
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+                       if (!params.input1.isInSparseFormat())
+                               throw new RuntimeException("Incorrect usage: 
sparse input1 expected");
+               }
+               
+               @Override
+               public Long call() throws Exception 
+               {
+                       final int P = _params.P, Q = _params.Q, W = _params.W;
+                       final int C = _params.C, R = _params.R, S = _params.S;
+                       final int padh = _params.pad_h, padw = _params.pad_w;
+                       final int strideh = _params.stride_h, stridew = 
_params.stride_w;
+                       final int PQ = _params.P * _params.Q;
+                       final int CPQ = _params.C * _params.P * _params.Q;
+                       final int HW = _params.H * _params.W;
+                       final int CHW = _params.C * _params.H * _params.W;
+                       
+                       //allocate auxiliary data structures
+                       double[] maxVal = new double[PQ];
+                       int[] maxIx = new int[PQ];
+                       
+                       for(int n = _rl; n < _ru; n++)  {
+                               for (int c = 0; c < C; c++) {
+                                       //step 0: basic initializations
+                                       final int outOffset = n*CHW + c*HW;
+                                       
+                                       //step 1: perform maxpooling w/ index 
maintenance in a 
+                                       //single, sequential pass over the 
sparse input matrix
+                                       maxpoolingForward(maxVal, maxIx, n, c,
+                                               padh, padw, strideh, stridew, 
C, P, Q, R, S, HW, W);
+                                       
+                                       //step 2: perform maxpooling backward
+                                       maxpoolingBackward(maxIx, outOffset, n, 
c, C, Q, PQ, CPQ);
+                               }
+                       }
+                       //thread-local nnz maintenance
+                       return output.recomputeNonZeros(_rl, _ru-1);
+               }
+               
+               protected void maxpoolingForward(double[] maxVal, int[] maxIx, 
int n, int c, int padh, int padw, int strideh, int stridew, int C, int P, int 
Q, int R, int S, int HW, int W) {
+                       SparseBlock sblock = _params.input1.getSparseBlock();
+                       if( !sblock.isEmpty(n) ) {
+                               Arrays.fill(maxVal, -Double.MAX_VALUE);
+                               int apos = sblock.pos(n);
+                               int alen = sblock.size(n);
+                               int[] aix = sblock.indexes(n);
+                               double[] avals = sblock.values(n);
+                               //find channel start and end, w/ robustness for 
non-existing entries
+                               int cpos = (c==0) ? 0 : sblock.posFIndexGTE(n, 
c*HW);
+                               int cpos2 = (c+1==C) ? alen : 
sblock.posFIndexGTE(n, (c+1)*HW);
+                               cpos = (cpos>=0) ? cpos : alen;
+                               cpos2 = (cpos2>=0) ? cpos2 : alen;
+                               int lastix = c*HW-1;
+                               for(int j=apos+cpos; j<apos+cpos2; j++) {
+                                       //handle skipped zero values
+                                       update0(lastix+1, aix[j], maxVal, 
maxIx, padh, padw, strideh, stridew, P, Q, R, S, HW, W);
+                                       //handle current non-zero value
+                                       int h = (aix[j] % HW) / W;
+                                       int w = aix[j] % W;
+                                       double val = reluBack && avals[j] < 0 ? 
0 : avals[j];
+                                       update(val, maxVal, maxIx, h, w, padh, 
padw, strideh, stridew, P, Q, R, S, W);
+                                       //memoize last seen index
+                                       lastix = aix[j];
+                               }
+                               //handle skipped zero values at end of row
+                               update0(lastix+1, (c+1)*HW, maxVal, maxIx, 
padh, padw, strideh, stridew, P, Q, R, S, HW, W);
+                       }
+                       else {
+                               //handle empty row
+                               Arrays.fill(maxVal, 0);
+                               for(int p = 0, ix=0; p < P; p++) {
+                                       int h = Math.max(-padh+p*strideh, 0);
+                                       for(int q = 0; q < Q; q++, ix++) {
+                                               int w = 
Math.max(-padw+q*stridew, 0);
+                                               maxIx[ix] = h * W + w;
+                                       }
+                               }
+                       }
+               }
+               
+               protected void maxpoolingBackward(int[] maxIx, int outOffset, 
int n, int c, int C, int Q, int PQ, int CPQ) {
+                       double[] dout = doutput.getDenseBlockValues();
+                       double[] out = output.getDenseBlockValues();
+                       final int doutOffset = n*CPQ + c*PQ;
+                       for( int pq = 0; pq < PQ; pq++ )
+                               out[ outOffset + maxIx[pq] ] += dout[ 
doutOffset + pq ];
+               }
+               
+               private static void update0(int lix, int uix, double[] maxVal, 
int[] maxIx, int padh, int padw, int strideh, int stridew, int P, int Q, int R, 
int S, int HW, int W) {
+                       //TODO exploit constant value and overlap for potential 
early abort
+                       for(int i = lix; i<uix; i++)
+                               update(0, maxVal, maxIx, (i%HW)/W, i%W, padh, 
padw, strideh, stridew, P, Q, R, S, W);
+               }
+               
+               private static void update(double val, double[] maxVal, int[] 
maxIx, int h, int w, int padh, int padw, int strideh, int stridew, int P, int 
Q, int R, int S, int W) {
+                       //determine lower and upper bounds for p and q
+                       //(see fillIndexesArray, solved for p and q, reversed)
+                       int lp = Math.max((h+padh-R+strideh)/strideh, 0);
+                       int up = Math.min((h+padh+strideh)/strideh, P);
+                       int lq = Math.max((w+padw-S+stridew)/stridew, 0);
+                       int uq = Math.min((w+padw+stridew)/stridew, Q);
+                       
+                       //maintain max index for all relevant p and q
+                       int maxIndex = h * W + w;
+                       for(int p = lp; p < up; p++) 
+                               for(int q = lq; q < uq; q++) {
+                                       int ix = p * Q + q;
+                                       if( maxVal[ix] < val ) {
+                                               maxVal[ix] = val;
+                                               maxIx[ix] = maxIndex;
+                                       }
+                               }
+               }
+       }
+       
+       /**
+        * Performs the maxpooling backward operation for sparse input and 
sparse error (dout)
+        */
+       private static class PoolingBackwardSparseSparse extends 
PoolingBackwardSparseDense
+       {
+               public PoolingBackwardSparseSparse(int rl, int ru, 
ConvolutionParameters params, boolean relu) {
+                       super(rl, ru, params, relu, params.input2, 
params.output);
+                       if (output.getDenseBlock() == null )
+                               throw new RuntimeException("Incorrect usage: 
empty outputs");
+                       if (!params.input1.isInSparseFormat() || 
!params.input2.isInSparseFormat())
+                               throw new RuntimeException("Incorrect usage: 
Call optimized versions");
+               }
+               
+               @Override
+               protected void maxpoolingBackward(int[] maxIx, int outOffset, 
int n, int c, int C, int Q, int PQ, int CPQ) {
+                       SparseBlock sblock = doutput.getSparseBlock();
+                       double[] out = output.getDenseBlockValues();
+                       if( sblock.isEmpty(n) )
+                               return;
+                       int apos = sblock.pos(n);
+                       int alen = sblock.size(n);
+                       int[] aix = sblock.indexes(n);
+                       double[] avals = sblock.values(n);
+                       //find channel start and end, w/ robustness for 
non-existing entries
+                       int cpos = (c==0) ? 0 : sblock.posFIndexGTE(n, c*PQ);
+                       int cpos2 = (c+1==C) ? alen : sblock.posFIndexGTE(n, 
(c+1)*PQ);
+                       cpos = (cpos>=0) ? cpos : alen;
+                       cpos2 = (cpos2>=0) ? cpos2 : alen;
+                       for(int j = apos+cpos; j<apos+cpos2; j++) {
+                               int p = (aix[j] % PQ) / Q;
+                               int q = aix[j] % Q;
+                               int pq = p * Q + q;
+                               out[ outOffset + maxIx[pq] ] += avals[j];
+                       }
+               }
+       }
+       
+       private static double max(final double aval, double[] b, final int bi, 
final int len) {
+               double ret = aval;
+               for( int i = bi; i < bi+len; i++ )
+                       ret = Math.max(ret, b[i]);
+               return ret;
+       }
+       
+       /**
+        * Returns the index of cell with maximum value. This method is 
optimized for dense input
+        * 
+        * @param p output feature map height
+        * @param q output feature map width
+        * @param inputOffset offset to be used for input index
+        * @param inputArray input array
+        * @param params convolution parameters
+        * @param performReluBackward perform ReLU backward
+        * @return index of cell with maximum value
+        */
+       private static int getMaxIndex(int p, int q, int inputOffset, double [] 
inputArray, ConvolutionParameters params, boolean performReluBackward) {
+               int start_index_h = params.start_indexes_h[p];
+               int end_index_h = params.end_indexes_h[p];
+               int start_index_w = params.start_indexes_w[q];
+               int end_index_w = params.end_indexes_w[q];
+               
+               int maxIndex = -1; 
+               double maxVal = -Double.MAX_VALUE;
+               
+               // Note: We do not treat pad as zero and hence we don't do:  
+               // maxVal = 0 
+               // if start_index_h < 0 || start_index_w < 0 || end_index_h >= 
params.H || end_index_w >= params.W
+               
+               // Find maxIndex
+               double currDoutVal = -1;
+               for (int h = start_index_h; h < end_index_h; h++) {
+                       for (int w = start_index_w; w < end_index_w; w++) {
+                               currDoutVal = inputArray[inputOffset +  
h*params.W + w];
+                               currDoutVal = performReluBackward && 
currDoutVal < 0 ? 0 : currDoutVal;
+                               if(maxVal < currDoutVal) {
+                                       maxIndex = inputOffset +  h*params.W + 
w;
+                                       maxVal = currDoutVal;
+                               }
+                       }
+               }
+               return maxIndex;
+       }
+}

Reply via email to