http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
deleted file mode 100644
index 3dfb545..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.matrix.data;
-
-import java.util.Arrays;
-import java.util.concurrent.Callable;
-
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNNHelper.CellIndex3;
-
-/**
- * This class contains the set of operators used for performing pooling 
backward
- */
-public class LibMatrixDNNPoolingBackwardHelper {
-       /**
-        * Performs the maxpooling backward operation for dense input and dense 
error (dout)
-        */
-       public static class PoolingBackwardDenseDense implements Callable<Long> 
-       {
-               public int _rl; public int _ru; 
-               private final ConvolutionParameters _params; 
-               boolean performReluBackward;
-               double [] inputArray, doutArray;
-               MatrixBlock output;
-               int C; int CHW; int P; int Q; int HW; int CPQ; int PQ;
-               public PoolingBackwardDenseDense(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-                       this.performReluBackward = performReluBackward;
-                       inputArray = params.input1.getDenseBlockValues();
-                       doutArray = params.input2.getDenseBlockValues();
-                       output = params.output;
-                       C = params.C; CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
-                       P = params.P; Q = params.Q; CPQ = 
params.C*params.P*params.Q;
-                       PQ = params.P*params.Q;
-                       if (inputArray == null || doutArray == null || 
output.getDenseBlock() == null )
-                               throw new RuntimeException("Incorrect usage: 
empty inputs");
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       double[] out = output.getDenseBlockValues();
-                       for(int n = _rl; n < _ru; n++)  {
-                               for (int c = 0; c < C; c++) {
-                                       final int inputOffset = n*CHW + c*HW;
-                                       final int outputOffset = n*CPQ + c*PQ;
-                                       for (int p = 0; p < P; p++) {
-                                               for (int q = 0; q < Q; q++) {
-                                                       int maxIndex = 
LibMatrixDNNHelper.getMaxIndex(p, q, inputOffset, inputArray, _params, 
performReluBackward);
-                                                       if(maxIndex != -1)
-                                                               out[maxIndex] 
+= doutArray[outputOffset +  p * Q + q];
-                                               }
-                                       }
-                               }
-                       }
-                       //thread-local nnz maintenance
-                       return output.recomputeNonZeros(_rl, _ru-1);
-               }
-       }
-       
-       /**
-        * Performs the maxpooling backward operation for dense input and 
sparse error (dout)
-        */
-       public static class PoolingBackwardDenseSparse implements 
Callable<Long> 
-       {
-               public int _rl; public int _ru; 
-               private final ConvolutionParameters _params; 
-               MatrixBlock output; 
-               boolean performReluBackward;
-               double [] inputArray;  MatrixBlock dout;
-               int C; int CHW; int P; int Q; int HW;
-               public PoolingBackwardDenseSparse(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-                       this.performReluBackward = performReluBackward;
-                       inputArray = params.input1.getDenseBlockValues();
-                       dout = params.input2;
-                       output = params.output;
-                       C = params.C; CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
-                       P = params.P; Q = params.Q; 
-                       if (inputArray == null || output.getDenseBlock() == 
null )
-                               throw new RuntimeException("Incorrect usage: 
empty inputs");
-                       if (!params.input2.isInSparseFormat())
-                               throw new RuntimeException("Incorrect usage: 
Call optimized versions");
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       CellIndex3 ix = new CellIndex3();
-                       double[] out = output.getDenseBlockValues();
-                       SparseBlock sblock = dout.sparseBlock;
-                       for(int n = _rl; n < _ru; n++)  {
-                               if( sblock.isEmpty(n) ) continue;
-                               int apos = sblock.pos(n);
-                               int alen = sblock.size(n);
-                               int[] aix = sblock.indexes(n);
-                               double[] avals = sblock.values(n);
-                               for(int j = apos; j < apos+alen; j++) {
-                                       ix = 
LibMatrixDNNHelper.computeTensorIndexes(aix[j], P, Q, ix);
-                                       final int inputOffset = n*CHW + 
ix.ix1*HW;
-                                       int maxIndex = 
LibMatrixDNNHelper.getMaxIndex(ix.ix2, ix.ix3,
-                                               inputOffset, inputArray, 
_params, performReluBackward);
-                                       if(maxIndex != -1)
-                                               out[maxIndex] += avals[j];
-                               }
-                       }
-                       //thread-local nnz maintenance
-                       return output.recomputeNonZeros(_rl, _ru-1);
-               }
-       }
-       
-       /**
-        * Performs the maxpooling backward operation for sparse input and 
dense error (dout)
-        */
-       public static class PoolingBackwardSparseDense implements 
Callable<Long> 
-       {
-               private final int _rl, _ru; 
-               private final ConvolutionParameters _params; 
-               private final boolean reluBack;
-               protected final MatrixBlock doutput, output;
-               
-               protected PoolingBackwardSparseDense(int rl, int ru, 
ConvolutionParameters params, boolean relu, MatrixBlock dout, MatrixBlock out) {
-                       _rl = rl; _ru = ru; 
-                       _params = params;
-                       reluBack = relu;
-                       doutput = dout;
-                       output = out;
-               }
-               
-               public PoolingBackwardSparseDense(int rl, int ru, 
ConvolutionParameters params, boolean relu) {
-                       this(rl, ru, params, relu, params.input2, 
params.output);
-                       if (doutput.getDenseBlock() == null || 
output.getDenseBlock() == null )
-                               throw new RuntimeException("Incorrect usage: 
empty inputs");
-                       if (!params.input1.isInSparseFormat())
-                               throw new RuntimeException("Incorrect usage: 
sparse input1 expected");
-               }
-               
-               @Override
-               public Long call() throws Exception 
-               {
-                       final int P = _params.P, Q = _params.Q, W = _params.W;
-                       final int C = _params.C, R = _params.R, S = _params.S;
-                       final int padh = _params.pad_h, padw = _params.pad_w;
-                       final int strideh = _params.stride_h, stridew = 
_params.stride_w;
-                       final int PQ = _params.P * _params.Q;
-                       final int CPQ = _params.C * _params.P * _params.Q;
-                       final int HW = _params.H * _params.W;
-                       final int CHW = _params.C * _params.H * _params.W;
-                       
-                       //allocate auxiliary data structures
-                       double[] maxVal = new double[PQ];
-                       int[] maxIx = new int[PQ];
-                       
-                       for(int n = _rl; n < _ru; n++)  {
-                               for (int c = 0; c < C; c++) {
-                                       //step 0: basic initializations
-                                       final int outOffset = n*CHW + c*HW;
-                                       
-                                       //step 1: perform maxpooling w/ index 
maintenance in a 
-                                       //single, sequential pass over the 
sparse input matrix
-                                       maxpoolingForward(maxVal, maxIx, n, c,
-                                               padh, padw, strideh, stridew, 
C, P, Q, R, S, HW, W);
-                                       
-                                       //step 2: perform maxpooling backward
-                                       maxpoolingBackward(maxIx, outOffset, n, 
c, C, Q, PQ, CPQ);
-                               }
-                       }
-                       //thread-local nnz maintenance
-                       return output.recomputeNonZeros(_rl, _ru-1);
-               }
-               
-               protected void maxpoolingForward(double[] maxVal, int[] maxIx, 
int n, int c, int padh, int padw, int strideh, int stridew, int C, int P, int 
Q, int R, int S, int HW, int W) {
-                       SparseBlock sblock = _params.input1.getSparseBlock();
-                       if( !sblock.isEmpty(n) ) {
-                               Arrays.fill(maxVal, -Double.MAX_VALUE);
-                               int apos = sblock.pos(n);
-                               int alen = sblock.size(n);
-                               int[] aix = sblock.indexes(n);
-                               double[] avals = sblock.values(n);
-                               //find channel start and end, w/ robustness for 
non-existing entries
-                               int cpos = (c==0) ? 0 : sblock.posFIndexGTE(n, 
c*HW);
-                               int cpos2 = (c+1==C) ? alen : 
sblock.posFIndexGTE(n, (c+1)*HW);
-                               cpos = (cpos>=0) ? cpos : alen;
-                               cpos2 = (cpos2>=0) ? cpos2 : alen;
-                               int lastix = c*HW-1;
-                               for(int j=apos+cpos; j<apos+cpos2; j++) {
-                                       //handle skipped zero values
-                                       update0(lastix+1, aix[j], maxVal, 
maxIx, padh, padw, strideh, stridew, P, Q, R, S, HW, W);
-                                       //handle current non-zero value
-                                       int h = (aix[j] % HW) / W;
-                                       int w = aix[j] % W;
-                                       double val = reluBack && avals[j] < 0 ? 
0 : avals[j];
-                                       update(val, maxVal, maxIx, h, w, padh, 
padw, strideh, stridew, P, Q, R, S, W);
-                                       //memoize last seen index
-                                       lastix = aix[j];
-                               }
-                               //handle skipped zero values at end of row
-                               update0(lastix+1, (c+1)*HW, maxVal, maxIx, 
padh, padw, strideh, stridew, P, Q, R, S, HW, W);
-                       }
-                       else {
-                               //handle empty row
-                               Arrays.fill(maxVal, 0);
-                               for(int p = 0, ix=0; p < P; p++) {
-                                       int h = Math.max(-padh+p*strideh, 0);
-                                       for(int q = 0; q < Q; q++, ix++) {
-                                               int w = 
Math.max(-padw+q*stridew, 0);
-                                               maxIx[ix] = h * W + w;
-                                       }
-                               }
-                       }
-               }
-               
-               protected void maxpoolingBackward(int[] maxIx, int outOffset, 
int n, int c, int C, int Q, int PQ, int CPQ) {
-                       double[] dout = doutput.getDenseBlockValues();
-                       double[] out = output.getDenseBlockValues();
-                       final int doutOffset = n*CPQ + c*PQ;
-                       for( int pq = 0; pq < PQ; pq++ )
-                               out[ outOffset + maxIx[pq] ] += dout[ 
doutOffset + pq ];
-               }
-               
-               private static void update0(int lix, int uix, double[] maxVal, 
int[] maxIx, int padh, int padw, int strideh, int stridew, int P, int Q, int R, 
int S, int HW, int W) {
-                       //TODO exploit constant value and overlap for potential 
early abort
-                       for(int i = lix; i<uix; i++)
-                               update(0, maxVal, maxIx, (i%HW)/W, i%W, padh, 
padw, strideh, stridew, P, Q, R, S, W);
-               }
-               
-               private static void update(double val, double[] maxVal, int[] 
maxIx, int h, int w, int padh, int padw, int strideh, int stridew, int P, int 
Q, int R, int S, int W) {
-                       //determine lower and upper bounds for p and q
-                       //(see fillIndexesArray, solved for p and q, reversed)
-                       int lp = Math.max((h+padh-R+strideh)/strideh, 0);
-                       int up = Math.min((h+padh+strideh)/strideh, P);
-                       int lq = Math.max((w+padw-S+stridew)/stridew, 0);
-                       int uq = Math.min((w+padw+stridew)/stridew, Q);
-                       
-                       //maintain max index for all relevant p and q
-                       int maxIndex = h * W + w;
-                       for(int p = lp; p < up; p++) 
-                               for(int q = lq; q < uq; q++) {
-                                       int ix = p * Q + q;
-                                       if( maxVal[ix] < val ) {
-                                               maxVal[ix] = val;
-                                               maxIx[ix] = maxIndex;
-                                       }
-                               }
-               }
-       }
-       
-       /**
-        * Performs the maxpooling backward operation for sparse input and 
sparse error (dout)
-        */
-       public static class PoolingBackwardSparseSparse extends 
PoolingBackwardSparseDense
-       {
-               public PoolingBackwardSparseSparse(int rl, int ru, 
ConvolutionParameters params, boolean relu) {
-                       super(rl, ru, params, relu, params.input2, 
params.output);
-                       if (output.getDenseBlock() == null )
-                               throw new RuntimeException("Incorrect usage: 
empty outputs");
-                       if (!params.input1.isInSparseFormat() || 
!params.input2.isInSparseFormat())
-                               throw new RuntimeException("Incorrect usage: 
Call optimized versions");
-               }
-               
-               @Override
-               protected void maxpoolingBackward(int[] maxIx, int outOffset, 
int n, int c, int C, int Q, int PQ, int CPQ) {
-                       SparseBlock sblock = doutput.getSparseBlock();
-                       double[] out = output.getDenseBlockValues();
-                       if( sblock.isEmpty(n) )
-                               return;
-                       int apos = sblock.pos(n);
-                       int alen = sblock.size(n);
-                       int[] aix = sblock.indexes(n);
-                       double[] avals = sblock.values(n);
-                       //find channel start and end, w/ robustness for 
non-existing entries
-                       int cpos = (c==0) ? 0 : sblock.posFIndexGTE(n, c*PQ);
-                       int cpos2 = (c+1==C) ? alen : sblock.posFIndexGTE(n, 
(c+1)*PQ);
-                       cpos = (cpos>=0) ? cpos : alen;
-                       cpos2 = (cpos2>=0) ? cpos2 : alen;
-                       for(int j = apos+cpos; j<apos+cpos2; j++) {
-                               int p = (aix[j] % PQ) / Q;
-                               int q = aix[j] % Q;
-                               int pq = p * Q + q;
-                               out[ outOffset + maxIx[pq] ] += avals[j];
-                       }
-               }
-       }
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
deleted file mode 100644
index 0377c50..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.matrix.data;
-
-import java.util.Arrays;
-import java.util.concurrent.Callable;
-
-/**
- * This class contains the set of operators used for performing pooling
- */
-public class LibMatrixDNNPoolingHelper {
-       
-       /**
-        * Performs the dense maxpooling
-        */
-       public static class DenseMaxPooling implements Callable<Long> 
-       {
-               private final int _rl, _ru; 
-               private final ConvolutionParameters _params;
-               
-               public DenseMaxPooling(int rl, int ru, ConvolutionParameters 
params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       final int C = _params.C, P = _params.P, Q = _params.Q;
-                       final int R = _params.R, S = _params.S, H = _params.H, 
W = _params.W;
-                       final int HW = _params.H*_params.W;
-                       final int CHW = _params.C*_params.H*_params.W;
-                       final int CPQ = C*P*Q;
-                       double[] in = _params.input1.getDenseBlockValues();
-                       double[] out = _params.output.getDenseBlockValues();
-                       
-                       double minValForMaxPoolOperations = 
_params.minValForMaxPoolOperations;
-                       
-                       //thread-local initialization of output block 
-                       if( !(_params.isStride1Pad0() && _params.isAllOnes(P, 
Q, W)) )
-                               Arrays.fill(out, _rl*CPQ, _ru*CPQ, 
minValForMaxPoolOperations);
-                       
-                       if( _params.isStride1Pad0() && _params.isAllOnes(P, Q, 
W) ) { 
-                               //quick-path w/o materialized index arrays and 
-                               //simplified inner loops for P = 1, Q = 1, W = 1
-                               int lenh = Math.min(R,H);
-                               for(int i = _rl, oix=_rl*C; i < _ru; i++, 
oix+=C)
-                                       for (int c = 0, off=i*CHW; c < C; c++, 
off+=H)
-                                               out[oix+c] = 
max(minValForMaxPoolOperations, in, off, lenh);
-                       }
-                       else if( _params.isStride1Pad0() ) {
-                               //quick-path w/o materialized index arrays 
-                               for(int i = _rl; i < _ru; i++)
-                                       for (int c = 0, off=i*CHW, oix=i*CPQ; c 
< C; c++, off+=HW)
-                                               for (int p = 0; p < P; p++, 
oix+=Q)
-                                                       for (int h = p; h < 
Math.min(p+R,H); h++)
-                                                               for (int q = 0, 
off2=off+h*W; q < Q; q++)
-                                                                       
out[oix+q] = max(out[oix+q], in, off2+q, Math.min(S,W-q));
-                       }
-                       else { //general case
-                               int[] hl = _params.start_indexes_h, hu = 
_params.end_indexes_h;
-                               int[] wl = _params.start_indexes_w, wu = 
_params.end_indexes_w;
-                               for(int i = _rl; i < _ru; i++)
-                                       for (int c = 0, off=i*CHW, oix=i*CPQ; c 
< C; c++, off+=HW)
-                                               for (int p = 0; p < P; p++, 
oix+=Q)
-                                                       for (int h = hl[p]; h < 
hu[p]; h++)
-                                                               for (int q = 0, 
off2=off+h*W; q < Q; q++)
-                                                                       
out[oix+q] = max(out[oix+q], in, off2+wl[q], wu[q]-wl[q]);
-                       }
-                       
-                       //thread-local recomputation of non-zeros
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-       }
-       
-       /**
-        * Performs the sparse maxpooling
-        */
-       public static class SparseMaxPooling implements Callable<Long> 
-       {
-               private final int _rl, _ru; 
-               private final ConvolutionParameters _params;
-               private double [] outputArray;
-               private final int C, P, Q, W, H, CPQ, PQ;
-               
-               public SparseMaxPooling(int rl, int ru, ConvolutionParameters 
params) {
-                       _rl = rl; _ru = ru;
-                       _params = params;
-                       outputArray = params.output.getDenseBlockValues();
-                       C = params.C; P = params.P; Q = params.Q; H = params.H; 
-                       W = params.W;
-                       CPQ = C*P*Q;
-                       PQ = P*Q;
-               }
-               
-               @Override
-               public Long call() throws Exception {
-                       //thread-local initialization of output block 
-                       Arrays.fill(outputArray, _rl *CPQ, _ru*CPQ, 
_params.minValForMaxPoolOperations);
-                       
-                       for(int n = _rl; n < _ru; n++)  {
-                               if( !_params.input1.sparseBlock.isEmpty(n) ) {
-                                       final int apos = 
_params.input1.sparseBlock.pos(n);
-                                       final int alen = 
_params.input1.sparseBlock.size(n);
-                                       final int [] aix = 
_params.input1.sparseBlock.indexes(n);
-                                       final double [] avals = 
_params.input1.sparseBlock.values(n);
-                                       int chw = 0; int index = apos;
-                                       for (int c = 0; c < C; c++) {
-                                               final int outOffset = n*CPQ + 
c*PQ;
-                                               for(int h = 0; h < H; h++) {
-                                                       for(int w = 0; w < W; 
w++, chw++) {
-                                                               // Take into 
account zero values as well
-                                                               double nchwVal 
= 0;
-                                                               if(aix[index] 
== chw) {
-                                                                       nchwVal 
= avals[index++];
-                                                                       // 
Ensure that we satisfy the condition index < apos+alen
-                                                                       
if(index >= apos+alen) index--;
-                                                               }
-                                                               // Perform 
maxpooling without binary search :)
-                                                               // Tradeoff as 
compared to dense maxpooling: 
-                                                               // In dense 
maxpooling, iteration space CPQHW where H and W iterations are restricted by 
_params.start_indexes_h[p] 
-                                                               // and are 
eligible for JIT optimizations.
-                                                               // In sparse 
maxpooling, iteration space CHWPQ without HW restrictions.
-                                                               for (int p = 0; 
p < P; p++) {
-                                                                       if(h >= 
_params.start_indexes_h[p] && h < _params.end_indexes_h[p]) {
-                                                                               
final int outOffsetWithp = outOffset + p*Q;
-                                                                               
for (int q = 0; q < Q; q++) {
-                                                                               
        if(w >= _params.start_indexes_w[q] && w < _params.end_indexes_w[q]) {
-                                                                               
                outputArray[outOffsetWithp + q] = 
Math.max(outputArray[outOffsetWithp + q], nchwVal);
-                                                                               
        }
-                                                                               
}
-                                                                       }
-                                                               }
-                                                       }
-                                               }
-                                       }
-                               }
-                               else {
-                                       // Empty input image
-                                       Arrays.fill(outputArray, n*CPQ, 
(n+1)*CPQ, 0);
-                               }
-                       }
-                       
-                       //thread-local recomputation of non-zeros
-                       return _params.output.recomputeNonZeros(_rl, _ru-1);
-               }
-       }
-       
-       private static double max(final double aval, double[] b, final int bi, 
final int len) {
-               double ret = aval;
-               for( int i = bi; i < bi+len; i++ )
-                       ret = Math.max(ret, b[i]);
-               return ret;
-       }
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java
new file mode 100644
index 0000000..c44a032
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRelu.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.ArrayList;
+import java.util.concurrent.Callable;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.functionobjects.Plus;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+
+/**
+ * This class contains the different implementation of rotate180 operation
+ */
+public class LibMatrixDNNRelu
+{
+       private static BinaryOperator PLUS = new 
BinaryOperator(Plus.getPlusFnObject());
+
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
relu backward operation
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing relu backward operation
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getReluBackwardWorkers(ConvolutionParameters params) throws DMLRuntimeException 
{
+               ArrayList<Callable<Long>> ret = new ArrayList<>();
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               for(int i = 0; i*taskSize < params.N; i++)
+                       ret.add(new ReluBackward(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+               return ret;
+       }
+       
+       /**
+        * Performs the operation: (X gt 0) * dout
+        */
+       public static class ReluBackward implements Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               double [] outputArray; int numOutCols;
+               public ReluBackward(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       outputArray= params.output.getDenseBlockValues();
+                       numOutCols = params.input1.getNumColumns();
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       if(!_params.input1.isInSparseFormat() && 
!_params.input2.isInSparseFormat()) {
+                               double [] inputArr = 
_params.input1.getDenseBlockValues();
+                               double [] doutArr = 
_params.input2.getDenseBlockValues();
+                               for(int i = _rl*numOutCols; i < _ru*numOutCols; 
i++) {
+                                       outputArray[i] = inputArr[i] > 0 ? 
doutArr[i] : 0;
+                               }
+                       }
+                       else {
+                               // Perform (X > 0)
+                               
ConvolutionUtils.scalarOperations(_params.input1, outputArray, _rl*numOutCols, 
numOutCols, _rl, _ru, 
+                                       
InstructionUtils.parseScalarBinaryOperator(">", false, 0));
+                               // Then perform (X > 0) * dout
+                               
ConvolutionUtils.binaryOperationInPlace(_params.input2, outputArray, 
_rl*numOutCols, numOutCols, _rl, _ru, PLUS);
+                       }
+                       return 0L;
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180.java
new file mode 100644
index 0000000..b463794
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNHelper.CellIndex3;
+
+/**
+ * This class contains the different implementation of rotate180 operation
+ */
+public class LibMatrixDNNRotate180
+{
+       public static interface Rotate180Worker {
+               public void execute(int inputN, int outputN);
+               public static Rotate180Worker getWorker(MatrixBlock in, 
MatrixBlock out,
+                       ConvolutionParameters params, boolean 
zeroOutSparseOutput, boolean trans) {
+                       if(!in.isInSparseFormat()) 
+                               return new DenseRotate180Worker(in, 
out.getDenseBlockValues(), params);
+                       else
+                               return new SparseRotate180Worker(in, out, 
params, trans);
+               }
+       }
+       
+       /**
+        * Performing dense rotate180 (general case)
+        */
+       private static class DenseRotate180Worker implements Rotate180Worker {
+               private final double[] inputArray, outputArray;
+               private final ConvolutionParameters params;
+               public DenseRotate180Worker(MatrixBlock input, double[] 
outputArray,  ConvolutionParameters params) {
+                       this.outputArray = outputArray;
+                       this.params = params;
+                       inputArray = input.getDenseBlockValues();
+                       if(inputArray == null || outputArray == null)
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+               }
+               
+               @Override
+               public void execute(int inputN, int outputN) {
+                       int outputOffset = outputN*params.K*params.P*params.Q;
+                       for (int k = 0; k < params.K; k++) {
+                               for (int p = 0; p < params.P; p++) {
+                                       for (int q = 0; q < params.Q; q++) {
+                                               outputArray[outputOffset + 
p*params.Q*params.K + q*params.K + k] = 
+                                                               
inputArray[inputN*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q 
+ q];
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Performing rotate180 when input is sparse (general case)
+        * 
+        * Why are we allocating the output of rotate180 in dense format ? 
+        * Because the number of rows of output (i.e. NPQ) is much larger than 
number of columns (i.e. K) 
+        */
+       private static class SparseRotate180Worker implements Rotate180Worker {
+               private final MatrixBlock in, out;
+               private final ConvolutionParameters params;
+               private final boolean trans;
+               
+               public SparseRotate180Worker(MatrixBlock input, MatrixBlock 
output,
+                       ConvolutionParameters params, boolean trans) {
+                       this.in = input;
+                       this.out = output;
+                       this.params = params;
+                       this.trans = trans;
+               }
+               
+               @Override
+               public void execute(int inputN, int outputN) {
+                       out.reset();
+                       
+                       SparseBlock sblock = in.sparseBlock;
+                       if( sblock==null || sblock.isEmpty(inputN) )
+                               return;
+                       
+                       CellIndex3 ix = new CellIndex3();
+                       int outputOffset = outputN*params.P*params.Q;
+                       int apos = sblock.pos(inputN);
+                       int alen = sblock.size(inputN);
+                       int[] aix = sblock.indexes(inputN);
+                       double[] avals = sblock.values(inputN);
+                       for(int j = apos; j < apos+alen; j++) {
+                               ix = 
LibMatrixDNNHelper.computeTensorIndexes(aix[j], params.P, params.Q, ix);
+                               if( trans )
+                                       out.appendValue(ix.ix1, outputOffset + 
ix.ix2*params.Q + ix.ix3, avals[j]);
+                               else
+                                       out.appendValue(outputOffset + 
ix.ix2*params.Q + ix.ix3, ix.ix1, avals[j]);
+                       }
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
deleted file mode 100644
index 74e2baa..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.matrix.data;
-
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNNHelper.CellIndex3;
-
-/**
- * This class contains the different implementation of rotate180 operation
- */
-public class LibMatrixDNNRotate180Helper {
-
-       static interface Rotate180Worker {
-               public void execute(int inputN, int outputN);
-               public static Rotate180Worker getWorker(MatrixBlock in, 
MatrixBlock out, 
-                       ConvolutionParameters params, boolean 
zeroOutSparseOutput, boolean trans) {
-                       if(!in.isInSparseFormat()) 
-                               return new DenseRotate180Worker(in, 
out.getDenseBlockValues(), params);
-                       else
-                               return new SparseRotate180Worker(in, out, 
params, trans);
-               }
-       }
-       
-       /**
-        * Performing dense rotate180 (general case)
-        */
-       static class DenseRotate180Worker implements Rotate180Worker {
-
-               double [] inputArray; double [] outputArray;  
-               ConvolutionParameters params;
-               public DenseRotate180Worker(MatrixBlock input, double [] 
outputArray,  ConvolutionParameters params) {
-                       this.outputArray = outputArray;
-                       this.params = params;
-                       inputArray = input.getDenseBlockValues();
-                       if(inputArray == null || outputArray == null)
-                               throw new RuntimeException("Incorrect usage: 
empty inputs");
-               }
-               
-               @Override
-               public void execute(int inputN, int outputN) {
-                       int outputOffset = outputN*params.K*params.P*params.Q;
-                       for (int k = 0; k < params.K; k++) {
-                               for (int p = 0; p < params.P; p++) {
-                                       for (int q = 0; q < params.Q; q++) {
-                                               outputArray[outputOffset + 
p*params.Q*params.K + q*params.K + k] = 
-                                                               
inputArray[inputN*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q 
+ q];
-                                       }
-                               }
-                       }
-               }
-       }
-       
-       /**
-        * Performing rotate180 when input is sparse (general case)
-        * 
-        * Why are we allocating the output of rotate180 in dense format ? 
-        * Because the number of rows of output (i.e. NPQ) is much larger than 
number of columns (i.e. K) 
-        */
-       static class SparseRotate180Worker implements Rotate180Worker {
-               private final MatrixBlock in, out;
-               private final ConvolutionParameters params;
-               private final boolean trans;
-               
-               public SparseRotate180Worker(MatrixBlock input, MatrixBlock 
output, 
-                       ConvolutionParameters params, boolean trans) {
-                       this.in = input;
-                       this.out = output;
-                       this.params = params;
-                       this.trans = trans;
-               }
-               
-               @Override
-               public void execute(int inputN, int outputN) {
-                       out.reset();
-                       
-                       SparseBlock sblock = in.sparseBlock;
-                       if( sblock==null || sblock.isEmpty(inputN) )
-                               return;
-                       
-                       CellIndex3 ix = new CellIndex3();
-                       int outputOffset = outputN*params.P*params.Q;
-                       int apos = sblock.pos(inputN);
-                       int alen = sblock.size(inputN);
-                       int[] aix = sblock.indexes(inputN);
-                       double[] avals = sblock.values(inputN);
-                       for(int j = apos; j < apos+alen; j++) {
-                               ix = 
LibMatrixDNNHelper.computeTensorIndexes(aix[j], params.P, params.Q, ix);
-                               if( trans )
-                                       out.appendValue(ix.ix1, outputOffset + 
ix.ix2*params.Q + ix.ix3, avals[j]);
-                               else
-                                       out.appendValue(outputOffset + 
ix.ix2*params.Q + ix.ix3, ix.ix1, avals[j]);
-                       }
-               }
-       }
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 36d3dc2..6119e95 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -1369,7 +1369,7 @@ public class LibMatrixMult
                        int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
                        k2 = (k2>=0) ? apos+k2 : apos+alen;
                        
-                       if( b.isContiguous(aix[k1], aix[k2-1]) ) {
+                       if( k1==k2 || b.isContiguous(aix[k1], aix[k2-1]) ) {
                                double[] bvals = b.values(aix[k1]);
                                int base = aix[k1]*n - b.pos(aix[k1]);
                                //rest not aligned to blocks of 4 rows

Reply via email to