Repository: incubator-systemml
Updated Branches:
  refs/heads/master 4cd982917 -> 2b5b12557


[SYSTEMML-540] Added fused relu_maxpooling

- Fused relu_maxpooling reduces the unnecessary dense-to-sparse-to-dense
  conversion. This operator makes relu a "no op".
- Note: fused relu_maxpooling is only supported in CP, not on GPU as both
  relu and maxpooling invoke CuDNN functions.
- Also, improved the performance of maxpooling by computing indexes
apriori.

Closes #329.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/2b5b1255
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/2b5b1255
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/2b5b1255

Branch: refs/heads/master
Commit: 2b5b12557556d95c499730ae105807d996ad7a47
Parents: 4cd9829
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Fri Jan 6 10:23:47 2017 -0800
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Fri Jan 6 10:23:47 2017 -0800

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    |  15 ++-
 .../apache/sysml/lops/ConvolutionTransform.java |   5 +-
 .../instructions/CPInstructionParser.java       |   1 +
 .../cp/ConvolutionCPInstruction.java            |   7 +-
 .../matrix/data/ConvolutionParameters.java      |   5 +-
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 124 +++++++++++--------
 6 files changed, 93 insertions(+), 64 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2b5b1255/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index f1efbb1..3f9ca7e 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -25,6 +25,7 @@ import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.MultiThreadedHop;
 import org.apache.sysml.lops.ConvolutionTransform;
+import org.apache.sysml.lops.ConvolutionTransform.OperationTypes;
 import org.apache.sysml.lops.Lop;
 import org.apache.sysml.lops.LopsException;
 import org.apache.sysml.lops.LopProperties.ExecType;
@@ -136,10 +137,18 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        throw new HopsException("Incorrect number of inputs for 
" + op.name());
                }
                
-               Lop in = inputs.get(0).constructLops();
+               Lop in = null;
+               OperationTypes lopOp = HopsConv2Lops.get(op);
                int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-               ConvolutionTransform transform1 = new ConvolutionTransform( in, 
-                               HopsConv2Lops.get(op), getDataType(), 
getValueType(), et, k);
+               if(op == ConvOp.MAX_POOLING && et == ExecType.CP && 
inputs.get(0) instanceof UnaryOp
+                               && ((UnaryOp) inputs.get(0)).getOp() == 
OpOp1.SELP) {
+                       in = inputs.get(0).getInput().get(0).constructLops();
+                       lopOp = OperationTypes.RELU_MAX_POOLING;
+               }
+               else {
+                       in = inputs.get(0).constructLops();
+               }
+               ConvolutionTransform transform1 = new ConvolutionTransform( in, 
lopOp, getDataType(), getValueType(), et, k);
                setOutputDimensions(transform1);
                setLineNumbers(transform1);
                in.addOutput(transform1);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2b5b1255/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java 
b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index d4821ac..d69bd93 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -30,7 +30,7 @@ public class ConvolutionTransform extends Lop
 
        
        public enum OperationTypes {
-               MAX_POOLING, MAX_POOLING_BACKWARD,
+               MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING,
                DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, 
DIRECT_CONV2D_BACKWARD_DATA,
                BIAS_ADD
        };
@@ -99,6 +99,9 @@ public class ConvolutionTransform extends Lop
                case MAX_POOLING:
                        return "maxpooling";
                        
+               case RELU_MAX_POOLING:
+                       return "relu_maxpooling";
+                       
                case MAX_POOLING_BACKWARD:
                        return "maxpooling_backward";
                

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2b5b1255/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 344cde4..18e3a48 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -218,6 +218,7 @@ public class CPInstructionParser extends InstructionParser
                String2CPInstructionType.put( "rsort"      , 
CPINSTRUCTION_TYPE.Reorg);
 
                // Opcodes related to convolutions
+               String2CPInstructionType.put( "relu_maxpooling"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "maxpooling"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "maxpooling_backward"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "conv2d"      , 
CPINSTRUCTION_TYPE.Convolution);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2b5b1255/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index e0238aa..56f1460 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -20,6 +20,7 @@
 package org.apache.sysml.runtime.instructions.cp;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
@@ -89,7 +90,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
 
                String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
                String opcode = parts[0];
-               if (opcode.equalsIgnoreCase("maxpooling")) {
+               if (opcode.equalsIgnoreCase("maxpooling") || 
opcode.equalsIgnoreCase("relu_maxpooling")) {
                        InstructionUtils.checkNumFields(parts, 15);
                        // stride1, stride2, padding1, padding2
                        // input_shape1, input_shape2, input_shape3, 
input_shape4,
@@ -231,12 +232,14 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
                
                ConvolutionParameters params = new ConvolutionParameters(N, C, 
H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
-               if (instOpcode.equalsIgnoreCase("maxpooling")) {
+               if (instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) {
                        if(matBlock.isEmptyBlock()) {
                                outputBlock = new MatrixBlock(N, C*P*Q, true, 
0);
                        }
                        else {
                                outputBlock = getDenseOutputBlock(ec, N, C*P*Q);
+                               if(instOpcode.equalsIgnoreCase("maxpooling"))
+                                       
Arrays.fill(outputBlock.getDenseBlock(), -Double.MAX_VALUE);
                                LibMatrixDNN.maxpooling(matBlock, outputBlock, 
params);
                        }
                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2b5b1255/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 27fcf87..cd37c06 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -19,8 +19,6 @@
 
 package org.apache.sysml.runtime.matrix.data;
 
-import java.util.concurrent.atomic.AtomicLong;
-
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 
@@ -33,10 +31,11 @@ public class ConvolutionParameters {
        public int K; public int R; public int S; public int stride_h; public 
int stride_w; public int pad_h; public int pad_w;
        public int P; public int Q; public int numThreads;
        
-       public AtomicLong outputNNZ = new AtomicLong(-1);
        
        MatrixBlock input1; MatrixBlock input2; MatrixBlock output;
        
+       public int [] start_indexes_h, end_indexes_h, start_indexes_w, 
end_indexes_w; 
+       
        private int convertToInt(long val) throws DMLRuntimeException {
                if( val > Integer.MAX_VALUE ) {
                        throw new DMLRuntimeException("The value for 
ConvolutionParameters is too large:" + val);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2b5b1255/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 62b1513..1400b31 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -379,9 +379,31 @@ public class LibMatrixDNN {
                if (params.output.isInSparseFormat())
                        throw new DMLRuntimeException("Sparse 
maxpooling_backward is not supported");
 
+               fillIndexesArray(params);
                runConvTask(TaskType.MaxPooling_Backward, params);
        }
        
+       private static void fillIndexesArray(ConvolutionParameters params) {
+               params.start_indexes_h = new int[params.P];
+               params.end_indexes_h = new int[params.P];
+               params.start_indexes_w = new int[params.Q];
+               params.end_indexes_w = new int[params.Q];
+               for (int p = 0; p < params.P; p++) {
+                       int start_index_h = p * params.stride_h - params.pad_h;
+                       final int end_index_h = Math.min(start_index_h + 
params.R, params.H);
+                       start_index_h = Math.max(start_index_h, 0);
+                       params.start_indexes_h[p] = start_index_h;
+                       params.end_indexes_h[p] = end_index_h;
+               }
+               for (int q = 0; q < params.Q; q++) {
+                       int start_index_w = Math.max(q * params.stride_w - 
params.pad_w, 0);
+                       int end_index_w = Math.min(start_index_w + params.S, 
params.W);
+                       start_index_w = Math.max(start_index_w, 0);
+                       params.start_indexes_w[q] = start_index_w;
+                       params.end_indexes_w[q] = end_index_w;
+               }
+       }
+       
        private static void doPoolingBackward(int n, ConvolutionParameters 
params) throws DMLRuntimeException {
                double [] inputArray = null;
                if (!params.input1.isInSparseFormat())
@@ -419,10 +441,7 @@ public class LibMatrixDNN {
                                        double inVal = 
doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q +  p * params.Q + 
q];
                                        if(inVal != 0) {
                                                final int inputOffset = 
n*params.C*params.H*params.W + c*params.H*params.W;
-                                               int start_index_h = p * 
params.stride_h - params.pad_h;
-                                               final int end_index_h = 
Math.min(start_index_h + params.R, params.H);
-                                               start_index_h = 
Math.max(start_index_h, 0);
-                                               int maxIndex = 
getMaxIndexSparse(start_index_h, end_index_h, q, inputOffset, n, c, 
params.input1, params);
+                                               int maxIndex = 
getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
                                                outputArray[maxIndex] += inVal;
                                        }
                                }
@@ -446,10 +465,7 @@ public class LibMatrixDNN {
                        int q = tensorIndexes[2];
                        
                        final int inputOffset = n*params.C*params.H*params.W + 
c*params.H*params.W;
-                       int start_index_h = p * params.stride_h - params.pad_h;
-                       final int end_index_h = Math.min(start_index_h + 
params.R, params.H);
-                       start_index_h = Math.max(start_index_h, 0);
-                       int maxIndex = getMaxIndexSparse(start_index_h, 
end_index_h, q, inputOffset, n, c, params.input1, params);
+                       int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, 
c, params.input1, params);
                        outputArray[maxIndex] += ijv.getV();
                }
                
@@ -469,10 +485,7 @@ public class LibMatrixDNN {
                        int q = tensorIndexes[2];
                        
                        final int inputOffset = n*params.C*params.H*params.W + 
c*params.H*params.W;
-                       int start_index_h = p * params.stride_h - params.pad_h;
-                       final int end_index_h = Math.min(start_index_h + 
params.R, params.H);
-                       start_index_h = Math.max(start_index_h, 0);
-                       int maxIndex = getMaxIndex(start_index_h, end_index_h, 
q, inputOffset, inputArray, params);
+                       int maxIndex = getMaxIndex(p, q, inputOffset, 
inputArray, params);
                        outputArray[maxIndex] += ijv.getV();
                }
        }
@@ -484,20 +497,15 @@ public class LibMatrixDNN {
                        final int outputOffset = n*params.C*params.P*params.Q + 
c*params.P*params.Q;
                        
                        for (int p = 0; p < params.P; p++) {
-                               int start_index_h = p * params.stride_h - 
params.pad_h;
-                               final int end_index_h = Math.min(start_index_h 
+ params.R, params.H);
-                               start_index_h = Math.max(start_index_h, 0);
-                               
                                for (int q = 0; q < params.Q; q++) {
-                                       int maxIndex = 
getMaxIndex(start_index_h, end_index_h, q, inputOffset, inputArray, params);
+                                       int maxIndex = getMaxIndex(p, q, 
inputOffset, inputArray, params);
                                        outputArray[maxIndex] += 
doutArray[outputOffset +  p * params.Q + q];
                                }
                        }
                }
        }
        
-       private static int getMaxIndexSparse(int start_index_h, int 
end_index_h, 
-                       int q, int inputOffset, int n, int c, MatrixBlock 
input, ConvolutionParameters params) throws DMLRuntimeException {
+       private static int getMaxIndexSparse(int p, int q, int inputOffset, int 
n, int c, MatrixBlock input, ConvolutionParameters params) throws 
DMLRuntimeException {
                if(!input.isInSparseFormat())
                        throw new DMLRuntimeException("Incorrect usage: Only 
sparse format supported");
                
@@ -505,9 +513,10 @@ public class LibMatrixDNN {
                Iterator<IJV> iter = input.sparseBlock.getIterator(n, n+1);
                int [] tensorIndexes = new int[3];
                
-               int start_index_w = Math.max(q * params.stride_w - 
params.pad_w, 0);
-               int end_index_w = Math.min(start_index_w + params.S, params.W);
-               start_index_w = Math.max(start_index_w, 0);
+               int start_index_h = params.start_indexes_h[p];
+               int end_index_h = params.end_indexes_h[p];
+               int start_index_w = params.start_indexes_w[q];
+               int end_index_w = params.end_indexes_w[q];
                
                int maxIndex = inputOffset +  start_index_h*params.W + 
start_index_w; 
                double maxVal = -Double.MAX_VALUE;
@@ -532,11 +541,11 @@ public class LibMatrixDNN {
                return maxIndex;
        }
        
-       private static int getMaxIndex(int start_index_h, int end_index_h, 
-                       int q, int inputOffset, double [] inputArray, 
ConvolutionParameters params) {
-               int start_index_w = q * params.stride_w - params.pad_w;
-               int end_index_w = Math.min(start_index_w + params.S, params.W);
-               start_index_w = Math.max(start_index_w, 0);
+       private static int getMaxIndex(int p, int q, int inputOffset, double [] 
inputArray, ConvolutionParameters params) {
+               int start_index_h = params.start_indexes_h[p];
+               int end_index_h = params.end_indexes_h[p];
+               int start_index_w = params.start_indexes_w[q];
+               int end_index_w = params.end_indexes_w[q];
                
                int maxIndex = inputOffset +  start_index_h*params.W + 
start_index_w; 
                double maxVal = -Double.MAX_VALUE;
@@ -619,12 +628,11 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Incorrect input 
dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + 
" " + params.N + " " + params.K*params.P*params.Q);
                }
                
-               params.outputNNZ.set(0);
+               fillIndexesArray(params);
                runConvTask(TaskType.MaxPooling_Forward, params);
-               outputBlock.setNonZeros(params.outputNNZ.get());
        }
 
-       private static void doPooling(int n, int c, ConvolutionParameters 
params) throws DMLRuntimeException {
+       private static void doPooling(int n, ConvolutionParameters params) 
throws DMLRuntimeException {
                double [] inputArray = null;
                if (!params.input1.isInSparseFormat())
                        inputArray = params.input1.getDenseBlock();
@@ -634,32 +642,40 @@ public class LibMatrixDNN {
                else
                        throw new DMLRuntimeException("Expected the output to 
be allocated in dense format");
                
-               long tmpNNZ = 0;
-               for (int p = 0; p < params.P; p++) {
-                       for (int q = 0; q < params.Q; q++) {
-                               int start_index_h = p * params.stride_h - 
params.pad_h;
-                               int start_index_w = q * params.stride_w - 
params.pad_w;
-                               int end_index_h = Math.min(start_index_h + 
params.R, params.H);
-                               int end_index_w = Math.min(start_index_w + 
params.S, params.W);
-                               start_index_h = Math.max(start_index_h, 0);
-                               start_index_w = Math.max(start_index_w, 0);
-                               int out_index = n*params.C*params.P*params.Q + 
c*params.P*params.Q +  p * params.Q + q;
-                               outputArray[out_index] = -Double.MAX_VALUE;
-                               for (int h = start_index_h; h < end_index_h; 
h++) {
-                                       for (int w = start_index_w; w < 
end_index_w; w++) {
-                                               double inVal = -1;
-                                               if(inputArray != null)
-                                                       inVal = 
inputArray[n*params.C*params.H*params.W + c*params.H*params.W +  h*params.W + 
w];
-                                               else
-                                                       inVal = 
params.input1.quickGetValue(n, c*params.H*params.W +  h*params.W + w);
-                                               outputArray[out_index] = 
Math.max(outputArray[out_index], inVal);
-                                               if(outputArray[out_index] != 0)
-                                                       tmpNNZ++;
+               final int inOffset = n*params.C*params.H*params.W;
+               int out_index = n*params.C*params.P*params.Q;
+               final int HW = params.H*params.W;
+               
+               if(inputArray != null) {
+                       for (int c = 0; c < params.C; c++) {
+                               final int inOffset1 = inOffset + c*HW;
+                               for (int p = 0; p < params.P; p++) {
+                                       for (int q = 0; q < params.Q; q++, 
out_index++) {
+                                               for (int h = 
params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) {
+                                                       for (int w = 
params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) {
+                                                               
outputArray[out_index] = Math.max(outputArray[out_index], inputArray[inOffset1 
+  h*params.W + w]);
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else {
+                       // TODO: Optimize sparse maxpooling
+                       // Low priority after adding fused relu_maxpooling 
operator as output of conv2d expected to be dense
+                       for (int c = 0; c < params.C; c++) {
+                               for (int p = 0; p < params.P; p++) {
+                                       for (int q = 0; q < params.Q; q++, 
out_index++) {
+                                               for (int h = 
params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) {
+                                                       for (int w = 
params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) {
+                                                               double inVal = 
params.input1.quickGetValue(n, c*HW +  h*params.W + w);
+                                                               
outputArray[out_index] = Math.max(outputArray[out_index], inVal);
+                                                       }
+                                               }
                                        }
                                }
                        }
                }
-               params.outputNNZ.addAndGet(tmpNNZ);
        }
        
        private static void doRotate180(int inputN, int outputN, MatrixBlock 
input, 
@@ -818,9 +834,7 @@ public class LibMatrixDNN {
                                case MaxPooling_Forward:
                                {
                                        for(int n = n1; n < n2; n++) {
-                                               for (int c = 0; c < params.C; 
c++) {
-                                                       doPooling(n, c, params);
-                                               }
+                                               doPooling(n, params);
                                        }
                                        break;
                                }

Reply via email to