[39/50] [abbrv] systemml git commit: [SYSTEMML-540] Avoid redundant computation of cudnnPoolingForward in max_pool_backward

reinwald Fri, 03 Nov 2017 11:02:55 -0700

[SYSTEMML-540] Avoid redundant computation of cudnnPoolingForward in 
max_pool_backward


- If the max_pool is invoked in the forward pass, then its output can be
  reused by the max_pool_backward rather than calling cudnnPoolingForward
  again. For sentence CNN with 2 epochs, this reduces the time for
  max_pool_backward from 6.361 to 2.966 seconds.

Closes #691.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/06d5bb07
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/06d5bb07
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/06d5bb07

Branch: refs/heads/master
Commit: 06d5bb073792345f7c4b7ecd0fb4454a335cc421
Parents: 118e3c0
Author: Niketan Pansare <[email protected]>
Authored: Sat Oct 28 13:44:37 2017 -0700
Committer: Niketan Pansare <[email protected]>
Committed: Sat Oct 28 13:45:52 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    | 163 +++++++++++++------
 .../gpu/ConvolutionGPUInstruction.java          |  43 ++++-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |  51 +++---
 .../sysml/test/gpu/NeuralNetworkOpTests.java    |  82 ++++++++++
 4 files changed, 260 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 50a7ca3..16a8b63 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -47,14 +47,23 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
        private static final boolean THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH = 
true;
        // 
-------------------------------------------------------------------------
        
+       // Specifies the type of this hop
        private Hop.ConvOp op;
-
        private int _maxNumThreads = -1; //-1 for unlimited
 
        private ConvolutionOp() {
                //default constructor for clone
        }
 
+       /**
+        * Create a hop from the builtin expression
+        * 
+        * @param l name of the hop
+        * @param dt datatype (only supports matrix datatype)
+        * @param vt valuetype  (only supports matrix valuetype) 
+        * @param o type of this hop
+        * @param inp input hops
+        */
        public ConvolutionOp(String l, DataType dt, ValueType vt, ConvOp o, 
ArrayList<Hop> inp) 
        {
                super(l, dt, vt);
@@ -75,8 +84,7 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                HopsException.check(_input.size() >= 1, this, "should have at 
least one input but has %d inputs", _input.size());
        }
 
-       public ConvOp getOp()
-       {
+       public ConvOp getOp() {
                return op;
        }
        
@@ -163,77 +171,129 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                return input instanceof ConvolutionOp && ((ConvolutionOp) 
input).getOp() == ConvOp.DIRECT_CONV2D;
        }
        
+       /**
+        * Compares the input parameters for max_pool/max_pool_backward 
operations
+        * 
+        * @return true if the following parameters match: stride=[stride, 
stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, 
imgSize], pool_size=[poolSize1, poolSize2]
+        */
+       private static boolean 
isPoolingParametersEqualAndKnown(ConvolutionParameters param1, 
ConvolutionParameters param2) {
+               return isEqualAndKnown(param1.stride_h, param2.stride_h) && 
isEqualAndKnown(param1.stride_w, param2.stride_w) && 
+                       isEqualAndKnown(param1.pad_h, param2.pad_h) && 
isEqualAndKnown(param1.pad_w, param2.pad_w) &&
+                       isEqualAndKnown(param1.R, param2.R) && 
isEqualAndKnown(param1.S, param2.S) &&
+                       isEqualAndKnown(param1.N, param2.N) && 
isEqualAndKnown(param1.C, param2.C) &&
+                       isEqualAndKnown(param1.H, param2.H) && 
isEqualAndKnown(param1.W, param2.W);
+       }
+       
+       private static boolean isEqualAndKnown(int val1, int val2) {
+               return val1 >= 0 && val2 >= 0 && val1 == val2;
+       }
+       
+       /**
+        * Returns the output lop of maxpool operation with same parameters as 
this hop.
+        * If corresponding output lop is not found or if this is not a 
max_pool_backward operation, this function returns null
+        * 
+        * @return output lop of maxpool operation with same parameters as this 
hop
+        * @throws HopsException if error 
+        * @throws LopsException if error
+        */
+       private Lop getMaxPoolOutputLop() throws HopsException, LopsException {
+               if(op != ConvOp.MAX_POOLING_BACKWARD)
+                       return null;
+               
+               Hop inputImage = getInput().get(0);
+               for(Hop tmpParent : inputImage.getParent()) {
+                       if(!(tmpParent instanceof ConvolutionOp))
+                               continue;
+                       ConvolutionOp parent = (ConvolutionOp) tmpParent;
+                       if(parent.getOp() == ConvOp.MAX_POOLING && 
isPoolingParametersEqualAndKnown(parent._cachedParams, _cachedParams)) {
+                               return parent.constructLops();
+                       }
+               }
+               return null;
+       }
+       
        public Lop constructConvolutionLops(ExecType et, ArrayList<Hop> inputs) 
throws HopsException, LopsException {
                if(inputs.size() != getNumExpectedInputs()) 
                        throw new HopsException("Incorrect number of inputs for 
" + op.name());
                
-               Lop in = null; Lop in2 = null;
-               ArrayList<Hop> inputs1 = inputs;
-               int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+               // 
---------------------------------------------------------------
+               // Deal with fused operators and contruct 
lhsInputLop/optionalRhsInputLop
+               Lop lhsInputLop = null; Lop optionalRhsInputLop = null;
+               ArrayList<Hop> inputsOfPotentiallyFusedOp = inputs;
                OperationTypes lopOp = HopsConv2Lops.get(op);
-
+               
                // RELU_MAX_POOLING and RELU_MAX_POOLING_BACKWARD is extremely 
useful for CP backend 
                // by reducing unnecessary sparse-to-dense-to-sparse conversion.
                // For other backends, this operators is not necessary as it 
reduces an additional relu operator.
                if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && 
op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0))) {
-                       in = inputs.get(0).getInput().get(0).constructLops();
+                       lhsInputLop = 
inputs.get(0).getInput().get(0).constructLops();
                        lopOp = OperationTypes.RELU_MAX_POOLING;
                }
                else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == 
ExecType.CP && op == ConvOp.MAX_POOLING_BACKWARD && isInputReLU(inputs.get(0))) 
{
-                       in = inputs.get(0).getInput().get(0).constructLops();
+                       lhsInputLop = 
inputs.get(0).getInput().get(0).constructLops();
                        lopOp = OperationTypes.RELU_MAX_POOLING_BACKWARD;
                }
                else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && op == 
ConvOp.BIAS_ADD && isInputConv2d(inputs.get(0))) {
                        lopOp = OperationTypes.DIRECT_CONV2D_BIAS_ADD;
                        
                        // the first lop is image 
-                       in = inputs.get(0).getInput().get(0).constructLops();
+                       lhsInputLop = 
inputs.get(0).getInput().get(0).constructLops();
                        // the second lop is bias
-                       in2 = inputs.get(1).constructLops();
+                       optionalRhsInputLop = inputs.get(1).constructLops();
                        
                        // Use the inputs from conv2d rather than bias_add
-                       inputs1 = inputs.get(0).getInput();
+                       inputsOfPotentiallyFusedOp = inputs.get(0).getInput();
                }
                else {
-                       in = inputs.get(0).constructLops();
+                       lhsInputLop = inputs.get(0).constructLops();
                }
+               // 
---------------------------------------------------------------
                
-//             // TODO: Inserting reblock requires knowing columns apriori
-//             ConvolutionTransform transform1 = new 
ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, 
getDataType(), getValueType(), et, k);
-//             setReblockedOutputDimension(et, transform1);
-               double cpIntermediateMemEstimate = 
computeIntermediateMemEstimate(-1, -1, -1 );
+               // 
---------------------------------------------------------------
+               // Compute intermediate memory budget that can be passed to GPU 
operators 
+               // for better CuDNN operator selection at runtime
+               double intermediateMemEstimate = 
computeIntermediateMemEstimate(-1, -1, -1 );
                if(et == ExecType.GPU && _dim1 > 0 && _dim2 > 0) {
                        // This enables us to compile more efficient 
matrix-matrix CuDNN operation instead of 
                        // row-by-row invocation of multiple vector-matrix 
CuDNN operations.
                        // This is possible as the operations on GPU are 
single-threaded
                        double optimisticIntermediateMemEstimate = 
GPUContextPool.initialGPUMemBudget() - getOutputMemEstimate() - 
inputs.get(0).getOutputMemEstimate();
-                       if(in2 != null) {
+                       if(optionalRhsInputLop != null) {
                                optimisticIntermediateMemEstimate -= 
inputs.get(1).getOutputMemEstimate();
                        }
-                       cpIntermediateMemEstimate = 
Math.max(cpIntermediateMemEstimate, optimisticIntermediateMemEstimate);
+                       intermediateMemEstimate = 
Math.max(intermediateMemEstimate, optimisticIntermediateMemEstimate);
                }
-               ConvolutionTransform transform1 = new ConvolutionTransform(in, 
lopOp, getDataType(), getValueType(), et, k, cpIntermediateMemEstimate);
-               setOutputDimensions(transform1);
+               // 
---------------------------------------------------------------
                
-               setLineNumbers(transform1);
-               in.addOutput(transform1);
+               // Contruct the lop
+               ConvolutionTransform convolutionLop = new 
ConvolutionTransform(lhsInputLop, lopOp, 
+                               getDataType(), getValueType(), et, 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), 
intermediateMemEstimate);
                
-               if(in2 != null) {
-                       transform1.addInput(in2);
-                       in2.addOutput(transform1);
-               }
+               // Propagate the output dimensions and the line number of 
ConvolutionOp to ConvolutionTransform
+               setOutputDimensions(convolutionLop);
+               setLineNumbers(convolutionLop);
                
-               // stride1, stride2, padding1, padding2  
-               // input_shape1, input_shape2, input_shape3, input_shape4, 
-               // filter_shape1, filter_shape2, filter_shape3, filter_shape4
-               for( int i=1; i < inputs1.size(); i++ )
-               {
-                       Lop ltmp = inputs1.get(i).constructLops();
-                       transform1.addInput(ltmp);
-                       ltmp.addOutput(transform1);
+               // 
---------------------------------------------------------------
+               // Add input/output for parent lops of convolutionLop
+               lhsInputLop.addOutput(convolutionLop);
+               if(optionalRhsInputLop != null) {
+                       convolutionLop.addInput(optionalRhsInputLop);
+                       optionalRhsInputLop.addOutput(convolutionLop);
+               }
+               for( int i=1; i < inputsOfPotentiallyFusedOp.size(); i++ ) {
+                       Lop ltmp = 
inputsOfPotentiallyFusedOp.get(i).constructLops();
+                       convolutionLop.addInput(ltmp);
+                       ltmp.addOutput(convolutionLop);
                }
-               transform1.setLevel(); //force order of added lops
-               return transform1;
+               // Only valid for MAX_POOLING_BACKWARD on GPU
+               Lop optionalMaxPoolOutput = (et == ExecType.GPU) ? 
getMaxPoolOutputLop() : null; 
+               if(optionalMaxPoolOutput != null) {
+                       convolutionLop.addInput(optionalMaxPoolOutput);
+                       optionalMaxPoolOutput.addOutput(convolutionLop);
+               }
+               convolutionLop.setLevel(); //force order of added lops
+               // 
---------------------------------------------------------------
+               return convolutionLop;
        }
 
                        
@@ -453,12 +513,10 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                
                ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? 
ExecType.SPARK : ExecType.MR;
                
-               if( _etypeForced != null )                      
-               {
+               if( _etypeForced != null ) {
                        _etype = _etypeForced;
                }
-               else 
-               {       
+               else {  
                        if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
                                _etype = findExecTypeByMemEstimate();
                        }
@@ -479,8 +537,9 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                return _etype;
        }
        
-       // Caching parameters speed-ups dynamic recompilation time by avoiding 
unnecessary computeSizeInformation
+       // Parameters recomputed in refreshSizeInformation and passed across 
many calls of getDim
        private ConvolutionParameters _cachedParams = new 
ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
_maxNumThreads);
+       
        // stride1, stride2, padding1, padding2  
        // input_shape1, input_shape2, input_shape3, input_shape4, 
        // filter_shape1, filter_shape2, filter_shape3, filter_shape4
@@ -494,16 +553,16 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        imageHeightHop = getInput().get(8);
                        filterHeightHop = getInput().get(12);
                        _cachedParams.setIfUnknown(
-                                       getInput().get(6),
-                                       getInput().get(7), 
-                                       imageHeightHop, 
-                                       getInput().get(9), 
-                                       getInput().get(10), 
-                                       filterHeightHop, 
-                                       getInput().get(13), 
-                                       getInput().get(2), 
-                                       getInput().get(3), 
-                                       getInput().get(4), 
+                                       getInput().get(6),  // N
+                                       getInput().get(7),  // C
+                                       imageHeightHop,     // H
+                                       getInput().get(9),  // W
+                                       getInput().get(10), // K
+                                       filterHeightHop,    // R
+                                       getInput().get(13), // S
+                                       getInput().get(2),  // stride_h
+                                       getInput().get(3),  // stride_w
+                                       getInput().get(4),  // pad+h
                                        getInput().get(5), _maxNumThreads);
                }
                else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 354ea63..8565b5a 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -92,8 +92,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction 
{
                
                if( ( opcode.equalsIgnoreCase("conv2d")
                         || opcode.equalsIgnoreCase("conv2d_backward_filter")
-                        || opcode.equalsIgnoreCase("conv2d_backward_data")
-                        || opcode.equalsIgnoreCase("maxpooling_backward")) ) {
+                        || opcode.equalsIgnoreCase("conv2d_backward_data")) ) {
                        InstructionUtils.checkNumFields(parts, 16);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand in2 = new CPOperand(parts[2]);
@@ -119,6 +118,39 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        return new ConvolutionGPUInstruction(in1, in2, out, 
opcode, str, stride,
                                        padding, input_shape, filter_shape, 
Double.parseDouble(parts[16]));
                }
+               else if( opcode.equalsIgnoreCase("maxpooling_backward") ) {
+                       boolean withMaxPoolOut = false;
+                       if(parts.length == 18) {
+                               withMaxPoolOut = true;
+                       }
+                       else
+                               InstructionUtils.checkNumFields(parts, 16);
+                       CPOperand in1 = new CPOperand(parts[1]);
+                       CPOperand in2 = new CPOperand(parts[2]);
+                       CPOperand in3 = withMaxPoolOut ? new 
CPOperand(parts[15]) : null;
+                       CPOperand out = withMaxPoolOut ? new 
CPOperand(parts[16]) : new CPOperand(parts[15]);
+                       double memBudget = withMaxPoolOut ? 
Double.parseDouble(parts[17]) : Double.parseDouble(parts[16]);
+               
+                       ArrayList<CPOperand> stride = new ArrayList<>();
+                       ArrayList<CPOperand> padding = new ArrayList<>();
+                       ArrayList<CPOperand> input_shape = new ArrayList<>();
+                       ArrayList<CPOperand> filter_shape = new ArrayList<>();
+                       stride.add(new CPOperand(parts[3]));
+                       stride.add(new CPOperand(parts[4]));
+                       padding.add(new CPOperand(parts[5]));
+                       padding.add(new CPOperand(parts[6]));
+                       input_shape.add(new CPOperand(parts[7]));
+                       input_shape.add(new CPOperand(parts[8]));
+                       input_shape.add(new CPOperand(parts[9]));
+                       input_shape.add(new CPOperand(parts[10]));
+                       filter_shape.add(new CPOperand(parts[11]));
+                       filter_shape.add(new CPOperand(parts[12]));
+                       filter_shape.add(new CPOperand(parts[13]));
+                       filter_shape.add(new CPOperand(parts[14]));
+
+                       return new ConvolutionGPUInstruction(in1, in2, in3, 
out, opcode, str, stride,
+                                       padding, input_shape, filter_shape, 
memBudget);
+               }
                else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
                        InstructionUtils.checkNumFields(parts, 17);
                        CPOperand in1 = new CPOperand(parts[1]);
@@ -324,7 +356,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
                        MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
-                       
+                       MatrixObject maxPoolOutput = _input3 != null ? 
getMatrixInputForGPUInstruction(ec, _input3.getName()) : null;
                        if(dout.getNumRows() != N || dout.getNumColumns() != 
C*P*Q) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for dout in maxpooling_backward");
                        if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
@@ -333,7 +365,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
                        
-                       LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
+                       LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, maxPoolOutput, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, _intermediateMemoryBudget);
                }
                else {
@@ -346,7 +378,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                if ( !instOpcode.equalsIgnoreCase("maxpooling") )
                        
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 
-               if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))
+               if (instOpcode.equalsIgnoreCase("conv2d_bias_add") || 
+                       (instOpcode.equalsIgnoreCase("maxpooling_backward") && 
_input3 != null))
                        
ec.releaseMatrixInputForGPUInstruction(_input3.getName());
 
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 7fd766c..e0a6a57 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -519,6 +519,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param image image as matrix object
         * @param dout                  delta matrix, output of previous layer
+        * @param maxpoolOutput (optional and can be null) output of maxpool 
forward function
         * @param outputBlock output matrix
         * @param N                             batch size
         * @param C                             number of channels
@@ -537,12 +538,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static void maxpoolingBackward(GPUContext gCtx, String instName, 
MatrixObject image, MatrixObject dout,
-                       MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
+                       MatrixObject maxpoolOutput, MatrixObject outputBlock, 
int N, int C, int H, int W, int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q, double intermediateMemoryBudget) throws 
DMLRuntimeException {
                long CHW = C*H*W; long CPQ = C*P*Q;  
                long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
+               final boolean isMaxPoolOutputProvided = maxpoolOutput != null;
+               
                if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < 
maxNumElementsOfCuDNNTensor) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
                        long overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
@@ -551,19 +554,26 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        if(overhead <= intermediateMemoryBudget) {
                                Pointer x = getDensePointerForCuDNN(gCtx, 
image, instName);
                                Pointer dy = getDensePointerForCuDNN(gCtx, 
dout, instName);
-                               cudnnMaxpoolingBackward(gCtx, instName, x, dy, 
dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+                               Pointer y = isMaxPoolOutputProvided ? 
getDensePointerForCuDNN(gCtx, maxpoolOutput, instName) : null;
+                               cudnnMaxpoolingBackward(gCtx, instName, x, dy, 
y, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
                        }
                        else {
                                LibMatrixCuDNNInputRowFetcher imgFetcher = new 
LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
                                LibMatrixCuDNNInputRowFetcher doutFetcher = new 
LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout);
+                               LibMatrixCuDNNInputRowFetcher maxPoolOutFetcher 
= isMaxPoolOutputProvided ? new LibMatrixCuDNNInputRowFetcher(gCtx, instName, 
maxpoolOutput) : null;
                                for(int n = 0; n < N; n++) {
-                                       cudnnMaxpoolingBackward(gCtx, instName, 
imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), 
+                                       Pointer x = imgFetcher.getNthRow(n);
+                                       Pointer dy = doutFetcher.getNthRow(n);
+                                       Pointer y = isMaxPoolOutputProvided ? 
maxPoolOutFetcher.getNthRow(n) : null;
+                                       cudnnMaxpoolingBackward(gCtx, instName, 
x, dy, y, 
                                                        
dx.withByteOffset(n*CHW*sizeOfDataType), 
                                                        1, C, H, W, K, R, S, 
pad_h, pad_w, stride_h, stride_w, P, Q);
                                }
                                // Deallocate temporary array to hold one 
element of input
                                imgFetcher.close();
                                doutFetcher.close();
+                               if(isMaxPoolOutputProvided)
+                                       maxPoolOutFetcher.close();
                        }
                }
                else {
@@ -572,36 +582,33 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        }
        
        private static void cudnnMaxpoolingBackward(GPUContext gCtx, String 
instName, 
-                       Pointer x, Pointer dy, Pointer dx, 
+                       Pointer x, Pointer dy, Pointer y, Pointer dx, 
                        int N, int C, int H, int W, int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" 
+ gCtx);
                }
-               Pointer y = null;
+               
+               boolean isMaxPoolOutputProvided = (y != null);
 
                try(LibMatrixCuDNNPoolingDescriptors desc = 
                                
LibMatrixCuDNNPoolingDescriptors.cudnnMaxpoolingBackwardDescriptors(gCtx, 
instName, N, C, H, W, K, R, S, 
                                                pad_h, pad_w, stride_h, 
stride_w, P, Q)) {
                        long t1=0, t2=0, t3=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       
-                       // Calling PoolForward first, y is one of the inputs 
for poolBackward
-                       // TODO: Remove calling poolForward after necessary 
changes at language level for poolBackward
-                       long numBytes = N*C*P*Q*sizeOfDataType;
-                       y = gCtx.allocate(numBytes);
-                       
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-                       
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
-
-                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-                               throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward before cudnnPoolingBackward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       int status;
+                       if(!isMaxPoolOutputProvided) {
+                               if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                               long numBytes = N*C*P*Q*sizeOfDataType;
+                               y = gCtx.allocate(numBytes);
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+                               if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                               status = 
cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, 
x, zero(), desc.yDesc, y);
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+                               if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                                       throw new DMLRuntimeException("Could 
not executed cudnnPoolingForward before cudnnPoolingBackward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                               }
                        }
-
                        if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
                        status = cudnnPoolingBackward(getCudnnHandle(gCtx), 
desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), 
desc.dxDesc, dx);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
@@ -615,7 +622,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                finally {
                        long t4=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t4 = 
System.nanoTime();
-                       if(y != null)
+                       if(!isMaxPoolOutputProvided)
                                gCtx.cudaFreeHelper(instName, y);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java 
b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
index aba0cae..c57e997 100644
--- a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
@@ -579,5 +579,87 @@ public class NeuralNetworkOpTests extends GPUTests {
                        }
                }
        }
+       
+       
+       @Test
+       @Ignore
+       public void testMaxPoolBackwardWithMaxpoolOut() {
+               String scriptStr = "tmp = max_pool(image, padding=[padH, padW], 
stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S]); 
print(sum(tmp)); O = max_pool_backward(image, dout, padding=[padH, padW], 
stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
+
+               for (long N : Nlst) {
+                       for (long C : Clst) {
+                               for (long H : Hlst) {
+                                       long W = H;
+                                       for (long R : Rlst) {
+                                               long S = R;
+                                               for (long strideH : strideLst) {
+                                                       long strideW = strideH;
+                                                       for (long padH : 
padLst) {
+                                                               long padW = 
padH;
+                                                               for (double 
sparsity : sparsitylst) {
+
+                                                                       // pool 
is smaller than image + padding
+                                                                       if (R > 
(H + padH) || S > (W + padW))
+                                                                               
continue;
+
+                                                                       // Make 
sure ops fit in GPU memory and within constraints of cudnn
+                                                                       long 
imageSize = N * C * H * W * 8l;
+                                                                       if 
(imageSize > MAX_OP_SIZE)  // image size
+                                                                               
continue;
+                                                                       long 
poolSize = R * S * 8l;
+                                                                       if 
(poolSize > MAX_OP_SIZE)  // filter size
+                                                                               
continue;
+
+                                                                       int P = 
(int) ConvolutionUtils.getP(H, R, strideH, padH);
+                                                                       int Q = 
(int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+                                                                       long 
doutSize = N * C * P * Q * 8l;
+                                                                       if 
(doutSize > MAX_OP_SIZE) // dout/output size
+                                                                               
continue;
+
+                                                                       double 
imageSizeInMB = imageSize / (1024.0 * 1024.0);
+                                                                       double 
poolSizeInMB = poolSize / (1024.0 * 1024.0);
+                                                                       double 
doutSizeInMB = doutSize / (1024.0 * 1024.0);
+                                                                       
System.out
+                                                                       
.format("max_pool_backward, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), 
dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+                                                                               
        N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+                                                                               
        P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+                                                                       Matrix 
image = generateInputMatrix(spark, (int) N,
+                                                                               
        (int) (C * H * W), -127.0, 127, sparsity, seed, true);
+                                                                       Matrix 
dout = generateInputMatrix(spark, (int) N, (int) (C * P * Q),
+                                                                               
        -127.0, 127, sparsity, seed, true);
+                                                                       
HashMap<String, Object> inputs = new HashMap<>();
+                                                                       
inputs.put("N", N);
+                                                                       
inputs.put("C", C);
+                                                                       
inputs.put("H", H);
+                                                                       
inputs.put("W", W);
+                                                                       
inputs.put("R", R);
+                                                                       
inputs.put("S", S);
+                                                                       
inputs.put("strideH", strideH);
+                                                                       
inputs.put("strideW", strideW);
+                                                                       
inputs.put("padH", padH);
+                                                                       
inputs.put("padW", padW);
+                                                                       
inputs.put("image", image);
+                                                                       
inputs.put("dout", dout);
+                                                                       
List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+                                                                               
        Arrays.asList("O"));
+                                                                       
List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+                                                                               
        Arrays.asList("O"));
+                                                                       
assertHeavyHitterPresent("gpu_maxpooling_backward");
+                                                                       
assertEqualObjects(outCPU.get(0), outGPU.get(0));
+                                                                       
clearGPUMemory();
+                                                               }
+                                                       }
+                                               }
+                                       }
+
+
+
+
+                               }
+                       }
+               }
+       }
 
 }

[39/50] [abbrv] systemml git commit: [SYSTEMML-540] Avoid redundant computation of cudnnPoolingForward in max_pool_backward

Reply via email to