[1/2] systemml git commit: [SYSTEMML-445] Added builtin functions for efficient computation of batch normalization and lstm layers.

niketanpansare Fri, 01 Jun 2018 10:56:24 -0700

Repository: systemml
Updated Branches:
  refs/heads/master cba082eb1 -> 276065f93



http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 0bb2a67..a9a22e0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -77,7 +77,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
        // global constant configuration parameters
        public static final long    CACHING_THRESHOLD = (long)Math.max(4*1024, 
//obj not s.t. caching
                1e-5 * InfrastructureAnalyzer.getLocalMaxMemory());       //if 
below threshold [in bytes]
-       public static double  CACHING_BUFFER_SIZE = 0.15; 
+       public static double CACHING_BUFFER_SIZE = 0.15; 
        public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO; 
        public static final boolean CACHING_BUFFER_PAGECACHE = false; 
        public static final boolean CACHING_WRITE_CACHE_ON_READ = false;        

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 903dca6..049577b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -56,7 +56,10 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType.put( "bias_add",               
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "bias_multiply",          
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "channel_sums",          
GPUINSTRUCTION_TYPE.Convolution);
-
+               String2GPUInstructionType.put( "lstm",                  
GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "batch_norm2d",           
GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "batch_norm2d_backward",  
GPUINSTRUCTION_TYPE.Convolution);
+               
                // Matrix Multiply Operators
                String2GPUInstructionType.put( "ba+*",  
GPUINSTRUCTION_TYPE.AggregateBinary);
                String2GPUInstructionType.put( "tsmm",  
GPUINSTRUCTION_TYPE.MMTSJ);
@@ -203,4 +206,4 @@ public class GPUInstructionParser  extends InstructionParser
                                throw new DMLRuntimeException("Invalid GPU 
Instruction Type: " + gputype );
                }
        }       
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 591fd54..e523a45 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -20,14 +20,20 @@ package org.apache.sysml.runtime.instructions.gpu;
 
 import java.util.ArrayList;
 
+import jcuda.Pointer;
+
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.PoolingType;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
@@ -37,7 +43,16 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
        private CPOperand _input1;
        private CPOperand _input2;
        private CPOperand _input3;
+       private CPOperand _input4;
+       private CPOperand _input5;
+       private CPOperand _input6;
+       private CPOperand _input7;
+       private CPOperand _input8;
        private CPOperand _output;
+       private CPOperand _output2;
+       private CPOperand _output3;
+       private CPOperand _output4;
+       private CPOperand _output5;
        private ArrayList<CPOperand> _input_shape;
        private ArrayList<CPOperand> _filter_shape;
        private ArrayList<CPOperand> _stride = new ArrayList<>();
@@ -57,8 +72,47 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                _output = out;
                _intermediateMemoryBudget = intermediateMemoryBudget;
        }
+       public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand in3, CPOperand in4, CPOperand in5, CPOperand in6, 
+                       CPOperand out, CPOperand out2, CPOperand out3, String 
opcode, String istr, 
+                       double intermediateMemoryBudget) throws 
DMLRuntimeException {
+               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), 
opcode, istr);
+               _input1 = in1;
+               _input2 = in2;
+               _input3 = in3;
+               _input4 = in4;
+               _input5 = in5;
+               _input6 = in6;
+               _gputype = GPUINSTRUCTION_TYPE.Convolution;
+               _output = out;
+               _output2 = out2;
+               _output3 = out3;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
+       }
        
-       public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand in3, CPOperand out, String opcode, String istr, double 
intermediateMemoryBudget) {
+       public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand in3, CPOperand in4, CPOperand in5,
+                       CPOperand in6, CPOperand in7, CPOperand in8,
+                       CPOperand out, CPOperand out2, CPOperand out3, 
CPOperand out4, CPOperand out5, String opcode, String istr, 
+                       double intermediateMemoryBudget) throws 
DMLRuntimeException {
+               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), 
opcode, istr);
+               _input1 = in1;
+               _input2 = in2;
+               _input3 = in3;
+               _input4 = in4;
+               _input5 = in5;
+               _input6 = in6;
+               _input7 = in7;
+               _input8 = in8;
+               _gputype = GPUINSTRUCTION_TYPE.Convolution;
+               _output = out;
+               _output2 = out2;
+               _output3 = out3;
+               _output4 = out4;
+               _output5 = out5;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
+       }
+       
+       public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand in3, CPOperand out, String opcode, String istr, 
+                       double intermediateMemoryBudget) throws 
DMLRuntimeException {
                super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), 
opcode, istr);
                if( !opcode.equals("channel_sums") ) {
                        throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be channel_sums, but found " + opcode);
@@ -228,6 +282,49 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        CPOperand out = new CPOperand(parts[4]);
                        return new ConvolutionGPUInstruction(in, in2, in3, out, 
opcode, str, 0);
                }
+               else if (opcode.equalsIgnoreCase("lstm")) {
+                       InstructionUtils.checkNumFields(parts, 9);
+                       CPOperand in1 = new CPOperand(parts[1]);
+                       CPOperand in2 = new CPOperand(parts[2]);
+                       CPOperand in3 = new CPOperand(parts[3]);
+                       CPOperand in4 = new CPOperand(parts[4]);
+                       CPOperand in5 = new CPOperand(parts[5]);
+                       CPOperand in6 = new CPOperand(parts[6]);
+                       CPOperand out = new CPOperand(parts[7]);
+                       CPOperand out2 = new CPOperand(parts[8]);
+                       CPOperand out3 = new CPOperand(parts[9]);
+                       return new ConvolutionGPUInstruction(in1, in2, in3, 
in4, in5, in6, out, out2, out3, opcode, str, 0);
+               }
+               else if (opcode.equalsIgnoreCase("batch_norm2d")) {
+                       InstructionUtils.checkNumFields(parts, 13);
+                       CPOperand in1 = new CPOperand(parts[1]); // image
+                       CPOperand in2 = new CPOperand(parts[2]); // scale
+                       CPOperand in3 = new CPOperand(parts[3]); // bias
+                       CPOperand in4 = new CPOperand(parts[4]); // runningMean
+                       CPOperand in5 = new CPOperand(parts[5]); // runningVar
+                       CPOperand in6 = new CPOperand(parts[6]); // mode
+                       CPOperand in7 = new CPOperand(parts[7]); // epsilon
+                       CPOperand in8 = new CPOperand(parts[8]); // 
exponentialAverageFactor
+                       CPOperand out = new CPOperand(parts[9]);  // ret
+                       CPOperand out2 = new CPOperand(parts[10]); // 
retRunningMean
+                       CPOperand out3 = new CPOperand(parts[11]); // 
retRunningVar
+                       CPOperand out4 = new CPOperand(parts[12]); // 
resultSaveMean
+                       CPOperand out5 = new CPOperand(parts[13]); // 
resultSaveInvVariance
+                       return new ConvolutionGPUInstruction(in1, in2, in3, 
in4, in5, in6, in7, in8, out, out2, out3, out4, out5, opcode, str, 0);
+               }
+               else if (opcode.equalsIgnoreCase("batch_norm2d_backward")) {
+                       InstructionUtils.checkNumFields(parts, 9);
+                       CPOperand in1 = new CPOperand(parts[1]); // image
+                       CPOperand in2 = new CPOperand(parts[2]); // dout
+                       CPOperand in3 = new CPOperand(parts[3]); // scale
+                       CPOperand in4 = new CPOperand(parts[4]); // epsilon
+                       CPOperand in5 = new CPOperand(parts[5]); // 
resultSaveMean
+                       CPOperand in6 = new CPOperand(parts[6]); // 
resultSaveInvVariance
+                       CPOperand out = new CPOperand(parts[7]);  // dX
+                       CPOperand out2 = new CPOperand(parts[8]); // dScale
+                       CPOperand out3 = new CPOperand(parts[9]); // dBias
+                       return new ConvolutionGPUInstruction(in1, in2, in3, 
in4, in5, in6, null, null, out, out2, out3, null, null, opcode, str, 0);
+               }
                else {
                        throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionGPUInstruction: " + str);      
                }
@@ -248,6 +345,82 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
+       
+       public void processBatchNorm2dInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
+               GPUStatistics.incrementNoOfExecutedGPUInst();
+               MatrixObject image = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject scale = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
+               MatrixObject bias = getMatrixInputForGPUInstruction(ec, 
_input3.getName());
+               MatrixObject runningMean = getMatrixInputForGPUInstruction(ec, 
_input4.getName());
+               MatrixObject runningVar = getMatrixInputForGPUInstruction(ec, 
_input5.getName());
+               
+               String phase = ec.getScalarInput(_input6.getName(), 
_input6.getValueType(), _input6.isLiteral()).getStringValue();
+               double epsilon = ec.getScalarInput(_input7.getName(), 
_input7.getValueType(), _input7.isLiteral()).getDoubleValue();
+               
+               MatrixObject ret = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName(), image.getNumRows(), image.getNumColumns());
+               
+               if(phase.equalsIgnoreCase("train")) {
+                       double exponentialAverageFactor = 
1-ec.getScalarInput(_input8.getName(), _input8.getValueType(), 
_input8.isLiteral()).getDoubleValue();
+                       MatrixObject retRunningMean = 
getDenseMatrixOutputForGPUInstruction(ec, _output2.getName(), 
runningMean.getNumRows(), runningMean.getNumColumns());
+                       MatrixObject retRunningVar = 
getDenseMatrixOutputForGPUInstruction(ec, _output3.getName(), 
runningVar.getNumRows(), runningVar.getNumColumns());
+                       MatrixObject resultSaveMean = 
getDenseMatrixOutputForGPUInstruction(ec, _output4.getName(), 
runningMean.getNumRows(), runningMean.getNumColumns());
+                       MatrixObject resultSaveInvVariance = 
getDenseMatrixOutputForGPUInstruction(ec, _output5.getName(), 
runningVar.getNumRows(), runningVar.getNumColumns());
+                       
LibMatrixCuDNN.batchNormalizationForwardTraining(ec.getGPUContext(0), 
getExtendedOpcode(), 
+                               image, scale, bias, runningMean, runningVar, 
ret, 
+                               retRunningMean, retRunningVar, epsilon, 
exponentialAverageFactor, resultSaveMean, resultSaveInvVariance);
+                       
ec.releaseMatrixOutputForGPUInstruction(_output2.getName());
+                       
ec.releaseMatrixOutputForGPUInstruction(_output3.getName());
+                       
ec.releaseMatrixOutputForGPUInstruction(_output4.getName());
+                       
ec.releaseMatrixOutputForGPUInstruction(_output5.getName());
+               }
+               else if(phase.equalsIgnoreCase("test")) {
+                       
LibMatrixCuDNN.batchNormalizationForwardInference(ec.getGPUContext(0), 
getExtendedOpcode(), 
+                                       image, scale, bias, runningMean, 
runningVar, ret, epsilon);
+                       ec.setMatrixOutput(_output2.getName(), new 
MatrixBlock((int)runningMean.getNumRows(), (int)runningMean.getNumColumns(), 
true), getExtendedOpcode());
+                       ec.setMatrixOutput(_output3.getName(), new 
MatrixBlock((int)runningVar.getNumRows(), (int)runningVar.getNumColumns(), 
true), getExtendedOpcode());
+                       ec.setMatrixOutput(_output4.getName(), new 
MatrixBlock((int)runningMean.getNumRows(), (int)runningMean.getNumColumns(), 
true), getExtendedOpcode());
+                       ec.setMatrixOutput(_output5.getName(), new 
MatrixBlock((int)runningVar.getNumRows(), (int)runningVar.getNumColumns(), 
true), getExtendedOpcode());
+               }
+               else {
+                       throw new DMLRuntimeException("Incorrect mode: Expected 
either train or test, but found " + phase);
+               }
+               
+               // release inputs/outputs
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input3.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input4.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input5.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+       }
+       
+       public void processBatchNorm2dBackwardInstruction(ExecutionContext ec) 
throws DMLRuntimeException {
+               GPUStatistics.incrementNoOfExecutedGPUInst();
+               MatrixObject image = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
+               MatrixObject scale = getMatrixInputForGPUInstruction(ec, 
_input3.getName());
+               double epsilon = ec.getScalarInput(_input4.getName(), 
_input4.getValueType(), _input4.isLiteral()).getDoubleValue();
+               MatrixObject resultSaveMean = 
getMatrixInputForGPUInstruction(ec, _input5.getName());
+               MatrixObject resultSaveInvVariance = 
getMatrixInputForGPUInstruction(ec, _input6.getName());
+               
+               MatrixObject dX = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName(), image.getNumRows(), image.getNumColumns());
+               MatrixObject dScale = getDenseMatrixOutputForGPUInstruction(ec, 
_output2.getName(), scale.getNumRows(), scale.getNumColumns());
+               MatrixObject dBias = getDenseMatrixOutputForGPUInstruction(ec, 
_output3.getName(), scale.getNumRows(), scale.getNumColumns());
+               
+               LibMatrixCuDNN.batchNormalizationBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, 
+                               dout, scale, dX, dScale, dBias,
+                               epsilon, resultSaveMean, resultSaveInvVariance);
+               
+               // release inputs/outputs
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input3.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input5.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input6.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output2.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output3.getName());
+       }
 
        // (X > 0) * dout
        public void processReLUBackwardInstruction(ExecutionContext ec) {
@@ -281,6 +454,78 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
        
+       private static int toInt(long num) throws DMLRuntimeException {
+               if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
+                       throw new DMLRuntimeException("GPU : Exceeded supported 
size " + num);
+               }
+               return (int)num;
+       }
+       
+//     private Pointer transpose(ExecutionContext ec, MatrixObject X) throws 
DMLRuntimeException {
+//             GPUContext gCtx = ec.getGPUContext(0);
+//             String instructionName = getExtendedOpcode();
+//             long numRowsX = X.getNumRows(); long numColsX = 
X.getNumColumns();
+//             Pointer tX = gCtx.allocate(instructionName, 
numRowsX*numColsX*LibMatrixCUDA.sizeOfDataType);
+//             jcuda.runtime.JCuda.cudaMemcpy(tX, 
LibMatrixCUDA.getDensePointer(gCtx, X, instructionName), 
numRowsX*numColsX*LibMatrixCUDA.sizeOfDataType,  
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice);
+//             // LibMatrixCUDA.denseTranspose(ec, gCtx, instructionName, 
LibMatrixCUDA.getDensePointer(gCtx, X, instructionName), tX, numRowsX, 
numColsX);
+//             return tX;
+//     }
+       
+       private void processLstmInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
+               // batchSize=N, seqLength=T, numFeatures=D and hiddenSize=M
+               // input  X:(N, T*D),   ==> (T, D, N)
+               // weight W:(D+M+2, 4M) 
+               // previous output out0 (also represented by hx) and cell state 
c0 (also represented by cx): (N, M) ==> (1, M, N)
+               // out: (N, T*M) or (N, M) ==> (T, M, N)
+               GPUStatistics.incrementNoOfExecutedGPUInst();
+               GPUContext gCtx = ec.getGPUContext(0);
+               String instructionName = getExtendedOpcode();
+               
+               MatrixObject out0 = getMatrixInputForGPUInstruction(ec, 
_input4.getName());
+               int M = toInt(out0.getNumColumns()); // hiddenSize .. since 
out0: (N, M)
+               Pointer out0Pointer =  LibMatrixCUDA.getDensePointer(gCtx, 
out0, instructionName);
+               
+               MatrixObject W = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
+               MatrixObject bias = getMatrixInputForGPUInstruction(ec, 
_input3.getName());
+               long numRowsW = W.getNumRows();
+               int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... 
numFeatures 
+               Pointer sysmlWPointer = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M);
+               Pointer sysmlBiasPointer = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M);
+               Pointer cudnnWPointer = gCtx.allocate(instructionName, 
(D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
+               
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
+                               
ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+                               sysmlWPointer, sysmlBiasPointer, cudnnWPointer, 
D, M);
+               ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+               
+               boolean return_sequences = ec.getScalarInput(_input6.getName(), 
_input6.getValueType(), _input6.isLiteral()).getBooleanValue();
+               
+               // Beause the matrices are released immediately, the output for 
transpose need not be taken into account
+               MatrixObject X = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               Pointer xPointer = LibMatrixCUDA.getDensePointer(gCtx, X, 
instructionName); 
+               int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D)
+               long numColsX = X.getNumColumns();
+               int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
+               Pointer cudnnInput = gCtx.allocate(instructionName, 
(N*T*D)*LibMatrixCUDA.sizeOfDataType);
+               
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
+                               
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+                               xPointer, cudnnInput, N, D, T*D, N*T*D);
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+               
+               Pointer c0Pointer = LibMatrixCUDA.getDensePointer(gCtx, 
getMatrixInputForGPUInstruction(ec, _input5.getName()), instructionName); 
+               
+               LibMatrixCuDNN.lstm(ec, gCtx, instructionName, cudnnInput, 
cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), 
_output2.getName(), _output3.getName(), N, M, D, T);
+               gCtx.cudaFreeHelper(instructionName, cudnnWPointer, 
DMLScript.EAGER_CUDA_FREE);
+               gCtx.cudaFreeHelper(instructionName, cudnnInput, 
DMLScript.EAGER_CUDA_FREE);
+               
+               // release inputs/outputs
+               ec.releaseMatrixInputForGPUInstruction(_input3.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input4.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input5.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output2.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output3.getName());
+       }
+       
        @Override
        public void processInstruction(ExecutionContext ec) {
                if (instOpcode.equalsIgnoreCase("bias_add") || 
instOpcode.equalsIgnoreCase("bias_multiply")) {
@@ -295,6 +540,18 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        processChannelSumsInstruction(ec);
                        return;
                }
+               else if (instOpcode.equalsIgnoreCase("lstm")) {
+                       processLstmInstruction(ec);
+                       return;
+               }
+               else if (instOpcode.equalsIgnoreCase("batch_norm2d")) {
+                       processBatchNorm2dInstruction(ec);
+                       return;
+               }
+               else if (instOpcode.equalsIgnoreCase("batch_norm2d_backward")) {
+                       processBatchNorm2dBackwardInstruction(ec);
+                       return;
+               }
                
                GPUStatistics.incrementNoOfExecutedGPUInst();
                                        

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 7eb4033..ff14ec4 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -506,4 +506,4 @@ public class CSRPointer {
        public String toString() {
                return "CSRPointer{" + "nnz=" + nnz + '}';
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 2d3918c..4c0d1eb 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -420,4 +420,4 @@ public class GPUContext {
        public String toString() {
                return "GPUContext{" + "deviceNum=" + deviceNum + '}';
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
index 830f7df..c90beef 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
@@ -168,4 +168,4 @@ public class GPULazyCudaFreeMemoryManager {
                }
        }
        
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
index 066905b..610df23 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
@@ -181,4 +181,4 @@ public class GPUMatrixMemoryManager {
                        gpuObjects.removeAll(unlockedGPUObjects);
                }
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index 411606d..35a31be 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -725,4 +725,4 @@ public class GPUMemoryManager {
                        }
                }
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index bf44895..576584b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -1029,4 +1029,4 @@ public class GPUObject {
                return sb.toString();
        }
 
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index cdb69f9..8b5043f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
+import static jcuda.runtime.JCuda.cudaMalloc;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
@@ -88,8 +89,10 @@ import jcuda.jcublas.cublasHandle;
 import jcuda.jcublas.cublasOperation;
 import jcuda.jcublas.cublasSideMode;
 import jcuda.jcusparse.cusparseAction;
+import jcuda.jcusparse.cusparseDirection;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseIndexBase;
+import jcuda.jcusparse.cusparseMatDescr;
 
 /**
  * All CUDA kernels and library calls are redirected through this class
@@ -2577,4 +2580,4 @@ public class LibMatrixCUDA {
        }
 
 
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 2bfb8f2..e84dce7 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -32,6 +32,11 @@ import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
 import static jcuda.runtime.JCuda.cudaMemset;
 import jcuda.CudaException;
 import jcuda.Pointer;
@@ -841,7 +846,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                }
        }
        
-       private static Pointer getDenseOutputPointer(ExecutionContext ec, 
GPUContext gCtx, String instName, String outputName,
+       static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext 
gCtx, String instName, String outputName,
                        long numRows, long numCols) throws DMLRuntimeException {
                MatrixObject output = ec.getMatrixObject(outputName);
                getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
numRows, numCols); // Allocated the dense output matrix
@@ -849,6 +854,270 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        }
        
        /**
+        * Computes the forward pass for an LSTM layer with M neurons.
+        * The input data has N sequences of T examples, each with D features.
+        * 
+        * @param ec execution context 
+        * @param gCtx gpu context 
+        * @param instName name of the instruction
+        * @param X input matrix pointer
+        * @param wPointer weight matrix pointer
+        * @param out0 Outputs from previous timestep
+        * @param c0 Initial cell state
+        * @param return_sequences Whether to return `out` at all timesteps, or 
just for the final timestep.
+        * @param outputName name of the out variable. If `return_sequences` is 
True, outputs for all timesteps.
+        * @param cyName name of the output cell state. Cell state for final 
timestep.
+        * @param reserveSpaceName name of reserve space.
+        * @param N minibatch size
+        * @param M hidden size
+        * @param D number of features
+        * @param T sequence length
+        * @throws DMLRuntimeException if error
+        */
+       public static void lstm(ExecutionContext ec, GPUContext gCtx, String 
instName,
+                       Pointer X,  Pointer wPointer, Pointer out0, Pointer c0, 
boolean return_sequences,
+                       String outputName, String cyName, String 
reserveSpaceName, int N, int M, int D, int T) throws DMLRuntimeException {
+               singleLayerUnidirectionalRNNForward(ec, gCtx, instName, X, 
out0, c0, wPointer, outputName, cyName, reserveSpaceName, "lstm", 
return_sequences, N, M, D, T);
+       }
+       
+       private static void 
singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, 
String instName,
+                       Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  
// input
+                       String outputName, String cyName, String 
reserveSpaceName,  // output
+                       String rnnMode, boolean return_sequences, int N, int M, 
int D, int T) throws DMLRuntimeException {
+               boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
+               // Get output pointers
+               Pointer cudnnYPointer = gCtx.allocate(instName, 
N*T*M*sizeOfDataType);
+               Pointer hyPointer = !return_sequences ? 
getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : 
gCtx.allocate(instName, N*M*sizeOfDataType);
+               Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, 
instName, cyName, N, M) : new Pointer();
+               // Pointer wPointer = getDensePointerForCuDNN(gCtx, w, 
instName, D+M+2, 4*M);
+               
+               try(LibMatrixCuDNNRnnAlgorithm algo = new 
LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, 
wPointer, reserveSpaceName)) {
+                       jcuda.runtime.JCuda.cudaDeviceSynchronize();
+                       JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), 
algo.rnnDesc, T, 
+                                       algo.xDesc, x, 
+                                       algo.hxDesc, hx, 
+                                       algo.cxDesc, cx, 
+                                       algo.wDesc, wPointer, 
+                                       algo.yDesc, cudnnYPointer, 
+                                       algo.hyDesc, hyPointer, 
+                                       algo.cyDesc, cyPointer, 
+                                       algo.workSpace, algo.sizeInBytes, 
+                                       algo.reserveSpace, 
algo.reserveSpaceSizeInBytes);
+               }
+               
+               if(return_sequences) {
+                       gCtx.cudaFreeHelper(instName, hyPointer, 
DMLScript.EAGER_CUDA_FREE);
+                       Pointer sysmlYPointer = getDenseOutputPointer(ec, gCtx, 
instName, outputName, N, T*M);
+                       
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
+                                       
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
+                                       sysmlYPointer, cudnnYPointer, N, T, M, 
N*T*M);
+               }
+               gCtx.cudaFreeHelper(instName, cudnnYPointer, 
DMLScript.EAGER_CUDA_FREE);
+       }
+       
+       /**
+        * Performs the forward BatchNormalization layer computation for 
training
+        * @param gCtx   a valid {@link GPUContext}
+        * @param instName name of the instruction
+        * @param image input image
+        * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
+        * @param bias bias (as per CuDNN) and beta as per original paper: 
shape [1, C, 1, 1]
+        * @param runningMean running mean accumulated during training phase: 
shape [1, C, 1, 1]
+        * @param runningVar running variance accumulated during training 
phase: shape [1, C, 1, 1]
+        * @param ret (output) normalized input
+        * @param retRunningMean (output) running mean accumulated during 
training phase: shape [1, C, 1, 1]
+        * @param retRunningVar (output) running variance accumulated during 
training phase: shape [1, C, 1, 1]
+        * @param epsilon epsilon value used in the batch normalization formula
+        * @param exponentialAverageFactor factor used in the moving average 
computation
+        * @param resultSaveMean (output) running mean accumulated during 
training phase: shape [1, C, 1, 1]
+        * @param resultSaveInvVariance (output) running variance accumulated 
during training phase: shape [1, C, 1, 1]
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static void batchNormalizationForwardTraining(GPUContext gCtx, 
String instName, MatrixObject image,
+                       MatrixObject scale,  MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
+                       MatrixObject ret, MatrixObject retRunningMean, 
MatrixObject retRunningVar, 
+                       double epsilon, double exponentialAverageFactor,
+                       MatrixObject resultSaveMean, MatrixObject 
resultSaveInvVariance) throws DMLRuntimeException {
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : batchNormalizationForwardTraining" + 
", GPUContext=" + gCtx);
+               }
+
+               int N = toInt(image.getNumRows());
+               int C = toInt(scale.getNumRows());
+               long CHW = image.getNumColumns();
+               validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
+
+               // Allocate descriptors
+               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
+                               new MatrixObject[] {image},  new MatrixObject[] 
{ret});
+               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(1, C, 1, 1);
+
+               // Get underlying dense pointer
+               Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, 
instName);
+               Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
+               Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
+               Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, 
instName);
+               Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, 
runningMean, instName);
+               Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, 
runningVar, instName);
+
+               // To allow for copy-on-write
+               Pointer retRunningMeanPtr = getDensePointerForCuDNN(gCtx, 
retRunningMean, instName);
+               Pointer retRunningVarPtr = getDensePointerForCuDNN(gCtx, 
retRunningVar, instName);
+               cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * 
sizeOfDataType, cudaMemcpyDeviceToDevice);
+               cudaMemcpy(retRunningVarPtr, runningVarPtr, C * sizeOfDataType, 
cudaMemcpyDeviceToDevice);
+
+               Pointer resultSaveMeanPtr = getDensePointerForCuDNN(gCtx, 
resultSaveMean, instName);
+               Pointer resultSaveInvVariancePtr = 
getDensePointerForCuDNN(gCtx, resultSaveInvVariance, instName);
+               
+               
checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), 
+                               
jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL, one(), zero(),
+                               nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
+                               scaleTensorDesc, scalePtr, biasPtr, 
exponentialAverageFactor,
+                               retRunningMeanPtr, retRunningVarPtr, epsilon, 
resultSaveMeanPtr, resultSaveInvVariancePtr));
+       }
+       
+       /**
+        * Performs the forward BatchNormalization layer computation for 
inference
+        * @param gCtx   a valid {@link GPUContext}
+        * @param instName name of the instruction
+        * @param image input image
+        * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
+        * @param bias bias (as per CuDNN) and beta as per original paper: 
shape [1, C, 1, 1]
+        * @param runningMean running mean accumulated during training phase: 
shape [1, C, 1, 1]
+        * @param runningVar running variance accumulated during training 
phase: shape [1, C, 1, 1]
+        * @param ret normalized input
+        * @param epsilon epsilon value used in the batch normalization formula
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static void batchNormalizationForwardInference(GPUContext gCtx, 
String instName, MatrixObject image,
+                       MatrixObject scale, MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
+                       MatrixObject ret, double epsilon) throws 
DMLRuntimeException {
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : batchNormalizationForwardInference" + 
", GPUContext=" + gCtx);
+               }
+
+               int N = toInt(image.getNumRows());
+               int C = toInt(scale.getNumRows());
+               long CHW = image.getNumColumns();
+               validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
+
+               // Allocate descriptors
+               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
+                               new MatrixObject[] {image},  new MatrixObject[] 
{ret});
+               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(1, C, 1, 1);
+
+               // Get underlying dense pointer
+               Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, 
instName);
+               Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
+               Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
+               Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, 
instName);
+               Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, 
runningMean, instName);
+               Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, 
runningVar, instName);
+
+               
checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), 
+                               
jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL, one(), zero(),
+                               nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
+                               scaleTensorDesc, scalePtr, biasPtr,
+                               runningMeanPtr, runningVarPtr, epsilon));
+       }
+       
+       /**
+        * This method computes the backpropagation errors for image, scale and 
bias of batch normalization layer
+        * @param gCtx   a valid {@link GPUContext}
+        * @param instName name of the instruction
+        * @param image input image
+        * @param dout input errors of shape C, H, W
+        * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
+        * @param dX (output) backpropagation errors for previous layer
+        * @param dScale backpropagation error for scale
+        * @param dBias backpropagation error for bias
+        * @param epsilon epsilon value used in the batch normalization formula
+        * @param resultSaveMean (input) running mean accumulated during 
training phase: shape [1, C, 1, 1]
+        * @param resultSaveInvVariance (input) running variance accumulated 
during training phase: shape [1, C, 1, 1]
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static void batchNormalizationBackward(GPUContext gCtx, String 
instName, MatrixObject image, MatrixObject dout,
+                       MatrixObject scale, MatrixObject dX, MatrixObject 
dScale, MatrixObject dBias,
+                       double epsilon, MatrixObject resultSaveMean, 
MatrixObject resultSaveInvVariance) throws DMLRuntimeException {
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : batchNormalizationBackward" + ", 
GPUContext=" + gCtx);
+               }
+               
+               int N = toInt(image.getNumRows());
+               int C = toInt(scale.getNumRows());
+               long CHW = image.getNumColumns();
+
+               // Allocate descriptors
+               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
+                               new MatrixObject[] {image, dout},  new 
MatrixObject[] {dX});
+               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(1, C, 1, 1);
+
+               // Get underlying dense pointer
+               Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, 
instName);
+               Pointer doutPtr = getDensePointerForCuDNN(gCtx, dout, instName);
+               Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, 
instName);
+               Pointer dXPtr = getDensePointerForCuDNN(gCtx, dX, instName);
+               Pointer dScalePtr = getDensePointerForCuDNN(gCtx, dScale, 
instName);
+               Pointer dBiasPtr = getDensePointerForCuDNN(gCtx, dBias, 
instName);
+               
+               Pointer resultSaveMeanPtr = getDensePointerForCuDNN(gCtx, 
resultSaveMean, instName);
+               Pointer resultSaveInvVariancePtr = 
getDensePointerForCuDNN(gCtx, resultSaveInvVariance, instName);
+
+
+               // ignoring resultSaveMean and resultSaveVariance as it 
requires state management
+               
checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), 
+                               
jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL,  one(), zero(), one(), 
zero(),
+                               nCHWDescriptor,  imagePtr, nCHWDescriptor, 
doutPtr, nCHWDescriptor, dXPtr,
+                               scaleTensorDesc, scalePtr, dScalePtr, dBiasPtr, 
epsilon, resultSaveMeanPtr, resultSaveInvVariancePtr));
+       }
+       
+       private static void validateBatchNormalizationDimensions(MatrixObject 
scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, 
int C) throws DMLRuntimeException {
+               if(scale.getNumRows() != C || scale.getNumColumns() != 1) {
+                       throw new DMLRuntimeException("Incorrect dimensions for 
scale. Expected a column vector of size " + C + ", but found [" + 
scale.getNumRows() + ", " + scale.getNumColumns() + "]");
+               }
+               if(bias.getNumRows() != C || bias.getNumColumns() != 1) {
+                       throw new DMLRuntimeException("Incorrect dimensions for 
bias. Expected a column vector of size " + C + ", but found [" + 
bias.getNumRows() + ", " + bias.getNumColumns() + "]");
+               }
+               if(runningMean.getNumRows() != C || runningMean.getNumColumns() 
!= 1) {
+                       throw new DMLRuntimeException("Incorrect dimensions for 
running mean. Expected a column vector of size " + C + ", but found [" + 
runningMean.getNumRows() + ", " + runningMean.getNumColumns() + "]");
+               }
+               if(runningVar.getNumRows() != C || runningVar.getNumColumns() 
!= 1) {
+                       throw new DMLRuntimeException("Incorrect dimensions for 
running variance. Expected a column vector of size " + C + ", but found [" + 
runningVar.getNumRows() + ", " + runningVar.getNumColumns() + "]");
+               }
+       }
+       
+       /**
+        * Convenient utility for batch normalization that returns a NCHW 
descriptor
+        * @param gCtx a valid {@link GPUContext}
+        * @param N number of images
+        * @param C number of channels
+        * @param CHW channels*height*width
+        * @param input input matrix objects
+        * @param output output matrix objects
+        * @return one of the NCHW descriptor
+        * @throws DMLRuntimeException if error occurs
+        */
+       private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext 
gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) 
throws DMLRuntimeException {
+               cudnnTensorDescriptor ret  = null; // Return any one
+               if(CHW > ((long)Integer.MAX_VALUE)*C) {
+                       throw new DMLRuntimeException("image size 
(height*width) should be less than " + Integer.MAX_VALUE);
+               }
+               int H = -1; int W = -1;
+               int HW = (int) (CHW / C);
+               H = HW; W = 1; // If not known
+               double potentialH = Math.sqrt(HW);
+               if(potentialH == ((int) potentialH)) {
+                       H = (int) potentialH;
+                       W = H;
+               }
+               // We are not sure about H and W, hence don't allocate them.
+               ret = new cudnnTensorDescriptor();
+               cudnnCreateTensorDescriptor(ret);
+               cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_TYPE, N, C, H, W);
+               return ret;
+       }
+       
+       /**
         * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
         * 
         * @param gCtx a valid {@link GPUContext}
@@ -900,4 +1169,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
                        throw new DMLRuntimeException("Error status returned by 
CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index 432e79e..c95c3b3 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -276,4 +276,4 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                cudnnSetConvolution2dDescriptor(convDesc, padding[0], 
padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION, 
LibMatrixCUDA.CUDNN_DATA_TYPE);
                return convDesc;
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index f3ce70d..70c33d6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -81,4 +81,4 @@ public class LibMatrixCuDNNInputRowFetcher extends 
LibMatrixCUDA implements java
                        throw new RuntimeException(e);
                }
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
new file mode 100644
index 0000000..d772a55
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensorNdDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyDropoutDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyRNNDescriptor;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcudnn.JCudnn.cudnnCreateRNNDescriptor;
+import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
+import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
+import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+
+import jcuda.Pointer;
+import jcuda.jcudnn.JCudnn;
+import jcuda.jcudnn.cudnnDropoutDescriptor;
+import jcuda.jcudnn.cudnnFilterDescriptor;
+import jcuda.jcudnn.cudnnRNNDescriptor;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+
+public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
+       GPUContext gCtx;
+       String instName;
+       cudnnDropoutDescriptor dropoutDesc;
+       cudnnRNNDescriptor rnnDesc;
+       cudnnTensorDescriptor[] xDesc, yDesc; // of length T
+       cudnnTensorDescriptor hxDesc, cxDesc, hyDesc, cyDesc; 
+       cudnnFilterDescriptor wDesc;
+       long sizeInBytes; Pointer workSpace;
+       long reserveSpaceSizeInBytes; Pointer reserveSpace;
+       public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, 
String instName, 
+                       String rnnMode, int N, int T, int M, int D, boolean 
isTraining, Pointer w, String reserveSpaceName) throws DMLRuntimeException {
+               this.gCtx = gCtx;
+               this.instName = instName;
+               
+               // Allocate input/output descriptors
+               xDesc = new cudnnTensorDescriptor[T];
+               yDesc = new cudnnTensorDescriptor[T];
+               for(int t = 0; t < T; t++) {
+                       xDesc[t] = allocateTensorDescriptorWithStride(N, D, 1);
+                       yDesc[t] = allocateTensorDescriptorWithStride(N, M, 1);
+               }
+               hxDesc = allocateTensorDescriptorWithStride(1, N, M); 
+               cxDesc = allocateTensorDescriptorWithStride(1, N, M);
+               hyDesc = allocateTensorDescriptorWithStride(1, N, M);
+               cyDesc = allocateTensorDescriptorWithStride(1, N, M);
+               
+               // Initial dropout descriptor
+               dropoutDesc = new cudnnDropoutDescriptor();
+               JCudnn.cudnnCreateDropoutDescriptor(dropoutDesc);
+               long [] dropOutSizeInBytes = {-1};
+               JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), 
dropOutSizeInBytes);
+               Pointer dropOutStateSpace = new Pointer();
+               if (dropOutSizeInBytes[0] != 0)
+                       dropOutStateSpace = 
gCtx.allocate(dropOutSizeInBytes[0]);
+               JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, 
gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes[0], 12345);
+               
+               // Initialize RNN descriptor
+               rnnDesc = new cudnnRNNDescriptor();
+               cudnnCreateRNNDescriptor(rnnDesc);
+               JCudnn.cudnnSetRNNDescriptor_v6(gCtx.getCudnnHandle(), rnnDesc, 
M, 1, dropoutDesc, 
+                               CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, 
+                               getCuDNNRnnMode(rnnMode), 
CUDNN_RNN_ALGO_STANDARD, LibMatrixCUDA.CUDNN_DATA_TYPE);
+               
+               // Allocate filter descriptor
+               int expectedNumWeights = getExpectedNumWeights();
+               if(rnnMode.equalsIgnoreCase("lstm") && (D+M+2)*4*M != 
expectedNumWeights) {
+                       throw new DMLRuntimeException("Incorrect number of RNN 
parameters " +  (D+M+2)*4*M + " != " +  expectedNumWeights + ", where 
numFeatures=" + D + ", hiddenSize=" + M);
+               }
+               wDesc = allocateFilterDescriptor(expectedNumWeights);
+               
+               // Setup workspace
+               workSpace = new Pointer(); reserveSpace = new Pointer();
+               sizeInBytes = getWorkspaceSize(T);
+               if(sizeInBytes != 0)
+                       workSpace = gCtx.allocate(sizeInBytes);
+               reserveSpaceSizeInBytes = 0;
+               if(isTraining) {
+                       reserveSpaceSizeInBytes = getReservespaceSize(T);
+                       if (reserveSpaceSizeInBytes != 0) {
+                               int numCols =  (int) 
Math.ceil(((double)reserveSpaceSizeInBytes) / LibMatrixCUDA.sizeOfDataType);
+                               reserveSpace = 
LibMatrixCuDNN.getDenseOutputPointer(ec, gCtx, instName, reserveSpaceName, 1, 
numCols);
+                       }
+               }
+               if (reserveSpaceSizeInBytes == 0) {
+                       reserveSpace = LibMatrixCuDNN.getDenseOutputPointer(ec, 
gCtx, instName, reserveSpaceName, 1, 1);
+               }
+               
+               /*
+               int numLinearLayers = getNumLinearLayers(rnnMode); 
+               for(int i = 0; i < numLinearLayers; i++) {
+                       cudnnFilterDescriptor  linLayerMatDesc = new 
cudnnFilterDescriptor();
+                       cudnnCreateFilterDescriptor(linLayerMatDesc);
+                       Pointer linLayerMat = new Pointer();
+                       
JCudnn.cudnnGetRNNLinLayerMatrixParams(gCtx.getCudnnHandle(), rnnDesc, 0, 
+                                       xDesc[0], wDesc, w, i, linLayerMatDesc, 
linLayerMat);
+                       int[] dataType = new int[] {-1};
+                       int[] format = new int[] {-1};
+                       int[] nbDims = new int[] {-1};
+                       int[] filterDimA = new int[3];
+                       JCudnn.cudnnGetFilterNdDescriptor(linLayerMatDesc, 3, 
dataType, format, nbDims, filterDimA);
+                       
+                       int filterDims = filterDimA[0] * filterDimA[1] * 
filterDimA[2];
+                       double [] tmp = new double[filterDims];
+                       LibMatrixCUDA.cudaSupportFunctions.deviceToHost(gCtx, 
linLayerMat, tmp, instName, false);
+                       System.out.println();
+                       for(int j = 0 ; j < tmp.length; j++) {
+                               System.out.print(" " + tmp[j]);
+                       }
+                       System.out.println();
+                       LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", 
+                                       
org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims),
 
+                                       linLayerMat, Math.pow(filterDims, -1), 
filterDims);
+                       JCudnn.cudnnDestroyFilterDescriptor(linLayerMatDesc);
+                       
+                       cudnnFilterDescriptor  linLayerBiasDesc = new 
cudnnFilterDescriptor();
+                       cudnnCreateFilterDescriptor(linLayerBiasDesc);
+                       Pointer linLayerBias = new Pointer();
+                       
JCudnn.cudnnGetRNNLinLayerBiasParams(gCtx.getCudnnHandle(), rnnDesc, 0,
+                                       xDesc[0], wDesc, w, i, 
linLayerBiasDesc, linLayerBias);
+                       JCudnn.cudnnGetFilterNdDescriptor(linLayerBiasDesc, 3, 
dataType, format, nbDims, filterDimA);
+                       filterDims = filterDimA[0] * filterDimA[1] * 
filterDimA[2];
+                       LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", 
+                                       
org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims),
 
+                                       linLayerBias, Math.pow(filterDims, -1), 
filterDims);
+                       JCudnn.cudnnDestroyFilterDescriptor(linLayerBiasDesc);
+               }
+               */
+       }
+       
+       @SuppressWarnings("unused")
+       private int getNumLinearLayers(String rnnMode) throws 
DMLRuntimeException {
+               int ret = 0;
+               if(rnnMode.equalsIgnoreCase("rnn_relu") || 
rnnMode.equalsIgnoreCase("rnn_tanh")) {
+                       ret = 2;
+               }
+               else if(rnnMode.equalsIgnoreCase("lstm")) {
+                       ret = 8;
+               }
+               else if(rnnMode.equalsIgnoreCase("gru")) {
+                       ret = 6;
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported rnn mode:" + 
rnnMode);
+               }
+               return ret;
+       }
+       
+       private long getWorkspaceSize(int seqLength) {
+               long [] sizeInBytesArray = new long[1];
+               JCudnn.cudnnGetRNNWorkspaceSize(gCtx.getCudnnHandle(), rnnDesc, 
seqLength, xDesc, sizeInBytesArray);
+               return sizeInBytesArray[0];
+       }
+       
+       private long getReservespaceSize(int seqLength) {
+               long [] sizeInBytesArray = new long[1];
+               JCudnn.cudnnGetRNNTrainingReserveSize(gCtx.getCudnnHandle(), 
rnnDesc, seqLength, xDesc, sizeInBytesArray);
+               return sizeInBytesArray[0];
+       }
+       
+       private int getCuDNNRnnMode(String rnnMode) throws DMLRuntimeException {
+               int rnnModeVal = -1;
+               if(rnnMode.equalsIgnoreCase("rnn_relu")) {
+                       rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_RNN_RELU;
+               }
+               else if(rnnMode.equalsIgnoreCase("rnn_tanh")) {
+                       rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_RNN_TANH;
+               }
+               else if(rnnMode.equalsIgnoreCase("lstm")) {
+                       rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_LSTM;
+               }
+               else if(rnnMode.equalsIgnoreCase("gru")) {
+                       rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_GRU;
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported rnn mode:" + 
rnnMode);
+               }
+               return rnnModeVal;
+       }
+       
+       private int getExpectedNumWeights() throws DMLRuntimeException {
+               long [] weightSizeInBytesArray = {-1}; // (D+M+2)*4*M
+               JCudnn.cudnnGetRNNParamsSize(gCtx.getCudnnHandle(), rnnDesc, 
xDesc[0], weightSizeInBytesArray, LibMatrixCUDA.CUDNN_DATA_TYPE);
+               // check if (D+M+2)*4M == weightsSize / sizeof(dataType) where 
weightsSize is given by 'cudnnGetRNNParamsSize'.
+               return 
LibMatrixCUDA.toInt(weightSizeInBytesArray[0]/LibMatrixCUDA.sizeOfDataType);
+       }
+       
+       private cudnnFilterDescriptor allocateFilterDescriptor(int numWeights) {
+               cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
+               cudnnCreateFilterDescriptor(filterDesc);
+               JCudnn.cudnnSetFilterNdDescriptor(filterDesc, 
LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, 3, new int[] {numWeights, 1, 
1});
+               return filterDesc;
+       }
+       
+       
+       
+       private static cudnnTensorDescriptor 
allocateTensorDescriptorWithStride(int firstDim, int secondDim, int thirdDim) 
throws DMLRuntimeException {
+               cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
+               cudnnCreateTensorDescriptor(tensorDescriptor);
+               int [] dimA = new int[] {firstDim, secondDim, thirdDim};
+               int [] strideA = new int[] {dimA[2] * dimA[1], dimA[2], 1};
+               cudnnSetTensorNdDescriptor(tensorDescriptor, 
LibMatrixCUDA.CUDNN_DATA_TYPE, 3, dimA, strideA);
+               return tensorDescriptor;
+       }
+
+
+       @Override
+       public void close() {
+               if(dropoutDesc != null)
+                       cudnnDestroyDropoutDescriptor(dropoutDesc);
+               dropoutDesc = null;
+               if(rnnDesc != null)
+                       cudnnDestroyRNNDescriptor(rnnDesc);
+               rnnDesc = null;
+               if(hxDesc != null)
+                       cudnnDestroyTensorDescriptor(hxDesc);
+               hxDesc = null;
+               if(hyDesc != null)
+                       cudnnDestroyTensorDescriptor(hyDesc);
+               hyDesc = null;
+               if(cxDesc != null)
+                       cudnnDestroyTensorDescriptor(cxDesc);
+               cxDesc = null;
+               if(cyDesc != null)
+                       cudnnDestroyTensorDescriptor(cyDesc);
+               cyDesc = null;
+               if(wDesc != null)
+                       cudnnDestroyFilterDescriptor(wDesc);
+               wDesc = null;
+               if(xDesc != null) {
+                       for(cudnnTensorDescriptor dsc : xDesc) {
+                               cudnnDestroyTensorDescriptor(dsc);
+                       }
+                       xDesc = null;
+               }
+               if(yDesc != null) {
+                       for(cudnnTensorDescriptor dsc : yDesc) {
+                               cudnnDestroyTensorDescriptor(dsc);
+                       }
+                       yDesc = null;
+               }
+               if(sizeInBytes != 0) {
+                       try {
+                               gCtx.cudaFreeHelper(instName, workSpace, 
DMLScript.EAGER_CUDA_FREE);
+                       } catch (DMLRuntimeException e) {
+                               throw new RuntimeException(e);
+                       }
+               }
+               if(reserveSpaceSizeInBytes != 0) {
+                       try {
+                               gCtx.cudaFreeHelper(instName, reserveSpace, 
DMLScript.EAGER_CUDA_FREE);
+                       } catch (DMLRuntimeException e) {
+                               throw new RuntimeException(e);
+                       }
+               }       
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index 60b2352..adbbcb8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -466,4 +466,4 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
        private static int reverseCusparseOp(int trans) {
                return trans == CUSPARSE_OPERATION_TRANSPOSE ? 
CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 3bd101c..39371e6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -224,4 +224,4 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, totalTime);
                }
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index f7bee4f..18270cc 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -240,4 +240,4 @@ public class GPUStatistics {
        }
 
 
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala 
b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
index 3e7aff3..869f1a1 100644
--- a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
@@ -1603,4 +1603,4 @@ class DeConvolution(val param: LayerParameter, val id: 
Int, val net: CaffeNetwor
     if (convParam.hasPadW) convParam.getPadW.toString
     else if (convParam.getPadCount > 0) convParam.getPad(0).toString
     else "0"
-}
\ No newline at end of file
+}

[1/2] systemml git commit: [SYSTEMML-445] Added builtin functions for efficient computation of batch normalization and lstm layers.

Reply via email to