Repository: systemml Updated Branches: refs/heads/master cba082eb1 -> 276065f93
http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java index 0bb2a67..a9a22e0 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java @@ -77,7 +77,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data // global constant configuration parameters public static final long CACHING_THRESHOLD = (long)Math.max(4*1024, //obj not s.t. caching 1e-5 * InfrastructureAnalyzer.getLocalMaxMemory()); //if below threshold [in bytes] - public static double CACHING_BUFFER_SIZE = 0.15; + public static double CACHING_BUFFER_SIZE = 0.15; public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO; public static final boolean CACHING_BUFFER_PAGECACHE = false; public static final boolean CACHING_WRITE_CACHE_ON_READ = false; http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 903dca6..049577b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -56,7 +56,10 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "channel_sums", GPUINSTRUCTION_TYPE.Convolution); - + String2GPUInstructionType.put( "lstm", GPUINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "batch_norm2d", GPUINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "batch_norm2d_backward", GPUINSTRUCTION_TYPE.Convolution); + // Matrix Multiply Operators String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ); @@ -203,4 +206,4 @@ public class GPUInstructionParser extends InstructionParser throw new DMLRuntimeException("Invalid GPU Instruction Type: " + gputype ); } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index 591fd54..e523a45 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -20,14 +20,20 @@ package org.apache.sysml.runtime.instructions.gpu; import java.util.ArrayList; +import jcuda.Pointer; + +import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.PoolingType; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; @@ -37,7 +43,16 @@ public class ConvolutionGPUInstruction extends GPUInstruction { private CPOperand _input1; private CPOperand _input2; private CPOperand _input3; + private CPOperand _input4; + private CPOperand _input5; + private CPOperand _input6; + private CPOperand _input7; + private CPOperand _input8; private CPOperand _output; + private CPOperand _output2; + private CPOperand _output3; + private CPOperand _output4; + private CPOperand _output5; private ArrayList<CPOperand> _input_shape; private ArrayList<CPOperand> _filter_shape; private ArrayList<CPOperand> _stride = new ArrayList<>(); @@ -57,8 +72,47 @@ public class ConvolutionGPUInstruction extends GPUInstruction { _output = out; _intermediateMemoryBudget = intermediateMemoryBudget; } + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand in4, CPOperand in5, CPOperand in6, + CPOperand out, CPOperand out2, CPOperand out3, String opcode, String istr, + double intermediateMemoryBudget) throws DMLRuntimeException { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); + _input1 = in1; + _input2 = in2; + _input3 = in3; + _input4 = in4; + _input5 = in5; + _input6 = in6; + _gputype = GPUINSTRUCTION_TYPE.Convolution; + _output = out; + _output2 = out2; + _output3 = out3; + _intermediateMemoryBudget = intermediateMemoryBudget; + } - public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, double intermediateMemoryBudget) { + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand in4, CPOperand in5, + CPOperand in6, CPOperand in7, CPOperand in8, + CPOperand out, CPOperand out2, CPOperand out3, CPOperand out4, CPOperand out5, String opcode, String istr, + double intermediateMemoryBudget) throws DMLRuntimeException { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); + _input1 = in1; + _input2 = in2; + _input3 = in3; + _input4 = in4; + _input5 = in5; + _input6 = in6; + _input7 = in7; + _input8 = in8; + _gputype = GPUINSTRUCTION_TYPE.Convolution; + _output = out; + _output2 = out2; + _output3 = out3; + _output4 = out4; + _output5 = out5; + _intermediateMemoryBudget = intermediateMemoryBudget; + } + + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, + double intermediateMemoryBudget) throws DMLRuntimeException { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); if( !opcode.equals("channel_sums") ) { throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be channel_sums, but found " + opcode); @@ -228,6 +282,49 @@ public class ConvolutionGPUInstruction extends GPUInstruction { CPOperand out = new CPOperand(parts[4]); return new ConvolutionGPUInstruction(in, in2, in3, out, opcode, str, 0); } + else if (opcode.equalsIgnoreCase("lstm")) { + InstructionUtils.checkNumFields(parts, 9); + CPOperand in1 = new CPOperand(parts[1]); + CPOperand in2 = new CPOperand(parts[2]); + CPOperand in3 = new CPOperand(parts[3]); + CPOperand in4 = new CPOperand(parts[4]); + CPOperand in5 = new CPOperand(parts[5]); + CPOperand in6 = new CPOperand(parts[6]); + CPOperand out = new CPOperand(parts[7]); + CPOperand out2 = new CPOperand(parts[8]); + CPOperand out3 = new CPOperand(parts[9]); + return new ConvolutionGPUInstruction(in1, in2, in3, in4, in5, in6, out, out2, out3, opcode, str, 0); + } + else if (opcode.equalsIgnoreCase("batch_norm2d")) { + InstructionUtils.checkNumFields(parts, 13); + CPOperand in1 = new CPOperand(parts[1]); // image + CPOperand in2 = new CPOperand(parts[2]); // scale + CPOperand in3 = new CPOperand(parts[3]); // bias + CPOperand in4 = new CPOperand(parts[4]); // runningMean + CPOperand in5 = new CPOperand(parts[5]); // runningVar + CPOperand in6 = new CPOperand(parts[6]); // mode + CPOperand in7 = new CPOperand(parts[7]); // epsilon + CPOperand in8 = new CPOperand(parts[8]); // exponentialAverageFactor + CPOperand out = new CPOperand(parts[9]); // ret + CPOperand out2 = new CPOperand(parts[10]); // retRunningMean + CPOperand out3 = new CPOperand(parts[11]); // retRunningVar + CPOperand out4 = new CPOperand(parts[12]); // resultSaveMean + CPOperand out5 = new CPOperand(parts[13]); // resultSaveInvVariance + return new ConvolutionGPUInstruction(in1, in2, in3, in4, in5, in6, in7, in8, out, out2, out3, out4, out5, opcode, str, 0); + } + else if (opcode.equalsIgnoreCase("batch_norm2d_backward")) { + InstructionUtils.checkNumFields(parts, 9); + CPOperand in1 = new CPOperand(parts[1]); // image + CPOperand in2 = new CPOperand(parts[2]); // dout + CPOperand in3 = new CPOperand(parts[3]); // scale + CPOperand in4 = new CPOperand(parts[4]); // epsilon + CPOperand in5 = new CPOperand(parts[5]); // resultSaveMean + CPOperand in6 = new CPOperand(parts[6]); // resultSaveInvVariance + CPOperand out = new CPOperand(parts[7]); // dX + CPOperand out2 = new CPOperand(parts[8]); // dScale + CPOperand out3 = new CPOperand(parts[9]); // dBias + return new ConvolutionGPUInstruction(in1, in2, in3, in4, in5, in6, null, null, out, out2, out3, null, null, opcode, str, 0); + } else { throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str); } @@ -248,6 +345,82 @@ public class ConvolutionGPUInstruction extends GPUInstruction { ec.releaseMatrixInputForGPUInstruction(_input2.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } + + public void processBatchNorm2dInstruction(ExecutionContext ec) throws DMLRuntimeException { + GPUStatistics.incrementNoOfExecutedGPUInst(); + MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject scale = getMatrixInputForGPUInstruction(ec, _input2.getName()); + MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName()); + MatrixObject runningMean = getMatrixInputForGPUInstruction(ec, _input4.getName()); + MatrixObject runningVar = getMatrixInputForGPUInstruction(ec, _input5.getName()); + + String phase = ec.getScalarInput(_input6.getName(), _input6.getValueType(), _input6.isLiteral()).getStringValue(); + double epsilon = ec.getScalarInput(_input7.getName(), _input7.getValueType(), _input7.isLiteral()).getDoubleValue(); + + MatrixObject ret = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), image.getNumRows(), image.getNumColumns()); + + if(phase.equalsIgnoreCase("train")) { + double exponentialAverageFactor = 1-ec.getScalarInput(_input8.getName(), _input8.getValueType(), _input8.isLiteral()).getDoubleValue(); + MatrixObject retRunningMean = getDenseMatrixOutputForGPUInstruction(ec, _output2.getName(), runningMean.getNumRows(), runningMean.getNumColumns()); + MatrixObject retRunningVar = getDenseMatrixOutputForGPUInstruction(ec, _output3.getName(), runningVar.getNumRows(), runningVar.getNumColumns()); + MatrixObject resultSaveMean = getDenseMatrixOutputForGPUInstruction(ec, _output4.getName(), runningMean.getNumRows(), runningMean.getNumColumns()); + MatrixObject resultSaveInvVariance = getDenseMatrixOutputForGPUInstruction(ec, _output5.getName(), runningVar.getNumRows(), runningVar.getNumColumns()); + LibMatrixCuDNN.batchNormalizationForwardTraining(ec.getGPUContext(0), getExtendedOpcode(), + image, scale, bias, runningMean, runningVar, ret, + retRunningMean, retRunningVar, epsilon, exponentialAverageFactor, resultSaveMean, resultSaveInvVariance); + ec.releaseMatrixOutputForGPUInstruction(_output2.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output3.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output4.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output5.getName()); + } + else if(phase.equalsIgnoreCase("test")) { + LibMatrixCuDNN.batchNormalizationForwardInference(ec.getGPUContext(0), getExtendedOpcode(), + image, scale, bias, runningMean, runningVar, ret, epsilon); + ec.setMatrixOutput(_output2.getName(), new MatrixBlock((int)runningMean.getNumRows(), (int)runningMean.getNumColumns(), true), getExtendedOpcode()); + ec.setMatrixOutput(_output3.getName(), new MatrixBlock((int)runningVar.getNumRows(), (int)runningVar.getNumColumns(), true), getExtendedOpcode()); + ec.setMatrixOutput(_output4.getName(), new MatrixBlock((int)runningMean.getNumRows(), (int)runningMean.getNumColumns(), true), getExtendedOpcode()); + ec.setMatrixOutput(_output5.getName(), new MatrixBlock((int)runningVar.getNumRows(), (int)runningVar.getNumColumns(), true), getExtendedOpcode()); + } + else { + throw new DMLRuntimeException("Incorrect mode: Expected either train or test, but found " + phase); + } + + // release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + ec.releaseMatrixInputForGPUInstruction(_input2.getName()); + ec.releaseMatrixInputForGPUInstruction(_input3.getName()); + ec.releaseMatrixInputForGPUInstruction(_input4.getName()); + ec.releaseMatrixInputForGPUInstruction(_input5.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + } + + public void processBatchNorm2dBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException { + GPUStatistics.incrementNoOfExecutedGPUInst(); + MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); + MatrixObject scale = getMatrixInputForGPUInstruction(ec, _input3.getName()); + double epsilon = ec.getScalarInput(_input4.getName(), _input4.getValueType(), _input4.isLiteral()).getDoubleValue(); + MatrixObject resultSaveMean = getMatrixInputForGPUInstruction(ec, _input5.getName()); + MatrixObject resultSaveInvVariance = getMatrixInputForGPUInstruction(ec, _input6.getName()); + + MatrixObject dX = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), image.getNumRows(), image.getNumColumns()); + MatrixObject dScale = getDenseMatrixOutputForGPUInstruction(ec, _output2.getName(), scale.getNumRows(), scale.getNumColumns()); + MatrixObject dBias = getDenseMatrixOutputForGPUInstruction(ec, _output3.getName(), scale.getNumRows(), scale.getNumColumns()); + + LibMatrixCuDNN.batchNormalizationBackward(ec.getGPUContext(0), getExtendedOpcode(), image, + dout, scale, dX, dScale, dBias, + epsilon, resultSaveMean, resultSaveInvVariance); + + // release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + ec.releaseMatrixInputForGPUInstruction(_input2.getName()); + ec.releaseMatrixInputForGPUInstruction(_input3.getName()); + ec.releaseMatrixInputForGPUInstruction(_input5.getName()); + ec.releaseMatrixInputForGPUInstruction(_input6.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output2.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output3.getName()); + } // (X > 0) * dout public void processReLUBackwardInstruction(ExecutionContext ec) { @@ -281,6 +454,78 @@ public class ConvolutionGPUInstruction extends GPUInstruction { ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } + private static int toInt(long num) throws DMLRuntimeException { + if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) { + throw new DMLRuntimeException("GPU : Exceeded supported size " + num); + } + return (int)num; + } + +// private Pointer transpose(ExecutionContext ec, MatrixObject X) throws DMLRuntimeException { +// GPUContext gCtx = ec.getGPUContext(0); +// String instructionName = getExtendedOpcode(); +// long numRowsX = X.getNumRows(); long numColsX = X.getNumColumns(); +// Pointer tX = gCtx.allocate(instructionName, numRowsX*numColsX*LibMatrixCUDA.sizeOfDataType); +// jcuda.runtime.JCuda.cudaMemcpy(tX, LibMatrixCUDA.getDensePointer(gCtx, X, instructionName), numRowsX*numColsX*LibMatrixCUDA.sizeOfDataType, jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice); +// // LibMatrixCUDA.denseTranspose(ec, gCtx, instructionName, LibMatrixCUDA.getDensePointer(gCtx, X, instructionName), tX, numRowsX, numColsX); +// return tX; +// } + + private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeException { + // batchSize=N, seqLength=T, numFeatures=D and hiddenSize=M + // input X:(N, T*D), ==> (T, D, N) + // weight W:(D+M+2, 4M) + // previous output out0 (also represented by hx) and cell state c0 (also represented by cx): (N, M) ==> (1, M, N) + // out: (N, T*M) or (N, M) ==> (T, M, N) + GPUStatistics.incrementNoOfExecutedGPUInst(); + GPUContext gCtx = ec.getGPUContext(0); + String instructionName = getExtendedOpcode(); + + MatrixObject out0 = getMatrixInputForGPUInstruction(ec, _input4.getName()); + int M = toInt(out0.getNumColumns()); // hiddenSize .. since out0: (N, M) + Pointer out0Pointer = LibMatrixCUDA.getDensePointer(gCtx, out0, instructionName); + + MatrixObject W = getMatrixInputForGPUInstruction(ec, _input2.getName()); + MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName()); + long numRowsW = W.getNumRows(); + int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures + Pointer sysmlWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M); + Pointer sysmlBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M); + Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); + LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight", + ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), + sysmlWPointer, sysmlBiasPointer, cudnnWPointer, D, M); + ec.releaseMatrixInputForGPUInstruction(_input2.getName()); + + boolean return_sequences = ec.getScalarInput(_input6.getName(), _input6.getValueType(), _input6.isLiteral()).getBooleanValue(); + + // Beause the matrices are released immediately, the output for transpose need not be taken into account + MatrixObject X = getMatrixInputForGPUInstruction(ec, _input1.getName()); + Pointer xPointer = LibMatrixCUDA.getDensePointer(gCtx, X, instructionName); + int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D) + long numColsX = X.getNumColumns(); + int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength + Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType); + LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input", + ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), + xPointer, cudnnInput, N, D, T*D, N*T*D); + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + + Pointer c0Pointer = LibMatrixCUDA.getDensePointer(gCtx, getMatrixInputForGPUInstruction(ec, _input5.getName()), instructionName); + + LibMatrixCuDNN.lstm(ec, gCtx, instructionName, cudnnInput, cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), _output2.getName(), _output3.getName(), N, M, D, T); + gCtx.cudaFreeHelper(instructionName, cudnnWPointer, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instructionName, cudnnInput, DMLScript.EAGER_CUDA_FREE); + + // release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(_input3.getName()); + ec.releaseMatrixInputForGPUInstruction(_input4.getName()); + ec.releaseMatrixInputForGPUInstruction(_input5.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output2.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output3.getName()); + } + @Override public void processInstruction(ExecutionContext ec) { if (instOpcode.equalsIgnoreCase("bias_add") || instOpcode.equalsIgnoreCase("bias_multiply")) { @@ -295,6 +540,18 @@ public class ConvolutionGPUInstruction extends GPUInstruction { processChannelSumsInstruction(ec); return; } + else if (instOpcode.equalsIgnoreCase("lstm")) { + processLstmInstruction(ec); + return; + } + else if (instOpcode.equalsIgnoreCase("batch_norm2d")) { + processBatchNorm2dInstruction(ec); + return; + } + else if (instOpcode.equalsIgnoreCase("batch_norm2d_backward")) { + processBatchNorm2dBackwardInstruction(ec); + return; + } GPUStatistics.incrementNoOfExecutedGPUInst(); http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index 7eb4033..ff14ec4 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -506,4 +506,4 @@ public class CSRPointer { public String toString() { return "CSRPointer{" + "nnz=" + nnz + '}'; } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 2d3918c..4c0d1eb 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -420,4 +420,4 @@ public class GPUContext { public String toString() { return "GPUContext{" + "deviceNum=" + deviceNum + '}'; } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java index 830f7df..c90beef 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java @@ -168,4 +168,4 @@ public class GPULazyCudaFreeMemoryManager { } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java index 066905b..610df23 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java @@ -181,4 +181,4 @@ public class GPUMatrixMemoryManager { gpuObjects.removeAll(unlockedGPUObjects); } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index 411606d..35a31be 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -725,4 +725,4 @@ public class GPUMemoryManager { } } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index bf44895..576584b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -1029,4 +1029,4 @@ public class GPUObject { return sb.toString(); } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index cdb69f9..8b5043f 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data; import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N; import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T; +import static jcuda.runtime.JCuda.cudaMalloc; import static jcuda.runtime.JCuda.cudaMemcpy; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; @@ -88,8 +89,10 @@ import jcuda.jcublas.cublasHandle; import jcuda.jcublas.cublasOperation; import jcuda.jcublas.cublasSideMode; import jcuda.jcusparse.cusparseAction; +import jcuda.jcusparse.cusparseDirection; import jcuda.jcusparse.cusparseHandle; import jcuda.jcusparse.cusparseIndexBase; +import jcuda.jcusparse.cusparseMatDescr; /** * All CUDA kernels and library calls are redirected through this class @@ -2577,4 +2580,4 @@ public class LibMatrixCUDA { } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index 2bfb8f2..e84dce7 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -32,6 +32,11 @@ import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor; import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU; import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN; import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; +import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining; +import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference; +import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward; import static jcuda.runtime.JCuda.cudaMemset; import jcuda.CudaException; import jcuda.Pointer; @@ -841,7 +846,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } } - private static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName, + static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName, long numRows, long numCols) throws DMLRuntimeException { MatrixObject output = ec.getMatrixObject(outputName); getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix @@ -849,6 +854,270 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } /** + * Computes the forward pass for an LSTM layer with M neurons. + * The input data has N sequences of T examples, each with D features. + * + * @param ec execution context + * @param gCtx gpu context + * @param instName name of the instruction + * @param X input matrix pointer + * @param wPointer weight matrix pointer + * @param out0 Outputs from previous timestep + * @param c0 Initial cell state + * @param return_sequences Whether to return `out` at all timesteps, or just for the final timestep. + * @param outputName name of the out variable. If `return_sequences` is True, outputs for all timesteps. + * @param cyName name of the output cell state. Cell state for final timestep. + * @param reserveSpaceName name of reserve space. + * @param N minibatch size + * @param M hidden size + * @param D number of features + * @param T sequence length + * @throws DMLRuntimeException if error + */ + public static void lstm(ExecutionContext ec, GPUContext gCtx, String instName, + Pointer X, Pointer wPointer, Pointer out0, Pointer c0, boolean return_sequences, + String outputName, String cyName, String reserveSpaceName, int N, int M, int D, int T) throws DMLRuntimeException { + singleLayerUnidirectionalRNNForward(ec, gCtx, instName, X, out0, c0, wPointer, outputName, cyName, reserveSpaceName, "lstm", return_sequences, N, M, D, T); + } + + private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName, + Pointer x, Pointer hx, Pointer cx, Pointer wPointer, // input + String outputName, String cyName, String reserveSpaceName, // output + String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { + boolean hasCarry = rnnMode.equalsIgnoreCase("lstm"); + // Get output pointers + Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); + Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType); + Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer(); + // Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M); + + try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer, reserveSpaceName)) { + jcuda.runtime.JCuda.cudaDeviceSynchronize(); + JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, + algo.xDesc, x, + algo.hxDesc, hx, + algo.cxDesc, cx, + algo.wDesc, wPointer, + algo.yDesc, cudnnYPointer, + algo.hyDesc, hyPointer, + algo.cyDesc, cyPointer, + algo.workSpace, algo.sizeInBytes, + algo.reserveSpace, algo.reserveSpaceSizeInBytes); + } + + if(return_sequences) { + gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE); + Pointer sysmlYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M); + LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output", + ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M), + sysmlYPointer, cudnnYPointer, N, T, M, N*T*M); + } + gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE); + } + + /** + * Performs the forward BatchNormalization layer computation for training + * @param gCtx a valid {@link GPUContext} + * @param instName name of the instruction + * @param image input image + * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1] + * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1] + * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1] + * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1] + * @param ret (output) normalized input + * @param retRunningMean (output) running mean accumulated during training phase: shape [1, C, 1, 1] + * @param retRunningVar (output) running variance accumulated during training phase: shape [1, C, 1, 1] + * @param epsilon epsilon value used in the batch normalization formula + * @param exponentialAverageFactor factor used in the moving average computation + * @param resultSaveMean (output) running mean accumulated during training phase: shape [1, C, 1, 1] + * @param resultSaveInvVariance (output) running variance accumulated during training phase: shape [1, C, 1, 1] + * @throws DMLRuntimeException if error occurs + */ + public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image, + MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, + MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, + double epsilon, double exponentialAverageFactor, + MatrixObject resultSaveMean, MatrixObject resultSaveInvVariance) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx); + } + + int N = toInt(image.getNumRows()); + int C = toInt(scale.getNumRows()); + long CHW = image.getNumColumns(); + validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C); + + // Allocate descriptors + cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, + new MatrixObject[] {image}, new MatrixObject[] {ret}); + cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(1, C, 1, 1); + + // Get underlying dense pointer + Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName); + Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName); + Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName); + Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName); + Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, runningMean, instName); + Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, runningVar, instName); + + // To allow for copy-on-write + Pointer retRunningMeanPtr = getDensePointerForCuDNN(gCtx, retRunningMean, instName); + Pointer retRunningVarPtr = getDensePointerForCuDNN(gCtx, retRunningVar, instName); + cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * sizeOfDataType, cudaMemcpyDeviceToDevice); + cudaMemcpy(retRunningVarPtr, runningVarPtr, C * sizeOfDataType, cudaMemcpyDeviceToDevice); + + Pointer resultSaveMeanPtr = getDensePointerForCuDNN(gCtx, resultSaveMean, instName); + Pointer resultSaveInvVariancePtr = getDensePointerForCuDNN(gCtx, resultSaveInvVariance, instName); + + checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), + jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL, one(), zero(), + nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, + scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor, + retRunningMeanPtr, retRunningVarPtr, epsilon, resultSaveMeanPtr, resultSaveInvVariancePtr)); + } + + /** + * Performs the forward BatchNormalization layer computation for inference + * @param gCtx a valid {@link GPUContext} + * @param instName name of the instruction + * @param image input image + * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1] + * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1] + * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1] + * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1] + * @param ret normalized input + * @param epsilon epsilon value used in the batch normalization formula + * @throws DMLRuntimeException if error occurs + */ + public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image, + MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, + MatrixObject ret, double epsilon) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx); + } + + int N = toInt(image.getNumRows()); + int C = toInt(scale.getNumRows()); + long CHW = image.getNumColumns(); + validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C); + + // Allocate descriptors + cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, + new MatrixObject[] {image}, new MatrixObject[] {ret}); + cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(1, C, 1, 1); + + // Get underlying dense pointer + Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName); + Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName); + Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName); + Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName); + Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, runningMean, instName); + Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, runningVar, instName); + + checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), + jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL, one(), zero(), + nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, + scaleTensorDesc, scalePtr, biasPtr, + runningMeanPtr, runningVarPtr, epsilon)); + } + + /** + * This method computes the backpropagation errors for image, scale and bias of batch normalization layer + * @param gCtx a valid {@link GPUContext} + * @param instName name of the instruction + * @param image input image + * @param dout input errors of shape C, H, W + * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1] + * @param dX (output) backpropagation errors for previous layer + * @param dScale backpropagation error for scale + * @param dBias backpropagation error for bias + * @param epsilon epsilon value used in the batch normalization formula + * @param resultSaveMean (input) running mean accumulated during training phase: shape [1, C, 1, 1] + * @param resultSaveInvVariance (input) running variance accumulated during training phase: shape [1, C, 1, 1] + * @throws DMLRuntimeException if error occurs + */ + public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, + MatrixObject scale, MatrixObject dX, MatrixObject dScale, MatrixObject dBias, + double epsilon, MatrixObject resultSaveMean, MatrixObject resultSaveInvVariance) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx); + } + + int N = toInt(image.getNumRows()); + int C = toInt(scale.getNumRows()); + long CHW = image.getNumColumns(); + + // Allocate descriptors + cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, + new MatrixObject[] {image, dout}, new MatrixObject[] {dX}); + cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(1, C, 1, 1); + + // Get underlying dense pointer + Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName); + Pointer doutPtr = getDensePointerForCuDNN(gCtx, dout, instName); + Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName); + Pointer dXPtr = getDensePointerForCuDNN(gCtx, dX, instName); + Pointer dScalePtr = getDensePointerForCuDNN(gCtx, dScale, instName); + Pointer dBiasPtr = getDensePointerForCuDNN(gCtx, dBias, instName); + + Pointer resultSaveMeanPtr = getDensePointerForCuDNN(gCtx, resultSaveMean, instName); + Pointer resultSaveInvVariancePtr = getDensePointerForCuDNN(gCtx, resultSaveInvVariance, instName); + + + // ignoring resultSaveMean and resultSaveVariance as it requires state management + checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), + jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL, one(), zero(), one(), zero(), + nCHWDescriptor, imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, dXPtr, + scaleTensorDesc, scalePtr, dScalePtr, dBiasPtr, epsilon, resultSaveMeanPtr, resultSaveInvVariancePtr)); + } + + private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException { + if(scale.getNumRows() != C || scale.getNumColumns() != 1) { + throw new DMLRuntimeException("Incorrect dimensions for scale. Expected a column vector of size " + C + ", but found [" + scale.getNumRows() + ", " + scale.getNumColumns() + "]"); + } + if(bias.getNumRows() != C || bias.getNumColumns() != 1) { + throw new DMLRuntimeException("Incorrect dimensions for bias. Expected a column vector of size " + C + ", but found [" + bias.getNumRows() + ", " + bias.getNumColumns() + "]"); + } + if(runningMean.getNumRows() != C || runningMean.getNumColumns() != 1) { + throw new DMLRuntimeException("Incorrect dimensions for running mean. Expected a column vector of size " + C + ", but found [" + runningMean.getNumRows() + ", " + runningMean.getNumColumns() + "]"); + } + if(runningVar.getNumRows() != C || runningVar.getNumColumns() != 1) { + throw new DMLRuntimeException("Incorrect dimensions for running variance. Expected a column vector of size " + C + ", but found [" + runningVar.getNumRows() + ", " + runningVar.getNumColumns() + "]"); + } + } + + /** + * Convenient utility for batch normalization that returns a NCHW descriptor + * @param gCtx a valid {@link GPUContext} + * @param N number of images + * @param C number of channels + * @param CHW channels*height*width + * @param input input matrix objects + * @param output output matrix objects + * @return one of the NCHW descriptor + * @throws DMLRuntimeException if error occurs + */ + private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) throws DMLRuntimeException { + cudnnTensorDescriptor ret = null; // Return any one + if(CHW > ((long)Integer.MAX_VALUE)*C) { + throw new DMLRuntimeException("image size (height*width) should be less than " + Integer.MAX_VALUE); + } + int H = -1; int W = -1; + int HW = (int) (CHW / C); + H = HW; W = 1; // If not known + double potentialH = Math.sqrt(HW); + if(potentialH == ((int) potentialH)) { + H = (int) potentialH; + W = H; + } + // We are not sure about H and W, hence don't allocate them. + ret = new cudnnTensorDescriptor(); + cudnnCreateTensorDescriptor(ret); + cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, N, C, H, W); + return ret; + } + + /** * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously. * * @param gCtx a valid {@link GPUContext} @@ -900,4 +1169,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { if(status != cudnnStatus.CUDNN_STATUS_SUCCESS) throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status)); } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index 432e79e..c95c3b3 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -276,4 +276,4 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION, LibMatrixCUDA.CUDNN_DATA_TYPE); return convDesc; } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java index f3ce70d..70c33d6 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java @@ -81,4 +81,4 @@ public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java throw new RuntimeException(e); } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java new file mode 100644 index 0000000..d772a55 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.matrix.data; + +import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnSetTensorNdDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyDropoutDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyRNNDescriptor; +import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW; +import static jcuda.jcudnn.JCudnn.cudnnCreateRNNDescriptor; +import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT; +import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL; +import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; + +import jcuda.Pointer; +import jcuda.jcudnn.JCudnn; +import jcuda.jcudnn.cudnnDropoutDescriptor; +import jcuda.jcudnn.cudnnFilterDescriptor; +import jcuda.jcudnn.cudnnRNNDescriptor; +import jcuda.jcudnn.cudnnTensorDescriptor; + +public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { + GPUContext gCtx; + String instName; + cudnnDropoutDescriptor dropoutDesc; + cudnnRNNDescriptor rnnDesc; + cudnnTensorDescriptor[] xDesc, yDesc; // of length T + cudnnTensorDescriptor hxDesc, cxDesc, hyDesc, cyDesc; + cudnnFilterDescriptor wDesc; + long sizeInBytes; Pointer workSpace; + long reserveSpaceSizeInBytes; Pointer reserveSpace; + public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String instName, + String rnnMode, int N, int T, int M, int D, boolean isTraining, Pointer w, String reserveSpaceName) throws DMLRuntimeException { + this.gCtx = gCtx; + this.instName = instName; + + // Allocate input/output descriptors + xDesc = new cudnnTensorDescriptor[T]; + yDesc = new cudnnTensorDescriptor[T]; + for(int t = 0; t < T; t++) { + xDesc[t] = allocateTensorDescriptorWithStride(N, D, 1); + yDesc[t] = allocateTensorDescriptorWithStride(N, M, 1); + } + hxDesc = allocateTensorDescriptorWithStride(1, N, M); + cxDesc = allocateTensorDescriptorWithStride(1, N, M); + hyDesc = allocateTensorDescriptorWithStride(1, N, M); + cyDesc = allocateTensorDescriptorWithStride(1, N, M); + + // Initial dropout descriptor + dropoutDesc = new cudnnDropoutDescriptor(); + JCudnn.cudnnCreateDropoutDescriptor(dropoutDesc); + long [] dropOutSizeInBytes = {-1}; + JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), dropOutSizeInBytes); + Pointer dropOutStateSpace = new Pointer(); + if (dropOutSizeInBytes[0] != 0) + dropOutStateSpace = gCtx.allocate(dropOutSizeInBytes[0]); + JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes[0], 12345); + + // Initialize RNN descriptor + rnnDesc = new cudnnRNNDescriptor(); + cudnnCreateRNNDescriptor(rnnDesc); + JCudnn.cudnnSetRNNDescriptor_v6(gCtx.getCudnnHandle(), rnnDesc, M, 1, dropoutDesc, + CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, + getCuDNNRnnMode(rnnMode), CUDNN_RNN_ALGO_STANDARD, LibMatrixCUDA.CUDNN_DATA_TYPE); + + // Allocate filter descriptor + int expectedNumWeights = getExpectedNumWeights(); + if(rnnMode.equalsIgnoreCase("lstm") && (D+M+2)*4*M != expectedNumWeights) { + throw new DMLRuntimeException("Incorrect number of RNN parameters " + (D+M+2)*4*M + " != " + expectedNumWeights + ", where numFeatures=" + D + ", hiddenSize=" + M); + } + wDesc = allocateFilterDescriptor(expectedNumWeights); + + // Setup workspace + workSpace = new Pointer(); reserveSpace = new Pointer(); + sizeInBytes = getWorkspaceSize(T); + if(sizeInBytes != 0) + workSpace = gCtx.allocate(sizeInBytes); + reserveSpaceSizeInBytes = 0; + if(isTraining) { + reserveSpaceSizeInBytes = getReservespaceSize(T); + if (reserveSpaceSizeInBytes != 0) { + int numCols = (int) Math.ceil(((double)reserveSpaceSizeInBytes) / LibMatrixCUDA.sizeOfDataType); + reserveSpace = LibMatrixCuDNN.getDenseOutputPointer(ec, gCtx, instName, reserveSpaceName, 1, numCols); + } + } + if (reserveSpaceSizeInBytes == 0) { + reserveSpace = LibMatrixCuDNN.getDenseOutputPointer(ec, gCtx, instName, reserveSpaceName, 1, 1); + } + + /* + int numLinearLayers = getNumLinearLayers(rnnMode); + for(int i = 0; i < numLinearLayers; i++) { + cudnnFilterDescriptor linLayerMatDesc = new cudnnFilterDescriptor(); + cudnnCreateFilterDescriptor(linLayerMatDesc); + Pointer linLayerMat = new Pointer(); + JCudnn.cudnnGetRNNLinLayerMatrixParams(gCtx.getCudnnHandle(), rnnDesc, 0, + xDesc[0], wDesc, w, i, linLayerMatDesc, linLayerMat); + int[] dataType = new int[] {-1}; + int[] format = new int[] {-1}; + int[] nbDims = new int[] {-1}; + int[] filterDimA = new int[3]; + JCudnn.cudnnGetFilterNdDescriptor(linLayerMatDesc, 3, dataType, format, nbDims, filterDimA); + + int filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2]; + double [] tmp = new double[filterDims]; + LibMatrixCUDA.cudaSupportFunctions.deviceToHost(gCtx, linLayerMat, tmp, instName, false); + System.out.println(); + for(int j = 0 ; j < tmp.length; j++) { + System.out.print(" " + tmp[j]); + } + System.out.println(); + LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", + org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims), + linLayerMat, Math.pow(filterDims, -1), filterDims); + JCudnn.cudnnDestroyFilterDescriptor(linLayerMatDesc); + + cudnnFilterDescriptor linLayerBiasDesc = new cudnnFilterDescriptor(); + cudnnCreateFilterDescriptor(linLayerBiasDesc); + Pointer linLayerBias = new Pointer(); + JCudnn.cudnnGetRNNLinLayerBiasParams(gCtx.getCudnnHandle(), rnnDesc, 0, + xDesc[0], wDesc, w, i, linLayerBiasDesc, linLayerBias); + JCudnn.cudnnGetFilterNdDescriptor(linLayerBiasDesc, 3, dataType, format, nbDims, filterDimA); + filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2]; + LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", + org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims), + linLayerBias, Math.pow(filterDims, -1), filterDims); + JCudnn.cudnnDestroyFilterDescriptor(linLayerBiasDesc); + } + */ + } + + @SuppressWarnings("unused") + private int getNumLinearLayers(String rnnMode) throws DMLRuntimeException { + int ret = 0; + if(rnnMode.equalsIgnoreCase("rnn_relu") || rnnMode.equalsIgnoreCase("rnn_tanh")) { + ret = 2; + } + else if(rnnMode.equalsIgnoreCase("lstm")) { + ret = 8; + } + else if(rnnMode.equalsIgnoreCase("gru")) { + ret = 6; + } + else { + throw new DMLRuntimeException("Unsupported rnn mode:" + rnnMode); + } + return ret; + } + + private long getWorkspaceSize(int seqLength) { + long [] sizeInBytesArray = new long[1]; + JCudnn.cudnnGetRNNWorkspaceSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray); + return sizeInBytesArray[0]; + } + + private long getReservespaceSize(int seqLength) { + long [] sizeInBytesArray = new long[1]; + JCudnn.cudnnGetRNNTrainingReserveSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray); + return sizeInBytesArray[0]; + } + + private int getCuDNNRnnMode(String rnnMode) throws DMLRuntimeException { + int rnnModeVal = -1; + if(rnnMode.equalsIgnoreCase("rnn_relu")) { + rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_RNN_RELU; + } + else if(rnnMode.equalsIgnoreCase("rnn_tanh")) { + rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_RNN_TANH; + } + else if(rnnMode.equalsIgnoreCase("lstm")) { + rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_LSTM; + } + else if(rnnMode.equalsIgnoreCase("gru")) { + rnnModeVal = jcuda.jcudnn.cudnnRNNMode.CUDNN_GRU; + } + else { + throw new DMLRuntimeException("Unsupported rnn mode:" + rnnMode); + } + return rnnModeVal; + } + + private int getExpectedNumWeights() throws DMLRuntimeException { + long [] weightSizeInBytesArray = {-1}; // (D+M+2)*4*M + JCudnn.cudnnGetRNNParamsSize(gCtx.getCudnnHandle(), rnnDesc, xDesc[0], weightSizeInBytesArray, LibMatrixCUDA.CUDNN_DATA_TYPE); + // check if (D+M+2)*4M == weightsSize / sizeof(dataType) where weightsSize is given by 'cudnnGetRNNParamsSize'. + return LibMatrixCUDA.toInt(weightSizeInBytesArray[0]/LibMatrixCUDA.sizeOfDataType); + } + + private cudnnFilterDescriptor allocateFilterDescriptor(int numWeights) { + cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor(); + cudnnCreateFilterDescriptor(filterDesc); + JCudnn.cudnnSetFilterNdDescriptor(filterDesc, LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, 3, new int[] {numWeights, 1, 1}); + return filterDesc; + } + + + + private static cudnnTensorDescriptor allocateTensorDescriptorWithStride(int firstDim, int secondDim, int thirdDim) throws DMLRuntimeException { + cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor(); + cudnnCreateTensorDescriptor(tensorDescriptor); + int [] dimA = new int[] {firstDim, secondDim, thirdDim}; + int [] strideA = new int[] {dimA[2] * dimA[1], dimA[2], 1}; + cudnnSetTensorNdDescriptor(tensorDescriptor, LibMatrixCUDA.CUDNN_DATA_TYPE, 3, dimA, strideA); + return tensorDescriptor; + } + + + @Override + public void close() { + if(dropoutDesc != null) + cudnnDestroyDropoutDescriptor(dropoutDesc); + dropoutDesc = null; + if(rnnDesc != null) + cudnnDestroyRNNDescriptor(rnnDesc); + rnnDesc = null; + if(hxDesc != null) + cudnnDestroyTensorDescriptor(hxDesc); + hxDesc = null; + if(hyDesc != null) + cudnnDestroyTensorDescriptor(hyDesc); + hyDesc = null; + if(cxDesc != null) + cudnnDestroyTensorDescriptor(cxDesc); + cxDesc = null; + if(cyDesc != null) + cudnnDestroyTensorDescriptor(cyDesc); + cyDesc = null; + if(wDesc != null) + cudnnDestroyFilterDescriptor(wDesc); + wDesc = null; + if(xDesc != null) { + for(cudnnTensorDescriptor dsc : xDesc) { + cudnnDestroyTensorDescriptor(dsc); + } + xDesc = null; + } + if(yDesc != null) { + for(cudnnTensorDescriptor dsc : yDesc) { + cudnnDestroyTensorDescriptor(dsc); + } + yDesc = null; + } + if(sizeInBytes != 0) { + try { + gCtx.cudaFreeHelper(instName, workSpace, DMLScript.EAGER_CUDA_FREE); + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + } + if(reserveSpaceSizeInBytes != 0) { + try { + gCtx.cudaFreeHelper(instName, reserveSpace, DMLScript.EAGER_CUDA_FREE); + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + } + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java index 60b2352..adbbcb8 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java @@ -466,4 +466,4 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { private static int reverseCusparseOp(int trans) { return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java index 3bd101c..39371e6 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java @@ -224,4 +224,4 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, totalTime); } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index f7bee4f..18270cc 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -240,4 +240,4 @@ public class GPUStatistics { } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/276065f9/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala index 3e7aff3..869f1a1 100644 --- a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala +++ b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala @@ -1603,4 +1603,4 @@ class DeConvolution(val param: LayerParameter, val id: Int, val net: CaffeNetwor if (convParam.hasPadW) convParam.getPadW.toString else if (convParam.getPadCount > 0) convParam.getPad(0).toString else "0" -} \ No newline at end of file +}
