Repository: systemml Updated Branches: refs/heads/master 77c98d693 -> e2dc85688
[SYSTEMML-445] Removed unnecessary long-to-int conversion in LSTM - Minor cleanup of the GPUObject class. - Also, fixed incorrect forced GPU configuration flag. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e2dc8568 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e2dc8568 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e2dc8568 Branch: refs/heads/master Commit: e2dc8568855d353265ac4e0755b9ac3d2b30b1d8 Parents: 77c98d6 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Thu Sep 13 11:17:33 2018 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Thu Sep 13 11:17:33 2018 -0700 ---------------------------------------------------------------------- .../apache/sysml/conf/ConfigurationManager.java | 2 +- .../instructions/gpu/DnnGPUInstruction.java | 20 +++--- .../instructions/gpu/context/CSRPointer.java | 8 --- .../gpu/context/ExecutionConfig.java | 4 +- .../gpu/context/GPUMemoryManager.java | 12 +++- .../instructions/gpu/context/GPUObject.java | 72 ++++++++++---------- .../runtime/matrix/data/LibMatrixCuDNN.java | 38 +++++++---- .../matrix/data/LibMatrixCuDNNRnnAlgorithm.java | 56 ++++----------- .../sysml/runtime/matrix/data/MatrixBlock.java | 3 +- 9 files changed, 100 insertions(+), 115 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/conf/ConfigurationManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java index d9f1906..96c3885 100644 --- a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java +++ b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java @@ -258,7 +258,7 @@ public class ConfigurationManager * @return true if GPU is enabled in forced mode */ public static boolean isForcedGPU() { - return _ldmlOptions.get().isGPU(); + return _ldmlOptions.get().isForceGPU(); } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java index d620de9..6094b6c 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java @@ -595,18 +595,18 @@ public class DnnGPUInstruction extends GPUInstruction { private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException { MatrixObject out0 = getMatrixInputForGPUInstruction(ec, _input4.getName()); - int M = toInt(out0.getNumColumns()); // hiddenSize .. since out0: (N, M) + long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, M) Pointer out0Pointer = LibMatrixCUDA.getDensePointer(gCtx, out0, instName); MatrixObject W = getMatrixInputForGPUInstruction(ec, _input2.getName()); MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName()); long numRowsW = W.getNumRows(); - int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures + long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures Pointer sysmlWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M); Pointer sysmlBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M); Pointer cudnnWPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight", - ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), + ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))), sysmlWPointer, sysmlBiasPointer, cudnnWPointer, D, M); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); ec.releaseMatrixInputForGPUInstruction(_input3.getName()); @@ -619,7 +619,7 @@ public class DnnGPUInstruction extends GPUInstruction { int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength Pointer cudnnInput = gCtx.allocate(instName, (N*T*D)*LibMatrixCUDA.sizeOfDataType); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input", - ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), + ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)), xPointer, cudnnInput, N, D, T*D, N*T*D); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); @@ -656,18 +656,19 @@ public class DnnGPUInstruction extends GPUInstruction { // previous output out0 (also represented by hx) and cell state c0 (also represented by cx): (N, M) ==> (1, M, N) // out: (N, T*M) or (N, M) ==> (T, M, N) MatrixObject out0 = getMatrixInputForGPUInstruction(ec, _input4.getName()); - int M = toInt(out0.getNumColumns()); // hiddenSize .. since out0: (N, M) + long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, M) Pointer out0Pointer = LibMatrixCUDA.getDensePointer(gCtx, out0, instName); MatrixObject W = getMatrixInputForGPUInstruction(ec, _input2.getName()); MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName()); long numRowsW = W.getNumRows(); - int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures + long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures + Pointer sysmlWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M); Pointer sysmlBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M); Pointer cudnnWPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight", - ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), + ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))), sysmlWPointer, sysmlBiasPointer, cudnnWPointer, D, M); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); ec.releaseMatrixInputForGPUInstruction(_input3.getName()); @@ -682,13 +683,14 @@ public class DnnGPUInstruction extends GPUInstruction { int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength Pointer cudnnInput = gCtx.allocate(instName, (N*T*D)*LibMatrixCUDA.sizeOfDataType); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input", - ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), + ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)), xPointer, cudnnInput, N, D, T*D, N*T*D); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); Pointer c0Pointer = LibMatrixCUDA.getDensePointer(gCtx, getMatrixInputForGPUInstruction(ec, _input5.getName()), instName); - LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), _output2.getName(), N, M, D, T); + LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), _output2.getName(), + toInt(N), toInt(M), toInt(D), toInt(T)); gCtx.cudaFreeHelper(instName, cudnnWPointer, gCtx.EAGER_CUDA_FREE); gCtx.cudaFreeHelper(instName, cudnnInput, gCtx.EAGER_CUDA_FREE); http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index d7e38b9..135e0b1 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -476,14 +476,6 @@ public class CSRPointer { } /** - * Calls cudaFree lazily on the allocated {@link Pointer} instances - * - */ - public void deallocate() { - deallocate(getGPUContext().EAGER_CUDA_FREE); - } - - /** * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances * * @param eager whether to do eager or lazy cudaFrees http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java index 872fef7..d35e813 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java @@ -68,8 +68,8 @@ public class ExecutionConfig { * @return execution configuration */ public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) { - if(numCells == 0) - throw new DMLRuntimeException("Attempting to invoke a kernel with 0 threads"); + if(numCells <= 0) + throw new DMLRuntimeException("Attempting to invoke a kernel with " + numCells + " threads"); int deviceNumber = 0; int blockDimX = getMaxBlockDim(deviceNumber); int gridDimX = (int) Math.ceil((double) numCells / blockDimX); http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index e01c71a..509aafe 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -53,7 +53,7 @@ public class GPUMemoryManager { // Developer flag: Use this flag to check for GPU memory leak in SystemML. // This has an additional overhead of maintaining stack trace of all the allocated GPU pointers via PointerInfo class. private static final boolean DEBUG_MEMORY_LEAK = false; - private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10}; // Avoids printing too much text while debuggin + private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10, 11}; // Avoids printing too much text while debugging private final boolean PRINT_GPU_MEMORY_INFO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO); @@ -86,7 +86,15 @@ public class GPUMemoryManager { private Set<Pointer> getNonMatrixLockedPointers() { Set<Pointer> managedPointers = matrixMemoryManager.getPointers(); managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers()); - return nonIn(allPointers.keySet(), managedPointers); + Set<Pointer> superSet = allPointers.keySet(); + Set<Pointer> ret = nonIn(superSet, managedPointers); + if(DEBUG_MEMORY_LEAK) { + System.out.println( + ret.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + " = notIn(>>>" + + superSet.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + ">>>, <<<" + + managedPointers.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + ">>>)"); + } + return ret; } http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index cfab0d4..1564f48 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -91,11 +91,6 @@ public class GPUObject { AtomicLong timestamp = new AtomicLong(); /** - * Whether this block is in sparse format - */ - protected boolean isSparse = false; - - /** * Enclosing {@link MatrixObject} instance */ MatrixObject mat = null; @@ -131,10 +126,29 @@ public class GPUObject { /** * Removes the dense pointer and potential soft reference + * + * @param opcode opcode of the instruction + * @param eager whether to delete eagerly */ - public void clearDensePointer() { - jcudaDenseMatrixPtr = null; + public void clearDensePointer(String opcode, boolean eager) { + if (!isDensePointerNull()) { + getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager); + } shadowBuffer.clearShadowPointer(); + jcudaDenseMatrixPtr = null; + } + + /** + * Removes the sparse pointer + * + * @param opcode opcode of the instruction + * @param eager whether to delete eagerly + */ + public void clearSparsePointer(String opcode, boolean eager) { + if (getJcudaSparseMatrixPtr() != null) { + getJcudaSparseMatrixPtr().deallocate(eager); + } + jcudaSparseMatrixPtr = null; } @@ -147,14 +161,14 @@ public class GPUObject { if (!this.isDensePointerNull()) { throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU"); } + clearSparsePointer(null, true); this.jcudaDenseMatrixPtr = densePtr; - this.isSparse = false; if(LOG.isDebugEnabled()) { LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr)); } - if (getJcudaSparseMatrixPtr() != null) { - getJcudaSparseMatrixPtr().deallocate(); - jcudaSparseMatrixPtr = null; + if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this)) { + // Double-check if the matrix manager still has the current GPU object in case of eviction. + gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this); } } // ---------------------------------------------------------------------- @@ -170,7 +184,6 @@ public class GPUObject { that.writeLock = false; that.timestamp = new AtomicLong(me.timestamp.get()); - that.isSparse = me.isSparse; try { if (!me.isDensePointerNull()) { @@ -197,10 +210,6 @@ public class GPUObject { return getGPUContext().allocate(null, size); } - private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { - getGPUContext().cudaFreeHelper(null, toFree, gpuContext.EAGER_CUDA_FREE); - } - public GPUContext getGPUContext() { return gpuContext; } @@ -300,11 +309,11 @@ public class GPUObject { if (this.jcudaSparseMatrixPtr != null) { throw new DMLRuntimeException("jcudaSparseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU"); } + clearDensePointer(null, true); this.jcudaSparseMatrixPtr = sparseMatrixPtr; - this.isSparse = true; - if (!isDensePointerNull() && !shadowBuffer.isBuffered()) { - cudaFreeHelper(getDensePointer()); - clearDensePointer(); + if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this)) { + // Double-check if the matrix manager still has the current GPU object in case of eviction. + gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this); } } @@ -354,8 +363,7 @@ public class GPUObject { } Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc); - cudaFreeHelper(getDensePointer()); - clearDensePointer(); + clearDensePointer(null, true); setDensePointer(tmp); } @@ -376,8 +384,7 @@ public class GPUObject { } Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc); - cudaFreeHelper(getDensePointer()); - clearDensePointer(); + clearDensePointer(null, true); setDensePointer(tmp); } @@ -446,7 +453,7 @@ public class GPUObject { } public boolean isSparse() { - return isSparse; + return jcudaSparseMatrixPtr != null; } private static long getDatatypeSizeOf(long numElems) { @@ -602,7 +609,6 @@ public class GPUObject { LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); } boolean allocated = false; - isSparse = true; if (!isAllocated()) { if(LOG.isTraceEnabled()) { LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this); @@ -995,22 +1001,15 @@ public class GPUObject { * Clears the data associated with this {@link GPUObject} instance * * @param opcode opcode of the instruction - * @param eager whether to be done synchronously or asynchronously + * @param eager whether to delete eagerly * @throws DMLRuntimeException if error occurs */ public void clearData(String opcode, boolean eager) throws DMLRuntimeException { if(LOG.isTraceEnabled()) { LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext()); } - if (!isDensePointerNull()) { - getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager); - } - if (getJcudaSparseMatrixPtr() != null) { - getJcudaSparseMatrixPtr().deallocate(eager); - } - clearDensePointer(); - shadowBuffer.clearShadowPointer(); - jcudaSparseMatrixPtr = null; + clearDensePointer(opcode, eager); + clearSparsePointer(opcode, eager); resetReadWriteLock(); getGPUContext().getMemoryManager().removeGPUObject(this); } @@ -1039,7 +1038,6 @@ public class GPUObject { sb.append(", dirty=").append(dirty); sb.append(", readLocks=").append(readLocks.longValue()); sb.append(", writeLock=").append(writeLock); - sb.append(", sparse? ").append(isSparse); sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]"); if(!isDensePointerNull()) sb.append(", densePtr=").append(getDensePointer()); http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index e7955e1..8051cbc 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -849,14 +849,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { static Pointer getDenseInputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String inputName, long numRows, long numCols) throws DMLRuntimeException { MatrixObject output = ec.getMatrixInputForGPUInstruction(inputName, instName); - return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols)); + return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, instName, numRows, numCols); } static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName, long numRows, long numCols) throws DMLRuntimeException { MatrixObject output = ec.getMatrixObject(outputName); getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix - return getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols)); + return getDensePointerForCuDNN(gCtx, output, instName, numRows, numCols); } /** @@ -890,9 +890,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { String outputName, String cyName, // output String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { boolean hasCarry = rnnMode.equalsIgnoreCase("lstm"); + if(LOG.isDebugEnabled()) { + long memRequired = (N*T*M + 2*N*M + N*T*M)*sizeOfDataType; + LOG.debug("Memory required for invoking lstmForward is " + memRequired + " bytes + workspace + reserve space + memory for descriptors."); + } + // Get output pointers Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); - Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType); + Pointer hyPointer = return_sequences ? gCtx.allocate(instName, N*M*sizeOfDataType) : getDenseOutputPointer(ec, gCtx, instName, outputName, N, M); Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer(); // Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M); @@ -922,20 +927,27 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName, Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName, // input String dxName, String dwName, String dbName, String dhxName, String dcxName, // output - boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { + boolean return_sequences, long N, long M, long D, long T) throws DMLRuntimeException { + + if(LOG.isDebugEnabled()) { + long memRequired = (N*T*M + (return_sequences ? T*M : M) + N*T*M + 2*N*T*D + (D+M+2)*(4*M))*sizeOfDataType; + LOG.debug("Memory required for invoking lstmBackward is " + memRequired + " bytes + workspace + reserve space + memory for descriptors."); + } + // Transform the input dout and prepare them for cudnnRNNBackwardData Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType); - int size = return_sequences ? N*T*M : N*M; + long size = return_sequences ? N*T*M : N*M; LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients", - ExecutionConfig.getConfigForSimpleVectorOperations(size), + ExecutionConfig.getConfigForSimpleVectorOperations(toInt(size)), getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M), dy, N, T, M, size, return_sequences ? 1 : 0); ec.releaseMatrixInputForGPUInstruction(doutName); // Allocate intermediate pointers computed by forward Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); - try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) { - JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, + try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", toInt(N), toInt(T), + toInt(M), toInt(D), true, wPointer)) { + JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T), algo.xDesc, x, algo.hxDesc, hx, algo.cxDesc, cx, @@ -947,7 +959,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { algo.reserveSpace, algo.reserveSpaceSizeInBytes); Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType); - JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, + JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T), algo.yDesc, yPointer, // ---------------------- // Additional inputs: @@ -973,14 +985,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput", - ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), + ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)), smlDx, cudnnDx, N, D, T*D, N*T*D); ec.releaseMatrixOutputForGPUInstruction(dxName); gCtx.cudaFreeHelper(instName, cudnnDx, gCtx.EAGER_CUDA_FREE); // ------------------------------------------------------------------------------------------- Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); - JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, + JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T), algo.xDesc, x, algo.hxDesc, hx, algo.yDesc, yPointer, @@ -988,7 +1000,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { algo.dwDesc, cudnnDwPointer, algo.reserveSpace, algo.reserveSpaceSizeInBytes); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight", - ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), + ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))), getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M); gCtx.cudaFreeHelper(instName, cudnnDwPointer, gCtx.EAGER_CUDA_FREE); @@ -1242,7 +1254,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { * @return jcuda pointer * @throws DMLRuntimeException if error occurs while sparse to dense conversion */ - public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, int numRows, int numCols) throws DMLRuntimeException { + public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, long numRows, long numCols) throws DMLRuntimeException { long numElems = image.getNumRows()*image.getNumColumns(); if(image.getNumRows() != numRows || image.getNumColumns() != numCols) { throw new DMLRuntimeException("Expected input of size:[" + numRows + ", " + numCols + "], but found [" + image.getNumRows() + ", " + image.getNumColumns() + "]."); http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java index 7b2c601..a1d799d 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java @@ -32,6 +32,8 @@ import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT; import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL; import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; @@ -44,6 +46,7 @@ import jcuda.jcudnn.cudnnRNNDescriptor; import jcuda.jcudnn.cudnnTensorDescriptor; public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { + private static final Log LOG = LogFactory.getLog(LibMatrixCuDNNRnnAlgorithm.class.getName()); GPUContext gCtx; String instName; cudnnDropoutDescriptor dropoutDesc; @@ -87,8 +90,11 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes); dropOutSizeInBytes = _dropOutSizeInBytes[0]; dropOutStateSpace = new Pointer(); - if (dropOutSizeInBytes != 0) + if (dropOutSizeInBytes != 0) { + if(LOG.isDebugEnabled()) + LOG.debug("Allocating " + dropOutSizeInBytes + " bytes for lstm dropout space."); dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes); + } JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345); // Initialize RNN descriptor @@ -109,55 +115,20 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { // Setup workspace workSpace = new Pointer(); reserveSpace = new Pointer(); sizeInBytes = getWorkspaceSize(T); - if(sizeInBytes != 0) + if(sizeInBytes != 0) { + if(LOG.isDebugEnabled()) + LOG.debug("Allocating " + sizeInBytes + " bytes for lstm workspace."); workSpace = gCtx.allocate(instName, sizeInBytes); + } reserveSpaceSizeInBytes = 0; if(isTraining) { reserveSpaceSizeInBytes = getReservespaceSize(T); if (reserveSpaceSizeInBytes != 0) { + if(LOG.isDebugEnabled()) + LOG.debug("Allocating " + reserveSpaceSizeInBytes + " bytes for lstm reserve space."); reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes); } } - /* - int numLinearLayers = getNumLinearLayers(rnnMode); - for(int i = 0; i < numLinearLayers; i++) { - cudnnFilterDescriptor linLayerMatDesc = new cudnnFilterDescriptor(); - cudnnCreateFilterDescriptor(linLayerMatDesc); - Pointer linLayerMat = new Pointer(); - JCudnn.cudnnGetRNNLinLayerMatrixParams(gCtx.getCudnnHandle(), rnnDesc, 0, - xDesc[0], wDesc, w, i, linLayerMatDesc, linLayerMat); - int[] dataType = new int[] {-1}; - int[] format = new int[] {-1}; - int[] nbDims = new int[] {-1}; - int[] filterDimA = new int[3]; - JCudnn.cudnnGetFilterNdDescriptor(linLayerMatDesc, 3, dataType, format, nbDims, filterDimA); - - int filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2]; - double [] tmp = new double[filterDims]; - LibMatrixCUDA.cudaSupportFunctions.deviceToHost(gCtx, linLayerMat, tmp, instName, false); - System.out.println(); - for(int j = 0 ; j < tmp.length; j++) { - System.out.print(" " + tmp[j]); - } - System.out.println(); - LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", - org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims), - linLayerMat, Math.pow(filterDims, -1), filterDims); - JCudnn.cudnnDestroyFilterDescriptor(linLayerMatDesc); - - cudnnFilterDescriptor linLayerBiasDesc = new cudnnFilterDescriptor(); - cudnnCreateFilterDescriptor(linLayerBiasDesc); - Pointer linLayerBias = new Pointer(); - JCudnn.cudnnGetRNNLinLayerBiasParams(gCtx.getCudnnHandle(), rnnDesc, 0, - xDesc[0], wDesc, w, i, linLayerBiasDesc, linLayerBias); - JCudnn.cudnnGetFilterNdDescriptor(linLayerBiasDesc, 3, dataType, format, nbDims, filterDimA); - filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2]; - LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", - org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims), - linLayerBias, Math.pow(filterDims, -1), filterDims); - JCudnn.cudnnDestroyFilterDescriptor(linLayerBiasDesc); - } - */ } @SuppressWarnings("unused") @@ -321,5 +292,6 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { throw new RuntimeException(e); } } + dropOutStateSpace = null; } } http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index 7af164e..25423c1 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -516,7 +516,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab //this method is used as a short-hand for all operations that //guaranteed only deal with dense blocks of a single block. if( denseBlock != null && denseBlock.numBlocks() > 1 ) { - throw new RuntimeException("Large dense in-memory block (with numblocks="+denseBlock.numBlocks()+") " + throw new RuntimeException("Large dense in-memory block (with numblocks="+denseBlock.numBlocks()+ ") with " + + "dimensions [" + getNumRows() + ", " + getNumColumns() + "] " + "allocated but operation access to first block only, which might cause incorrect results."); } return (denseBlock != null) ? denseBlock.valuesAt(0) : null;