Repository: systemml
Updated Branches:
  refs/heads/master 77c98d693 -> e2dc85688


[SYSTEMML-445] Removed unnecessary long-to-int conversion in LSTM

- Minor cleanup of the GPUObject class.
- Also, fixed incorrect forced GPU configuration flag.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e2dc8568
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e2dc8568
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e2dc8568

Branch: refs/heads/master
Commit: e2dc8568855d353265ac4e0755b9ac3d2b30b1d8
Parents: 77c98d6
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Thu Sep 13 11:17:33 2018 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Thu Sep 13 11:17:33 2018 -0700

----------------------------------------------------------------------
 .../apache/sysml/conf/ConfigurationManager.java |  2 +-
 .../instructions/gpu/DnnGPUInstruction.java     | 20 +++---
 .../instructions/gpu/context/CSRPointer.java    |  8 ---
 .../gpu/context/ExecutionConfig.java            |  4 +-
 .../gpu/context/GPUMemoryManager.java           | 12 +++-
 .../instructions/gpu/context/GPUObject.java     | 72 ++++++++++----------
 .../runtime/matrix/data/LibMatrixCuDNN.java     | 38 +++++++----
 .../matrix/data/LibMatrixCuDNNRnnAlgorithm.java | 56 ++++-----------
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  3 +-
 9 files changed, 100 insertions(+), 115 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java 
b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
index d9f1906..96c3885 100644
--- a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
@@ -258,7 +258,7 @@ public class ConfigurationManager
         * @return true if GPU is enabled in forced mode
         */
        public static boolean isForcedGPU() {
-               return _ldmlOptions.get().isGPU();
+               return _ldmlOptions.get().isForceGPU();
        }
        
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
index d620de9..6094b6c 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -595,18 +595,18 @@ public class DnnGPUInstruction extends GPUInstruction {
        
        private void processLstmBackwardInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
                MatrixObject out0 = getMatrixInputForGPUInstruction(ec, 
_input4.getName());
-               int M = toInt(out0.getNumColumns()); // hiddenSize .. since 
out0: (N, M)
+               long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, 
M)
                Pointer out0Pointer =  LibMatrixCUDA.getDensePointer(gCtx, 
out0, instName);
                
                MatrixObject W = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                MatrixObject bias = getMatrixInputForGPUInstruction(ec, 
_input3.getName());
                long numRowsW = W.getNumRows();
-               int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... 
numFeatures 
+               long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures 
                Pointer sysmlWPointer = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M);
                Pointer sysmlBiasPointer = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M);
                Pointer cudnnWPointer = gCtx.allocate(instName, 
(D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
                
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
-                               
ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+                               
ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
                                sysmlWPointer, sysmlBiasPointer, cudnnWPointer, 
D, M);
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
                ec.releaseMatrixInputForGPUInstruction(_input3.getName());
@@ -619,7 +619,7 @@ public class DnnGPUInstruction extends GPUInstruction {
                int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
                Pointer cudnnInput = gCtx.allocate(instName, 
(N*T*D)*LibMatrixCUDA.sizeOfDataType);
                
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
-                               
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+                               
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
                                xPointer, cudnnInput, N, D, T*D, N*T*D);
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                
@@ -656,18 +656,19 @@ public class DnnGPUInstruction extends GPUInstruction {
                // previous output out0 (also represented by hx) and cell state 
c0 (also represented by cx): (N, M) ==> (1, M, N)
                // out: (N, T*M) or (N, M) ==> (T, M, N)
                MatrixObject out0 = getMatrixInputForGPUInstruction(ec, 
_input4.getName());
-               int M = toInt(out0.getNumColumns()); // hiddenSize .. since 
out0: (N, M)
+               long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, 
M)
                Pointer out0Pointer =  LibMatrixCUDA.getDensePointer(gCtx, 
out0, instName);
                
                MatrixObject W = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                MatrixObject bias = getMatrixInputForGPUInstruction(ec, 
_input3.getName());
                long numRowsW = W.getNumRows();
-               int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... 
numFeatures 
+               long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures
+               
                Pointer sysmlWPointer = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M);
                Pointer sysmlBiasPointer = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M);
                Pointer cudnnWPointer = gCtx.allocate(instName, 
(D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
                
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
-                               
ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+                               
ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
                                sysmlWPointer, sysmlBiasPointer, cudnnWPointer, 
D, M);
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
                ec.releaseMatrixInputForGPUInstruction(_input3.getName());
@@ -682,13 +683,14 @@ public class DnnGPUInstruction extends GPUInstruction {
                int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
                Pointer cudnnInput = gCtx.allocate(instName, 
(N*T*D)*LibMatrixCUDA.sizeOfDataType);
                
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
-                               
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+                               
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
                                xPointer, cudnnInput, N, D, T*D, N*T*D);
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                
                Pointer c0Pointer = LibMatrixCUDA.getDensePointer(gCtx, 
getMatrixInputForGPUInstruction(ec, _input5.getName()), instName); 
                
-               LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, 
cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), 
_output2.getName(), N, M, D, T);
+               LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, 
cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), 
_output2.getName(), 
+                               toInt(N), toInt(M), toInt(D), toInt(T));
                gCtx.cudaFreeHelper(instName, cudnnWPointer, 
gCtx.EAGER_CUDA_FREE);
                gCtx.cudaFreeHelper(instName, cudnnInput, gCtx.EAGER_CUDA_FREE);
                

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index d7e38b9..135e0b1 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -476,14 +476,6 @@ public class CSRPointer {
        }
 
        /**
-        * Calls cudaFree lazily on the allocated {@link Pointer} instances
-        *
-        */
-       public void deallocate() {
-               deallocate(getGPUContext().EAGER_CUDA_FREE);
-       }
-
-       /**
         * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} 
instances
         *
         * @param eager whether to do eager or lazy cudaFrees

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index 872fef7..d35e813 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -68,8 +68,8 @@ public class ExecutionConfig {
         * @return execution configuration
         */
        public static ExecutionConfig getConfigForSimpleVectorOperations(int 
numCells) {
-               if(numCells == 0)
-                       throw new DMLRuntimeException("Attempting to invoke a 
kernel with 0 threads");
+               if(numCells <= 0)
+                       throw new DMLRuntimeException("Attempting to invoke a 
kernel with " + numCells + " threads");
                int deviceNumber = 0;
                int blockDimX = getMaxBlockDim(deviceNumber);
                int gridDimX = (int) Math.ceil((double) numCells / blockDimX);

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index e01c71a..509aafe 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -53,7 +53,7 @@ public class GPUMemoryManager {
        // Developer flag: Use this flag to check for GPU memory leak in 
SystemML.  
        // This has an additional overhead of maintaining stack trace of all 
the allocated GPU pointers via PointerInfo class.
        private static final boolean DEBUG_MEMORY_LEAK = false;
-       private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 
7, 8, 9, 10}; // Avoids printing too much text while debuggin
+       private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 
7, 8, 9, 10, 11}; // Avoids printing too much text while debugging
        
        private final boolean PRINT_GPU_MEMORY_INFO = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
        
@@ -86,7 +86,15 @@ public class GPUMemoryManager {
        private Set<Pointer> getNonMatrixLockedPointers() {
                Set<Pointer> managedPointers = 
matrixMemoryManager.getPointers();
                
managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers());
-               return nonIn(allPointers.keySet(), managedPointers);
+               Set<Pointer> superSet = allPointers.keySet();
+               Set<Pointer> ret = nonIn(superSet, managedPointers);
+               if(DEBUG_MEMORY_LEAK) {
+                       System.out.println(
+                               ret.stream().map(p -> 
p.toString()).collect(Collectors.joining(",")) + " = notIn(>>>" + 
+                               superSet.stream().map(p -> 
p.toString()).collect(Collectors.joining(",")) + ">>>, <<<" + 
+                               managedPointers.stream().map(p -> 
p.toString()).collect(Collectors.joining(",")) + ">>>)");
+               }
+               return ret;
        }
        
        

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index cfab0d4..1564f48 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -91,11 +91,6 @@ public class GPUObject {
        AtomicLong timestamp = new AtomicLong();
 
        /**
-        * Whether this block is in sparse format
-        */
-       protected boolean isSparse = false;
-
-       /**
         * Enclosing {@link MatrixObject} instance
         */
        MatrixObject mat = null;
@@ -131,10 +126,29 @@ public class GPUObject {
        
        /**
         * Removes the dense pointer and potential soft reference
+        * 
+        * @param opcode opcode of the instruction
+        * @param eager whether to delete eagerly
         */
-       public void clearDensePointer() {
-               jcudaDenseMatrixPtr = null;
+       public void clearDensePointer(String opcode, boolean eager) {
+               if (!isDensePointerNull()) {
+                       getGPUContext().cudaFreeHelper(opcode, 
getDensePointer(), eager);
+               }
                shadowBuffer.clearShadowPointer();
+               jcudaDenseMatrixPtr = null;
+       }
+       
+       /**
+        * Removes the sparse pointer
+        * 
+        * @param opcode opcode of the instruction
+        * @param eager whether to delete eagerly
+        */
+       public void clearSparsePointer(String opcode, boolean eager) {
+               if (getJcudaSparseMatrixPtr() != null) {
+                       getJcudaSparseMatrixPtr().deallocate(eager);
+               }
+               jcudaSparseMatrixPtr = null;
        }
        
        
@@ -147,14 +161,14 @@ public class GPUObject {
                if (!this.isDensePointerNull()) {
                        throw new DMLRuntimeException("jcudaDenseMatrixPtr was 
already allocated for " + this + ", this will cause a memory leak on the GPU");
                }
+               clearSparsePointer(null, true);
                this.jcudaDenseMatrixPtr = densePtr;
-               this.isSparse = false;
                if(LOG.isDebugEnabled()) {
                        LOG.debug("Setting dense pointer of size " + 
getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
                }
-               if (getJcudaSparseMatrixPtr() != null) {
-                       getJcudaSparseMatrixPtr().deallocate();
-                       jcudaSparseMatrixPtr = null;
+               
if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this))
 {
+                       // Double-check if the matrix manager still has the 
current GPU object in case of eviction.
+                       
gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this);
                }
        }
        // 
----------------------------------------------------------------------
@@ -170,7 +184,6 @@ public class GPUObject {
                that.writeLock = false;
                
                that.timestamp = new AtomicLong(me.timestamp.get());
-               that.isSparse = me.isSparse;
 
                try {
                        if (!me.isDensePointerNull()) {
@@ -197,10 +210,6 @@ public class GPUObject {
                return getGPUContext().allocate(null, size);
        }
 
-       private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
-               getGPUContext().cudaFreeHelper(null, toFree, 
gpuContext.EAGER_CUDA_FREE);
-       }
-
        public GPUContext getGPUContext() {
                return gpuContext;
        }
@@ -300,11 +309,11 @@ public class GPUObject {
                if (this.jcudaSparseMatrixPtr != null) {
                        throw new DMLRuntimeException("jcudaSparseMatrixPtr was 
already allocated for " + this + ", this will cause a memory leak on the GPU");
                }
+               clearDensePointer(null, true);
                this.jcudaSparseMatrixPtr = sparseMatrixPtr;
-               this.isSparse = true;
-               if (!isDensePointerNull() && !shadowBuffer.isBuffered()) {
-                       cudaFreeHelper(getDensePointer());
-                       clearDensePointer();
+               
if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this))
 {
+                       // Double-check if the matrix manager still has the 
current GPU object in case of eviction.
+                       
gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this);
                }
        }
        
@@ -354,8 +363,7 @@ public class GPUObject {
                }
 
                Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, 
n, lda, ldc);
-               cudaFreeHelper(getDensePointer());
-               clearDensePointer();
+               clearDensePointer(null, true);
                setDensePointer(tmp);
        }
 
@@ -376,8 +384,7 @@ public class GPUObject {
                }
 
                Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, 
n, lda, ldc);
-               cudaFreeHelper(getDensePointer());
-               clearDensePointer();
+               clearDensePointer(null, true);
                setDensePointer(tmp);
        }
 
@@ -446,7 +453,7 @@ public class GPUObject {
        }
 
        public boolean isSparse() {
-               return isSparse;
+               return jcudaSparseMatrixPtr != null;
        }
        
        private static long getDatatypeSizeOf(long numElems) {
@@ -602,7 +609,6 @@ public class GPUObject {
                        LOG.trace("GPU : acquireDeviceModifySparse on " + this 
+ ", GPUContext=" + getGPUContext());
                }
                boolean allocated = false;
-               isSparse = true;
                if (!isAllocated()) {
                        if(LOG.isTraceEnabled()) {
                                LOG.trace("GPU : data is not allocated, 
allocating a sparse block, on " + this);
@@ -995,22 +1001,15 @@ public class GPUObject {
         * Clears the data associated with this {@link GPUObject} instance
         *
         * @param opcode opcode of the instruction
-        * @param eager whether to be done synchronously or asynchronously
+        * @param eager whether to delete eagerly
         * @throws DMLRuntimeException if error occurs
         */
        public void clearData(String opcode, boolean eager) throws 
DMLRuntimeException {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : clearData on " + this + ", 
GPUContext=" + getGPUContext());
                }
-               if (!isDensePointerNull()) {
-                       getGPUContext().cudaFreeHelper(opcode, 
getDensePointer(), eager);
-               }
-               if (getJcudaSparseMatrixPtr() != null) {
-                       getJcudaSparseMatrixPtr().deallocate(eager);
-               }
-               clearDensePointer();
-               shadowBuffer.clearShadowPointer();
-               jcudaSparseMatrixPtr = null;
+               clearDensePointer(opcode, eager);
+               clearSparsePointer(opcode, eager);
                resetReadWriteLock();
                getGPUContext().getMemoryManager().removeGPUObject(this);
        }
@@ -1039,7 +1038,6 @@ public class GPUObject {
                sb.append(", dirty=").append(dirty);
                sb.append(", readLocks=").append(readLocks.longValue());
                sb.append(", writeLock=").append(writeLock);
-               sb.append(", sparse? ").append(isSparse);
                sb.append(", 
dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
                if(!isDensePointerNull())
                        sb.append(", densePtr=").append(getDensePointer());

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index e7955e1..8051cbc 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -849,14 +849,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        static Pointer getDenseInputPointer(ExecutionContext ec, GPUContext 
gCtx, String instName, String inputName,
                        long numRows, long numCols) throws DMLRuntimeException {
                MatrixObject output = 
ec.getMatrixInputForGPUInstruction(inputName, instName);
-               return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, 
instName, toInt(numRows), toInt(numCols));
+               return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, 
instName, numRows, numCols);
        }
        
        static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext 
gCtx, String instName, String outputName,
                        long numRows, long numCols) throws DMLRuntimeException {
                MatrixObject output = ec.getMatrixObject(outputName);
                getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
numRows, numCols); // Allocated the dense output matrix
-               return getDensePointerForCuDNN(gCtx, output, instName, 
toInt(numRows), toInt(numCols));
+               return getDensePointerForCuDNN(gCtx, output, instName, numRows, 
numCols);
        }
        
        /**
@@ -890,9 +890,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        String outputName, String cyName,                       
                 // output
                        String rnnMode, boolean return_sequences, int N, int M, 
int D, int T) throws DMLRuntimeException {
                boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
+               if(LOG.isDebugEnabled()) {
+                       long memRequired = (N*T*M + 2*N*M + 
N*T*M)*sizeOfDataType;
+                       LOG.debug("Memory required for invoking lstmForward is 
" + memRequired + " bytes + workspace + reserve space + memory for 
descriptors.");
+               }
+               
                // Get output pointers
                Pointer cudnnYPointer = gCtx.allocate(instName, 
N*T*M*sizeOfDataType);
-               Pointer hyPointer = !return_sequences ? 
getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : 
gCtx.allocate(instName, N*M*sizeOfDataType);
+               Pointer hyPointer = return_sequences ? gCtx.allocate(instName, 
N*M*sizeOfDataType) : getDenseOutputPointer(ec, gCtx, instName, outputName, N, 
M);
                Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, 
instName, cyName, N, M) : new Pointer();
                // Pointer wPointer = getDensePointerForCuDNN(gCtx, w, 
instName, D+M+2, 4*M);
                
@@ -922,20 +927,27 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, 
String instName,
                        Pointer x, Pointer hx, Pointer cx, Pointer wPointer, 
String doutName, String dcyName,  // input
                        String dxName, String dwName, String dbName, String 
dhxName, String dcxName,    // output
-                       boolean return_sequences, int N, int M, int D, int T) 
throws DMLRuntimeException {
+                       boolean return_sequences, long N, long M, long D, long 
T) throws DMLRuntimeException {
+               
+               if(LOG.isDebugEnabled()) {
+                       long memRequired = (N*T*M + (return_sequences ? T*M : 
M) + N*T*M + 2*N*T*D + (D+M+2)*(4*M))*sizeOfDataType;
+                       LOG.debug("Memory required for invoking lstmBackward is 
" + memRequired + " bytes + workspace + reserve space + memory for 
descriptors.");
+               }
+               
                // Transform the input dout and prepare them for 
cudnnRNNBackwardData
                Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
-               int size = return_sequences ? N*T*M : N*M;
+               long size = return_sequences ? N*T*M : N*M;
                
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
-                               
ExecutionConfig.getConfigForSimpleVectorOperations(size),
+                               
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(size)),
                                getDenseInputPointer(ec, gCtx, instName, 
doutName, N, return_sequences ? T*M : M),
                                dy, N, T, M, size, return_sequences ? 1 : 0);
                ec.releaseMatrixInputForGPUInstruction(doutName);
                                
                // Allocate intermediate pointers computed by forward
                Pointer yPointer = gCtx.allocate(instName, 
N*T*M*sizeOfDataType);
-               try(LibMatrixCuDNNRnnAlgorithm algo = new 
LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, 
wPointer)) {
-                       JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), 
algo.rnnDesc, T, 
+               try(LibMatrixCuDNNRnnAlgorithm algo = new 
LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", toInt(N), toInt(T), 
+                               toInt(M), toInt(D), true, wPointer)) {
+                       JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), 
algo.rnnDesc, toInt(T), 
                                        algo.xDesc, x, 
                                        algo.hxDesc, hx, 
                                        algo.cxDesc, cx, 
@@ -947,7 +959,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                        algo.reserveSpace, 
algo.reserveSpaceSizeInBytes);
                        
                        Pointer cudnnDx = gCtx.allocate(instName, 
N*T*D*LibMatrixCUDA.sizeOfDataType);
-                       JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), 
algo.rnnDesc, T, 
+                       JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), 
algo.rnnDesc, toInt(T), 
                                        algo.yDesc, yPointer,
                                        // ----------------------
                                        // Additional inputs:
@@ -973,14 +985,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        
                        Pointer smlDx = getDenseOutputPointer(ec, gCtx, 
instName, dxName, N, T*D);
                        
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
-                                       
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+                                       
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
                                        smlDx, cudnnDx, N, D, T*D, N*T*D);
                        ec.releaseMatrixOutputForGPUInstruction(dxName);
                        gCtx.cudaFreeHelper(instName, cudnnDx, 
gCtx.EAGER_CUDA_FREE);
                        
                        // 
-------------------------------------------------------------------------------------------
                        Pointer cudnnDwPointer = gCtx.allocate(instName, 
(D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
-                       JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), 
algo.rnnDesc, T, 
+                       JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), 
algo.rnnDesc, toInt(T), 
                                        algo.xDesc, x, 
                                        algo.hxDesc, hx, 
                                        algo.yDesc, yPointer, 
@@ -988,7 +1000,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                        algo.dwDesc, cudnnDwPointer, 
                                        algo.reserveSpace, 
algo.reserveSpaceSizeInBytes);
                        
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
-                                       
ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+                                       
ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
                                        getDenseOutputPointer(ec, gCtx, 
instName, dwName, D+M, 4*M), 
                                        getDenseOutputPointer(ec, gCtx, 
instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
                        gCtx.cudaFreeHelper(instName, cudnnDwPointer, 
gCtx.EAGER_CUDA_FREE);
@@ -1242,7 +1254,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @return jcuda pointer
         * @throws DMLRuntimeException if error occurs while sparse to dense 
conversion
         */
-       public static Pointer getDensePointerForCuDNN(GPUContext gCtx, 
MatrixObject image, String instName, int numRows, int numCols) throws 
DMLRuntimeException {
+       public static Pointer getDensePointerForCuDNN(GPUContext gCtx, 
MatrixObject image, String instName, long numRows, long numCols) throws 
DMLRuntimeException {
                long numElems = image.getNumRows()*image.getNumColumns();
                if(image.getNumRows() != numRows || image.getNumColumns() != 
numCols) {
                        throw new DMLRuntimeException("Expected input of 
size:[" +  numRows + ", " + numCols + "], but found [" + image.getNumRows() + 
", " + image.getNumColumns() + "]."); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
index 7b2c601..a1d799d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -32,6 +32,8 @@ import static 
jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
 import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
 import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
@@ -44,6 +46,7 @@ import jcuda.jcudnn.cudnnRNNDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
 
 public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
+       private static final Log LOG = 
LogFactory.getLog(LibMatrixCuDNNRnnAlgorithm.class.getName());
        GPUContext gCtx;
        String instName;
        cudnnDropoutDescriptor dropoutDesc;
@@ -87,8 +90,11 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
                JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), 
_dropOutSizeInBytes);
                dropOutSizeInBytes = _dropOutSizeInBytes[0];
                dropOutStateSpace = new Pointer();
-               if (dropOutSizeInBytes != 0)
+               if (dropOutSizeInBytes != 0) {
+                       if(LOG.isDebugEnabled()) 
+                               LOG.debug("Allocating " +  dropOutSizeInBytes + 
" bytes for lstm dropout space.");
                        dropOutStateSpace = gCtx.allocate(instName, 
dropOutSizeInBytes);
+               }
                JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, 
gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
                
                // Initialize RNN descriptor
@@ -109,55 +115,20 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
                // Setup workspace
                workSpace = new Pointer(); reserveSpace = new Pointer();
                sizeInBytes = getWorkspaceSize(T);
-               if(sizeInBytes != 0)
+               if(sizeInBytes != 0) {
+                       if(LOG.isDebugEnabled()) 
+                               LOG.debug("Allocating " +  sizeInBytes + " 
bytes for lstm workspace.");
                        workSpace = gCtx.allocate(instName, sizeInBytes);
+               }
                reserveSpaceSizeInBytes = 0;
                if(isTraining) {
                        reserveSpaceSizeInBytes = getReservespaceSize(T);
                        if (reserveSpaceSizeInBytes != 0) {
+                               if(LOG.isDebugEnabled()) 
+                                       LOG.debug("Allocating " +  
reserveSpaceSizeInBytes + " bytes for lstm reserve space.");
                                reserveSpace = gCtx.allocate(instName, 
reserveSpaceSizeInBytes);
                        }
                }
-               /*
-               int numLinearLayers = getNumLinearLayers(rnnMode); 
-               for(int i = 0; i < numLinearLayers; i++) {
-                       cudnnFilterDescriptor  linLayerMatDesc = new 
cudnnFilterDescriptor();
-                       cudnnCreateFilterDescriptor(linLayerMatDesc);
-                       Pointer linLayerMat = new Pointer();
-                       
JCudnn.cudnnGetRNNLinLayerMatrixParams(gCtx.getCudnnHandle(), rnnDesc, 0, 
-                                       xDesc[0], wDesc, w, i, linLayerMatDesc, 
linLayerMat);
-                       int[] dataType = new int[] {-1};
-                       int[] format = new int[] {-1};
-                       int[] nbDims = new int[] {-1};
-                       int[] filterDimA = new int[3];
-                       JCudnn.cudnnGetFilterNdDescriptor(linLayerMatDesc, 3, 
dataType, format, nbDims, filterDimA);
-                       
-                       int filterDims = filterDimA[0] * filterDimA[1] * 
filterDimA[2];
-                       double [] tmp = new double[filterDims];
-                       LibMatrixCUDA.cudaSupportFunctions.deviceToHost(gCtx, 
linLayerMat, tmp, instName, false);
-                       System.out.println();
-                       for(int j = 0 ; j < tmp.length; j++) {
-                               System.out.print(" " + tmp[j]);
-                       }
-                       System.out.println();
-                       LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", 
-                                       
org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims),
 
-                                       linLayerMat, Math.pow(filterDims, -1), 
filterDims);
-                       JCudnn.cudnnDestroyFilterDescriptor(linLayerMatDesc);
-                       
-                       cudnnFilterDescriptor  linLayerBiasDesc = new 
cudnnFilterDescriptor();
-                       cudnnCreateFilterDescriptor(linLayerBiasDesc);
-                       Pointer linLayerBias = new Pointer();
-                       
JCudnn.cudnnGetRNNLinLayerBiasParams(gCtx.getCudnnHandle(), rnnDesc, 0,
-                                       xDesc[0], wDesc, w, i, 
linLayerBiasDesc, linLayerBias);
-                       JCudnn.cudnnGetFilterNdDescriptor(linLayerBiasDesc, 3, 
dataType, format, nbDims, filterDimA);
-                       filterDims = filterDimA[0] * filterDimA[1] * 
filterDimA[2];
-                       LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", 
-                                       
org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims),
 
-                                       linLayerBias, Math.pow(filterDims, -1), 
filterDims);
-                       JCudnn.cudnnDestroyFilterDescriptor(linLayerBiasDesc);
-               }
-               */
        }
        
        @SuppressWarnings("unused")
@@ -321,5 +292,6 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
                                throw new RuntimeException(e);
                        }
                }
+               dropOutStateSpace = null;
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 7af164e..25423c1 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -516,7 +516,8 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                //this method is used as a short-hand for all operations that
                //guaranteed only deal with dense blocks of a single block.
                if( denseBlock != null && denseBlock.numBlocks() > 1 ) {
-                       throw new RuntimeException("Large dense in-memory block 
(with numblocks="+denseBlock.numBlocks()+") "
+                       throw new RuntimeException("Large dense in-memory block 
(with numblocks="+denseBlock.numBlocks()+ ") with "
+                               + "dimensions [" + getNumRows() + ", " + 
getNumColumns() + "] "
                                + "allocated but operation access to first 
block only, which might cause incorrect results.");
                }
                return (denseBlock != null) ? denseBlock.valuesAt(0) : null;

Reply via email to