systemml git commit: [SYSTEMML-445] Refactoring to avoid potential memory leaks

niketanpansare Wed, 11 Oct 2017 15:53:09 -0700

Repository: systemml
Updated Branches:
  refs/heads/master 8f786aa22 -> 96ae6c7eb



[SYSTEMML-445] Refactoring to avoid potential memory leaks

- Removed tensor descriptors from GPUObject
- Created closeable LibMatrixCuDNNPoolingDescriptors class to manage the data 
structures required by maxpooling
- Enabled JCuda exceptions to catch CUDA errors eagerly
- Added debugging messages in eviction logic. The printing of these messages is 
guarded to avoid additional overhead
- Removed unused batch normalization methods from LibMatrixCuDNN

Closes #679.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/96ae6c7e
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/96ae6c7e
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/96ae6c7e

Branch: refs/heads/master
Commit: 96ae6c7eb34e792f9fe7c3a8b37c9130fb0ea7ae
Parents: 8f786aa
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Wed Oct 11 15:47:59 2017 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Wed Oct 11 15:49:14 2017 -0700

----------------------------------------------------------------------
 .../instructions/gpu/context/GPUContext.java    |  18 +-
 .../instructions/gpu/context/GPUObject.java     |  75 +---
 .../runtime/matrix/data/LibMatrixCuDNN.java     | 363 ++-----------------
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |  56 ++-
 .../data/LibMatrixCuDNNPoolingDescriptors.java  | 164 +++++++++
 5 files changed, 251 insertions(+), 425 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/96ae6c7e/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 118602b..55cb95f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -401,8 +401,12 @@ public class GPUContext {
         */
        public void cudaFreeHelper(String instructionName, final Pointer 
toFree, boolean eager) {
                Pointer dummy = new Pointer();
-               if (toFree == dummy) // trying to free a null pointer
+               if (toFree == dummy) { // trying to free a null pointer
+                       if (LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : trying to free an empty 
pointer");
+                       }
                        return;
+               }
                long t0 = 0;
                if (!cudaBlockSizeMap.containsKey(toFree))
                        throw new RuntimeException(
@@ -410,7 +414,7 @@ public class GPUContext {
                long size = cudaBlockSizeMap.get(toFree);
                if (eager) {
                        if (LOG.isTraceEnabled()) {
-                               LOG.trace("GPU : eagerly freeing cuda memory [ 
" + toFree + " ] for instruction " + instructionName
+                               LOG.trace("GPU : eagerly freeing cuda memory [ 
" + toFree + " ] of size " + size + " for instruction " + instructionName
                                                + " on " + this);
                        }
                        if (DMLScript.STATISTICS)
@@ -426,7 +430,7 @@ public class GPUContext {
                                                System.nanoTime() - t0);
                } else {
                        if (LOG.isTraceEnabled()) {
-                               LOG.trace("GPU : lazily freeing cuda memory for 
instruction " + instructionName + " on " + this);
+                               LOG.trace("GPU : lazily freeing cuda memory of 
size " + size + " for instruction " + instructionName + " on " + this);
                        }
                        Set<Pointer> freeList = freeCUDASpaceMap.get(size);
                        if (freeList == null) {
@@ -492,6 +496,10 @@ public class GPUContext {
                        LOG.trace("GPU : evict called from " + instructionName 
+ " for size " + neededSize + " on " + this);
                }
                GPUStatistics.cudaEvictionCount.add(1);
+               if (LOG.isDebugEnabled()) {
+                       printMemoryInfo("EVICTION_CUDA_FREE_SPACE");
+               }
+               
                // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
                // to free up space
                LRUCacheMap<Long, Set<Pointer>> lruCacheMap = freeCUDASpaceMap;
@@ -560,6 +568,9 @@ public class GPUContext {
                });
 
                while (neededSize > getAvailableMemory() && 
allocatedGPUObjects.size() > 0) {
+                       if (LOG.isDebugEnabled()) {
+                               printMemoryInfo("EVICTION_UNLOCKED");
+                       }
                        GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
                        if (toBeRemoved.isLocked()) {
                                throw new DMLRuntimeException(
@@ -569,7 +580,6 @@ public class GPUContext {
                        if (toBeRemoved.dirty) {
                                toBeRemoved.copyFromDeviceToHost();
                        }
-
                        toBeRemoved.clearData(true);
                }
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/96ae6c7e/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 31bf151..feb34bc 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -19,11 +19,6 @@
 package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
-import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
 import static jcuda.jcusparse.JCusparse.cusparseDnnz;
 import static jcuda.runtime.JCuda.cudaMemcpy;
@@ -32,7 +27,6 @@ import static 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
-import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
@@ -55,7 +49,6 @@ import org.apache.sysml.utils.GPUStatistics;
 import jcuda.Pointer;
 import jcuda.Sizeof;
 import jcuda.jcublas.JCublas2;
-import jcuda.jcudnn.cudnnTensorDescriptor;
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseDirection;
 import jcuda.jcusparse.cusparseHandle;
@@ -84,17 +77,6 @@ public class GPUObject {
        private CSRPointer jcudaSparseMatrixPtr = null;
 
        /**
-        * An optional tensor descriptor (and shape) that can be set by a 
tensor instruction such as convolution,
-        * maxpooling and exploited by a subsequent non-tensor instruction such 
as relu
-        */
-       private cudnnTensorDescriptor tensorDescriptor = null;
-
-       /**
-        * the shape of this tensor, if in fact this is a tensor
-        */
-       private int[] tensorShape = null;
-
-       /**
         * whether the block attached to this {@link GPUContext} is dirty on 
the device and needs to be copied back to host
         */
        protected boolean dirty = false;
@@ -132,13 +114,7 @@ public class GPUObject {
        public Object clone() {
                GPUObject me = this;
                GPUObject that = new GPUObject(me.gpuContext, me.mat);
-               if (me.tensorShape != null) {
-                       that.tensorShape = new int[me.tensorShape.length];
-                       System.arraycopy(me.tensorShape, 0, that.tensorShape, 
0, me.tensorShape.length);
-                       that.allocateTensorDescriptor(me.tensorShape[0], 
me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
-               }
                that.dirty = me.dirty;
-               // TODO Nakul: Should the locks be cloned here ?
                // The only place clone is getting called: LibMatrixCUDA's solve
                that.readLocks.reset();
                that.writeLock = false;
@@ -498,51 +474,7 @@ public class GPUObject {
        public boolean isSparse() {
                return isSparse;
        }
-
-       /**
-        * Returns a previously allocated tensor shape or null
-        *
-        * @return int array of four elements or null
-        */
-       public int[] getTensorShape() {
-               return tensorShape;
-       }
-
-       /**
-        * Returns a previously allocated tensor descriptor or null
-        *
-        * @return cudnn tensor descriptor
-        */
-       public cudnnTensorDescriptor getTensorDescriptor() {
-               return tensorDescriptor;
-       }
-
-       /**
-        * Returns a previously allocated or allocates and returns a tensor 
descriptor
-        *
-        * @param N number of images
-        * @param C number of channels
-        * @param H height
-        * @param W width
-        * @return cudnn tensor descriptor
-        */
-       public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int 
H, int W) {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : allocateTensorDescriptor with [N=" + N 
+ ",C=" + C + ",H=" + H + ",W=" + W + "] on " + this);
-               }
-               if (tensorDescriptor == null) {
-                       tensorDescriptor = new cudnnTensorDescriptor();
-                       cudnnCreateTensorDescriptor(tensorDescriptor);
-                       cudnnSetTensor4dDescriptor(tensorDescriptor, 
CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
-                       tensorShape = new int[4];
-                       tensorShape[0] = N;
-                       tensorShape[1] = C;
-                       tensorShape[2] = H;
-                       tensorShape[3] = W;
-               }
-               return tensorDescriptor;
-       }
-
+       
        private static long getDoubleSizeOf(long numElems) {
                return numElems * ((long) jcuda.Sizeof.DOUBLE);
        }
@@ -829,10 +761,6 @@ public class GPUObject {
                }
                jcudaDenseMatrixPtr = null;
                jcudaSparseMatrixPtr = null;
-               if (tensorDescriptor != null) {
-                       cudnnDestroyTensorDescriptor(tensorDescriptor);
-                       tensorDescriptor = null;
-               }
                resetReadWriteLock();
                getGPUContext().removeRecordedUsage(this);
        }
@@ -1094,7 +1022,6 @@ public class GPUObject {
        @Override
        public String toString() {
                final StringBuilder sb = new StringBuilder("GPUObject{");
-               sb.append(", 
tensorShape=").append(Arrays.toString(tensorShape));
                sb.append(", dirty=").append(dirty);
                sb.append(", readLocks=").append(readLocks.longValue());
                sb.append(", writeLock=").append(writeLock);

http://git-wip-us.apache.org/repos/asf/systemml/blob/96ae6c7e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 25dc604..bb74aa2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -19,43 +19,27 @@
 package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
 import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
 import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
 import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
-import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
 import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
-import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
-import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import jcuda.CudaException;
 import jcuda.Pointer;
 import jcuda.Sizeof;
 import jcuda.jcudnn.cudnnActivationDescriptor;
-import jcuda.jcudnn.cudnnBatchNormMode;
-import jcuda.jcudnn.cudnnConvolutionDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
-import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcudnn.cudnnPoolingDescriptor;
 import jcuda.jcudnn.cudnnStatus;
 import jcuda.jcudnn.cudnnTensorDescriptor;
 
@@ -115,6 +99,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                //cudaDeviceSynchronize;
                biasAdd(gCtx, instName, output, bias, output);
        }
+       
 
        /**
         * Performs a 2D convolution
@@ -145,7 +130,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
-
+               
                if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2d
                        double overhead = isInSparseFormat(gCtx, filter) ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
@@ -489,14 +474,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
                        if(overhead <= intermediateMemoryBudget) {
                                Pointer x = getDensePointerForCuDNN(gCtx, 
image, instName);
-                               cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(gCtx, image, N, C, H, W);
-                               cudnnMaxpooling(gCtx, instName, x, xDesc, y, N, 
C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+                               cudnnMaxpooling(gCtx, instName, x, y, N, C, H, 
W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
                        }
                        else {
                                LibMatrixCuDNNInputRowFetcher imgFetcher = new 
LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
-                               cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(gCtx, image, N, C, H, W);
                                for(int n = 0; n < N; n++) {
-                                       cudnnMaxpooling(gCtx, instName, 
imgFetcher.getNthRow(n), xDesc, y.withByteOffset(n*CPQ*Sizeof.DOUBLE), 1, C, H, 
W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+                                       cudnnMaxpooling(gCtx, instName, 
imgFetcher.getNthRow(n), y.withByteOffset(n*CPQ*Sizeof.DOUBLE), 1, C, H, W, K, 
R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
                                }
                                imgFetcher.close();
                        }
@@ -506,7 +489,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                }
        }
 
-       private static void cudnnMaxpooling(GPUContext gCtx, String instName, 
Pointer x, cudnnTensorDescriptor xDesc,
+       private static void cudnnMaxpooling(GPUContext gCtx, String instName, 
Pointer x,
                        Pointer y, int N, int C, int H, int W, int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
@@ -514,33 +497,21 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + 
gCtx);
                }
 
-               cudnnPoolingDescriptor poolingDesc = null;
-
-               try {
+               try(LibMatrixCuDNNPoolingDescriptors desc = 
+                               
LibMatrixCuDNNPoolingDescriptors.cudnnMaxpoolingDescriptors(gCtx, instName, N, 
C, H, W, K, R, S, 
+                                               pad_h, pad_w, stride_h, 
stride_w, P, Q)) {
                        long t1=0,t2=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       // Allocate descriptors
-                       cudnnTensorDescriptor yDesc = 
allocateTensorDescriptor(N, C, P, Q);
-                       poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+                       int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
-
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
                } catch (CudaException e) {
                        throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
                }
-               finally {
-                       long t3=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
-                       if(poolingDesc != null)
-                               
jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-               }
        }
 
        /**
@@ -611,28 +582,22 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" 
+ gCtx);
                }
                Pointer y = null;
-               cudnnPoolingDescriptor poolingDesc = null;
 
-               try {
+               try(LibMatrixCuDNNPoolingDescriptors desc = 
+                               
LibMatrixCuDNNPoolingDescriptors.cudnnMaxpoolingBackwardDescriptors(gCtx, 
instName, N, C, H, W, K, R, S, 
+                                               pad_h, pad_w, stride_h, 
stride_w, P, Q)) {
                        long t1=0, t2=0, t3=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       // Allocate descriptors
-                       cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(N, C, H, W);
-                       cudnnTensorDescriptor yDesc = 
allocateTensorDescriptor(N, C, P, Q);
-                       cudnnTensorDescriptor dxDesc = 
allocateTensorDescriptor(N, C, H, W);
-                       cudnnTensorDescriptor dyDesc = 
allocateTensorDescriptor(N, C, P, Q);
-
-                       poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
-
+                       
                        // Calling PoolForward first, y is one of the inputs 
for poolBackward
                        // TODO: Remove calling poolForward after necessary 
changes at language level for poolBackward
                        long numBytes = N*C*P*Q*Sizeof.DOUBLE;
                        y = gCtx.allocate(numBytes);
                        
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
+                       
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+                       int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -640,7 +605,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        }
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
-                       status = cudnnPoolingBackward(getCudnnHandle(gCtx), 
poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
+                       status = cudnnPoolingBackward(getCudnnHandle(gCtx), 
desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), 
desc.dxDesc, dx);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -652,297 +617,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                finally {
                        long t4=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t4 = 
System.nanoTime();
-
                        if(y != null)
                                gCtx.cudaFreeHelper(instName, y);
-                       if(poolingDesc != null)
-                               
jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
-
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
                }
        }
 
-       static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int 
padding [], int strides []) {
-               cudnnConvolutionDescriptor convDesc = new 
cudnnConvolutionDescriptor();
-               cudnnCreateConvolutionDescriptor(convDesc);
-               cudnnSetConvolution2dDescriptor(convDesc, padding[0], 
padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);
-               return convDesc;
-       }
-
-       protected static cudnnFilterDescriptor allocateFilterDescriptor(int K, 
int C, int R, int S) {
-               cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
-               cudnnCreateFilterDescriptor(filterDesc);
-               cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, 
CUDNN_TENSOR_NCHW, K, C, R, S);
-               return filterDesc;
-       }
-
-       /**
-        * allocates pooling descriptor, used in poolingForward and 
poolingBackward
-        * @param R                     pooling window height
-        * @param S                     pooling window width
-        * @param pad_h         vertical padding
-        * @param pad_w         horizontal padding
-        * @param stride_h      pooling vertical stride
-        * @param stride_w      pooling horizontal stride
-        * @return cudnn pooling descriptor
-        */
-       private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, 
int S, int pad_h, int pad_w, int stride_h, int stride_w) {
-               cudnnPoolingDescriptor poolingDesc = new 
cudnnPoolingDescriptor();
-               cudnnCreatePoolingDescriptor(poolingDesc);
-               cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 
CUDNN_PROPAGATE_NAN, R, S, pad_h, pad_w, stride_h, stride_w);
-               return poolingDesc;
-       }
-
-       /**
-        * Convenience method to get tensor descriptor
-        * @param N number of images
-        * @param C number of channels
-        * @param H height
-        * @param W width
-        * @return cudnn tensor descriptor
-        * @throws DMLRuntimeException if the input descriptor and matrix 
dimensions don't match
-        */
-       static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int 
H, int W) throws DMLRuntimeException {
-               cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
-               cudnnCreateTensorDescriptor(tensorDescriptor);
-               cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
-               return tensorDescriptor;
-       }
-
-       /**
-        * Convenience method to get tensor descriptor from underlying GPUObject
-        * @param gCtx   a valid {@link GPUContext}
-        * @param mat matrix object
-        * @param N number of images
-        * @param C number of channels
-        * @param H height
-        * @param W width
-        * @return cudnn tensor descriptor
-        * @throws DMLRuntimeException if the input descriptor and matrix 
dimensions don't match
-        */
-       private static cudnnTensorDescriptor 
allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int 
H, int W) throws DMLRuntimeException {
-               if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
-                       throw new DMLRuntimeException("Mismatch 
descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
-                                       + " || " + mat.getNumColumns() + " != " 
+ (C*H*W));
-               }
-               return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, 
W);
-       }
-
-       /**
-        * Performs the forward BatchNormalization layer computation for 
inference
-        * @param gCtx   a valid {@link GPUContext}
-        * @param instName name of the instruction
-        * @param image input image
-        * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
-        * @param bias bias (as per CuDNN) and beta as per original paper: 
shape [1, C, 1, 1]
-        * @param runningMean running mean accumulated during training phase: 
shape [1, C, 1, 1]
-        * @param runningVar running variance accumulated during training 
phase: shape [1, C, 1, 1]
-        * @param ret normalized input
-        * @param epsilon epsilon value used in the batch normalization formula
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static void batchNormalizationForwardInference(GPUContext gCtx, 
String instName, MatrixObject image,
-                       MatrixObject scale, MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
-                       MatrixObject ret, double epsilon) throws 
DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : batchNormalizationForwardInference" + 
", GPUContext=" + gCtx);
-               }
-               int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
-               int N = toInt(image.getNumRows());
-               int C = toInt(scale.getNumColumns());
-               long CHW = image.getNumColumns();
-               validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
-
-               // Allocate descriptors
-               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
-                               new MatrixObject[] {image},  new MatrixObject[] 
{ret});
-               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
-               // Get underlying dense pointer
-               Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, 
instName);
-               Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
-               Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
-               Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, 
instName);
-               Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, 
runningMean, instName);
-               Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, 
runningVar, instName);
-
-               
checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, 
one(), zero(),
-                               nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
-                               scaleTensorDesc, scalePtr, biasPtr,
-                               runningMeanPtr, runningVarPtr, epsilon));
-       }
-
-       /**
-        * Performs the forward BatchNormalization layer computation for 
training
-        * @param gCtx   a valid {@link GPUContext}
-        * @param instName name of the instruction
-        * @param image input image
-        * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
-        * @param bias bias (as per CuDNN) and beta as per original paper: 
shape [1, C, 1, 1]
-        * @param runningMean running mean accumulated during training phase: 
shape [1, C, 1, 1]
-        * @param runningVar running variance accumulated during training 
phase: shape [1, C, 1, 1]
-        * @param ret (output) normalized input
-        * @param retRunningMean (output) running mean accumulated during 
training phase: shape [1, C, 1, 1]
-        * @param retRunningVar (output) running variance accumulated during 
training phase: shape [1, C, 1, 1]
-        * @param epsilon epsilon value used in the batch normalization formula
-        * @param exponentialAverageFactor factor used in the moving average 
computation
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static void batchNormalizationForwardTraining(GPUContext gCtx, 
String instName, MatrixObject image,
-                       MatrixObject scale,  MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
-                       MatrixObject ret, MatrixObject retRunningMean, 
MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) 
throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : batchNormalizationForwardTraining" + 
", GPUContext=" + gCtx);
-               }
-               int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
-               int N = toInt(image.getNumRows());
-               int C = toInt(scale.getNumColumns());
-               long CHW = image.getNumColumns();
-               validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
-
-               // Allocate descriptors
-               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
-                               new MatrixObject[] {image},  new MatrixObject[] 
{ret});
-               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
-               // Get underlying dense pointer
-               Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, 
instName);
-               Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
-               Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
-               Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, 
instName);
-               Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, 
runningMean, instName);
-               Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, 
runningVar, instName);
-
-               // To allow for copy-on-write
-               Pointer retRunningMeanPtr = getDensePointerForCuDNN(gCtx, 
retRunningMean, instName);
-               Pointer retRunningVarPtr = getDensePointerForCuDNN(gCtx, 
retRunningVar, instName);
-               cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * 
Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-               cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
-
-               // ignoring resultSaveMean and resultSaveVariance as it 
requires state management
-               
checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, 
one(), zero(),
-                               nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
-                               scaleTensorDesc, scalePtr, biasPtr, 
exponentialAverageFactor,
-                               retRunningMeanPtr, retRunningVarPtr, epsilon, 
new Pointer(), new Pointer()));
-       }
-
-       private static void validateBatchNormalizationDimensions(MatrixObject 
scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, 
int C) throws DMLRuntimeException {
-               if(scale.getNumRows() != 1 || scale.getNumColumns() != C) {
-                       throw new DMLRuntimeException("Incorrect dimensions for 
scale");
-               }
-               if(bias.getNumRows() != 1 || bias.getNumColumns() != C) {
-                       throw new DMLRuntimeException("Incorrect dimensions for 
bias");
-               }
-               if(runningMean.getNumRows() != 1 || runningMean.getNumColumns() 
!= C) {
-                       throw new DMLRuntimeException("Incorrect dimensions for 
running mean");
-               }
-               if(runningVar.getNumRows() != 1 || runningVar.getNumColumns() 
!= C) {
-                       throw new DMLRuntimeException("Incorrect dimensions for 
running variance");
-               }
-       }
-
-       /**
-        * Convenient utility for batch normalization that returns a NCHW 
descriptor
-        * @param gCtx a valid {@link GPUContext}
-        * @param N number of images
-        * @param C number of channels
-        * @param CHW channels*height*width
-        * @param input input matrix objects
-        * @param output output matrix objects
-        * @return one of the NCHW descriptor
-        * @throws DMLRuntimeException if error occurs
-        */
-       private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext 
gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) 
throws DMLRuntimeException {
-               cudnnTensorDescriptor ret  = null; // Return any one
-               if(CHW > ((long)Integer.MAX_VALUE)*C) {
-                       throw new DMLRuntimeException("image size 
(height*width) should be less than " + Integer.MAX_VALUE);
-               }
-               cudnnTensorDescriptor knownNCHWdescriptor = null;
-               int H = -1; int W = -1;
-               for(int i = 0; i < input.length; i++) {
-                       knownNCHWdescriptor = 
input[i].getGPUObject(gCtx).getTensorDescriptor();
-                       if(knownNCHWdescriptor != null) {
-                               int [] shape = 
input[i].getGPUObject(gCtx).getTensorShape();
-                               if(shape[0] != N || shape[1] != C) {
-                                       throw new 
DMLRuntimeException("Incorrect N and C:" + shape[0]  + " != " + N + " || " + 
shape[1]  + " != " +  C);
-                               }
-                               H = shape[2];
-                               W = shape[3];
-                               break;
-                       }
-               }
-               if(knownNCHWdescriptor != null) {
-                       // We precisely know N, C, H, W
-                       for(int i = 0; i < input.length; i++) {
-                               ret = allocateTensorDescriptor(gCtx, input[i], 
N, C, H, W);
-                       }
-                       for(int i = 0; i < output.length; i++) {
-                               ret = allocateTensorDescriptor(gCtx, output[i], 
N, C, H, W);
-                       }
-               }
-               else {
-                       int HW = (int) (CHW / C);
-                       H = HW; W = 1; // If not known
-                       double potentialH = Math.sqrt(HW);
-                       if(potentialH == ((int) potentialH)) {
-                               H = (int) potentialH;
-                               W = H;
-                       }
-                       // We are not sure about H and W, hence don't allocate 
them.
-                       ret = new cudnnTensorDescriptor();
-                       cudnnCreateTensorDescriptor(ret);
-                       cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
-               }
-               return ret;
-       }
-
-       /**
-        * This method computes the backpropagation errors for image, scale and 
bias of batch normalization layer
-        * @param gCtx   a valid {@link GPUContext}
-        * @param instName name of the instruction
-        * @param image input image
-        * @param dout input errors of shape C, H, W
-        * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
-        * @param ret (output) backpropagation errors for previous layer
-        * @param retScale backpropagation error for scale
-        * @param retBias backpropagation error for bias
-        * @param epsilon epsilon value used in the batch normalization formula
-        * @throws DMLRuntimeException if error occurs
-        */
-       public static void batchNormalizationBackward(GPUContext gCtx, String 
instName, MatrixObject image, MatrixObject dout,
-                       MatrixObject scale, MatrixObject ret, MatrixObject 
retScale, MatrixObject retBias,
-                       double epsilon) throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : batchNormalizationBackward" + ", 
GPUContext=" + gCtx);
-               }
-               int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
-               int N = toInt(image.getNumRows());
-               int C = toInt(scale.getNumColumns());
-               long CHW = image.getNumColumns();
-
-               // Allocate descriptors
-               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
-                               new MatrixObject[] {image, dout},  new 
MatrixObject[] {ret});
-               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
-               // Get underlying dense pointer
-               Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, 
instName);
-               Pointer doutPtr = getDensePointerForCuDNN(gCtx, dout, instName);
-               Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, 
instName);
-               Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
-               Pointer retScalePtr = getDensePointerForCuDNN(gCtx, retScale, 
instName);
-               Pointer retBiasPtr = getDensePointerForCuDNN(gCtx, retBias, 
instName);
-
-               // ignoring resultSaveMean and resultSaveVariance as it 
requires state management
-               
checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode,  one(), 
zero(), one(), zero(),
-                               nCHWDescriptor,  imagePtr, nCHWDescriptor, 
doutPtr, nCHWDescriptor, retPtr,
-                               scaleTensorDesc, scalePtr, retScalePtr, 
retBiasPtr, epsilon, new Pointer(), new Pointer()));
-       }
-
-
        private static void cudnnReLU(GPUContext gCtx, String instName, 
MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws 
DMLRuntimeException {
                long t0=0;
                try {
@@ -988,8 +668,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                MatrixObject output = ec.getMatrixObject(outputName);
                getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
                long t0=0;
-               cudnnTensorDescriptor srcTensorDesc = 
in.getGPUObject(gCtx).getTensorDescriptor();
-               if(N*CHW >= maxNumDoublesOfCuDNNTensor ||  srcTensorDesc == 
null) {
+               if(N*CHW >= maxNumDoublesOfCuDNNTensor) {
                        if(LOG.isTraceEnabled()) {
                                LOG.trace("GPU : relu custom kernel" + ", 
GPUContext=" + gCtx);
                        }
@@ -1003,7 +682,11 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
                }
                else {
-                       cudnnReLU(gCtx, instName, in, 
getDensePointerForCuDNN(gCtx, output, instName), srcTensorDesc);
+                       cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
+                       cudnnCreateTensorDescriptor(tensorDescriptor);
+                       cudnnSetTensor4dDescriptor(tensorDescriptor, 
CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, toInt(N), 1, 1, toInt(CHW));
+                       cudnnReLU(gCtx, instName, in, 
getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor);
+                       cudnnDestroyTensorDescriptor(tensorDescriptor);
                }
        }
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/96ae6c7e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index 2243b58..871194e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -31,9 +31,18 @@ import jcuda.jcudnn.cudnnConvolutionDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
 import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 
 /**
  * This class is a wrapper that contain necessary data structures to invoke 
@@ -48,6 +57,9 @@ import static 
jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
  *  
  */
 public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseable {
+       // Limit the workspace available to cudnn convolution operation to 1 GB
+       private static long MAX_WORKSPACE_LIMIT_BYTES = (long) 1e+9;
+       
        public int algo = -1;
        public Pointer workSpace = new Pointer();
        public long sizeInBytes = 0;
@@ -61,12 +73,12 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                        int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q) throws DMLRuntimeException {
                int padding[] = {pad_h, pad_w};
                int strides[] = {stride_h, stride_w};
-               convDesc = 
LibMatrixCuDNN.allocateConvolutionDescriptor(padding, strides);
+               convDesc = allocateConvolutionDescriptor(padding, strides);
                this.gCtx = gCtx;
                this.instName = instName;
-               nchwTensorDesc = LibMatrixCuDNN.allocateTensorDescriptor(N, C, 
H, W);
-               nkpqTensorDesc = LibMatrixCuDNN.allocateTensorDescriptor(N, K, 
P, Q);
-               filterDesc = LibMatrixCuDNN.allocateFilterDescriptor(K, C, R, 
S);
+               nchwTensorDesc = allocateTensorDescriptor(N, C, H, W);
+               nkpqTensorDesc = allocateTensorDescriptor(N, K, P, Q);
+               filterDesc = allocateFilterDescriptor(K, C, R, S);
        }
        
        /**
@@ -125,7 +137,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                }
                else {
                        int[] algos = {-1};
-                       long sizeInBytesArray[] = {workspaceLimit};
+                       long sizeInBytesArray[] = {Math.min(workspaceLimit, 
MAX_WORKSPACE_LIMIT_BYTES)};
                        
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
                                        ret.nchwTensorDesc, ret.filterDesc, 
ret.convDesc, ret.nkpqTensorDesc,
                                        
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 
sizeInBytesArray[0], algos);
@@ -177,7 +189,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                }
                else {
                        int[] algos = {-1};
-                       long sizeInBytesArray[] = {workspaceLimit};
+                       long sizeInBytesArray[] = {Math.min(workspaceLimit, 
MAX_WORKSPACE_LIMIT_BYTES)};
                        
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
                                        LibMatrixCuDNN.getCudnnHandle(gCtx), 
                                        ret.nchwTensorDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.filterDesc, 
@@ -230,7 +242,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                }
                else {
                        int[] algos = {-1};
-                       long sizeInBytesArray[] = {workspaceLimit};
+                       long sizeInBytesArray[] = {Math.min(workspaceLimit, 
MAX_WORKSPACE_LIMIT_BYTES)};
                        
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
                                        LibMatrixCuDNN.getCudnnHandle(gCtx), 
                                        ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc,
@@ -246,4 +258,34 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
                return ret;
        }
+       
+       /**
+        * Convenience method to get tensor descriptor
+        * @param N number of images
+        * @param C number of channels
+        * @param H height
+        * @param W width
+        * @return cudnn tensor descriptor
+        * @throws DMLRuntimeException if the input descriptor and matrix 
dimensions don't match
+        */
+       private static cudnnTensorDescriptor allocateTensorDescriptor(int N, 
int C, int H, int W) throws DMLRuntimeException {
+               cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
+               cudnnCreateTensorDescriptor(tensorDescriptor);
+               cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
+               return tensorDescriptor;
+       }
+       
+       private static cudnnFilterDescriptor allocateFilterDescriptor(int K, 
int C, int R, int S) {
+               cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
+               cudnnCreateFilterDescriptor(filterDesc);
+               cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, 
CUDNN_TENSOR_NCHW, K, C, R, S);
+               return filterDesc;
+       }
+       
+       private static cudnnConvolutionDescriptor 
allocateConvolutionDescriptor(int padding [], int strides []) {
+               cudnnConvolutionDescriptor convDesc = new 
cudnnConvolutionDescriptor();
+               cudnnCreateConvolutionDescriptor(convDesc);
+               cudnnSetConvolution2dDescriptor(convDesc, padding[0], 
padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);
+               return convDesc;
+       }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/96ae6c7e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
new file mode 100644
index 0000000..f817bd5
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
+import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+
+import jcuda.jcudnn.cudnnPoolingDescriptor;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+
+/**
+ * This class is a wrapper that contain necessary data structures to invoke 
+ * a cudnn convolution* functions (such as cudnnConvolutionForward, etc)
+ * 
+ * It implements autocloseable to simplify the LibMatrixCuDNN code and also 
avoids potential memory leaks.
+ */
+public class LibMatrixCuDNNPoolingDescriptors implements 
java.lang.AutoCloseable {
+
+       public cudnnTensorDescriptor xDesc; 
+       public cudnnTensorDescriptor yDesc; 
+       public cudnnTensorDescriptor dxDesc; 
+       public cudnnTensorDescriptor dyDesc; 
+       public cudnnPoolingDescriptor poolingDesc;
+       
+       @Override
+       public void close() {
+               if(xDesc != null) 
+                       cudnnDestroyTensorDescriptor(xDesc);
+               if(yDesc != null) 
+                       cudnnDestroyTensorDescriptor(yDesc);
+               if(dxDesc != null) 
+                       cudnnDestroyTensorDescriptor(dxDesc);
+               if(dyDesc != null) 
+                       cudnnDestroyTensorDescriptor(dyDesc);
+               if(poolingDesc != null)
+                       
jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
+       }
+       
+       /**
+        * Get descriptors for maxpooling backward operation
+        * 
+        * @param gCtx gpu context
+        * @param instName instruction name
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
+        * @return decriptor wrapper
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static LibMatrixCuDNNPoolingDescriptors 
cudnnMaxpoolingBackwardDescriptors(GPUContext gCtx, 
+                       String instName, int N, int C, int H, int W, int K, int 
R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q) throws DMLRuntimeException {
+               LibMatrixCuDNNPoolingDescriptors ret = new 
LibMatrixCuDNNPoolingDescriptors();
+               ret.xDesc = allocateTensorDescriptor(N, C, H, W);
+               ret.yDesc = allocateTensorDescriptor(N, C, P, Q);
+               ret.dxDesc = allocateTensorDescriptor(N, C, H, W);
+               ret.dyDesc = allocateTensorDescriptor(N, C, P, Q);
+               ret.poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, 
stride_h, stride_w);
+               return ret;
+       }
+       
+       /**
+        * Get descriptors for maxpooling operation
+        * 
+        * @param gCtx gpu context
+        * @param instName instruction name
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
+        * @return decriptor wrapper
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static LibMatrixCuDNNPoolingDescriptors 
cudnnMaxpoolingDescriptors(GPUContext gCtx, 
+                       String instName, int N, int C, int H, int W, int K, int 
R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q) throws DMLRuntimeException {
+               LibMatrixCuDNNPoolingDescriptors ret = new 
LibMatrixCuDNNPoolingDescriptors();
+               ret.xDesc = allocateTensorDescriptor(N, C, H, W);
+               ret.yDesc = allocateTensorDescriptor(N, C, P, Q);
+               ret.poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, 
stride_h, stride_w);
+               return ret;
+       }
+
+       /**
+        * Convenience method to get tensor descriptor
+        * @param N number of images
+        * @param C number of channels
+        * @param H height
+        * @param W width
+        * @return cudnn tensor descriptor
+        * @throws DMLRuntimeException if the input descriptor and matrix 
dimensions don't match
+        */
+       private static cudnnTensorDescriptor allocateTensorDescriptor(int N, 
int C, int H, int W) throws DMLRuntimeException {
+               cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
+               cudnnCreateTensorDescriptor(tensorDescriptor);
+               cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
+               return tensorDescriptor;
+       }
+       
+       /**
+        * allocates pooling descriptor, used in poolingForward and 
poolingBackward
+        * @param R                     pooling window height
+        * @param S                     pooling window width
+        * @param pad_h         vertical padding
+        * @param pad_w         horizontal padding
+        * @param stride_h      pooling vertical stride
+        * @param stride_w      pooling horizontal stride
+        * @return cudnn pooling descriptor
+        */
+       private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, 
int S, int pad_h, int pad_w, int stride_h, int stride_w) {
+               cudnnPoolingDescriptor poolingDesc = new 
cudnnPoolingDescriptor();
+               cudnnCreatePoolingDescriptor(poolingDesc);
+               cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 
CUDNN_PROPAGATE_NAN, R, S, pad_h, pad_w, stride_h, stride_w);
+               return poolingDesc;
+       }
+}

systemml git commit: [SYSTEMML-445] Refactoring to avoid potential memory leaks

Reply via email to