Repository: systemml Updated Branches: refs/heads/master a97bc53f7 -> ea2a6e491
[SYSTEMML-445] Change the default CuDNN algorithm selector for conv2d_backward_data CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario: - sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1) which is 200x slower than CUDNN_CONVOLUTION_BWD_DATA_ALGO_0. Since it is difficult to debug a closed-source method cudnnGetConvolutionBackwardDataAlgorithm, we will always prefer to use memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0. We can revisit this for next CuDNN version. This is not an ideal solution, but the simplest one I could think of. I welcome discussion on any alternative solutions you may have. And this goes without saying, we can revisit this for next CuDNN version. For more details, please see https://github.com/apache/systemml/pull/682 Closes #682. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ea2a6e49 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ea2a6e49 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ea2a6e49 Branch: refs/heads/master Commit: ea2a6e4917e85c12784e3794fb5b5da214da8103 Parents: a97bc53 Author: Niketan Pansare <[email protected]> Authored: Thu Oct 12 10:35:27 2017 -0700 Committer: Niketan Pansare <[email protected]> Committed: Thu Oct 12 10:37:14 2017 -0700 ---------------------------------------------------------------------- .../LibMatrixCuDNNConvolutionAlgorithm.java | 101 ++++++++----------- 1 file changed, 43 insertions(+), 58 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/ea2a6e49/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index 871194e..0378c7a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -130,24 +130,17 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); - if(workspaceLimit <= 0) { - // If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement, - // i.e. CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM - ret.algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - } - else { - int[] algos = {-1}; - long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; - jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx), - ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, - cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); - jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), - ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray); - if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); - ret.sizeInBytes = sizeInBytesArray[0]; - ret.algo = algos[0]; - } + int[] algos = {-1}; + long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; + jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx), + ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, + cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); + jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), + ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray); + if (sizeInBytesArray[0] != 0) + ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); + ret.sizeInBytes = sizeInBytesArray[0]; + ret.algo = algos[0]; if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; @@ -182,25 +175,19 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); - if(workspaceLimit <= 0) { - // If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement - // i.e. CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 - ret.algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; - } - else { - int[] algos = {-1}; - long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; - jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm( - LibMatrixCuDNN.getCudnnHandle(gCtx), - ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, - cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); - jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), - ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray); - if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); - ret.sizeInBytes = sizeInBytesArray[0]; - ret.algo = algos[0]; - } + int[] algos = {-1}; + long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; + jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm( + LibMatrixCuDNN.getCudnnHandle(gCtx), + ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, + cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); + jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), + ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray); + if (sizeInBytesArray[0] != 0) + ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); + ret.sizeInBytes = sizeInBytesArray[0]; + ret.algo = algos[0]; + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; @@ -235,27 +222,25 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); - if(workspaceLimit <= 0) { - // If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement - // i.e. CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 - ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; - } - else { - int[] algos = {-1}; - long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; - jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm( - LibMatrixCuDNN.getCudnnHandle(gCtx), - ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, - cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); - jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), - ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray); - if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); - ret.sizeInBytes = sizeInBytesArray[0]; - ret.algo = algos[0]; - } - if (GPUStatistics.DISPLAY_STATISTICS) - GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + // CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario + // for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1). + // This causes more than 100x slowdown when compared with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0. + // To keep things simple for now, we will always prefer to use memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 + ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; +// int[] algos = {-1}; +// long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; +// jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm( +// LibMatrixCuDNN.getCudnnHandle(gCtx), +// ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, +// cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); +// jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), +// ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray); +// if (sizeInBytesArray[0] != 0) +// ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); +// ret.sizeInBytes = sizeInBytesArray[0]; +// ret.algo = algos[0]; +// if (GPUStatistics.DISPLAY_STATISTICS) +// GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; }
