Repository: systemml Updated Branches: refs/heads/master 9970fd814 -> f69047ea4
[SYSTEMML-445] Update algorithm selection logic on GPU for conv2d_backward_data Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f69047ea Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f69047ea Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f69047ea Branch: refs/heads/master Commit: f69047ea48fe5eb4c7269b49c3e6d1aef7f6967d Parents: 9970fd8 Author: Niketan Pansare <[email protected]> Authored: Wed Jan 31 16:49:41 2018 -0800 Committer: Niketan Pansare <[email protected]> Committed: Wed Jan 31 16:50:22 2018 -0800 ---------------------------------------------------------------------- .../LibMatrixCuDNNConvolutionAlgorithm.java | 45 +++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/f69047ea/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index 6673e60..8050d1e 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -217,29 +217,32 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardDataAlgorithm( GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) throws DMLRuntimeException { - //long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); - - // CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario - // for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1). - // This causes more than 100x slowdown when compared with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0. - // To keep things simple for now, we will always prefer to use memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 - ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; -// int[] algos = {-1}; -// long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; -// jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm( -// LibMatrixCuDNN.getCudnnHandle(gCtx), -// ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, -// cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); -// jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), -// ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray); -// if (sizeInBytesArray[0] != 0) -// ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); -// ret.sizeInBytes = sizeInBytesArray[0]; -// ret.algo = algos[0]; -// if (DMLScript.FINEGRAINED_STATISTICS) -// GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + if(H == R || W == S) { + // CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario + // for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1). + // This causes more than 100x slowdown when compared with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0. + // To keep things simple for now, we will always prefer to use memory-less operator for conv1d: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 + ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; + } + else { + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + int[] algos = {-1}; + long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; + jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm( + LibMatrixCuDNN.getCudnnHandle(gCtx), + ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, + jcuda.jcudnn.cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos); + jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), + ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray); + if (sizeInBytesArray[0] != 0) + ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); + ret.sizeInBytes = sizeInBytesArray[0]; + ret.algo = algos[0]; + if (DMLScript.FINEGRAINED_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + } return ret; }
