systemml git commit: [SYSTEMML-445] Update algorithm selection logic on GPU for conv2d_backward_data

niketanpansare Wed, 31 Jan 2018 16:52:04 -0800

Repository: systemml
Updated Branches:
  refs/heads/master 9970fd814 -> f69047ea4



[SYSTEMML-445] Update algorithm selection logic on GPU for
conv2d_backward_data

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f69047ea
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f69047ea
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f69047ea

Branch: refs/heads/master
Commit: f69047ea48fe5eb4c7269b49c3e6d1aef7f6967d
Parents: 9970fd8
Author: Niketan Pansare <[email protected]>
Authored: Wed Jan 31 16:49:41 2018 -0800
Committer: Niketan Pansare <[email protected]>
Committed: Wed Jan 31 16:50:22 2018 -0800

----------------------------------------------------------------------
 .../LibMatrixCuDNNConvolutionAlgorithm.java     | 45 +++++++++++---------
 1 file changed, 24 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/f69047ea/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index 6673e60..8050d1e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -217,29 +217,32 @@ public class LibMatrixCuDNNConvolutionAlgorithm 
implements java.lang.AutoCloseab
        public static LibMatrixCuDNNConvolutionAlgorithm 
cudnnGetConvolutionBackwardDataAlgorithm(
                        GPUContext gCtx, String instName, int N, int C, int H, 
int W, int K, int R, int S, 
                        int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q, long workspaceLimit) throws DMLRuntimeException {
-               //long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                LibMatrixCuDNNConvolutionAlgorithm ret = new 
LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
                                pad_h, pad_w, stride_h, stride_w, P, Q);
-               
-               // CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns 
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario 
-               // for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, 
Wf=300, sparsity=0.1).
-               // This causes more than 100x slowdown when compared with 
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.
-               // To keep things simple for now, we will always prefer to use 
memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
-               ret.algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-//             int[] algos = {-1};
-//             long sizeInBytesArray[] = {Math.min(workspaceLimit, 
MAX_WORKSPACE_LIMIT_BYTES)};
-//             jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
-//                             LibMatrixCuDNN.getCudnnHandle(gCtx), 
-//                             ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc,
-//                             
cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
 sizeInBytesArray[0], algos);
-//             
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
-//                             ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
-//             if (sizeInBytesArray[0] != 0)
-//                     ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
-//             ret.sizeInBytes = sizeInBytesArray[0];
-//             ret.algo = algos[0];
-//             if (DMLScript.FINEGRAINED_STATISTICS)
-//                     GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+               if(H == R || W == S) {
+                       // CuDNN's cudnnGetConvolutionBackwardDataAlgorithm 
returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario 
+                       // for sentence CNN (N=1, C=1, H=2060, W=300, F=500, 
Hf=5, Wf=300, sparsity=0.1).
+                       // This causes more than 100x slowdown when compared 
with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.
+                       // To keep things simple for now, we will always prefer 
to use memory-less operator for conv1d: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
+                       ret.algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+               }
+               else {
+                       long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
+                       int[] algos = {-1};
+                       long sizeInBytesArray[] = {Math.min(workspaceLimit, 
MAX_WORKSPACE_LIMIT_BYTES)};
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
+                                       LibMatrixCuDNN.getCudnnHandle(gCtx), 
+                                       ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc,
+                                       
jcuda.jcudnn.cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
 sizeInBytesArray[0], algos);
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
+                                       ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
+                       if (sizeInBytesArray[0] != 0)
+                               ret.workSpace = 
gCtx.allocate(sizeInBytesArray[0]);
+                       ret.sizeInBytes = sizeInBytesArray[0];
+                       ret.algo = algos[0];
+                       if (DMLScript.FINEGRAINED_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+               }
                return ret;
        }

systemml git commit: [SYSTEMML-445] Update algorithm selection logic on GPU for conv2d_backward_data

Reply via email to