Repository: systemml
Updated Branches:
  refs/heads/master 61dcc85e4 -> 0cb2f7f68


[MINOR] [SYSTEMML-446] Added time spent in jcuda sync to fine-grained statistics

- Also added force accelerator flag to LibMatrixCuDNN to skip worst-case memory
  budget restriction.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0cb2f7f6
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0cb2f7f6
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0cb2f7f6

Branch: refs/heads/master
Commit: 0cb2f7f68cb644c7fda6666bc84782e82069fb34
Parents: 61dcc85
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Thu Sep 28 12:14:28 2017 -0800
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Thu Sep 28 13:14:28 2017 -0700

----------------------------------------------------------------------
 .../instructions/gpu/GPUInstruction.java        |  7 +++++-
 .../runtime/matrix/data/LibMatrixCuDNN.java     | 26 ++++++++++++--------
 2 files changed, 22 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/0cb2f7f6/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index bc3ba9b..108a622 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -61,7 +61,8 @@ public abstract class GPUInstruction extends Instruction {
        public final static String MISC_TIMER_ROW_TO_COLUMN_MAJOR =     "r2c";  
// time spent in converting data from row major to column major
        public final static String MISC_TIMER_COLUMN_TO_ROW_MAJOR =     "c2r";  
// time spent in converting data from column major to row major
        public final static String MISC_TIMER_OBJECT_CLONE =            
"clone";// time spent in cloning (deep copying) a GPUObject instance
-
+       public final static String MISC_TIMER_CUDA_SYNC =               "sync"; 
// time spent in device sync
+       
        public final static String MISC_TIMER_CUDA_FREE =               "f";    
        // time spent in calling cudaFree
        public final static String MISC_TIMER_ALLOCATE =                "a";    
        // time spent to allocate memory on gpu
        public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT =   "ad";   
        // time spent to allocate dense output (recorded differently than 
MISC_TIMER_ALLOCATE)
@@ -198,7 +199,11 @@ public abstract class GPUInstruction extends Instruction {
                                        throws DMLRuntimeException
        {
                if(DMLScript.SYNCHRONIZE_GPU) {
+                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                        jcuda.runtime.JCuda.cudaDeviceSynchronize();
+                       if(GPUStatistics.DISPLAY_STATISTICS) {
+                               
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_CUDA_SYNC, System.nanoTime() - t0);
+                       }
                }
                if(LOG.isDebugEnabled()) {
                        for(GPUContext gpuCtx : ec.getGPUContexts()) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/0cb2f7f6/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 602edce..654bd9d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -67,6 +67,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
@@ -153,7 +154,8 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
-               if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+               if(DMLScript.FORCE_ACCELERATOR ||
+                               (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2d
                        double overhead = isInSparseFormat(gCtx, filter) ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
@@ -161,7 +163,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        Pointer filterPointer = getDensePointerForCuDNN(gCtx, 
filter, instName);
                        Pointer dstPointer = getDensePointerForCuDNN(gCtx, 
outputBlock, instName);
 
-                       if(overhead <= intermediateMemoryBudget) {
+                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
                                // Perform all-input all-channel conv2d
                                Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
                                cudnnConv2d(gCtx, instName, imagePointer, 
filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
@@ -346,11 +348,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
-               if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+               if(DMLScript.FORCE_ACCELERATOR || 
+                               (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
                        Pointer dwPointer = getDensePointerForCuDNN(gCtx, 
outputBlock, instName);
                        double overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
-                       if(overhead <= intermediateMemoryBudget) {
+                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
                                // Perform all-input all-channel 
conv2dBackwardFilter
                                Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
                                Pointer doutPointer = 
getDensePointerForCuDNN(gCtx, dout, instName);
@@ -502,13 +505,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
-               if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+               if(DMLScript.FORCE_ACCELERATOR ||
+                               (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
                        double overhead = isInSparseFormat(gCtx, filter) ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
                        Pointer filterPointer = getDensePointerForCuDNN(gCtx, 
filter, instName);
                        Pointer dstPointer = getDensePointerForCuDNN(gCtx, 
output, instName);
-                       if(overhead <= intermediateMemoryBudget) {
+                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
                                // Perform all-input all-channel 
conv2dBackwardData
                                Pointer doutPointer = 
getDensePointerForCuDNN(gCtx, dout, instName);
                                cudnnConv2dBackwardData(gCtx, instName, 
filterPointer, doutPointer, dstPointer, 
@@ -638,11 +642,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long CPQ = C*P*Q;  
                long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-               if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor) {
+               if(DMLScript.FORCE_ACCELERATOR || 
+                               (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor)) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
                        long overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
                        Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
-                       if(overhead <= intermediateMemoryBudget) {
+                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
                                Pointer x = getDensePointerForCuDNN(gCtx, 
image, instName);
                                cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(gCtx, image, N, C, H, W);
                                cudnnMaxpooling(gCtx, instName, x, xDesc, y, N, 
C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
@@ -780,12 +785,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long CPQ = C*P*Q;  
                long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-               if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor) {
+               if(DMLScript.FORCE_ACCELERATOR || 
+                               (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor)) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
                        long overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0;
                        Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
-                       if(overhead <= intermediateMemoryBudget) {
+                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
                                Pointer x = getDensePointerForCuDNN(gCtx, 
image, instName);
                                Pointer dy = getDensePointerForCuDNN(gCtx, 
dout, instName);
                                cudnnMaxpoolingBackward(gCtx, instName, x, dy, 
dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);

Reply via email to