Repository: systemml Updated Branches: refs/heads/master 61dcc85e4 -> 0cb2f7f68
[MINOR] [SYSTEMML-446] Added time spent in jcuda sync to fine-grained statistics - Also added force accelerator flag to LibMatrixCuDNN to skip worst-case memory budget restriction. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0cb2f7f6 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0cb2f7f6 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0cb2f7f6 Branch: refs/heads/master Commit: 0cb2f7f68cb644c7fda6666bc84782e82069fb34 Parents: 61dcc85 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Thu Sep 28 12:14:28 2017 -0800 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Thu Sep 28 13:14:28 2017 -0700 ---------------------------------------------------------------------- .../instructions/gpu/GPUInstruction.java | 7 +++++- .../runtime/matrix/data/LibMatrixCuDNN.java | 26 ++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/0cb2f7f6/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index bc3ba9b..108a622 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -61,7 +61,8 @@ public abstract class GPUInstruction extends Instruction { public final static String MISC_TIMER_ROW_TO_COLUMN_MAJOR = "r2c"; // time spent in converting data from row major to column major public final static String MISC_TIMER_COLUMN_TO_ROW_MAJOR = "c2r"; // time spent in converting data from column major to row major public final static String MISC_TIMER_OBJECT_CLONE = "clone";// time spent in cloning (deep copying) a GPUObject instance - + public final static String MISC_TIMER_CUDA_SYNC = "sync"; // time spent in device sync + public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ad"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE) @@ -198,7 +199,11 @@ public abstract class GPUInstruction extends Instruction { throws DMLRuntimeException { if(DMLScript.SYNCHRONIZE_GPU) { + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; jcuda.runtime.JCuda.cudaDeviceSynchronize(); + if(GPUStatistics.DISPLAY_STATISTICS) { + GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_CUDA_SYNC, System.nanoTime() - t0); + } } if(LOG.isDebugEnabled()) { for(GPUContext gpuCtx : ec.getGPUContexts()) { http://git-wip-us.apache.org/repos/asf/systemml/blob/0cb2f7f6/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index 602edce..654bd9d 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -67,6 +67,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; @@ -153,7 +154,8 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS; - if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) { + if(DMLScript.FORCE_ACCELERATOR || + (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) { // Filter and output are accounted as dense in the memory estimation for conv2d double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0; overhead += isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0; @@ -161,7 +163,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName); Pointer dstPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName); - if(overhead <= intermediateMemoryBudget) { + if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) { // Perform all-input all-channel conv2d Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName); cudnnConv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); @@ -346,11 +348,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS; - if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) { + if(DMLScript.FORCE_ACCELERATOR || + (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) { Pointer dwPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName); double overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0; overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0; - if(overhead <= intermediateMemoryBudget) { + if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) { // Perform all-input all-channel conv2dBackwardFilter Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName); Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName); @@ -502,13 +505,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS; - if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) { + if(DMLScript.FORCE_ACCELERATOR || + (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) { // Filter and output are accounted as dense in the memory estimation for conv2dBackwardData double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0; overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0; Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName); Pointer dstPointer = getDensePointerForCuDNN(gCtx, output, instName); - if(overhead <= intermediateMemoryBudget) { + if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) { // Perform all-input all-channel conv2dBackwardData Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName); cudnnConv2dBackwardData(gCtx, instName, filterPointer, doutPointer, dstPointer, @@ -638,11 +642,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long CHW = C*H*W; long CPQ = C*P*Q; long NCHW = N*CHW; long NCPQ = N*CPQ; - if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) { + if(DMLScript.FORCE_ACCELERATOR || + (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor)) { // Filter and output are accounted as dense in the memory estimation for conv2dBackwardData long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0; Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, instName); - if(overhead <= intermediateMemoryBudget) { + if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) { Pointer x = getDensePointerForCuDNN(gCtx, image, instName); cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W); cudnnMaxpooling(gCtx, instName, x, xDesc, y, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); @@ -780,12 +785,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long CHW = C*H*W; long CPQ = C*P*Q; long NCHW = N*CHW; long NCPQ = N*CPQ; - if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) { + if(DMLScript.FORCE_ACCELERATOR || + (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor)) { // Filter and output are accounted as dense in the memory estimation for conv2dBackwardData long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0; overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0; Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, instName); - if(overhead <= intermediateMemoryBudget) { + if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) { Pointer x = getDensePointerForCuDNN(gCtx, image, instName); Pointer dy = getDensePointerForCuDNN(gCtx, dout, instName); cudnnMaxpoolingBackward(gCtx, instName, x, dy, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);