Repository: systemml Updated Branches: refs/heads/master 3da574684 -> de69afdc8
[MINOR] Merge extra GPU and extra DNN flags into single "sysml.stats.finegrained" flag to simplify usage Closes #701. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/de69afdc Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/de69afdc Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/de69afdc Branch: refs/heads/master Commit: de69afdc84ad99e2f2a41ac03f69f2f1e64e921f Parents: 3da5746 Author: Niketan Pansare <[email protected]> Authored: Tue Nov 14 11:06:18 2017 -0800 Committer: Niketan Pansare <[email protected]> Committed: Tue Nov 14 11:06:18 2017 -0800 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 8 +- .../apache/sysml/api/ScriptExecutorUtils.java | 6 +- .../java/org/apache/sysml/conf/DMLConfig.java | 6 +- .../instructions/gpu/GPUInstruction.java | 8 +- .../instructions/gpu/context/CSRPointer.java | 4 +- .../instructions/gpu/context/GPUContext.java | 10 +- .../instructions/gpu/context/GPUObject.java | 14 +- .../DoublePrecisionCudaSupportFunctions.java | 9 +- .../runtime/matrix/data/LibMatrixCUDA.java | 148 +++++++++---------- .../runtime/matrix/data/LibMatrixCuDNN.java | 53 +++---- .../LibMatrixCuDNNConvolutionAlgorithm.java | 17 ++- .../data/LibMatrixCuDNNInputRowFetcher.java | 6 +- .../runtime/matrix/data/LibMatrixCuMatMult.java | 20 +-- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 12 +- .../LibMatrixDNNConv2dBackwardDataHelper.java | 10 +- .../LibMatrixDNNConv2dBackwardFilterHelper.java | 20 +-- .../matrix/data/LibMatrixDNNConv2dHelper.java | 20 +-- .../SinglePrecisionCudaSupportFunctions.java | 9 +- .../org/apache/sysml/utils/GPUStatistics.java | 5 +- .../java/org/apache/sysml/utils/Statistics.java | 8 +- 20 files changed, 190 insertions(+), 203 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index 8452e75..8a4a5d6 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -75,15 +75,9 @@ <!-- enables native blas for matrix multiplication and convolution, experimental feature (options: auto, mkl, openblas, none) --> <sysml.native.blas>none</sysml.native.blas> - <!-- prints finegrained statistics information --> + <!-- prints finegrained statistics information (includes extra GPU information and extra statistics information for Deep Neural Networks done in CP mode) --> <sysml.stats.finegrained>false</sysml.stats.finegrained> - <!-- prints extra statistics information for GPU --> - <sysml.stats.extraGPU>false</sysml.stats.extraGPU> - - <!-- prints extra statistics information for Deep Neural Networks done in CP mode --> - <sysml.stats.extraDNN>false</sysml.stats.extraDNN> - <!-- sets the GPUs to use per process, -1 for all GPUs, a specific GPU number (5), a range (eg: 0-2) or a comma separated list (eg: 0,2,4)--> <sysml.gpu.availableGPUs>-1</sysml.gpu.availableGPUs> http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index 51ab6a1..cb39340 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -30,8 +30,6 @@ import org.apache.sysml.runtime.controlprogram.Program; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNN; -import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; public class ScriptExecutorUtils { @@ -75,9 +73,7 @@ public class ScriptExecutorUtils { throws DMLRuntimeException { // Whether extra statistics useful for developers and others interested // in digging into performance problems are recorded and displayed - GPUStatistics.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS); - LibMatrixDNN.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS); - DMLScript.FINEGRAINED_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS); + DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS); DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU); DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE); DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN); http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index e8bde56..42037a1 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -84,8 +84,6 @@ public class DMLConfig public static final String EXTRA_FINEGRAINED_STATS = "sysml.stats.finegrained"; //boolean public static final String STATS_MAX_WRAP_LEN = "sysml.stats.maxWrapLength"; //int - public static final String EXTRA_GPU_STATS = "sysml.stats.extraGPU"; //boolean - public static final String EXTRA_DNN_STATS = "sysml.stats.extraDNN"; //boolean public static final String AVAILABLE_GPUS = "sysml.gpu.availableGPUs"; // String to specify which GPUs to use (a range, all GPUs, comma separated list or a specific GPU) public static final String SYNCHRONIZE_GPU = "sysml.gpu.sync.postProcess"; // boolean: whether to synchronize GPUs after every instruction public static final String EAGER_CUDA_FREE = "sysml.gpu.eager.cudaFree"; // boolean: whether to perform eager CUDA free on rmvar @@ -134,8 +132,6 @@ public class DMLConfig _defaultVals.put(NATIVE_BLAS, "none" ); _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" ); _defaultVals.put(STATS_MAX_WRAP_LEN, "30" ); - _defaultVals.put(EXTRA_GPU_STATS, "false" ); - _defaultVals.put(EXTRA_DNN_STATS, "false" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); _defaultVals.put(AVAILABLE_GPUS, "-1"); _defaultVals.put(SYNCHRONIZE_GPU, "true" ); @@ -422,7 +418,7 @@ public class DMLConfig CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, COMPRESSED_LINALG, CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, - EXTRA_GPU_STATS, EXTRA_DNN_STATS, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, + EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION }; http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index f5d2f46..1883582 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -200,9 +200,9 @@ public abstract class GPUInstruction extends Instruction { throws DMLRuntimeException { if(DMLScript.SYNCHRONIZE_GPU) { - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; jcuda.runtime.JCuda.cudaDeviceSynchronize(); - if(GPUStatistics.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_CUDA_SYNC, System.nanoTime() - t0); } } @@ -238,9 +238,9 @@ public abstract class GPUInstruction extends Instruction { * @throws DMLRuntimeException if an error occurs */ protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols) throws DMLRuntimeException { - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols); - if (GPUStatistics.DISPLAY_STATISTICS && mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS && mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index 53f1a19..d165970 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -497,7 +497,7 @@ public class CSRPointer { */ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols, String instName) throws DMLRuntimeException { - long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS && instName != null ? System.nanoTime() : 0; LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext()); long size = ((long) rows) * getDataTypeSizeOf((long) cols); @@ -510,7 +510,7 @@ public class CSRPointer { } else { LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null"); } - if (GPUStatistics.DISPLAY_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); return A; } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 0a2f6c4..4b47117 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -298,7 +298,7 @@ public class GPUContext { "GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this); } - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); Set<Pointer> freeList = freeCUDASpaceMap.get(size); @@ -308,7 +308,7 @@ public class GPUContext { if (freeList.isEmpty()) freeCUDASpaceMap.remove(size); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS) GPUStatistics .maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); } else { @@ -326,7 +326,7 @@ public class GPUContext { GPUStatistics.cudaAllocTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.add(statsCount); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0); } @@ -340,7 +340,7 @@ public class GPUContext { cudaMemset(A, 0, size); if (DMLScript.STATISTICS) end = System.nanoTime(); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1); if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.add(end - t1); @@ -413,7 +413,7 @@ public class GPUContext { GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.add(1); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0); } else { http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 7e01166..c4a16fc 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -401,7 +401,7 @@ public class GPUObject { denseColumnMajorToRowMajor(); if (DMLScript.STATISTICS) end = System.nanoTime(); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start); if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(end - start); @@ -756,9 +756,9 @@ public class GPUObject { if (DMLScript.STATISTICS) start = System.nanoTime(); - long acqrTime = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long acqrTime = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; MatrixBlock tmp = mat.acquireRead(); - if(GPUStatistics.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { if(tmp.isInSparseFormat()) GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_GET_SPARSE_MB, System.nanoTime()-acqrTime); else @@ -826,10 +826,10 @@ public class GPUObject { allocateSparseMatrixOnDevice(); if (copyToDevice) { - long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values); - if(GPUStatistics.DISPLAY_STATISTICS) + if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1); } } else { @@ -845,9 +845,9 @@ public class GPUObject { if (tmp.getNonZeros() == 0) { // Minor optimization: No need to allocate empty error for CPU // data = new double[tmp.getNumRows() * tmp.getNumColumns()]; - long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; cudaMemset(getJcudaDenseMatrixPtr(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns())); - if(GPUStatistics.DISPLAY_STATISTICS) + if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1); } else { http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java index be5ade7..fb70c13 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java @@ -22,6 +22,7 @@ import static jcuda.runtime.JCuda.cudaMemcpy; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; @@ -159,17 +160,17 @@ public class DoublePrecisionCudaSupportFunctions implements CudaSupportFunctions @Override public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName, boolean isEviction) throws DMLRuntimeException { - long t1 = GPUStatistics.DISPLAY_STATISTICS && instName != null? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS && instName != null? System.nanoTime() : 0; cudaMemcpy(Pointer.to(dest), src, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost); - if(GPUStatistics.DISPLAY_STATISTICS && instName != null) + if(DMLScript.FINEGRAINED_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1); } @Override public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) throws DMLRuntimeException { - long t1 = GPUStatistics.DISPLAY_STATISTICS && instName != null? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS && instName != null? System.nanoTime() : 0; cudaMemcpy(dest, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice); - if(GPUStatistics.DISPLAY_STATISTICS && instName != null) + if(DMLScript.FINEGRAINED_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index fd99eb3..59a9e49 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -315,11 +315,11 @@ public class LibMatrixCUDA { Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName); long t1=0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols)); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); } @@ -387,11 +387,11 @@ public class LibMatrixCUDA { Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr(); Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr(); long t1 = 0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ)); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } @@ -441,11 +441,11 @@ public class LibMatrixCUDA { } int PQ = cols / k; long t1 = 0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_add", ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), image, bias, output, rows, cols, PQ); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } @@ -512,13 +512,13 @@ public class LibMatrixCUDA { long t0=0, t1=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudaSupportFunctions.cublassyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0); - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); copyUpperToLowerTriangle(gCtx, instName, output); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1); } /** @@ -930,19 +930,19 @@ public class LibMatrixCUDA { long t1=0,t2=0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n); //cudaDeviceSynchronize; - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1); int s = blocks; while (s > 1) { tmp = getKernelParamsForReduceAll(gCtx, s); blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2]; - if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2); s = (s + (threads*2-1)) / (threads*2); } double[] result = {-1f}; @@ -971,11 +971,11 @@ public class LibMatrixCUDA { int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); //cudaDeviceSynchronize; - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0); } @@ -999,11 +999,11 @@ public class LibMatrixCUDA { int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); //cudaDeviceSynchronize; - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0); } /** @@ -1329,11 +1329,11 @@ public class LibMatrixCUDA { int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0; int size = rlenA * clenA; long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("matrix_scalar_op", ExecutionConfig.getConfigForSimpleVectorOperations(size), a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0); } /** @@ -1433,11 +1433,11 @@ public class LibMatrixCUDA { LOG.trace("GPU : matrix_matrix_cellwise_op" + ", GPUContext=" + gCtx); } long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn)); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0); } /** @@ -1529,11 +1529,11 @@ public class LibMatrixCUDA { int rlen = toInt(out.getNumRows()); int clen = toInt(out.getNumColumns()); long t0 = 0; - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); int size = rlen * clen; getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0); } } @@ -1549,10 +1549,10 @@ public class LibMatrixCUDA { */ private static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) throws DMLRuntimeException { long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); int size = rlen * clen * sizeOfDataType; cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0); } /** @@ -1633,19 +1633,19 @@ public class LibMatrixCUDA { // Invoke cuSparse when either are in sparse format // Perform sparse-sparse dgeam if (!isInSparseFormat(gCtx, in1)) { - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); in1.getGPUObject(gCtx).denseToSparse(); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0); } CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); if (!isInSparseFormat(gCtx, in2)) { - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); in2.getGPUObject(gCtx).denseToSparse(); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0); } @@ -1668,21 +1668,21 @@ public class LibMatrixCUDA { "Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU"); } - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1); out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); //long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows()); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr, B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd); //cudaDeviceSynchronize; - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0); } @@ -1709,9 +1709,9 @@ public class LibMatrixCUDA { getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix Pointer C = getDensePointer(gCtx, out, instName); - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); } } @@ -1823,7 +1823,7 @@ public class LibMatrixCUDA { */ protected static void sliceDenseDense(GPUContext gCtx, String instName, Pointer inPointer, Pointer outPointer, int rl, int ru, int cl, int cu, int inClen) throws DMLRuntimeException { - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; long retClen = cu - cl + 1; if (inClen == retClen) { cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * sizeOfDataType), (ru - rl + 1) * inClen @@ -1833,7 +1833,7 @@ public class LibMatrixCUDA { getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(retRlen*retClen)), inPointer, outPointer, rl, ru, cl, cu, inClen, retRlen, retClen); } - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0); } /** @@ -1857,7 +1857,7 @@ public class LibMatrixCUDA { if(size == 0) return; int retRlen = ru - rl + 1; - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; int retClen = cu - cl + 1; String kernel = null; String timer = null; @@ -1879,7 +1879,7 @@ public class LibMatrixCUDA { // We can generalize this later to output sparse matrix. getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu, retClen); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, timer, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, timer, System.nanoTime() - t0); } /** @@ -1924,11 +1924,11 @@ public class LibMatrixCUDA { int maxRows = toInt(Math.max(rowsA, rowsB)); int maxCols = toInt(Math.max(colsA, colsB)); - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx) .launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1); } @@ -1959,11 +1959,11 @@ public class LibMatrixCUDA { int maxRows = Math.max(rowsA, rowsB); int maxCols = Math.max(colsA, colsB); - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx) .launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1); } @@ -2295,10 +2295,10 @@ public class LibMatrixCUDA { Pointer output = getDensePointer(gCtx, out, instName); Pointer input = getDensePointer(gCtx, in1, instName); int size = toInt(in1.getNumColumns() * in1.getNumRows()); - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1); } } @@ -2337,13 +2337,13 @@ public class LibMatrixCUDA { // becomes // C <- A // C <- alpha*B + C - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); cudaMemcpy(C, A, n*((long)sizeOfDataType), cudaMemcpyDeviceToDevice); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1); - if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); cudaSupportFunctions.cublasaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2); } else { if(LOG.isTraceEnabled()) { @@ -2353,12 +2353,12 @@ public class LibMatrixCUDA { // Matrix-Vector daxpy // Note: Vector-Matrix operation is not supported // daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); int rlenA = toInt(in1.getNumRows()); int clenA = toInt(in1.getNumColumns()); int rlenB = toInt(in2.getNumRows()); int clenB = toInt(in2.getNumColumns()); getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), A, B, constant, C, rlenA, clenA, rlenB, clenB); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1); } } @@ -2406,20 +2406,20 @@ public class LibMatrixCUDA { // convert dense matrices to row major // Operation in cuSolver and cuBlas are for column major dense matrices // and are destructive to the original input - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); GPUObject ATobj = (GPUObject) Aobj.clone(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); ATobj.denseRowMajorToColumnMajor(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); Pointer A = ATobj.getJcudaDenseMatrixPtr(); - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); GPUObject bTobj = (GPUObject) bobj.clone(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); bTobj.denseRowMajorToColumnMajor(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); Pointer b = bTobj.getJcudaDenseMatrixPtr(); @@ -2428,18 +2428,18 @@ public class LibMatrixCUDA { // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1 // step 3: query working space of geqrf and ormqr - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); int[] lwork = {0}; cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0); // step 4: compute QR factorization Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType); Pointer tau = gCtx.allocate(instName, m * sizeOfDataType); Pointer devInfo = gCtx.allocate(Sizeof.INT); - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0); int[] qrError = {-1}; cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); @@ -2448,24 +2448,24 @@ public class LibMatrixCUDA { } // step 5: compute Q^T*B - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0); cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); if (qrError[0] != 0) { throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); } // step 6: compute x = R \ Q^T*B - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, n, 1, dataTypePointerTo(1.0), A, m, b, m); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0); - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); bTobj.denseColumnMajorToRowMajor(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0); // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash // There is an avoidable copy happening here @@ -2497,10 +2497,10 @@ public class LibMatrixCUDA { */ protected static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) throws DMLRuntimeException { long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols); if (mb.getValue()) - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); } @@ -2519,10 +2519,10 @@ public class LibMatrixCUDA { */ private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) throws DMLRuntimeException { long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); Pair<MatrixObject, Boolean> mb = ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz); if (mb.getValue()) - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index 5935285..4e23953 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -43,6 +43,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; @@ -220,13 +221,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } try { long t1 = 0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(), algo.nchwTensorDesc, image, algo.filterDesc, filter, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.nkpqTensorDesc, output); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t1); if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status)); @@ -292,9 +293,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { // Perform one-input conv2dBackwardFilter Pointer tempdwPointer = gCtx.allocate(KCRS*sizeOfDataType); for(int n = 0; n < N; n++) { - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType); - if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); + if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); // Perform one-input conv2dBackwardFilter cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), tempdwPointer, algo); getCudaKernels(gCtx).launchKernel("inplace_add", @@ -330,10 +331,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx); } try { - long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), algo.nchwTensorDesc, imagePointer, algo.nkpqTensorDesc, doutPointer, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.filterDesc, dwPointer); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t1); if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); @@ -424,10 +425,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx); } try { - long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), algo.filterDesc, w, algo.nkpqTensorDesc, dy, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.nchwTensorDesc, dx); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t1); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); @@ -499,11 +500,11 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { LibMatrixCuDNNPoolingDescriptors.cudnnMaxpoolingDescriptors(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q)) { long t1=0,t2=0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); - if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); int status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } @@ -598,20 +599,20 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long t1=0, t2=0, t3=0; int status; if(!isMaxPoolOutputProvided) { - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); long numBytes = N*C*P*Q*sizeOfDataType; y = gCtx.allocate(numBytes); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); - if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } } - if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t3 = System.nanoTime(); status = cudnnPoolingBackward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), desc.dxDesc, dx); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); @@ -621,10 +622,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } finally { long t4=0; - if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t4 = System.nanoTime(); if(!isMaxPoolOutputProvided) gCtx.cudaFreeHelper(instName, y); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4); } } @@ -641,18 +642,18 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { cudnnCreateActivationDescriptor(activationDescriptor); double dummy = -1; cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy); - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor, one(), srcTensorDesc, srcData, zero(), dstTensorDesc, dstData); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0); } catch (CudaException e) { throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e); } finally { long t1=0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1); + if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1); } } @@ -678,13 +679,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx); } // Invokes relu(double* A, double* ret, int rlen, int clen) - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); Pointer dstData = getDensePointerForCuDNN(gCtx, output, instName); Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), srcData, dstData, toInt(N), toInt(CHW)); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0); } else { cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor(); http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index ee22541..17cd610 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -19,6 +19,7 @@ package org.apache.sysml.runtime.matrix.data; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; @@ -85,7 +86,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab @Override public void close() { long t3 = 0; - if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); + if (DMLScript.FINEGRAINED_STATISTICS) t3 = System.nanoTime(); if(nchwTensorDesc != null) cudnnDestroyTensorDescriptor(nchwTensorDesc); if(nkpqTensorDesc != null) @@ -96,7 +97,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab cudnnDestroyConvolutionDescriptor(convDesc); if(sizeInBytes != 0) gCtx.cudaFreeHelper(instName, workSpace); - if(GPUStatistics.DISPLAY_STATISTICS) + if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); } @@ -125,7 +126,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionForwardAlgorithm( GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) throws DMLRuntimeException { - long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); int[] algos = {-1}; @@ -139,7 +140,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; } @@ -169,7 +170,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilterAlgorithm( GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) throws DMLRuntimeException { - long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); @@ -186,7 +187,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; } @@ -216,7 +217,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardDataAlgorithm( GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) throws DMLRuntimeException { - //long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + //long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); @@ -237,7 +238,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab // ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); // ret.sizeInBytes = sizeInBytesArray[0]; // ret.algo = algos[0]; -// if (GPUStatistics.DISPLAY_STATISTICS) +// if (DMLScript.FINEGRAINED_STATISTICS) // GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java index 5121c87..5a7cad3 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java @@ -20,6 +20,8 @@ package org.apache.sysml.runtime.matrix.data; import static jcuda.runtime.JCuda.cudaMemset; import jcuda.Pointer; + +import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; @@ -59,10 +61,10 @@ public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java public Pointer getNthRow(int n) throws DMLRuntimeException { if(isInputInSparseFormat) { jcuda.runtime.JCuda.cudaDeviceSynchronize(); - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; cudaMemset(outPointer, 0, numColumns*sizeOfDataType); jcuda.runtime.JCuda.cudaDeviceSynchronize(); - if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); + if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); LibMatrixCUDA.sliceSparseDense(gCtx, instName, (CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCUDA.toInt(numColumns-1), numColumns); } else { http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java index d962027..ce0ad5b 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java @@ -164,19 +164,19 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { // and output CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb, params.m, params.n, params.k); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0); // Step 3: Invoke the kernel - long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1); output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); @@ -284,14 +284,14 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params); if (outRLen != 1 && outCLen != 1) { // Transpose: C = t(output) - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T, toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(), toInt(outRLen), C, toInt(outCLen)); if (!DMLScript.EAGER_CUDA_FREE) JCuda.cudaDeviceSynchronize(); gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE); - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0); } @@ -319,7 +319,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { */ private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B, CuMatMultParameters param) throws DMLRuntimeException { - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; String kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB; // Ignoring sparse vector dense matrix multiplication and dot product boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed) @@ -343,7 +343,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc); } - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0); } @@ -370,7 +370,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { */ private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B, CuMatMultParameters param) throws DMLRuntimeException { - long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; String kernel = null; param.rowToColumnMajor(); param.validate(); @@ -412,7 +412,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { zero(), C, param.ldc); kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB; } - if (GPUStatistics.DISPLAY_STATISTICS) + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0); } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index 096574a..67d4a1a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -68,8 +68,6 @@ public class LibMatrixDNN { protected static final Log LOG = LogFactory.getLog(LibMatrixDNN.class.getName()); //library configurations and external contracts - public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output - // ------------------------------------------------------------------------------------------------ private static AtomicLong conv2dSparseCount = new AtomicLong(0); private static AtomicLong conv2dDenseCount = new AtomicLong(0); @@ -89,7 +87,7 @@ public class LibMatrixDNN { static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0); public static void appendStatistics(StringBuilder sb) { - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t" + conv2dDenseCount.get() + "/" + conv2dBwdFilterDenseCount.get() + "/" @@ -230,7 +228,7 @@ public class LibMatrixDNN { if(params.stride_h <= 0 || params.stride_w <= 0) throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { if(filter.isInSparseFormat() || dout.isInSparseFormat()) { conv2dBwdDataSparseCount.addAndGet(1); } @@ -255,7 +253,7 @@ public class LibMatrixDNN { if(params.stride_h <= 0 || params.stride_w <= 0) throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { if(input.isInSparseFormat() || dout.isInSparseFormat()) { conv2dBwdFilterSparseCount.addAndGet(1); } @@ -281,7 +279,7 @@ public class LibMatrixDNN { if(params.stride_h <= 0 || params.stride_w <= 0) throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { if(input.isInSparseFormat() || filter.isInSparseFormat()) { conv2dSparseCount.addAndGet(1); } @@ -314,7 +312,7 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Incorrect dout dimensions in maxpooling_backward:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q); } - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { if(input.isInSparseFormat() || dout.isInSparseFormat()) { maxPoolBwdSparseCount.addAndGet(1); } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java index 960cea6..55f263f 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java @@ -89,20 +89,20 @@ public class LibMatrixDNNConv2dBackwardDataHelper { // rotate180(dout[n,]) => dout_reshaped rotate180Worker.execute(n, 0); // dout_reshaped %*% filter => temp - long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; outMM.reset(PQ, CRS, false); LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, !outRotate.sparse, false, _params); - long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; // col2im(temp) => output[n,] LibMatrixDNNHelper.doCol2imOverSingleImage(n, outMM, _params); - long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { time1 += t2 - t1; time2 += t3 - t2; } } - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1); LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2); } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java index 9698725..4a94838 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java @@ -99,24 +99,24 @@ public class LibMatrixDNNConv2dBackwardFilterHelper { rotate180Worker.execute(n, 0); // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; im2ColWorker.execute(n); - long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; outMM.reset(CRS, K, false); LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, !im2ColOutBlock.sparse, !outRotate.sparse, _params); - long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; if( !outMM.isEmptyBlock() ) //accumulate row results LibMatrixMult.vectAdd(outMM.getDenseBlock(), partRet, 0, 0, K*CRS); - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { time1 += t2 - t1; time2 += t3 - t2; } } inplaceTransAdd(partRet, _params); - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); } @@ -150,27 +150,27 @@ public class LibMatrixDNNConv2dBackwardFilterHelper { rotate180Worker.execute(n, 0); // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; im2ColWorker.execute(n); - long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; outMM.reset(K, CRS, false); //Timing time = new Timing(true); LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, outMM, !outRotate.sparse, !im2ColOutBlock.sparse, _params); - long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; if( !outMM.isEmptyBlock() ) //accumulate row results LibMatrixMult.vectAdd(outMM.getDenseBlock(), partRet, 0, 0, K*CRS); - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { time1 += t2 - t1; time2 += t3 - t2; } } //no need to transpose because t(t(out)) cancel out inplaceAdd(partRet, _params); - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java index 6a0205e..b390906 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java @@ -56,16 +56,16 @@ public class LibMatrixDNNConv2dHelper { for(int n = _rl; n < _ru; n++) { for(int c = 0; c < _params.C; c++) { // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; im2ColWorker.execute(n, c); - long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; // filter %*% _im2ColOutBlock => matMultOutBlock MatrixBlock matMultOutBlock = new MatrixBlock(K, PQ, false); LibMatrixDNNHelper.singleThreadedMatMult(_filters.get(c), im2ColOutBlock, matMultOutBlock, false, true, _params); - long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { time1 += t2 - t1; time2 += t3 - t2; } @@ -77,7 +77,7 @@ public class LibMatrixDNNConv2dHelper { if(_params.bias != null) LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); } - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); } @@ -137,16 +137,16 @@ public class LibMatrixDNNConv2dHelper { long time1 = 0; long time2 = 0; for(int n = _rl; n < _ru; n++) { // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; im2ColWorker.execute(n); - long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; // filter %*% _im2ColOutBlock => matMultOutBlock outMM.reset(outMM.rlen, outMM.clen, false); LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, false, true, _params); - long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { time1 += t2 - t1; time2 += t3 - t2; } @@ -159,7 +159,7 @@ public class LibMatrixDNNConv2dHelper { LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); } - if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + if(DMLScript.FINEGRAINED_STATISTICS) { LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java index 96a3887..8429d35 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java @@ -24,6 +24,7 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; @@ -163,7 +164,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions @Override public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName, boolean isEviction) throws DMLRuntimeException { - long t1 = GPUStatistics.DISPLAY_STATISTICS && instName != null? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS && instName != null? System.nanoTime() : 0; // We invoke transfer matrix from device to host in two cases: // 1. During eviction of unlocked matrices // 2. During acquireHostRead @@ -187,7 +188,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions dest[i] = floatData[i]; } } - if(GPUStatistics.DISPLAY_STATISTICS && instName != null) + if(DMLScript.FINEGRAINED_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1); } @@ -195,7 +196,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) throws DMLRuntimeException { LOG.debug("Potential OOM: Allocated additional space in hostToDevice"); // TODO: Perform conversion on GPU using double2float and float2double kernels - long t1 = GPUStatistics.DISPLAY_STATISTICS && instName != null? System.nanoTime() : 0; + long t1 = DMLScript.FINEGRAINED_STATISTICS && instName != null? System.nanoTime() : 0; if(PERFORM_CONVERSION_ON_DEVICE) { Pointer deviceDoubleData = gCtx.allocate(((long)src.length)*Sizeof.DOUBLE); cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice); @@ -210,7 +211,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions cudaMemcpy(dest, Pointer.to(floatData), ((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice); } - if(GPUStatistics.DISPLAY_STATISTICS && instName != null) + if(DMLScript.FINEGRAINED_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index 2668ad9..33ab953 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -35,9 +35,6 @@ import org.apache.sysml.api.DMLScript; * Printed as part of {@link Statistics}. */ public class GPUStatistics { - // Whether or not extra per-instruction statistics will be recorded and shown for the GPU - public static boolean DISPLAY_STATISTICS = false; - private static int iNoOfExecutedGPUInst = 0; public static long cudaInitTime = 0; @@ -117,7 +114,7 @@ public class GPUStatistics { */ public synchronized static void maintainCPMiscTimes( String instructionName, String miscTimer, long timeNanos, long incrementCount) { - if (!(DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS)) + if (!(DMLScript.FINEGRAINED_STATISTICS)) return; HashMap<String, Long> miscTimesMap = _cpInstMiscTime.get(instructionName); http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/utils/Statistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java index 5cc0650..44bb232 100644 --- a/src/main/java/org/apache/sysml/utils/Statistics.java +++ b/src/main/java/org/apache/sysml/utils/Statistics.java @@ -601,7 +601,7 @@ public class Statistics sb.append(String.format( " %" + maxNumLen + "s %-" + maxInstLen + "s %" + maxTimeSLen + "s %" + maxCountLen + "s", numCol, instCol, timeSCol, countCol)); - if (GPUStatistics.DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS) { + if (DMLScript.FINEGRAINED_STATISTICS) { sb.append(" "); sb.append(gpuCol); } @@ -618,15 +618,15 @@ public class Statistics int numLines = wrappedInstruction.length; String [] miscTimers = null; - if (GPUStatistics.DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS) { + if (DMLScript.FINEGRAINED_STATISTICS) { miscTimers = wrap(GPUStatistics.getStringForCPMiscTimesPerInstruction(instruction), DMLScript.STATISTICS_MAX_WRAP_LEN); numLines = Math.max(numLines, miscTimers.length); } - String miscFormatString = (GPUStatistics.DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS) ? " %" + DMLScript.STATISTICS_MAX_WRAP_LEN + "s" : "%s"; + String miscFormatString = (DMLScript.FINEGRAINED_STATISTICS) ? " %" + DMLScript.STATISTICS_MAX_WRAP_LEN + "s" : "%s"; for(int wrapIter = 0; wrapIter < numLines; wrapIter++) { String instStr = (wrapIter < wrappedInstruction.length) ? wrappedInstruction[wrapIter] : ""; - String miscTimerStr = ( (GPUStatistics.DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS) && wrapIter < miscTimers.length) ? miscTimers[wrapIter] : ""; + String miscTimerStr = ( (DMLScript.FINEGRAINED_STATISTICS) && wrapIter < miscTimers.length) ? miscTimers[wrapIter] : ""; if(wrapIter == 0) { // Display instruction count sb.append(String.format(
