systemml git commit: [MINOR] Merge extra GPU and extra DNN flags into single "sysml.stats.finegrained" flag to simplify usage

niketanpansare Tue, 14 Nov 2017 11:07:23 -0800

Repository: systemml
Updated Branches:
  refs/heads/master 3da574684 -> de69afdc8



[MINOR] Merge extra GPU and extra DNN flags into single 
"sysml.stats.finegrained" flag to simplify usage

Closes #701.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/de69afdc
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/de69afdc
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/de69afdc

Branch: refs/heads/master
Commit: de69afdc84ad99e2f2a41ac03f69f2f1e64e921f
Parents: 3da5746
Author: Niketan Pansare <[email protected]>
Authored: Tue Nov 14 11:06:18 2017 -0800
Committer: Niketan Pansare <[email protected]>
Committed: Tue Nov 14 11:06:18 2017 -0800

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   8 +-
 .../apache/sysml/api/ScriptExecutorUtils.java   |   6 +-
 .../java/org/apache/sysml/conf/DMLConfig.java   |   6 +-
 .../instructions/gpu/GPUInstruction.java        |   8 +-
 .../instructions/gpu/context/CSRPointer.java    |   4 +-
 .../instructions/gpu/context/GPUContext.java    |  10 +-
 .../instructions/gpu/context/GPUObject.java     |  14 +-
 .../DoublePrecisionCudaSupportFunctions.java    |   9 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 148 +++++++++----------
 .../runtime/matrix/data/LibMatrixCuDNN.java     |  53 +++----
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |  17 ++-
 .../data/LibMatrixCuDNNInputRowFetcher.java     |   6 +-
 .../runtime/matrix/data/LibMatrixCuMatMult.java |  20 +--
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |  12 +-
 .../LibMatrixDNNConv2dBackwardDataHelper.java   |  10 +-
 .../LibMatrixDNNConv2dBackwardFilterHelper.java |  20 +--
 .../matrix/data/LibMatrixDNNConv2dHelper.java   |  20 +--
 .../SinglePrecisionCudaSupportFunctions.java    |   9 +-
 .../org/apache/sysml/utils/GPUStatistics.java   |   5 +-
 .../java/org/apache/sysml/utils/Statistics.java |   8 +-
 20 files changed, 190 insertions(+), 203 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index 8452e75..8a4a5d6 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -75,15 +75,9 @@
    <!-- enables native blas for matrix multiplication and convolution, 
experimental feature (options: auto, mkl, openblas, none) -->
    <sysml.native.blas>none</sysml.native.blas>
 
-   <!-- prints finegrained statistics information -->
+   <!-- prints finegrained statistics information (includes extra GPU 
information and extra statistics information for Deep Neural Networks done in 
CP mode) -->
    <sysml.stats.finegrained>false</sysml.stats.finegrained>
    
-   <!-- prints extra statistics information for GPU -->
-   <sysml.stats.extraGPU>false</sysml.stats.extraGPU>
-
-   <!-- prints extra statistics information for Deep Neural Networks done in 
CP mode -->
-   <sysml.stats.extraDNN>false</sysml.stats.extraDNN>
-
     <!-- sets the GPUs to use per process, -1 for all GPUs, a specific GPU 
number (5), a range (eg: 0-2) or a comma separated list (eg: 0,2,4)-->
     <sysml.gpu.availableGPUs>-1</sysml.gpu.availableGPUs>
     

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 51ab6a1..cb39340 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -30,8 +30,6 @@ import org.apache.sysml.runtime.controlprogram.Program;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
-import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
 public class ScriptExecutorUtils {
@@ -75,9 +73,7 @@ public class ScriptExecutorUtils {
                        throws DMLRuntimeException {
                // Whether extra statistics useful for developers and others 
interested
                // in digging into performance problems are recorded and 
displayed
-               GPUStatistics.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS);
-               LibMatrixDNN.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS);
-               DMLScript.FINEGRAINED_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS);
+               DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && 
dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS);
                DMLScript.SYNCHRONIZE_GPU = 
dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
                DMLScript.EAGER_CUDA_FREE = 
dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
                DMLScript.STATISTICS_MAX_WRAP_LEN = 
dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index e8bde56..42037a1 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -84,8 +84,6 @@ public class DMLConfig
        
        public static final String EXTRA_FINEGRAINED_STATS = 
"sysml.stats.finegrained"; //boolean
        public static final String STATS_MAX_WRAP_LEN   = 
"sysml.stats.maxWrapLength"; //int
-       public static final String EXTRA_GPU_STATS      = 
"sysml.stats.extraGPU"; //boolean
-       public static final String EXTRA_DNN_STATS      = 
"sysml.stats.extraDNN"; //boolean
        public static final String AVAILABLE_GPUS       = 
"sysml.gpu.availableGPUs"; // String to specify which GPUs to use (a range, all 
GPUs, comma separated list or a specific GPU)
        public static final String SYNCHRONIZE_GPU      = 
"sysml.gpu.sync.postProcess"; // boolean: whether to synchronize GPUs after 
every instruction 
        public static final String EAGER_CUDA_FREE              = 
"sysml.gpu.eager.cudaFree"; // boolean: whether to perform eager CUDA free on 
rmvar
@@ -134,8 +132,6 @@ public class DMLConfig
                _defaultVals.put(NATIVE_BLAS,            "none" );
                _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" );
                _defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
-               _defaultVals.put(EXTRA_GPU_STATS,        "false" );
-               _defaultVals.put(EXTRA_DNN_STATS,        "false" );
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
                _defaultVals.put(AVAILABLE_GPUS,         "-1");
                _defaultVals.put(SYNCHRONIZE_GPU,        "true" );
@@ -422,7 +418,7 @@ public class DMLConfig
                                CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS,
                                COMPRESSED_LINALG, 
                                CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, 
CODEGEN_PLANCACHE, CODEGEN_LITERALS,
-                               EXTRA_GPU_STATS, EXTRA_DNN_STATS, 
EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN,
+                               EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN,
                                AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION
                }; 
                

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index f5d2f46..1883582 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -200,9 +200,9 @@ public abstract class GPUInstruction extends Instruction {
                                        throws DMLRuntimeException
        {
                if(DMLScript.SYNCHRONIZE_GPU) {
-                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       long t0 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                        jcuda.runtime.JCuda.cudaDeviceSynchronize();
-                       if(GPUStatistics.DISPLAY_STATISTICS) {
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
                                
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_CUDA_SYNC, System.nanoTime() - t0);
                        }
                }
@@ -238,9 +238,9 @@ public abstract class GPUInstruction extends Instruction {
         * @throws DMLRuntimeException  if an error occurs
         */
        protected MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long 
numRows, long numCols) throws DMLRuntimeException {
-               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
                Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols);
-               if (GPUStatistics.DISPLAY_STATISTICS && mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS && mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
                return mb.getKey();
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 53f1a19..d165970 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -497,7 +497,7 @@ public class CSRPointer {
         */
        public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, 
cublasHandle cublasHandle, int rows,
                        int cols, String instName) throws DMLRuntimeException {
-               long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null 
? System.nanoTime() : 0;
+               long t0 = DMLScript.FINEGRAINED_STATISTICS && instName != null 
? System.nanoTime() : 0;
                LOG.trace("GPU : sparse -> column major dense (inside 
CSRPointer) on " + this + ", GPUContext="
                                + getGPUContext());
                long size = ((long) rows) * getDataTypeSizeOf((long) cols);
@@ -510,7 +510,7 @@ public class CSRPointer {
                } else {
                        LOG.debug("in CSRPointer, the values array, row 
pointers array or column indices array was null");
                }
-               if (GPUStatistics.DISPLAY_STATISTICS && instName != null) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS && instName != null) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
                return A;
        }
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 0a2f6c4..4b47117 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -298,7 +298,7 @@ public class GPUContext {
                                                "GPU : in allocate from 
instruction " + instructionName + ", found free block of size " + (size
                                                                / 1024.0) + " 
Kbytes from previously allocated block on " + this);
                        }
-                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                       if (instructionName != null && 
DMLScript.FINEGRAINED_STATISTICS)
                                t0 = System.nanoTime();
                        Set<Pointer> freeList = freeCUDASpaceMap.get(size);
 
@@ -308,7 +308,7 @@ public class GPUContext {
 
                        if (freeList.isEmpty())
                                freeCUDASpaceMap.remove(size);
-                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                       if (instructionName != null && 
DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics
                                                
.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, 
System.nanoTime() - t0);
                } else {
@@ -326,7 +326,7 @@ public class GPUContext {
                                
GPUStatistics.cudaAllocTime.add(System.nanoTime() - t0);
                        if (DMLScript.STATISTICS)
                                GPUStatistics.cudaAllocCount.add(statsCount);
-                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                       if (instructionName != null && 
DMLScript.FINEGRAINED_STATISTICS)
                                
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_ALLOCATE,
                                                System.nanoTime() - t0);
                }
@@ -340,7 +340,7 @@ public class GPUContext {
                cudaMemset(A, 0, size);
                if (DMLScript.STATISTICS)
                        end = System.nanoTime();
-               if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+               if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
                if (DMLScript.STATISTICS)
                        GPUStatistics.cudaMemSet0Time.add(end - t1);
@@ -413,7 +413,7 @@ public class GPUContext {
                                
GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0);
                        if (DMLScript.STATISTICS)
                                GPUStatistics.cudaDeAllocCount.add(1);
-                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                       if (instructionName != null && 
DMLScript.FINEGRAINED_STATISTICS)
                                
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE,
                                                System.nanoTime() - t0);
                } else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 7e01166..c4a16fc 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -401,7 +401,7 @@ public class GPUObject {
                denseColumnMajorToRowMajor();
                if (DMLScript.STATISTICS)
                        end = System.nanoTime();
-               if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+               if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
                if (DMLScript.STATISTICS)
                        GPUStatistics.cudaSparseToDenseTime.add(end - start);
@@ -756,9 +756,9 @@ public class GPUObject {
                if (DMLScript.STATISTICS)
                        start = System.nanoTime();
 
-               long acqrTime = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+               long acqrTime = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                MatrixBlock tmp = mat.acquireRead();
-               if(GPUStatistics.DISPLAY_STATISTICS) {
+               if(DMLScript.FINEGRAINED_STATISTICS) {
                        if(tmp.isInSparseFormat())
                                GPUStatistics.maintainCPMiscTimes(opcode, 
CPInstruction.MISC_TIMER_GET_SPARSE_MB, System.nanoTime()-acqrTime);
                        else
@@ -826,10 +826,10 @@ public class GPUObject {
                        allocateSparseMatrixOnDevice();
 
                        if (copyToDevice) {
-                               long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                CSRPointer.copyToDevice(getGPUContext(), 
getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd,
                                                values);
-                               if(GPUStatistics.DISPLAY_STATISTICS) 
+                               if(DMLScript.FINEGRAINED_STATISTICS) 
                                        
GPUStatistics.maintainCPMiscTimes(opcode, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
                        }
                } else {
@@ -845,9 +845,9 @@ public class GPUObject {
                        if (tmp.getNonZeros() == 0) {
                                // Minor optimization: No need to allocate 
empty error for CPU 
                                // data = new double[tmp.getNumRows() * 
tmp.getNumColumns()];
-                               long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                cudaMemset(getJcudaDenseMatrixPtr(), 0, 
getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
-                               if(GPUStatistics.DISPLAY_STATISTICS) 
+                               if(DMLScript.FINEGRAINED_STATISTICS) 
                                        
GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t1);
                        }
                        else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index be5ade7..fb70c13 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -22,6 +22,7 @@ import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
@@ -159,17 +160,17 @@ public class DoublePrecisionCudaSupportFunctions 
implements CudaSupportFunctions
 
        @Override
        public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, 
String instName, boolean isEviction) throws DMLRuntimeException {
-               long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != 
null? System.nanoTime() : 0;
+               long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != 
null? System.nanoTime() : 0;
                cudaMemcpy(Pointer.to(dest), src, 
((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-               if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+               if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
        }
 
        @Override
        public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, 
String instName) throws DMLRuntimeException {
-               long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != 
null? System.nanoTime() : 0;
+               long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != 
null? System.nanoTime() : 0;
                cudaMemcpy(dest, Pointer.to(src), 
((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
-               if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+               if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index fd99eb3..59a9e49 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -315,11 +315,11 @@ public class LibMatrixCUDA {
                Pointer outputPointer = getDensePointer(gCtx, outputBlock, 
instName);
 
                long t1=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("relu_backward",
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
                                imagePointer, doutPointer, outputPointer, 
toInt(rows), toInt(cols));
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 
        }
        
@@ -387,11 +387,11 @@ public class LibMatrixCUDA {
                Pointer biasPointer = 
bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
                Pointer outputPointer = 
outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
                long t1 = 0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("bias_multiply",
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
                                imagePointer, biasPointer, outputPointer, 
toInt(rows), toInt(cols), toInt(PQ));
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
 
        }
 
@@ -441,11 +441,11 @@ public class LibMatrixCUDA {
                }
                int PQ = cols / k;
                long t1 = 0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("bias_add",
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
                                image, bias, output, rows, cols, PQ);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
        }
        
 
@@ -512,13 +512,13 @@ public class LibMatrixCUDA {
 
                long t0=0, t1=0;
 
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                cudaSupportFunctions.cublassyrk(getCublasHandle(gCtx), 
cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, 
ldc);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, 
System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, 
System.nanoTime() - t0);
 
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                copyUpperToLowerTriangle(gCtx, instName, output);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - 
t1);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - 
t1);
        }
 
        /**
@@ -930,19 +930,19 @@ public class LibMatrixCUDA {
 
                long t1=0,t2=0;
 
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
                //cudaDeviceSynchronize;
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
 
                int s = blocks;
                while (s > 1) {
                        tmp = getKernelParamsForReduceAll(gCtx, s);
                        blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t2 = 
System.nanoTime();
                        getCudaKernels(gCtx).launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                        tempOut, tempOut, s);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
                        s = (s + (threads*2-1)) / (threads*2);
                }
                double[] result = {-1f};
@@ -971,11 +971,11 @@ public class LibMatrixCUDA {
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
                long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                in, out, rows, cols);
                //cudaDeviceSynchronize;
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
 
        }
 
@@ -999,11 +999,11 @@ public class LibMatrixCUDA {
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
                long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                in, out, rows, cols);
                //cudaDeviceSynchronize;
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
        }
 
        /**
@@ -1329,11 +1329,11 @@ public class LibMatrixCUDA {
                int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
                int size = rlenA * clenA;
                long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("matrix_scalar_op",
                                
ExecutionConfig.getConfigForSimpleVectorOperations(size),
                                a, scalar, c, size, getBinaryOp(op.fn), 
isLeftScalar);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0);
        }
 
        /**
@@ -1433,11 +1433,11 @@ public class LibMatrixCUDA {
                        LOG.trace("GPU : matrix_matrix_cellwise_op" + ", 
GPUContext=" + gCtx);
                }
                long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op",
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
                                a, b, c, maxRlen, maxClen, vecStatusA, 
vecStatusB, getBinaryOp(op.fn));
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - 
t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - 
t0);
        }
 
        /**
@@ -1529,11 +1529,11 @@ public class LibMatrixCUDA {
                        int rlen = toInt(out.getNumRows());
                        int clen = toInt(out.getNumColumns());
                        long t0 = 0;
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                t0 = System.nanoTime();
                        int size = rlen * clen;
                        getCudaKernels(gCtx).launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0);
                }
        }
@@ -1549,10 +1549,10 @@ public class LibMatrixCUDA {
         */
        private static void deviceCopy(String instName, Pointer src, Pointer 
dest, int rlen, int clen) throws DMLRuntimeException {
                long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                int size = rlen * clen * sizeOfDataType;
                cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0);
        }
 
        /**
@@ -1633,19 +1633,19 @@ public class LibMatrixCUDA {
                        // Invoke cuSparse when either are in sparse format
                        // Perform sparse-sparse dgeam
                        if (!isInSparseFormat(gCtx, in1)) {
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        t0 = System.nanoTime();
                                in1.getGPUObject(gCtx).denseToSparse();
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
                                                        System.nanoTime() - t0);
                        }
                        CSRPointer A = 
in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
                        if (!isInSparseFormat(gCtx, in2)) {
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        t0 = System.nanoTime();
                                in2.getGPUObject(gCtx).denseToSparse();
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE,
                                                        System.nanoTime() - t0);
                        }
@@ -1668,21 +1668,21 @@ public class LibMatrixCUDA {
                                                        "Transpose in 
cusparseDcsrgeam not supported for sparse matrices on GPU");
                                }
 
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        t1 = System.nanoTime();
                                CSRPointer C = 
CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
                                                        System.nanoTime() - t1);
 
                                
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
                                //long sizeOfC = CSRPointer.estimateSize(C.nnz, 
out.getNumRows());
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        t0 = System.nanoTime();
                                
cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, 
A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr,
                                                B.descr, toInt(B.nnz), B.val, 
B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
                                //cudaDeviceSynchronize;
-                               if (GPUStatistics.DISPLAY_STATISTICS)
+                               if (DMLScript.FINEGRAINED_STATISTICS)
                                        
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB,
                                                        System.nanoTime() - t0);
                        }
@@ -1709,9 +1709,9 @@ public class LibMatrixCUDA {
                        getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName, outRLen, outCLen);      // Allocated the dense output matrix
                        Pointer C = getDensePointer(gCtx, out, instName);
 
-                       if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t0 = 
System.nanoTime();
                        cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), 
transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
                }
        }
 
@@ -1823,7 +1823,7 @@ public class LibMatrixCUDA {
         */
        protected static void sliceDenseDense(GPUContext gCtx, String instName, 
Pointer inPointer, Pointer outPointer, 
                        int rl, int ru, int cl, int cu, int inClen) throws 
DMLRuntimeException {
-               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
                long retClen = cu - cl + 1;
                if (inClen == retClen) {
                        cudaMemcpy(outPointer, inPointer.withByteOffset(rl * 
inClen * sizeOfDataType), (ru - rl + 1) * inClen
@@ -1833,7 +1833,7 @@ public class LibMatrixCUDA {
                        getCudaKernels(gCtx).launchKernel("slice_dense_dense", 
ExecutionConfig.getConfigForSimpleVectorOperations(toInt(retRlen*retClen)),
                                        inPointer, outPointer, rl, ru, cl, cu, 
inClen,  retRlen, retClen);
                }
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0);
        }
        
        /**
@@ -1857,7 +1857,7 @@ public class LibMatrixCUDA {
                if(size == 0) return;
                
                int retRlen = ru - rl + 1;
-               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
                int retClen = cu - cl + 1;
                
                String kernel = null; String timer = null;
@@ -1879,7 +1879,7 @@ public class LibMatrixCUDA {
                // We can generalize this later to output sparse matrix.
                getCudaKernels(gCtx).launchKernel(kernel, 
ExecutionConfig.getConfigForSimpleVectorOperations(size),
                                inPointer.val, inPointer.rowPtr, 
inPointer.colInd, outPointer, rl, ru, cl, cu, retClen);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, timer, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, timer, System.nanoTime() - t0);
        }
        
        /**
@@ -1924,11 +1924,11 @@ public class LibMatrixCUDA {
                int maxRows = toInt(Math.max(rowsA, rowsB));
                int maxCols = toInt(Math.max(colsA, colsB));
 
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx)
                .launchKernel("cbind", 
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
                                rowsA, colsA, rowsB, colsB);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1);
 
        }
 
@@ -1959,11 +1959,11 @@ public class LibMatrixCUDA {
                int maxRows = Math.max(rowsA, rowsB);
                int maxCols = Math.max(colsA, colsB);
 
-               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx)
                .launchKernel("rbind", 
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
                                rowsA, colsA, rowsB, colsB);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1);
 
        }
 
@@ -2295,10 +2295,10 @@ public class LibMatrixCUDA {
                        Pointer output = getDensePointer(gCtx, out, instName);
                        Pointer input = getDensePointer(gCtx, in1, instName);
                        int size = toInt(in1.getNumColumns() * 
in1.getNumRows());
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
                        getCudaKernels(gCtx).launchKernel(kernel, 
ExecutionConfig.getConfigForSimpleVectorOperations(size),
                                        input, output, size);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - 
t1);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - 
t1);
                }
        }
 
@@ -2337,13 +2337,13 @@ public class LibMatrixCUDA {
                        // becomes
                        // C <- A
                        // C <- alpha*B + C
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
                        cudaMemcpy(C, A, n*((long)sizeOfDataType), 
cudaMemcpyDeviceToDevice);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
 
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t2 = 
System.nanoTime();
                        cudaSupportFunctions.cublasaxpy(getCublasHandle(gCtx), 
toInt(n), alphaPtr, B, 1, C, 1);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
                }
                else {
                        if(LOG.isTraceEnabled()) {
@@ -2353,12 +2353,12 @@ public class LibMatrixCUDA {
                        // Matrix-Vector daxpy
                        // Note: Vector-Matrix operation is not supported
                        // daxpy_matrix_vector(double* A,  double* B, double 
alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
                        int rlenA = toInt(in1.getNumRows()); int clenA =  
toInt(in1.getNumColumns());
                        int rlenB = toInt(in2.getNumRows()); int clenB =  
toInt(in2.getNumColumns());
                        
getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", 
ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA),
                                        A, B, constant, C, rlenA, clenA, rlenB, 
clenB);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1);
                }
        }
 
@@ -2406,20 +2406,20 @@ public class LibMatrixCUDA {
                // convert dense matrices to row major
                // Operation in cuSolver and cuBlas are for column major dense 
matrices
                // and are destructive to the original input
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                GPUObject ATobj = (GPUObject) Aobj.clone();
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                ATobj.denseRowMajorToColumnMajor();
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
                Pointer A = ATobj.getJcudaDenseMatrixPtr();
 
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                GPUObject bTobj = (GPUObject) bobj.clone();
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                bTobj.denseRowMajorToColumnMajor();
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
 
 
                Pointer b = bTobj.getJcudaDenseMatrixPtr();
@@ -2428,18 +2428,18 @@ public class LibMatrixCUDA {
                // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
 
                // step 3: query working space of geqrf and ormqr
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                int[] lwork = {0};
                
cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, 
n, A, m, lwork);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
 
                // step 4: compute QR factorization
                Pointer work = gCtx.allocate(instName, lwork[0] * 
sizeOfDataType);
                Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
                Pointer devInfo = gCtx.allocate(Sizeof.INT);
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                
cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, 
tau, work, lwork[0], devInfo);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, 
System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, 
System.nanoTime() - t0);
 
                int[] qrError = {-1};
                cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, 
cudaMemcpyDeviceToHost);
@@ -2448,24 +2448,24 @@ public class LibMatrixCUDA {
                }
 
                // step 5: compute Q^T*B
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                
cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), 
cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, 
tau, b, m, work, lwork[0], devInfo);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, 
System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, 
System.nanoTime() - t0);
                cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, 
cudaMemcpyDeviceToHost);
                if (qrError[0] != 0) {
                        throw new DMLRuntimeException("GPU : Error in call to 
ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + 
qrError[0] + " was wrong");
                }
 
                // step 6: compute x = R \ Q^T*B
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(),
                        cublasSideMode.CUBLAS_SIDE_LEFT, 
cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, 
cublasDiagType.CUBLAS_DIAG_NON_UNIT,
                        n, 1, dataTypePointerTo(1.0), A, m, b, m);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, 
System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, 
System.nanoTime() - t0);
 
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                bTobj.denseColumnMajorToRowMajor();
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
 
                // TODO  : Find a way to assign bTobj directly to the output 
and set the correct flags so as to not crash
                // There is an avoidable copy happening here
@@ -2497,10 +2497,10 @@ public class LibMatrixCUDA {
         */
        protected static MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, 
String name, long numRows, long numCols) throws DMLRuntimeException {
                long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols);
                if (mb.getValue())
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
                return mb.getKey();
        }
@@ -2519,10 +2519,10 @@ public class LibMatrixCUDA {
         */
        private static MatrixObject 
getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long 
numCols, long nnz, String instName, String name) throws DMLRuntimeException {
                long t0=0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                Pair<MatrixObject, Boolean> mb = 
ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz);
                if (mb.getValue())
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0);
                return mb.getKey();
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 5935285..4e23953 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -43,6 +43,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
@@ -220,13 +221,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                }
                try {
                        long t1 = 0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
                        int status = 
cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
                                        algo.nchwTensorDesc, image,
                                        algo.filterDesc, filter,
                                        algo.convDesc, algo.algo, 
algo.workSpace, algo.sizeInBytes, zero(),
                                        algo.nkpqTensorDesc, output);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t1);
                        if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
@@ -292,9 +293,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                                // Perform one-input 
conv2dBackwardFilter
                                                Pointer tempdwPointer = 
gCtx.allocate(KCRS*sizeOfDataType);
                                                for(int n = 0; n < N; n++) {
-                                                       long t0 = 
GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                                                       long t0 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
                                                        
cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType);
-                                                       
if(GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
+                                                       
if(DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
                                                        // Perform one-input 
conv2dBackwardFilter
                                                        
cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), 
doutFetcher.getNthRow(n), tempdwPointer, algo);
                                                        
getCudaKernels(gCtx).launchKernel("inplace_add",
@@ -330,10 +331,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        LOG.trace("GPU : conv2dBackwardFilter" + ", 
GPUContext=" + gCtx);
                }
                try {
-                       long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                        int status = 
cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), 
algo.nchwTensorDesc, imagePointer,
                                        algo.nkpqTensorDesc, doutPointer, 
algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), 
algo.filterDesc, dwPointer);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - 
t1);
                        if (status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardFilter: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
@@ -424,10 +425,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" 
+ gCtx);
                }
                try {
-                       long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                        int status = 
cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), algo.filterDesc, w,
                                        algo.nkpqTensorDesc, dy, algo.convDesc, 
algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.nchwTensorDesc, dx);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - 
t1);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - 
t1);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardData: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
@@ -499,11 +500,11 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                
LibMatrixCuDNNPoolingDescriptors.cudnnMaxpoolingDescriptors(gCtx, instName, N, 
C, H, W, K, R, S, 
                                                pad_h, pad_w, stride_h, 
stride_w, P, Q)) {
                        long t1=0,t2=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+                       if (DMLScript.FINEGRAINED_STATISTICS) t2 = 
System.nanoTime();
                        int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
@@ -598,20 +599,20 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        long t1=0, t2=0, t3=0;
                        int status;
                        if(!isMaxPoolOutputProvided) {
-                               if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                               if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
                                long numBytes = N*C*P*Q*sizeOfDataType;
                                y = gCtx.allocate(numBytes);
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-                               if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+                               if (DMLScript.FINEGRAINED_STATISTICS) t2 = 
System.nanoTime();
                                status = 
cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, 
x, zero(), desc.yDesc, y);
-                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+                               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
                                if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                        throw new DMLRuntimeException("Could 
not executed cudnnPoolingForward before cudnnPoolingBackward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                                }
                        }
-                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t3 = 
System.nanoTime();
                        status = cudnnPoolingBackward(getCudnnHandle(gCtx), 
desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), 
desc.dxDesc, dx);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
@@ -621,10 +622,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                }
                finally {
                        long t4=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t4 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t4 = 
System.nanoTime();
                        if(!isMaxPoolOutputProvided)
                                gCtx.cudaFreeHelper(instName, y);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
                }
        }
 
@@ -641,18 +642,18 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        cudnnCreateActivationDescriptor(activationDescriptor);
                        double dummy = -1;
                        cudnnSetActivationDescriptor(activationDescriptor, 
CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
-                       if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t0 = 
System.nanoTime();
                        cudnnActivationForward(getCudnnHandle(gCtx), 
activationDescriptor,
                                        one(), srcTensorDesc, srcData,
                                        zero(), dstTensorDesc, dstData);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
                } catch (CudaException e) {
                        throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
                }
                finally {
                        long t1=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
+                       if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
                }
        }
 
@@ -678,13 +679,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                LOG.trace("GPU : relu custom kernel" + ", 
GPUContext=" + gCtx);
                        }
                        // Invokes relu(double* A,  double* ret, int rlen, int 
clen)
-                       if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                       if (DMLScript.FINEGRAINED_STATISTICS) t0 = 
System.nanoTime();
                        Pointer dstData = getDensePointerForCuDNN(gCtx, output, 
instName);
                        Pointer srcData = getDensePointerForCuDNN(gCtx, in, 
instName); // TODO: FIXME: Add sparse kernel support for relu
                        getCudaKernels(gCtx).launchKernel("relu",
                                        
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
                                        srcData, dstData, toInt(N), toInt(CHW));
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
+                       if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
                }
                else {
                        cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index ee22541..17cd610 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.runtime.matrix.data;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
@@ -85,7 +86,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
        @Override
        public void close() {
                long t3 = 0;
-               if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
+               if (DMLScript.FINEGRAINED_STATISTICS) t3 = System.nanoTime();
                if(nchwTensorDesc != null)
                        cudnnDestroyTensorDescriptor(nchwTensorDesc);
                if(nkpqTensorDesc != null)
@@ -96,7 +97,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                        cudnnDestroyConvolutionDescriptor(convDesc);
                if(sizeInBytes != 0)
                        gCtx.cudaFreeHelper(instName, workSpace);
-               if(GPUStatistics.DISPLAY_STATISTICS)
+               if(DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
        }
        
@@ -125,7 +126,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
        public static LibMatrixCuDNNConvolutionAlgorithm 
cudnnGetConvolutionForwardAlgorithm(
                        GPUContext gCtx, String instName, int N, int C, int H, 
int W, int K, int R, int S, 
                        int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q, long workspaceLimit) throws DMLRuntimeException {
-               long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
                LibMatrixCuDNNConvolutionAlgorithm ret = new 
LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
                                pad_h, pad_w, stride_h, stride_w, P, Q);
                int[] algos = {-1};
@@ -139,7 +140,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                        ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
                ret.sizeInBytes = sizeInBytesArray[0];
                ret.algo = algos[0];
-               if (GPUStatistics.DISPLAY_STATISTICS)
+               if (DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
                return ret;
        }
@@ -169,7 +170,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
        public static LibMatrixCuDNNConvolutionAlgorithm 
cudnnGetConvolutionBackwardFilterAlgorithm(
                        GPUContext gCtx, String instName, int N, int C, int H, 
int W, int K, int R, int S, 
                        int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q, long workspaceLimit) throws DMLRuntimeException {
-               long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
                LibMatrixCuDNNConvolutionAlgorithm ret = new 
LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
                                pad_h, pad_w, stride_h, stride_w, P, Q);
                
@@ -186,7 +187,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                ret.sizeInBytes = sizeInBytesArray[0];
                ret.algo = algos[0];
                
-               if (GPUStatistics.DISPLAY_STATISTICS)
+               if (DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
                return ret;
        }
@@ -216,7 +217,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
        public static LibMatrixCuDNNConvolutionAlgorithm 
cudnnGetConvolutionBackwardDataAlgorithm(
                        GPUContext gCtx, String instName, int N, int C, int H, 
int W, int K, int R, int S, 
                        int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q, long workspaceLimit) throws DMLRuntimeException {
-               //long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+               //long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                LibMatrixCuDNNConvolutionAlgorithm ret = new 
LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
                                pad_h, pad_w, stride_h, stride_w, P, Q);
                
@@ -237,7 +238,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
 //                     ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
 //             ret.sizeInBytes = sizeInBytesArray[0];
 //             ret.algo = algos[0];
-//             if (GPUStatistics.DISPLAY_STATISTICS)
+//             if (DMLScript.FINEGRAINED_STATISTICS)
 //                     GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
                return ret;
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index 5121c87..5a7cad3 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -20,6 +20,8 @@ package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.runtime.JCuda.cudaMemset;
 import jcuda.Pointer;
+
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -59,10 +61,10 @@ public class LibMatrixCuDNNInputRowFetcher extends 
LibMatrixCUDA implements java
        public Pointer getNthRow(int n) throws DMLRuntimeException {
                if(isInputInSparseFormat) {
                        jcuda.runtime.JCuda.cudaDeviceSynchronize();
-                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       long t0 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                        cudaMemset(outPointer, 0, numColumns*sizeOfDataType);
                        jcuda.runtime.JCuda.cudaDeviceSynchronize();
-                       if(GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
+                       if(DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
                        LibMatrixCUDA.sliceSparseDense(gCtx, instName, 
(CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCUDA.toInt(numColumns-1), 
numColumns);
                }
                else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index d962027..ce0ad5b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -164,19 +164,19 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
                        // and output
                        CSRPointer A = 
left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
                        CSRPointer B = 
right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       long t0 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                        CSRPointer C = 
CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, 
B, transb,
                                        params.m, params.n, params.k);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
                                                System.nanoTime() - t0);
 
                        // Step 3: Invoke the kernel
-                       long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                        
cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, 
params.m, params.n, params.k, A.descr,
                                        (int) A.nnz, A.val, A.rowPtr, A.colInd, 
B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
                                        C.val, C.rowPtr, C.colInd);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB,
                                                System.nanoTime() - t1);
                        output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
@@ -284,14 +284,14 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
                denseSparseMatMult(getCusparseHandle(gCtx), instName, output, 
B, A, params);
                if (outRLen != 1 && outCLen != 1) {
                        // Transpose: C = t(output)
-                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       long t0 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                        cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), 
cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
                                        toInt(outCLen), toInt(outRLen), one(), 
output, toInt(outRLen), zero(), new Pointer(),
                                        toInt(outRLen), C, toInt(outCLen));
                        if (!DMLScript.EAGER_CUDA_FREE)
                                JCuda.cudaDeviceSynchronize();
                        gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
+                       if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime()
                                                - t0);
                }
@@ -319,7 +319,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
         */
        private static void denseSparseMatMult(cusparseHandle handle, String 
instName, Pointer C, Pointer A, CSRPointer B,
                        CuMatMultParameters param) throws DMLRuntimeException {
-               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
                String kernel = 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB;
                // Ignoring sparse vector dense matrix multiplication and dot 
product
                boolean isVector = (param.leftNumRows == 1 && 
!param.isLeftTransposed)
@@ -343,7 +343,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
                        cudaSupportFunctions.cusparsecsrmm2(handle, transa, 
transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
                                        B.rowPtr, B.colInd, A, param.ldb, 
zero(), C, param.ldc);
                }
-               if (GPUStatistics.DISPLAY_STATISTICS)
+               if (DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instName, kernel, 
System.nanoTime() - t0);
        }
 
@@ -370,7 +370,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
         */
        private static void denseDenseMatMult(cublasHandle handle, String 
instName, Pointer C, Pointer A, Pointer B,
                        CuMatMultParameters param) throws DMLRuntimeException {
-               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() 
: 0;
                String kernel = null;
                param.rowToColumnMajor();
                param.validate();
@@ -412,7 +412,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
                                        zero(), C, param.ldc);
                        kernel = 
GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
                }
-               if (GPUStatistics.DISPLAY_STATISTICS)
+               if (DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instName, kernel, 
System.nanoTime() - t0);
        }
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 096574a..67d4a1a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -68,8 +68,6 @@ public class LibMatrixDNN {
        protected static final Log LOG =  
LogFactory.getLog(LibMatrixDNN.class.getName());
        
        //library configurations and external contracts
-       public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in 
stats output
-       
        // 
------------------------------------------------------------------------------------------------
        private static AtomicLong conv2dSparseCount = new AtomicLong(0);
        private static AtomicLong conv2dDenseCount = new AtomicLong(0);
@@ -89,7 +87,7 @@ public class LibMatrixDNN {
        static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0);
        
        public static void appendStatistics(StringBuilder sb) {
-               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+               if(DMLScript.FINEGRAINED_STATISTICS) {
                        sb.append("LibMatrixDNN dense count 
(conv/bwdF/bwdD/im2col/maxBwd):\t" 
                                        + conv2dDenseCount.get() + "/"
                                        + conv2dBwdFilterDenseCount.get() + "/"
@@ -230,7 +228,7 @@ public class LibMatrixDNN {
                if(params.stride_h <= 0 || params.stride_w <= 0) 
                        throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
                
-               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+               if(DMLScript.FINEGRAINED_STATISTICS) {
                        if(filter.isInSparseFormat() || 
dout.isInSparseFormat()) {
                                conv2dBwdDataSparseCount.addAndGet(1);
                        }
@@ -255,7 +253,7 @@ public class LibMatrixDNN {
                if(params.stride_h <= 0 || params.stride_w <= 0) 
                        throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
                
-               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+               if(DMLScript.FINEGRAINED_STATISTICS) {
                        if(input.isInSparseFormat() || dout.isInSparseFormat()) 
{
                                conv2dBwdFilterSparseCount.addAndGet(1);
                        }
@@ -281,7 +279,7 @@ public class LibMatrixDNN {
                if(params.stride_h <= 0 || params.stride_w <= 0) 
                        throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
                
-               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+               if(DMLScript.FINEGRAINED_STATISTICS) {
                        if(input.isInSparseFormat() || 
filter.isInSparseFormat()) {
                                conv2dSparseCount.addAndGet(1);
                        }
@@ -314,7 +312,7 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Incorrect dout 
dimensions in maxpooling_backward:" + input.getNumRows() + " " + 
input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
                }
                
-               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+               if(DMLScript.FINEGRAINED_STATISTICS) {
                        if(input.isInSparseFormat() || dout.isInSparseFormat()) 
{
                                maxPoolBwdSparseCount.addAndGet(1);
                        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
index 960cea6..55f263f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
@@ -89,20 +89,20 @@ public class LibMatrixDNNConv2dBackwardDataHelper {
                                // rotate180(dout[n,]) => dout_reshaped
                                rotate180Worker.execute(n, 0);
                                // dout_reshaped %*% filter => temp
-                               long t1 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                outMM.reset(PQ, CRS, false);
                                
LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, 
!outRotate.sparse, false, _params);
-                               long t2 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                // col2im(temp) => output[n,] 
                                LibMatrixDNNHelper.doCol2imOverSingleImage(n, 
outMM, _params);
-                               long t3 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                
-                               if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
                                        time1 += t2 - t1;
                                        time2 += t3 - t2;
                                }
                        }
-                       if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
                                
LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1);
                                
LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2);
                        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
index 9698725..4a94838 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
@@ -99,24 +99,24 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
                                rotate180Worker.execute(n, 0);
                                
                                // im2col(input) => _im2ColOutBlock
-                               long t1 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                im2ColWorker.execute(n);
-                               long t2 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                
                                outMM.reset(CRS, K, false);
                                
LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, 
!im2ColOutBlock.sparse, !outRotate.sparse, _params);
-                               long t3 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                
                                if( !outMM.isEmptyBlock() ) //accumulate row 
results
                                        
LibMatrixMult.vectAdd(outMM.getDenseBlock(), partRet, 0, 0, K*CRS);
                                
-                               if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
                                        time1 += t2 - t1;
                                        time2 += t3 - t2;
                                }
                        }
                        inplaceTransAdd(partRet, _params);
-                       if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
                                
LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
                                
LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
                        }
@@ -150,27 +150,27 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
                                rotate180Worker.execute(n, 0);
                                
                                // im2col(input) => _im2ColOutBlock
-                               long t1 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                im2ColWorker.execute(n);
-                               long t2 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                
                                outMM.reset(K, CRS, false);
                                //Timing time = new Timing(true);
                                
LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, 
                                        outMM, !outRotate.sparse, 
!im2ColOutBlock.sparse, _params);
-                               long t3 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                
                                if( !outMM.isEmptyBlock() ) //accumulate row 
results
                                        
LibMatrixMult.vectAdd(outMM.getDenseBlock(), partRet, 0, 0, K*CRS);
                                
-                               if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
                                        time1 += t2 - t1;
                                        time2 += t3 - t2;
                                }
                        }
                        //no need to transpose because t(t(out)) cancel out
                        inplaceAdd(partRet, _params);
-                       if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
                                
LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
                                
LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
                        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
index 6a0205e..b390906 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
@@ -56,16 +56,16 @@ public class LibMatrixDNNConv2dHelper {
                        for(int n = _rl; n < _ru; n++)  {
                                for(int c = 0; c < _params.C; c++)  {
                                        // im2col(input) => _im2ColOutBlock
-                                       long t1 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                                       long t1 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
                                        im2ColWorker.execute(n, c);
-                                       long t2 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                                       long t2 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
                                        
                                        // filter %*% _im2ColOutBlock => 
matMultOutBlock
                                        MatrixBlock matMultOutBlock = new 
MatrixBlock(K, PQ, false);
                                        
LibMatrixDNNHelper.singleThreadedMatMult(_filters.get(c), im2ColOutBlock, 
matMultOutBlock, false, true, _params);
-                                       long t3 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                                       long t3 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
                                        
-                                       if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                                       if(DMLScript.FINEGRAINED_STATISTICS) {
                                                time1 += t2 - t1;
                                                time2 += t3 - t2;
                                        }
@@ -77,7 +77,7 @@ public class LibMatrixDNNConv2dHelper {
                                if(_params.bias != null)
                                        LibMatrixDNNHelper.addBias(n, 
_params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ);
                        }
-                       if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
                                
LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1);
                                
LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2);
                        }
@@ -137,16 +137,16 @@ public class LibMatrixDNNConv2dHelper {
                        long time1 = 0; long time2 = 0;
                        for(int n = _rl; n < _ru; n++)  {
                                // im2col(input) => _im2ColOutBlock
-                               long t1 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                im2ColWorker.execute(n);
-                               long t2 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t2 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                
                                // filter %*% _im2ColOutBlock => matMultOutBlock
                                outMM.reset(outMM.rlen, outMM.clen, false);
                                
LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, 
false, true, _params);
-                               long t3 = DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                               long t3 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
                                
-                               if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                               if(DMLScript.FINEGRAINED_STATISTICS) {
                                        time1 += t2 - t1;
                                        time2 += t3 - t2;
                                }
@@ -159,7 +159,7 @@ public class LibMatrixDNNConv2dHelper {
                                        LibMatrixDNNHelper.addBias(n, 
_params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ);
                        }
                        
-                       if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
+                       if(DMLScript.FINEGRAINED_STATISTICS) {
                                
LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1);
                                
LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2);
                        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 96a3887..8429d35 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -24,6 +24,7 @@ import static 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
@@ -163,7 +164,7 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
        
        @Override
        public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, 
String instName, boolean isEviction) throws DMLRuntimeException {
-               long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != 
null? System.nanoTime() : 0;
+               long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != 
null? System.nanoTime() : 0;
                // We invoke transfer matrix from device to host in two cases:
                // 1. During eviction of unlocked matrices
                // 2. During acquireHostRead
@@ -187,7 +188,7 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
                                dest[i] = floatData[i];
                        }
                }
-               if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+               if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
        }
 
@@ -195,7 +196,7 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
        public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, 
String instName) throws DMLRuntimeException {
                LOG.debug("Potential OOM: Allocated additional space in 
hostToDevice");
                // TODO: Perform conversion on GPU using double2float and 
float2double kernels
-               long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != 
null? System.nanoTime() : 0;
+               long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != 
null? System.nanoTime() : 0;
                if(PERFORM_CONVERSION_ON_DEVICE) {
                        Pointer deviceDoubleData = 
gCtx.allocate(((long)src.length)*Sizeof.DOUBLE);
                        cudaMemcpy(deviceDoubleData, Pointer.to(src), 
((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
@@ -210,7 +211,7 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
                        cudaMemcpy(dest, Pointer.to(floatData), 
((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice);
                }
                
-               if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+               if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
                        GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index 2668ad9..33ab953 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -35,9 +35,6 @@ import org.apache.sysml.api.DMLScript;
  * Printed as part of {@link Statistics}.
  */
 public class GPUStatistics {
-       // Whether or not extra per-instruction statistics will be recorded and 
shown for the GPU
-       public static boolean DISPLAY_STATISTICS = false;
-
        private static int iNoOfExecutedGPUInst = 0;
 
        public static long cudaInitTime = 0;
@@ -117,7 +114,7 @@ public class GPUStatistics {
         */
        public synchronized static void maintainCPMiscTimes( String 
instructionName, String miscTimer, long timeNanos, long incrementCount)
        {
-               if (!(DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS))
+               if (!(DMLScript.FINEGRAINED_STATISTICS))
                        return;
 
                HashMap<String, Long> miscTimesMap = 
_cpInstMiscTime.get(instructionName);

http://git-wip-us.apache.org/repos/asf/systemml/blob/de69afdc/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index 5cc0650..44bb232 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -601,7 +601,7 @@ public class Statistics
                sb.append(String.format(
                                " %" + maxNumLen + "s  %-" + maxInstLen + "s  
%" + maxTimeSLen + "s  %" + maxCountLen + "s", numCol,
                                instCol, timeSCol, countCol));
-               if (GPUStatistics.DISPLAY_STATISTICS || 
DMLScript.FINEGRAINED_STATISTICS) {
+               if (DMLScript.FINEGRAINED_STATISTICS) {
                        sb.append("  ");
                        sb.append(gpuCol);
                }
@@ -618,15 +618,15 @@ public class Statistics
                        int numLines = wrappedInstruction.length;
                        String [] miscTimers = null;
                        
-                       if (GPUStatistics.DISPLAY_STATISTICS || 
DMLScript.FINEGRAINED_STATISTICS) {
+                       if (DMLScript.FINEGRAINED_STATISTICS) {
                                miscTimers = 
wrap(GPUStatistics.getStringForCPMiscTimesPerInstruction(instruction), 
DMLScript.STATISTICS_MAX_WRAP_LEN);
                                numLines = Math.max(numLines, 
miscTimers.length);
                        }
                        
-                       String miscFormatString = 
(GPUStatistics.DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS) ? " %" + 
DMLScript.STATISTICS_MAX_WRAP_LEN + "s" : "%s";
+                       String miscFormatString = 
(DMLScript.FINEGRAINED_STATISTICS) ? " %" + DMLScript.STATISTICS_MAX_WRAP_LEN + 
"s" : "%s";
                        for(int wrapIter = 0; wrapIter < numLines; wrapIter++) {
                                String instStr = (wrapIter < 
wrappedInstruction.length) ? wrappedInstruction[wrapIter] : "";
-                               String miscTimerStr = ( 
(GPUStatistics.DISPLAY_STATISTICS || DMLScript.FINEGRAINED_STATISTICS) && 
wrapIter < miscTimers.length) ? miscTimers[wrapIter] : ""; 
+                               String miscTimerStr = ( 
(DMLScript.FINEGRAINED_STATISTICS) && wrapIter < miscTimers.length) ? 
miscTimers[wrapIter] : ""; 
                                if(wrapIter == 0) {
                                        // Display instruction count
                                        sb.append(String.format(

systemml git commit: [MINOR] Merge extra GPU and extra DNN flags into single "sysml.stats.finegrained" flag to simplify usage

Reply via email to