http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index 97597e0..798e74e 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -78,6 +78,8 @@ import 
org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter;
 import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDHandler;
 import org.apache.sysml.runtime.matrix.CleanupMR;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
 import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
 import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
 import org.apache.sysml.runtime.util.LocalFileUtils;
@@ -85,6 +87,7 @@ import org.apache.sysml.runtime.util.MapReduceTool;
 import org.apache.sysml.utils.Explain;
 import org.apache.sysml.utils.Explain.ExplainCounts;
 import org.apache.sysml.utils.Explain.ExplainType;
+import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 import org.apache.sysml.yarn.DMLAppMasterUtils;
 import org.apache.sysml.yarn.DMLYarnClientProxy;
@@ -646,7 +649,11 @@ public class DMLScript
                
                //double costs = CostEstimationWrapper.getTimeEstimate(rtprog, 
ExecutionContextFactory.createContext());
                //System.out.println("Estimated costs: "+costs);
-               
+
+               // Whether extra statistics useful for developers and others 
interested in digging
+               // into performance problems are recorded and displayed
+               GPUStatistics.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS);
+               LibMatrixDNN.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS);
                
                //Step 10: execute runtime program
                Statistics.startRunTimer();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 3d0fb28..a42b1ca 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -74,6 +74,8 @@ public class DMLConfig
        public static final String CODEGEN              = "codegen.enabled"; 
//boolean
        public static final String CODEGEN_PLANCACHE    = "codegen.plancache"; 
//boolean
        public static final String CODEGEN_LITERALS     = "codegen.literals"; 
//1..heuristic, 2..always
+       public static final String EXTRA_GPU_STATS                      = 
"systemml.stats.extraGPU"; //boolean
+       public static final String EXTRA_DNN_STATS                      = 
"systemml.stats.extraDNN"; //boolean
 
        // Fraction of available memory to use. The available memory is 
computer when the JCudaContext is created
        // to handle the tradeoff on calling cudaMemGetInfo too often.
@@ -114,7 +116,10 @@ public class DMLConfig
                _defaultVals.put(CODEGEN,                "false" );
                _defaultVals.put(CODEGEN_PLANCACHE,      "true" );
                _defaultVals.put(CODEGEN_LITERALS,       "1" );
-               
+
+               _defaultVals.put(EXTRA_GPU_STATS,       "false" );
+               _defaultVals.put(EXTRA_DNN_STATS,       "false" );
+
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
                _defaultVals.put(REFRESH_AVAILABLE_MEMORY_EVERY_TIME,      
"true" );
        }
@@ -402,6 +407,7 @@ public class DMLConfig
                                YARN_APPMASTER, YARN_APPMASTERMEM, 
YARN_MAPREDUCEMEM, 
                                CP_PARALLEL_MATRIXMULT, CP_PARALLEL_TEXTIO,
                                COMPRESSED_LINALG, CODEGEN, CODEGEN_LITERALS, 
CODEGEN_PLANCACHE,
+                               EXTRA_GPU_STATS, EXTRA_DNN_STATS
                }; 
                
                StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index f14123e..6455add 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -51,6 +51,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
 import org.apache.sysml.runtime.matrix.MetaData;
 import org.apache.sysml.runtime.matrix.data.FrameBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.runtime.util.MapReduceTool;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
@@ -224,14 +225,20 @@ public class ExecutionContext
                                
((MatrixFormatMetaData)oldMetaData).getOutputInfo(),
                                
((MatrixFormatMetaData)oldMetaData).getInputInfo()));
        }
-       
-       public MatrixObject getDenseMatrixOutputForGPUInstruction(String 
varName) 
+
+       /**
+        * Allocates a dense matrix on the GPU (for output)
+        * @param varName       name of the output matrix (known by this {@link 
ExecutionContext})
+        * @return a pair containing the wrapping {@link MatrixObject} and a 
boolean indicating whether a cuda memory allocation took place (as opposed to 
the space already being allocated)
+        * @throws DMLRuntimeException
+        */
+       public Pair<MatrixObject, Boolean> 
getDenseMatrixOutputForGPUInstruction(String varName)
                throws DMLRuntimeException 
        {       
                MatrixObject mo = allocateGPUMatrixObject(varName);
-               mo.getGPUObject().acquireDeviceModifyDense();
+               boolean allocated = 
mo.getGPUObject().acquireDeviceModifyDense();
                mo.getMatrixCharacteristics().setNonZeros(-1);
-               return mo;
+               return new Pair<MatrixObject, Boolean>(mo, allocated);
        }
 
     /**
@@ -243,13 +250,13 @@ public class ExecutionContext
      * @return matrix object
      * @throws DMLRuntimeException if DMLRuntimeException occurs
      */
-    public MatrixObject getSparseMatrixOutputForGPUInstruction(String varName, 
long nnz)
+    public Pair<MatrixObject, Boolean> 
getSparseMatrixOutputForGPUInstruction(String varName, long nnz)
         throws DMLRuntimeException
     {
         MatrixObject mo = allocateGPUMatrixObject(varName);
         mo.getMatrixCharacteristics().setNonZeros(nnz);
-        mo.getGPUObject().acquireDeviceModifySparse();
-        return mo;
+                               boolean allocated = 
mo.getGPUObject().acquireDeviceModifySparse();
+        return new Pair<MatrixObject, Boolean>(mo, allocated);
     } 
 
        /**
@@ -266,9 +273,10 @@ public class ExecutionContext
                return mo;
        }
 
-       public MatrixObject getMatrixInputForGPUInstruction(String varName) 
+       public Pair<MatrixObject, Boolean> 
getMatrixInputForGPUInstruction(String varName)
                        throws DMLRuntimeException 
-       {       
+       {
+               boolean copied = false;
                MatrixObject mo = getMatrixObject(varName);
                if(mo == null) {
                        throw new DMLRuntimeException("No matrix object 
available for variable:" + varName);
@@ -281,11 +289,11 @@ public class ExecutionContext
                        mo.acquireRead();
                        acquired = true;
                }
-               mo.getGPUObject().acquireDeviceRead();
+               copied = mo.getGPUObject().acquireDeviceRead();
                if(acquired) {
                        mo.release();
                }
-               return mo;
+               return new Pair<MatrixObject, Boolean>(mo, copied);
        }
        
        /**
@@ -301,7 +309,7 @@ public class ExecutionContext
                mo.release();
        }
        
-       public void releaseMatrixInputForGPUInstruction(String varName) 
+       public void releaseMatrixInputForGPUInstruction(String varName)
                throws DMLRuntimeException 
        {
                MatrixObject mo = getMatrixObject(varName);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 0be2139..23b5328 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -30,7 +30,7 @@ import 
org.apache.sysml.runtime.instructions.gpu.MatrixMatrixAxpyGPUInstruction;
 import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
 import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction;
-import 
org.apache.sysml.runtime.instructions.gpu.context.AggregateUnaryGPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.AggregateUnaryGPUInstruction;
 
 public class GPUInstructionParser  extends InstructionParser 
 {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 7219c6c..55c8f8d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -32,7 +32,7 @@ import 
org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 public class AggregateBinaryGPUInstruction extends GPUInstruction
 {
@@ -74,12 +74,12 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
                AggregateBinaryOperator aggbin = new 
AggregateBinaryOperator(Multiply.getMultiplyFnObject(), agg, 1);
                return new AggregateBinaryGPUInstruction(aggbin, in1, in2, out, 
opcode, str, isLeftTransposed, isRightTransposed);      
        }
-       
+
        @Override
        public void processInstruction(ExecutionContext ec) 
                throws DMLRuntimeException 
        {
-               Statistics.incrementNoOfExecutedGPUInst();
+               GPUStatistics.incrementNoOfExecutedGPUInst();
                
                AggregateBinaryOperator op = (AggregateBinaryOperator) _optr;
                if( !(op.binaryFn instanceof Multiply && op.aggOp.increOp.fn 
instanceof Plus) ) {
@@ -87,15 +87,16 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
                }
                
                //get inputs
-               MatrixObject m1 = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-               MatrixObject m2 = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+               MatrixObject m1 = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject m2 = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
+
 
                //compute matrix multiplication
                int rlen = (int) (_isLeftTransposed ? m1.getNumColumns() : 
m1.getNumRows());
                int clen = (int) (_isRightTransposed ? m2.getNumRows() : 
m2.getNumColumns());
 
                ec.setMetaData(_output.getName(), rlen, clen);
-               LibMatrixCUDA.matmult(ec, m1, m2, _output.getName(), 
_isLeftTransposed, _isRightTransposed);
+               LibMatrixCUDA.matmult(ec, getExtendedOpcode(), m1, m2, 
_output.getName(), _isLeftTransposed, _isRightTransposed);
         
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
new file mode 100644
index 0000000..45db44c
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.IndexFunction;
+import org.apache.sysml.runtime.functionobjects.ReduceCol;
+import org.apache.sysml.runtime.functionobjects.ReduceRow;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.utils.GPUStatistics;
+
+/**
+ * Implements aggregate unary instructions for CUDA
+ */
+public class AggregateUnaryGPUInstruction extends GPUInstruction {
+  private CPOperand _input1 = null;
+  private CPOperand _output = null;
+
+  public AggregateUnaryGPUInstruction(Operator op, CPOperand in1, CPOperand 
out,
+                                       String opcode, String istr)
+  {
+    super(op, opcode, istr);
+    _gputype = GPUINSTRUCTION_TYPE.AggregateUnary;
+    _input1 = in1;
+    _output = out;
+  }
+
+  public static AggregateUnaryGPUInstruction parseInstruction(String str )
+          throws DMLRuntimeException
+  {
+    String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+    String opcode = parts[0];
+    CPOperand in1 = new CPOperand(parts[1]);
+    CPOperand out = new CPOperand(parts[2]);
+
+    // This follows logic similar to AggregateUnaryCPInstruction.
+    // nrow, ncol & length should either read or refresh metadata
+    Operator aggop = null;
+    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
+      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
+    } else {
+      aggop = InstructionUtils.parseBasicAggregateUnaryOperator(opcode);
+    }
+    return new AggregateUnaryGPUInstruction(aggop, in1, out, opcode, str);
+  }
+
+  @Override
+  public void processInstruction(ExecutionContext ec)
+          throws DMLRuntimeException
+  {
+    GPUStatistics.incrementNoOfExecutedGPUInst();
+
+    String opcode = getOpcode();
+
+    // nrow, ncol & length should either read or refresh metadata
+    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
+      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
+    }
+
+    //get inputs
+    MatrixObject in1 = getMatrixInputForGPUInstruction(ec, _input1.getName());
+
+    int rlen = (int)in1.getNumRows();
+    int clen = (int)in1.getNumColumns();
+
+    IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn;
+    if (indexFunction instanceof ReduceRow){  // COL{SUM, MAX...}
+      ec.setMetaData(_output.getName(), 1, clen);
+    } else if (indexFunction instanceof ReduceCol) { // ROW{SUM, MAX,...}
+      ec.setMetaData(_output.getName(), rlen, 1);
+    }
+
+    LibMatrixCUDA.unaryAggregate(ec, getExtendedOpcode(), in1, 
_output.getName(), (AggregateUnaryOperator)_optr);
+
+    //release inputs/outputs
+    ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+
+    // If the unary aggregate is a row reduction or a column reduction, it 
results in a vector
+    // which needs to be released. Otherwise a scala is produced and it is 
copied back to the host
+    // and set in the execution context by invoking the setScalarOutput
+    if (indexFunction instanceof ReduceRow || indexFunction instanceof 
ReduceCol) {
+      ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 56e95b7..cb8c729 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -29,7 +29,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 public class ConvolutionGPUInstruction extends GPUInstruction 
 {
@@ -140,29 +140,29 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionGPUInstruction: " + str);      
                }
        }
-       
+
        public void processBiasInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
-               Statistics.incrementNoOfExecutedGPUInst();
-               MatrixObject input = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-               MatrixObject bias = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+               GPUStatistics.incrementNoOfExecutedGPUInst();
+               MatrixObject input = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject bias = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                
                ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
-               MatrixObject out = 
ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
-               LibMatrixCUDA.biasAdd(input, bias, out);
+               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
+               LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
-       
+
        public void processReLUBackwardInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
-               Statistics.incrementNoOfExecutedGPUInst();
-               MatrixObject input = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-               MatrixObject dout = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+               GPUStatistics.incrementNoOfExecutedGPUInst();
+               MatrixObject input = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                
-               MatrixObject out = 
ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
+               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
                ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
-               LibMatrixCUDA.reluBackward(input, dout, out);
+               LibMatrixCUDA.reluBackward(getExtendedOpcode(), input, dout, 
out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -182,7 +182,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        return;
                }
                
-               Statistics.incrementNoOfExecutedGPUInst();
+               GPUStatistics.incrementNoOfExecutedGPUInst();
                                        
                int pad_h = getScalarInput(ec, _padding, 0);
                int pad_w = getScalarInput(ec, _padding, 1);
@@ -203,8 +203,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
                
                if (instOpcode.equalsIgnoreCase("conv2d")) {
-                       MatrixObject image = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-                       MatrixObject filter = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+                       MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
+                       MatrixObject filter = 
getMatrixInputForGPUInstruction(ec, _input2.getName());
 
                        if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for image in conv2d");
@@ -212,13 +212,13 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                throw new DMLRuntimeException("Incorrect 
dimensions for filter in conv2d");
                        
                        ec.setMetaData(_output.getName(), N, K * P * Q);
-                       MatrixObject out = 
ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
-                       LibMatrixCUDA.conv2d(image, filter, out, N, C, H, W,
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       LibMatrixCUDA.conv2d(getExtendedOpcode(), image, 
filter, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) 
{
-                       MatrixObject image = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-                       MatrixObject dout = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+                       MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
+                       MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
 
                        if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for image in conv2d_backward_filter");
@@ -227,15 +227,15 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                                dout.getNumRows() + " != " +  N 
+ " || " + dout.getNumColumns() + " != " + K*P*Q);
                        
                        ec.setMetaData(_output.getName(), K, C * R * S);
-                       MatrixObject out = 
ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
-                       LibMatrixCUDA.conv2dBackwardFilter(image, dout, out, N, 
C, H, W,
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       LibMatrixCUDA.conv2dBackwardFilter(getExtendedOpcode(), 
image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                        // TODO: For now always copy the device data to host
                        // ec.gpuCtx.copyDeviceToHost(outputBlock);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_backward_data")) {
-                       MatrixObject filter = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-                       MatrixObject dout = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+                       MatrixObject filter = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
+                       MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
 
                        if(filter.getNumRows() != K || filter.getNumColumns() 
!= C*R*S) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for filter in convolution_backward_data");
@@ -244,25 +244,25 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                                dout.getNumRows() + " != " +  N 
+ " || " + dout.getNumColumns() + " != " + K*P*Q);
                        
                        ec.setMetaData(_output.getName(), N, C * H * W);
-                       MatrixObject out = 
ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
-                       LibMatrixCUDA.conv2dBackwardData(filter, dout, out, N, 
C, H, W,
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       LibMatrixCUDA.conv2dBackwardData(getExtendedOpcode(), 
filter, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling")) {
-                       MatrixObject image = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
+                       MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
 
                        if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for image in maxpooling: " + 
                                                image.getNumRows() + " != " +  
N + " || " + image.getNumColumns() + " != " + C*H*W);
                        
                        ec.setMetaData(_output.getName(), N, C * P * Q);
-                       MatrixObject out = 
ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
-                       LibMatrixCUDA.maxpooling(image, out, N, C, H, W,
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       LibMatrixCUDA.maxpooling(getExtendedOpcode(), image, 
out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
-                       MatrixObject image = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-                       MatrixObject dout = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+                       MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
+                       MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                        
                        if(dout.getNumRows() != N || dout.getNumColumns() != 
C*P*Q) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for dout in maxpooling_backward");
@@ -271,8 +271,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                                image.getNumRows() + " != " +  
N + " || " + image.getNumColumns() + " != " + K*P*Q);
                        
                        ec.setMetaData(_output.getName(), N, C * H * W);
-                       MatrixObject out = 
ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
-                       LibMatrixCUDA.maxpoolingBackward(image, dout, out, N, 
C, H, W,
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       LibMatrixCUDA.maxpoolingBackward(getExtendedOpcode(), 
image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 1c91a51..dcb2edc 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -22,15 +22,72 @@ package org.apache.sysml.runtime.instructions.gpu;
 import jcuda.runtime.JCuda;
 import org.apache.sysml.lops.runtime.RunMRJobs;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.GPUInstructionParser;
 import org.apache.sysml.runtime.instructions.Instruction;
+import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
 
-public abstract class GPUInstruction extends Instruction 
+public abstract class GPUInstruction extends Instruction
 {
        public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, 
Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary, Builtin };
-       
+
+       // Memory/conversions
+       public final static String MISC_TIMER_HOST_TO_DEVICE =                  
        "H2D";  // time spent in bringing data to gpu (from host)
+       public final static String MISC_TIMER_DEVICE_TO_HOST =                  
        "D2H";  // time spent in bringing data from gpu (to host)
+       public final static String MISC_TIMER_DEVICE_TO_DEVICE =                
        "D2D";  // time spent in copying data from one region on the device to 
another
+       public final static String MISC_TIMER_SPARSE_TO_DENSE =                 
        "s2d";  // time spent in converting data from sparse to dense
+       public final static String MISC_TIMER_DENSE_TO_SPARSE =                 
        "d2s";  // time spent in converting data from dense to sparse
+       public final static String MISC_TIMER_CUDA_FREE =                       
                        "f";            // time spent in calling cudaFree
+       public final static String MISC_TIMER_ALLOCATE =                        
                                "a";            // time spent to allocate 
memory on gpu
+       public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ao";     
        // time spent to allocate dense output (recorded differently than 
MISC_TIMER_ALLOCATE)
+
+       // Matmult instructions
+       public final static String MISC_TIMER_SPARSE_ALLOCATE_LIB =             
                                "Msao";         // time spend in allocating for 
sparse matrix output
+       public final static String MISC_TIMER_DENSE_DOT_LIB =                   
                                                "Mddot";        // time spent 
in dot product of 2 dense vectors
+       public final static String MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB =   
"Mdvdm";        // time spent in matrix mult of dense vector and dense matrix
+       public final static String MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB =   
"Mdmdv";        // time spent in matrix mult of dense matrix and dense vector
+       public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB =   
"Mdmdm";        // time spent in matrix mult of dense matrices
+       public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB =  
"Msmdv";        // time spent in matrix mult of sparse matrix and dense vector
+       public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = 
"Msmsm";  // time spent in matrix mult of sparse matrices
+       public final static String MISC_TIMER_SYRK_LIB =                        
                                                                        
"Msyrk";        // time spent in symmetric rank-k update
+
+       // Other BLAS instructions
+       public final static String MISC_TIMER_DAXPY_LIB = "daxpy";      // time 
spent in daxpy
+
+       // Transpose
+       public final static String MISC_TIMER_SPARSE_DGEAM_LIB =        
"sdgeaml";      // time spent in sparse transpose (and other ops of type 
a*op(A) + b*op(B))
+       public final static String MISC_TIMER_DENSE_DGEAM_LIB =         
"ddgeaml";      // time spent in dense transpose (and other ops of type a*op(A) 
+ b*op(B))
+       public final static String MISC_TIMER_TRANSPOSE_LIB =           "dtl";  
                // time spent on dense transpose, this includes allocation of 
output
+
+       // Custom kernels
+       public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL 
=        "mmck"; // time spent in matrix-matrix cellwise operations
+       public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL =          
                                "cask"; // time spent in compareAndSet kernel
+       public final static String MISC_TIMER_EXP_KERNEL =                      
                                                                        "expk"; 
// time spent in the exp kernel
+       public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL =  
        "u2lk"; // time spent in the copy_u2l_dense kernel
+       public final static String MISC_TIMER_FILL_KERNEL       =               
                                                                                
"fillk"; // time spent in the "fill" kernel
+       public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL =         
                                "msk";  // time spent in the matrix scalar 
kernel
+       public final static String MISC_TIMER_REDUCE_ALL_KERNEL =               
                                                "rallk"; // time spent in 
reduce all kernel
+       public final static String MISC_TIMER_REDUCE_ROW_KERNEL =               
                                                "rrowk"; // time spent in 
reduce row kernel
+       public final static String MISC_TIMER_REDUCE_COL_KERNEL =               
                                                "rcolk";        // time spent 
in reduce column kernel
+
+       // Deep learning operators
+       public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB =          
                        "nnaf"; // time spent in cudnnActivationForward
+       public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB =         
                        "nncf"; // time spent in cudnnConvolutionForward
+       public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB = 
"nncbf"; // time spent in cudnnConvolutionBackwardFilter
+       public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB =   
"nncbd"; // time spent in cudnnConvolutionBackwardData
+       public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB =          
                        "nnmf"; // time spent in cudnnPoolingForward
+       public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB =         
                        "nnmb"; // time spent in cudnnPoolingBackward
+       public final static String MISC_TIMER_BIAS_ADD_LIB =                    
                                                        "nnba"; // time spent 
in bias_add cuda kernel
+       public final static String MISC_TIMER_RELU_BACKWARD_KERNEL=             
                                "nnrbk"; // time spent in relu_backward cuda 
kernel
+       public final static String MISC_TIMER_RELU_KERNEL =                     
                                                        "nnrk"; // time spent 
in the relu kernel
+       public final static String MISC_TIMER_CUDNN_INIT =                      
                                                                "nni";  // time 
spent in initializations for cudnn call
+       public final static String MISC_TIMER_CUDNN_CLEANUP =                   
                                                "nnc";  // time spent in 
cleanup for cudnn call
+
+
        protected GPUINSTRUCTION_TYPE _gputype;
        protected Operator _optr;
        
@@ -91,4 +148,34 @@ public abstract class GPUInstruction extends Instruction
        {
                JCuda.cudaDeviceSynchronize();
        }
+
+       /**
+        * Helper method to get the input block (allocated on the GPU)
+        * Also records performance information into {@link Statistics}
+        * @param ec            active {@link ExecutionContext}
+        * @param name  name of input matrix (that the {@link ExecutionContext} 
is aware of)
+        * @return      the matrix object
+        * @throws DMLRuntimeException if an error occurs
+        */
+       protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext 
ec, String name) throws DMLRuntimeException {
+               long t0 = System.nanoTime();
+               Pair<MatrixObject, Boolean> mb = 
ec.getMatrixInputForGPUInstruction(name);
+               if (mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t0);
+               return mb.getKey();
+       }
+
+       /**
+        * Helper method to get the output block (allocated on the GPU)
+        * Also records performance information into {@link Statistics}
+        * @param ec            active {@link ExecutionContext}
+        * @param name  name of input matrix (that the {@link ExecutionContext} 
is aware of)
+        * @return      the matrix object
+        * @throws DMLRuntimeException  if an error occurs
+        */
+       protected MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name) throws 
DMLRuntimeException {
+               long t0 = System.nanoTime();
+               Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name);
+               if (mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
+               return mb.getKey();
+       }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
index 4c05833..0f2542a 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -34,7 +34,7 @@ import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 public class MMTSJGPUInstruction extends GPUInstruction
 {
@@ -95,10 +95,10 @@ public class MMTSJGPUInstruction extends GPUInstruction
         public void processInstruction(ExecutionContext ec)
                 throws DMLRuntimeException
         {
-                Statistics.incrementNoOfExecutedGPUInst();
+                GPUStatistics.incrementNoOfExecutedGPUInst();
 
                 //get input
-                MatrixObject mat = 
ec.getMatrixInputForGPUInstruction(_input.getName());
+                MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
                
                 boolean isLeftTransposed = ( _type == MMTSJType.LEFT);
 
@@ -107,7 +107,7 @@ public class MMTSJGPUInstruction extends GPUInstruction
 
                 //execute operations 
                 ec.setMetaData(_output.getName(), rlen, clen);
-                LibMatrixCUDA.matmultTSMM(ec, mat, _output.getName(), 
isLeftTransposed);
+                LibMatrixCUDA.matmultTSMM(ec, getExtendedOpcode(), mat, 
_output.getName(), isLeftTransposed);
                 
                 ec.releaseMatrixInputForGPUInstruction(_input.getName());
                 ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index a423cdd..2766909 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -22,11 +22,10 @@ package org.apache.sysml.runtime.instructions.gpu;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysml.runtime.instructions.gpu.BuiltinUnaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
        
@@ -37,18 +36,17 @@ public class MatrixBuiltinGPUInstruction extends 
BuiltinUnaryGPUInstruction {
 
        @Override
        public void processInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
-               Statistics.incrementNoOfExecutedGPUInst();
+               GPUStatistics.incrementNoOfExecutedGPUInst();
                
                String opcode = getOpcode();
-               MatrixObject mat = 
ec.getMatrixInputForGPUInstruction(_input.getName());
-
+               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
                ec.setMetaData(_output.getName(), mat.getNumRows(), 
mat.getNumColumns());
 
                if(opcode.equals("sel+")) {
-                       LibMatrixCUDA.relu(ec, mat, _output.getName());
+                       LibMatrixCUDA.relu(ec, getExtendedOpcode(), mat, 
_output.getName());
 
                } else if (opcode.equals("exp")) {
-                       LibMatrixCUDA.exp(ec, mat, _output.getName());
+                       LibMatrixCUDA.exp(ec, getExtendedOpcode(), mat, 
_output.getName());
                }
                else {
                        throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
index b7c0e99..2da1aa6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -26,7 +26,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.Operator;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 public class MatrixMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUInstruction
 {
@@ -42,10 +42,10 @@ public class MatrixMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
        
        @Override
        public void processInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
-               Statistics.incrementNoOfExecutedGPUInst();
+               GPUStatistics.incrementNoOfExecutedGPUInst();
                
-               MatrixObject in1 = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-               MatrixObject in2 = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+               MatrixObject in1 = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject in2 = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                
                //TODO: make hop level changes for this
                boolean isLeftTransposed = false;
@@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
                ec.setMetaData(_output.getName(), (int)rlen, (int)clen);
                
                BinaryOperator bop = (BinaryOperator) _optr;
-               LibMatrixCUDA.matrixScalarArithmetic(ec, in1, in2, 
_output.getName(), isLeftTransposed, isRightTransposed, bop);
+               LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), 
in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
                
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
index cc8ff9f..e6c6b90 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
@@ -28,7 +28,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 public class MatrixMatrixAxpyGPUInstruction extends 
ArithmeticBinaryGPUInstruction
 {
@@ -85,10 +85,10 @@ public class MatrixMatrixAxpyGPUInstruction extends 
ArithmeticBinaryGPUInstructi
        
        @Override
        public void processInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
-               Statistics.incrementNoOfExecutedGPUInst();
+               GPUStatistics.incrementNoOfExecutedGPUInst();
                
-               MatrixObject in1 = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-               MatrixObject in2 = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+               MatrixObject in1 = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject in2 = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                ScalarObject scalar = ec.getScalarInput(constant.getName(), 
constant.getValueType(), constant.isLiteral());
                
                long rlen1 = in1.getNumRows();
@@ -103,7 +103,7 @@ public class MatrixMatrixAxpyGPUInstruction extends 
ArithmeticBinaryGPUInstructi
 
                ec.setMetaData(_output.getName(), (int)rlen1, (int)clen1);
                
-               LibMatrixCUDA.axpy(ec, in1, in2, _output.getName(), 
multiplier*scalar.getDoubleValue());
+               LibMatrixCUDA.axpy(ec, getExtendedOpcode(), in1, in2, 
_output.getName(), multiplier*scalar.getDoubleValue());
                
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
index 12deb22..54ba32e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -28,7 +28,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 
 public class ReorgGPUInstruction extends GPUInstruction
@@ -72,16 +72,16 @@ public class ReorgGPUInstruction extends GPUInstruction
        public void processInstruction(ExecutionContext ec)
                        throws DMLRuntimeException 
        {
-               Statistics.incrementNoOfExecutedGPUInst();
+               GPUStatistics.incrementNoOfExecutedGPUInst();
                //acquire input
-               MatrixObject mat = 
ec.getMatrixInputForGPUInstruction(_input.getName());        
+               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
 
                int rlen = (int) mat.getNumColumns();
                int clen = (int) mat.getNumRows();
                
                //execute operation
                ec.setMetaData(_output.getName(), rlen, clen);
-               LibMatrixCUDA.transpose(ec, mat, _output.getName());
+               LibMatrixCUDA.transpose(ec, getExtendedOpcode(), mat, 
_output.getName());
                
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
index 78b480e..44cc6e2 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -28,7 +28,7 @@ import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
-import org.apache.sysml.utils.Statistics;
+import org.apache.sysml.utils.GPUStatistics;
 
 public class ScalarMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUInstruction {
        public ScalarMatrixArithmeticGPUInstruction(Operator op, 
@@ -44,11 +44,11 @@ public class ScalarMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
        public void processInstruction(ExecutionContext ec) 
                throws DMLRuntimeException
        {
-               Statistics.incrementNoOfExecutedGPUInst();
+               GPUStatistics.incrementNoOfExecutedGPUInst();
                
                CPOperand mat = ( _input1.getDataType() == DataType.MATRIX ) ? 
_input1 : _input2;
                CPOperand scalar = ( _input1.getDataType() == DataType.MATRIX ) 
? _input2 : _input1;
-               MatrixObject in1 = 
ec.getMatrixInputForGPUInstruction(mat.getName());
+               MatrixObject in1 = getMatrixInputForGPUInstruction(ec, 
mat.getName());
                ScalarObject constant = (ScalarObject) 
ec.getScalarInput(scalar.getName(), scalar.getValueType(), scalar.isLiteral());
                
                boolean isTransposed = false;
@@ -60,7 +60,7 @@ public class ScalarMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
                ScalarOperator sc_op = (ScalarOperator) _optr;
                sc_op.setConstant(constant.getDoubleValue());
                
-               LibMatrixCUDA.matrixScalarArithmetic(ec, in1, 
_output.getName(), isTransposed, sc_op);
+               LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), 
in1, _output.getName(), isTransposed, sc_op);
                
                ec.releaseMatrixInputForGPUInstruction(mat.getName());
         ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
deleted file mode 100644
index 28efe64..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.runtime.instructions.gpu.context;
-
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysml.runtime.functionobjects.IndexFunction;
-import org.apache.sysml.runtime.functionobjects.ReduceCol;
-import org.apache.sysml.runtime.functionobjects.ReduceRow;
-import org.apache.sysml.runtime.instructions.InstructionUtils;
-import org.apache.sysml.runtime.instructions.cp.CPOperand;
-import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
-import org.apache.sysml.runtime.matrix.operators.Operator;
-import org.apache.sysml.utils.Statistics;
-
-/**
- * Implements aggregate unary instructions for CUDA
- */
-public class AggregateUnaryGPUInstruction extends GPUInstruction {
-  private CPOperand _input1 = null;
-  private CPOperand _output = null;
-
-  public AggregateUnaryGPUInstruction(Operator op, CPOperand in1, CPOperand 
out,
-                                       String opcode, String istr)
-  {
-    super(op, opcode, istr);
-    _gputype = GPUINSTRUCTION_TYPE.AggregateUnary;
-    _input1 = in1;
-    _output = out;
-  }
-
-  public static AggregateUnaryGPUInstruction parseInstruction(String str )
-          throws DMLRuntimeException
-  {
-    String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
-    String opcode = parts[0];
-    CPOperand in1 = new CPOperand(parts[1]);
-    CPOperand out = new CPOperand(parts[2]);
-
-    // This follows logic similar to AggregateUnaryCPInstruction.
-    // nrow, ncol & length should either read or refresh metadata
-    Operator aggop = null;
-    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
-      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
-    } else {
-      aggop = InstructionUtils.parseBasicAggregateUnaryOperator(opcode);
-    }
-    return new AggregateUnaryGPUInstruction(aggop, in1, out, opcode, str);
-  }
-
-  @Override
-  public void processInstruction(ExecutionContext ec)
-          throws DMLRuntimeException
-  {
-    Statistics.incrementNoOfExecutedGPUInst();
-
-    String opcode = getOpcode();
-
-    // nrow, ncol & length should either read or refresh metadata
-    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
-      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
-    }
-
-    //get inputs
-    MatrixObject in1 = ec.getMatrixInputForGPUInstruction(_input1.getName());
-
-    int rlen = (int)in1.getNumRows();
-    int clen = (int)in1.getNumColumns();
-
-    IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn;
-    if (indexFunction instanceof ReduceRow){  // COL{SUM, MAX...}
-      ec.setMetaData(_output.getName(), 1, clen);
-    } else if (indexFunction instanceof ReduceCol) { // ROW{SUM, MAX,...}
-      ec.setMetaData(_output.getName(), rlen, 1);
-    }
-
-    LibMatrixCUDA.unaryAggregate(ec, in1, _output.getName(), 
(AggregateUnaryOperator)_optr);
-
-    //release inputs/outputs
-    ec.releaseMatrixInputForGPUInstruction(_input1.getName());
-
-    // If the unary aggregate is a row reduction or a column reduction, it 
results in a vector
-    // which needs to be released. Otherwise a scala is produced and it is 
copied back to the host
-    // and set in the execution context by invoking the setScalarOutput
-    if (indexFunction instanceof ReduceRow || indexFunction instanceof 
ReduceCol) {
-      ec.releaseMatrixOutputForGPUInstruction(_output.getName());
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 215b38c..ba605bc 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -22,6 +22,7 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.CacheException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
 import java.util.Collections;
@@ -54,18 +55,25 @@ public abstract class GPUObject
        }
        
        public abstract boolean isAllocated();
-       
-       public abstract void acquireDeviceRead() throws DMLRuntimeException;
+
+       /**
+        * Signal intent that a matrix block will be read (as input) on the GPU
+        * @return      true if a host memory to device memory transfer happened
+        * @throws DMLRuntimeException
+        */
+       public abstract boolean acquireDeviceRead() throws DMLRuntimeException;
        /**
         * To signal intent that a matrix block will be written to on the GPU
+        * @return      true if memory was allocated on the GPU as a result of 
this call
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public abstract void acquireDeviceModifyDense() throws 
DMLRuntimeException;
+       public abstract boolean acquireDeviceModifyDense() throws 
DMLRuntimeException;
        /**
         * To signal intent that a sparse matrix block will be written to on 
the GPU
+        * @return      true if memory was allocated on the GPU as a result of 
this call
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public abstract void acquireDeviceModifySparse() throws 
DMLRuntimeException;
+       public abstract boolean acquireDeviceModifySparse() throws 
DMLRuntimeException;
        
        /**
         * If memory on GPU has been allocated from elsewhere, this method 
@@ -73,9 +81,14 @@ public abstract class GPUObject
         * @param numBytes number of bytes
         */
        public abstract void setDeviceModify(long numBytes);
-       
-       public abstract void acquireHostRead() throws CacheException;
-       public abstract void acquireHostModify() throws CacheException;
+
+       /**
+        * Signal intent that a block needs to be read on the host
+        * @return true if copied from device to host
+        * @throws CacheException
+        */
+       public abstract boolean acquireHostRead() throws CacheException;
+
        public abstract void releaseInput() throws CacheException;
        public abstract void releaseOutput() throws CacheException;
        
@@ -134,7 +147,7 @@ public abstract class GPUObject
                                throw new DMLRuntimeException("There is not 
enough memory on device for this matrix!");
                        }
 
-                       Statistics.cudaEvictionCount.addAndGet(1);
+                       GPUStatistics.cudaEvictionCount.addAndGet(1);
 
                        synchronized (evictionLock) {
                                Collections.sort(GPUContext.allocatedPointers, 
new Comparator<GPUObject>() {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
index d118429..c58bfe7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
@@ -26,6 +26,7 @@ import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
 import jcuda.driver.JCudaDriver;
@@ -104,7 +105,7 @@ public class JCudaContext extends GPUContext {
                LOG.info("Active CUDA device number : " + device[0]);
                LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + 
maxThreadsPerBlock + "/" + sharedMemPerBlock);
 
-               Statistics.cudaInitTime = System.nanoTime() - start;
+               GPUStatistics.cudaInitTime = System.nanoTime() - start;
 
                start = System.nanoTime();
                LibMatrixCUDA.cudnnHandle = new cudnnHandle();
@@ -116,7 +117,7 @@ public class JCudaContext extends GPUContext {
                // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
                LibMatrixCUDA.cusparseHandle = new cusparseHandle();
                cusparseCreate(LibMatrixCUDA.cusparseHandle);
-               Statistics.cudaLibrariesInitTime = System.nanoTime() - start;
+               GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
 
                try {
                        LibMatrixCUDA.kernels = new JCudaKernels();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index c5de805..f24c320 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -32,6 +32,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 
+import jcuda.runtime.JCuda;
 import org.apache.sysml.runtime.DMLRuntimeException;
 
 import jcuda.CudaException;
@@ -165,7 +166,7 @@ public class JCudaKernels {
                                config.gridDimX, config.gridDimY, 
config.gridDimZ, 
                                config.blockDimX, config.blockDimY, 
config.blockDimZ, 
                                config.sharedMemBytes, config.stream, 
Pointer.to(kernelParams), null));
-               // JCuda.cudaDeviceSynchronize();
+               JCuda.cudaDeviceSynchronize();
        }
        
     public static void checkResult(int cuResult) throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
index b9c9161..d25d387 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -39,12 +39,14 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.CacheException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
 import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
 import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
+import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
 import jcuda.Pointer;
@@ -173,9 +175,9 @@ public class JCudaObject extends GPUObject {
                        }
                        ensureFreeSpace(getDoubleSizeOf(nnz2) + 
getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
                        // increment the cudaCount by 1 for the allocation of 
all 3 arrays
-                       r.val = allocate(getDoubleSizeOf(nnz2), 0);
-                       r.rowPtr = allocate(getIntSizeOf(rows + 1), 0);
-                       r.colInd = allocate(getIntSizeOf(nnz2), 1);
+                       r.val = allocate(null, getDoubleSizeOf(nnz2), 0);
+                       r.rowPtr = allocate(null, getIntSizeOf(rows + 1), 0);
+                       r.colInd = allocate(null, getIntSizeOf(nnz2), 1);
                        return r;
                }
                
@@ -195,8 +197,8 @@ public class JCudaObject extends GPUObject {
                        cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), 
getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
                        cudaMemcpy(r.colInd, Pointer.to(colInd), 
getIntSizeOf(nnz), cudaMemcpyHostToDevice);
                        cudaMemcpy(r.val, Pointer.to(values), 
getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
-                       
Statistics.cudaToDevTime.addAndGet(System.nanoTime()-t0);
-                       Statistics.cudaToDevCount.addAndGet(3);
+                       
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-t0);
+                       GPUStatistics.cudaToDevCount.addAndGet(3);
                }
                
                /**
@@ -214,8 +216,8 @@ public class JCudaObject extends GPUObject {
                        cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, 
getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
                        cudaMemcpy(Pointer.to(colInd), r.colInd, 
getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
                        cudaMemcpy(Pointer.to(values), r.val, 
getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
-                       
Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-t0);
-                       Statistics.cudaFromDevCount.addAndGet(3);
+                       
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-t0);
+                       GPUStatistics.cudaFromDevCount.addAndGet(3);
                }
                
                // 
==============================================================================================
@@ -248,7 +250,7 @@ public class JCudaObject extends GPUObject {
                        cusparseSetPointerMode(handle, 
cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
             cudaDeviceSynchronize();
                        // Do not increment the cudaCount of allocations on GPU
-                       C.rowPtr = allocate(getIntSizeOf((long)rowsC+1), 0);
+                       C.rowPtr = allocate(null, getIntSizeOf((long)rowsC+1), 
0);
                }
                
                /**
@@ -325,8 +327,8 @@ public class JCudaObject extends GPUObject {
                 */
                private static void step3AllocateValNInd(cusparseHandle handle, 
CSRPointer C) throws DMLRuntimeException {
                        // Increment cudaCount by one when all three arrays of 
CSR sparse array are allocated
-                       C.val = allocate(getDoubleSizeOf(C.nnz), 0);
-                       C.colInd = allocate(getIntSizeOf(C.nnz), 1);
+                       C.val = allocate(null, getDoubleSizeOf(C.nnz), 0);
+                       C.colInd = allocate(null, getIntSizeOf(C.nnz), 1);
                }
 
                // 
==============================================================================================
@@ -456,12 +458,13 @@ public class JCudaObject extends GPUObject {
         * Allocates temporary space on the device.
         * Does not update bookkeeping.
         * The caller is responsible for freeing up after usage.
+        * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
         * @param size                          Size of data (in bytes) to 
allocate
         * @param statsCount    amount to increment the cudaAllocCount by
         * @return jcuda Pointer
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static Pointer allocate(long size, int statsCount) throws 
DMLRuntimeException{
+       public static Pointer allocate(String instructionName, long size, int 
statsCount) throws DMLRuntimeException{
                synchronized (GPUContext.syncObj) {
                        Pointer A = new Pointer();
                        ensureFreeSpace(size);
@@ -469,23 +472,37 @@ public class JCudaObject extends GPUObject {
                        cudaMalloc(A, size);
                        // Set all elements to 0 since newly allocated space 
will contain garbage
                        cudaMemset(A, 0, size);
-                       Statistics.cudaAllocTime.getAndAdd(System.nanoTime() - 
t0);
-                       Statistics.cudaAllocCount.getAndAdd(statsCount);
+                       GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() 
- t0);
+                       GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
+
                        return A;
                }
        }
 
        /**
-        * Convenience method for {@link #allocate(long, int)}, defaults 
statsCount to 1.
+        * Convenience method for {@link #allocate(String, long, int)}, 
defaults statsCount to 1.
         * @param size size of data (in bytes) to allocate
         * @return jcuda pointer
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static Pointer allocate(long size) throws DMLRuntimeException {
-               return allocate(size, 1);
+               return allocate(null, size, 1);
        }
 
        /**
+        * Convenience method for {@link #allocate(String, long, int)}, 
defaults statsCount to 1.
+        * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
+        * @param size size of data (in bytes) to allocate
+        * @return jcuda pointer
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static Pointer allocate(String instructionName, long size) 
throws DMLRuntimeException {
+               return allocate(instructionName, size, 1);
+       }
+
+
+       /**
         * Allocates a sparse and empty {@link JCudaObject}
         * This is the result of operations that are both non zero matrices.
         * 
@@ -528,22 +545,27 @@ public class JCudaObject extends GPUObject {
        }
 
        @Override
-       public synchronized void acquireDeviceRead() throws DMLRuntimeException 
{
+       public synchronized boolean acquireDeviceRead() throws 
DMLRuntimeException {
+               boolean transferred = false;
                if(!isAllocated()) {
                        copyFromHostToDevice();
+                       transferred = true;
                } else {
                        numLocks.addAndGet(1);
                }
                if(!isAllocated())
                        throw new DMLRuntimeException("Expected device data to 
be allocated");
+               return transferred;
        }
        
        @Override
-       public synchronized void acquireDeviceModifyDense() throws 
DMLRuntimeException {
+       public synchronized boolean acquireDeviceModifyDense() throws 
DMLRuntimeException {
+               boolean allocated = false;
                if(!isAllocated()) {
                        mat.setDirty(true);
                        // Dense block, size = numRows * numCols
                        allocateDenseMatrixOnDevice();
+                       allocated = true;
                        synchronized(evictionLock) {
                                GPUContext.allocatedPointers.add(this);
                        }
@@ -551,14 +573,17 @@ public class JCudaObject extends GPUObject {
                isDeviceCopyModified = true;
                if(!isAllocated()) 
                        throw new DMLRuntimeException("Expected device data to 
be allocated");
+               return allocated;
        }
        
        @Override
-       public synchronized void acquireDeviceModifySparse() throws 
DMLRuntimeException {
+       public synchronized boolean acquireDeviceModifySparse() throws 
DMLRuntimeException {
+               boolean allocated = false;
                isInSparseFormat = true;
                if(!isAllocated()) {
                        mat.setDirty(true);
                        allocateSparseMatrixOnDevice();
+                       allocated = true;
                        synchronized(evictionLock) {
                                GPUContext.allocatedPointers.add(this);
                        }
@@ -566,14 +591,17 @@ public class JCudaObject extends GPUObject {
                isDeviceCopyModified = true;
                if(!isAllocated()) 
                        throw new DMLRuntimeException("Expected device data to 
be allocated");
+               return allocated;
        }
        
        @Override
-       public synchronized void acquireHostRead() throws CacheException {
+       public synchronized boolean acquireHostRead() throws CacheException {
+               boolean copied = false;
                if(isAllocated()) {
                        try {
                                if(isDeviceCopyModified) {
                                        copyFromDeviceToHost();
+                                       copied = true;
                                }
                        } catch (DMLRuntimeException e) {
                                throw new CacheException(e);
@@ -582,21 +610,7 @@ public class JCudaObject extends GPUObject {
                else {
                        throw new CacheException("Cannot perform 
acquireHostRead as the GPU data is not allocated:" + mat.getVarName());
                }
-       }
-       
-       @Override
-       public synchronized void acquireHostModify() throws CacheException {
-               if(isAllocated()) {
-                       try {
-                               if(isDeviceCopyModified) {
-                                       throw new 
DMLRuntimeException("Potential overwrite of GPU data");
-                                       // copyFromDeviceToHost();
-                               }
-                               clearData();
-                       } catch (DMLRuntimeException e) {
-                               throw new CacheException(e);
-                       }
-               }
+               return copied;
        }
        
        /**
@@ -659,8 +673,8 @@ public class JCudaObject extends GPUObject {
                                
JCudaContext.deviceMemBytes.addAndGet(-numBytes);
                        }
 
-                       
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
-                       Statistics.cudaAllocCount.addAndGet(1);
+                       
GPUStatistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
+                       GPUStatistics.cudaAllocCount.addAndGet(1);
 
                }
        }
@@ -715,17 +729,17 @@ public class JCudaObject extends GPUObject {
        void deallocateMemoryOnDevice(boolean synchronous) {
                if(jcudaDenseMatrixPtr != null) {
                        long start = System.nanoTime();
-                       cudaFreeHelper(jcudaDenseMatrixPtr, synchronous);
+                       cudaFreeHelper(null, jcudaDenseMatrixPtr, synchronous);
                        
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
-                       
Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
-                       Statistics.cudaDeAllocCount.addAndGet(1);
+                       
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
+                       GPUStatistics.cudaDeAllocCount.addAndGet(1);
                }
                if (jcudaSparseMatrixPtr != null) {
                        long start = System.nanoTime();
                        jcudaSparseMatrixPtr.deallocate(synchronous);
                        
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
-                       
Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
-                       Statistics.cudaDeAllocCount.addAndGet(1);
+                       
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
+                       GPUStatistics.cudaDeAllocCount.addAndGet(1);
                }
                jcudaDenseMatrixPtr = null;
                jcudaSparseMatrixPtr = null;
@@ -785,14 +799,14 @@ public class JCudaObject extends GPUObject {
                                        long t0 = System.nanoTime();
                                        SparseBlockCOO cooBlock = 
(SparseBlockCOO)block;
                                        csrBlock = new 
SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), 
cooBlock.indexes(), cooBlock.values());
-                                       
Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-                                       
Statistics.cudaSparseConversionCount.incrementAndGet();
+                                       
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+                                       
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
                                } else if (block instanceof SparseBlockMCSR) {
                                        long t0 = System.nanoTime();
                                        SparseBlockMCSR mcsrBlock = 
(SparseBlockMCSR)block;
                                        csrBlock = new 
SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
-                                       
Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-                                       
Statistics.cudaSparseConversionCount.incrementAndGet();
+                                       
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+                                       
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
                                } else {
                                        throw new 
DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
                                }
@@ -830,8 +844,8 @@ public class JCudaObject extends GPUObject {
                
                mat.release();
                
-               Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
-               Statistics.cudaToDevCount.addAndGet(1);
+               GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
+               GPUStatistics.cudaToDevCount.addAndGet(1);
        }
        
        public static int toIntExact(long l) throws DMLRuntimeException {
@@ -860,8 +874,8 @@ public class JCudaObject extends GPUObject {
                        mat.acquireModify(tmp);
                        mat.release();
                        
-                       
Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
-                       Statistics.cudaFromDevCount.addAndGet(1);
+                       
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
+                       GPUStatistics.cudaFromDevCount.addAndGet(1);
                }
                else if (jcudaSparseMatrixPtr != null){
                        printCaller();
@@ -887,8 +901,8 @@ public class JCudaObject extends GPUObject {
                                MatrixBlock tmp = new MatrixBlock(rows, cols, 
nnz, sparseBlock);
                                mat.acquireModify(tmp);
                                mat.release();
-                               
Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
-                               Statistics.cudaFromDevCount.addAndGet(1);
+                               
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+                               GPUStatistics.cudaFromDevCount.addAndGet(1);
                        }
                }
                else {
@@ -992,8 +1006,8 @@ public class JCudaObject extends GPUObject {
                
setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(cusparseHandle, 
rows, cols, jcudaDenseMatrixPtr));
                // TODO: What if mat.getNnz() is -1 ?
                numBytes = CSRPointer.estimateSize(mat.getNnz(), rows);
-               Statistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - 
t0);
-               Statistics.cudaDenseToSparseCount.addAndGet(1);
+               GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() 
- t0);
+               GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
        }
 
        /**
@@ -1055,16 +1069,27 @@ public class JCudaObject extends GPUObject {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public void sparseToDense() throws DMLRuntimeException {
-               long t0 = System.nanoTime();
+               sparseToDense(null);
+       }
+
+       /**
+        * Convert sparse to dense (Performs transpose, use 
sparseToColumnMajorDense if the kernel can deal with column major format)
+        * Also records per instruction invokation of sparseToDense.
+        * @param instructionName       Name of the instruction for which 
statistics are recorded in {@link GPUStatistics}
+        * @throws DMLRuntimeException
+        */
+       public void sparseToDense(String instructionName) throws 
DMLRuntimeException {
+               long start = System.nanoTime();
                if(jcudaSparseMatrixPtr == null || !isAllocated())
                        throw new DMLRuntimeException("Expected allocated 
sparse matrix before sparseToDense() call");
 
                sparseToColumnMajorDense();
                convertDensePtrFromColMajorToRowMajor();
-               Statistics.cudaSparseToDenseTime.addAndGet(System.nanoTime() - 
t0);
-               Statistics.cudaSparseToDenseCount.addAndGet(1);
+               long end = System.nanoTime();
+               if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
+               GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
+               GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
        }
-
        
        /**
         * More efficient method to convert sparse to dense but returns dense 
in column major format
@@ -1107,8 +1132,8 @@ public class JCudaObject extends GPUObject {
                long t1 = System.nanoTime();
                nnzPerRowPtr = allocate(getIntSizeOf(rows));
                nnzTotalDevHostPtr = allocate(getIntSizeOf(1));
-               Statistics.cudaAllocTime.addAndGet(System.nanoTime() - t1);
-               Statistics.cudaAllocCount.addAndGet(2);         
+               GPUStatistics.cudaAllocTime.addAndGet(System.nanoTime() - t1);
+               GPUStatistics.cudaAllocCount.addAndGet(2);
                
                // Output is in dense vector format, convert it to CSR
                cusparseDnnz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, 
nnzPerRowPtr, nnzTotalDevHostPtr);
@@ -1117,8 +1142,8 @@ public class JCudaObject extends GPUObject {
                
                long t2 = System.nanoTime();
                cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, 
getIntSizeOf(1), cudaMemcpyDeviceToHost);
-               Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
-               Statistics.cudaFromDevCount.addAndGet(2);               
+               GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
+               GPUStatistics.cudaFromDevCount.addAndGet(2);
                
                if (nnzC[0] == -1){
                        throw new DMLRuntimeException("cusparseDnnz did not 
calculate the correct number of nnz from the sparse-matrix vector mulitply on 
the GPU");
@@ -1139,16 +1164,28 @@ public class JCudaObject extends GPUObject {
         * @param toFree {@link Pointer} instance to be freed
         */
        public static void cudaFreeHelper(final Pointer toFree) {
-               cudaFreeHelper(toFree, false);
+               cudaFreeHelper(null, toFree, false);
+       }
+
+       /**
+        * Does asynchronous cudaFree calls
+        * @param instructionName name of the instruction for which to record 
per instruction free time, null if do not want to record
+        * @param toFree {@link Pointer} instance to be freed
+        */
+       public static void cudaFreeHelper(String instructionName, final Pointer 
toFree) {
+               cudaFreeHelper(instructionName, toFree, false);
        }
 
        /**
         * Does cudaFree calls, either synchronously or asynchronously
+        * @param instructionName name of the instruction for which to record 
per instruction free time, null if do not want to record
         * @param toFree {@link Pointer} instance to be freed
         * @param synchronous true if to be done synchronously
         */
        @SuppressWarnings("rawtypes")
-       public static void cudaFreeHelper(final Pointer toFree, boolean 
synchronous) {
+       public static void cudaFreeHelper(String instructionName, final Pointer 
toFree, boolean synchronous) {
+               long t0 = 0;
+               if (instructionName != null) t0 = System.nanoTime();
                if (synchronous) {
                        cudaFree(toFree);
                } else {
@@ -1160,6 +1197,7 @@ public class JCudaObject extends GPUObject {
                        });
                        GPUContext.pendingDeallocates.offer(submitted);
                }
+               if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
        }
 
 

Reply via email to