http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 51a0f6b..1511afc 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -44,6 +44,7 @@ import static 
jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
 import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
 import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
@@ -55,23 +56,61 @@ import static 
jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
 import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
-import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
 import static 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.allocate;
 import static 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.cudaFreeHelper;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysml.runtime.functionobjects.*;
+import org.apache.sysml.runtime.functionobjects.And;
+import org.apache.sysml.runtime.functionobjects.Builtin;
+import org.apache.sysml.runtime.functionobjects.CM;
+import org.apache.sysml.runtime.functionobjects.Divide;
+import org.apache.sysml.runtime.functionobjects.Equals;
+import org.apache.sysml.runtime.functionobjects.GreaterThan;
+import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
+import org.apache.sysml.runtime.functionobjects.IndexFunction;
+import org.apache.sysml.runtime.functionobjects.KahanPlus;
+import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
+import org.apache.sysml.runtime.functionobjects.LessThan;
+import org.apache.sysml.runtime.functionobjects.LessThanEquals;
+import org.apache.sysml.runtime.functionobjects.Mean;
+import org.apache.sysml.runtime.functionobjects.Minus;
+import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.Multiply2;
+import org.apache.sysml.runtime.functionobjects.NotEquals;
+import org.apache.sysml.runtime.functionobjects.Or;
+import org.apache.sysml.runtime.functionobjects.Plus;
+import org.apache.sysml.runtime.functionobjects.Power;
+import org.apache.sysml.runtime.functionobjects.Power2;
+import org.apache.sysml.runtime.functionobjects.ReduceAll;
+import org.apache.sysml.runtime.functionobjects.ReduceCol;
+import org.apache.sysml.runtime.functionobjects.ReduceDiag;
+import org.apache.sysml.runtime.functionobjects.ReduceRow;
+import org.apache.sysml.runtime.functionobjects.ValueFunction;
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
-import org.apache.sysml.runtime.instructions.gpu.context.*;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.JCudaContext;
+import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
+import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
 import 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer;
-import org.apache.sysml.runtime.matrix.operators.*;
+import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysml.runtime.matrix.operators.CMOperator;
+import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
+import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
+
 import jcuda.Pointer;
 import jcuda.Sizeof;
 import jcuda.jcublas.JCublas2;
@@ -146,6 +185,16 @@ public class LibMatrixCUDA {
                return _WARP_SIZE;
        }
 
+       public static boolean isInSparseFormat(MatrixObject mo) {
+               if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated())
+                       return mo.getGPUObject().isInSparseFormat();
+               return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), 
mo.getNumColumns(), mo.getNnz());
+       }
+
+
+       //********************************************************************/
+       //***************** DEEP LEARNING Operators **************************/
+       //********************************************************************/
 
 
        public static cudnnHandle cudnnHandle;
@@ -157,14 +206,15 @@ public class LibMatrixCUDA {
 
        private static int CONVOLUTION_PREFERENCE = 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 
-       public static void conv2d(MatrixObject image, MatrixObject filter, 
MatrixObject outputBlock, int N, int C, int H, int W,
+       public static void conv2d(String instName, MatrixObject image, 
MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
                                                                                
                                int K, int R, int S, int pad_h, int pad_w, int 
stride_h, int stride_w, int P, int Q)
                                        throws DMLRuntimeException {
+
                if(isInSparseFormat(image)) {
-                       ((JCudaObject)image.getGPUObject()).sparseToDense();
+                       
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
                }
                if(isInSparseFormat(filter)) {
-                       ((JCudaObject)filter.getGPUObject()).sparseToDense();
+                       
((JCudaObject)filter.getGPUObject()).sparseToDense(instName);
                }
 
                cudnnTensorDescriptor srcTensorDesc = null;
@@ -176,11 +226,14 @@ public class LibMatrixCUDA {
                Pointer alpha = null;
                Pointer beta = null;
                try {
+                       long t1=0, t2=0;
                        // Allocate descriptors
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
                        dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
                        filterDesc = allocateFilterDescriptor(K, C, R, S);
 
+
                        Pointer imagePointer = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
                        Pointer filterPointer = 
((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr;
                        Pointer dstPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
@@ -219,24 +272,28 @@ public class LibMatrixCUDA {
                        else {
                                throw new DMLRuntimeException("Unsupported 
preference criteria for convolution");
                        }
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
                        alpha = pointerTo(1.0);
                        beta = pointerTo(0.0f);
+                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
                        int status = cudnnConvolutionForward(cudnnHandle, alpha,
                                                        srcTensorDesc, 
imagePointer,
                                                        filterDesc, 
filterPointer,
                                                        convDesc, algo, 
workSpace, sizeInBytes, beta,
                                                        dstTensorDesc, 
dstPointer);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionForward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
                }
                finally {
-
+                       long t3=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
                        if(alpha != null)
-                               cudaFreeHelper(alpha);
+                               cudaFreeHelper(instName, alpha);
                        if(beta != null)
-                               cudaFreeHelper(beta);
+                               cudaFreeHelper(instName, beta);
 
                        if(srcTensorDesc != null)
                                cudnnDestroyTensorDescriptor(srcTensorDesc);
@@ -247,7 +304,8 @@ public class LibMatrixCUDA {
                        if(convDesc != null)
                                cudnnDestroyConvolutionDescriptor(convDesc);
                        if(workSpace != null && sizeInBytes != 0)
-                               cudaFreeHelper(workSpace);
+                               cudaFreeHelper(instName, workSpace);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
                }
        }
 
@@ -295,27 +353,32 @@ public class LibMatrixCUDA {
 
        /**
         * This method computes the backpropagation errors for previous layer 
of relu operation
-        *
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param input input image
         * @param dout  next layer error propogation
         * @param outputBlock output
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void reluBackward(MatrixObject input, MatrixObject dout, 
MatrixObject outputBlock) throws DMLRuntimeException {
+       public static void reluBackward(String instName, MatrixObject input, 
MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
                if(isInSparseFormat(input)) {
-                       ((JCudaObject)input.getGPUObject()).sparseToDense();
+                       
((JCudaObject)input.getGPUObject()).sparseToDense(instName);
                }
                if(isInSparseFormat(dout)) {
-                       ((JCudaObject)dout.getGPUObject()).sparseToDense();
+                       
((JCudaObject)dout.getGPUObject()).sparseToDense(instName);
                }
                long rows = input.getNumRows();
                long cols = input.getNumColumns();
                Pointer imagePointer = 
((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
                Pointer doutPointer = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr;
                Pointer outputPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
-               kernels.launchKernel("reluBackward",
+
+               long t1=0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               kernels.launchKernel("relu_backward",
                                                
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
                                                imagePointer, doutPointer, 
outputPointer, (int)rows, (int)cols);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
+
        }
 
        /**
@@ -323,18 +386,18 @@ public class LibMatrixCUDA {
         * ones = matrix(1, rows=1, cols=Hout*Wout)
         * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
         * This operation is often followed by conv2d and hence we have 
introduced bias_add(input, bias) built-in function
-        *
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param input input image
         * @param bias bias
         * @param outputBlock output
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void biasAdd(MatrixObject input, MatrixObject bias, 
MatrixObject outputBlock) throws DMLRuntimeException {
+       public static void biasAdd(String instName, MatrixObject input, 
MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
                if(isInSparseFormat(input)) {
-                       ((JCudaObject)input.getGPUObject()).sparseToDense();
+                       
((JCudaObject)input.getGPUObject()).sparseToDense(instName);
                }
                if(isInSparseFormat(bias)) {
-                       ((JCudaObject)bias.getGPUObject()).sparseToDense();
+                       
((JCudaObject)bias.getGPUObject()).sparseToDense(instName);
                }
                long rows = input.getNumRows();
                long cols = input.getNumColumns();
@@ -346,15 +409,18 @@ public class LibMatrixCUDA {
                Pointer imagePointer = 
((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
                Pointer biasPointer = 
((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr;
                Pointer outputPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
-               kernels.launchKernel("biasAdd",
+               long t1 = 0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               kernels.launchKernel("bias_add",
                                                
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
                                                imagePointer, biasPointer, 
outputPointer, (int)rows, (int)cols, (int) PQ);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 
        }
 
        /**
         * This method computes the backpropogation errors for filter of 
convolution operation
-        *
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param image input image
         * @param dout errors from next layer
         * @param outputBlock  output errors
@@ -373,16 +439,17 @@ public class LibMatrixCUDA {
         * @param Q output activation width
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void conv2dBackwardFilter(MatrixObject image, 
MatrixObject dout,
+       public static void conv2dBackwardFilter(String instName, MatrixObject 
image, MatrixObject dout,
                                                                                
                                                                                
        MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
                                                                                
                                                                                
        int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
                                                                                
                                                                                
        int Q) throws DMLRuntimeException {
                if(isInSparseFormat(image)) {
-                       ((JCudaObject)image.getGPUObject()).sparseToDense();
+                       
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
                }
                if(isInSparseFormat(dout)) {
-                       ((JCudaObject)dout.getGPUObject()).sparseToDense();
+                       
((JCudaObject)dout.getGPUObject()).sparseToDense(instName);
                }
+
                Pointer alpha = null;
                Pointer beta = null;
                cudnnTensorDescriptor xTensorDesc = null;
@@ -393,6 +460,9 @@ public class LibMatrixCUDA {
                Pointer workSpace = null;
                long sizeInBytes = 0;
                try {
+
+                       long t1=0, t2=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        // Allocate descriptors
                        xTensorDesc = allocateTensorDescriptor(N, C, H, W);
                        doutTensorDesc = allocateTensorDescriptor(N, K, P, Q);
@@ -413,21 +483,31 @@ public class LibMatrixCUDA {
 
                        // TODO: Select the best algorithm depending on the 
data and supported CUDA
                        int algo = 
jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+
                        workSpace = new Pointer();
                        
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle,
                                                        xTensorDesc, 
doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
+                       if 
(GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
+                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
                        int status = 
cudnnConvolutionBackwardFilter(cudnnHandle, alpha, xTensorDesc, imagePointer,
                                                        doutTensorDesc, 
doutPointer, convDesc, algo, workSpace, sizeInBytes, beta, dwDesc, dwPointer);
+                       if 
(GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - 
t2);
+
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardFilter: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
                }
                finally {
+                       long t3=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
                        if(alpha != null)
-                               cudaFreeHelper(alpha);
+                               cudaFreeHelper(instName, alpha);
                        if(beta != null)
-                               cudaFreeHelper(beta);
+                               cudaFreeHelper(instName, beta);
+                       if(workSpace != null && sizeInBytes != 0)
+                               cudaFreeHelper(instName, workSpace);
+
                        if(xTensorDesc != null)
                                cudnnDestroyTensorDescriptor(xTensorDesc);
                        if(doutTensorDesc != null)
@@ -437,143 +517,468 @@ public class LibMatrixCUDA {
 
                        if(convDesc != null)
                                cudnnDestroyConvolutionDescriptor(convDesc);
-
-                       if(workSpace != null && sizeInBytes != 0)
-                               cudaFreeHelper(workSpace);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
                }
        }
 
        private static long numDoublesIn2GB = 125000000;
 
-       public static void relu(ExecutionContext ec, MatrixObject in, String 
outputName) throws DMLRuntimeException {
-               if(isInSparseFormat(in)) {
-                       // TODO: FIXME: Implement sparse relu kernel
-                       ((JCudaObject)in.getGPUObject()).sparseToDense();
+       /**
+        * This method computes the backpropogation errors for previous layer 
of convolution operation
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param filter filter used in conv2d
+        * @param dout errors from next layer
+        * @param output  output errors
+        * @param N number of images
+        * @param C number of channels
+        * @param H height
+        * @param W width
+        * @param K number of filters
+        * @param R filter height
+        * @param S filter width
+        * @param pad_h pad height
+        * @param pad_w pad width
+        * @param stride_h stride height
+        * @param stride_w stride width
+        * @param P output activation height
+        * @param Q output activation width
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static void conv2dBackwardData(String instName, MatrixObject 
filter, MatrixObject dout,
+                                                                               
                                                                                
MatrixObject output, int N, int C, int H, int W, int K, int R,
+                                                                               
                                                                                
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+                                                                               
                                                                                
int Q) throws DMLRuntimeException {
+               if(isInSparseFormat(dout)) {
+                       
((JCudaObject)dout.getGPUObject()).sparseToDense(instName);
+               }
+               if(isInSparseFormat(filter)) {
+                       
((JCudaObject)filter.getGPUObject()).sparseToDense(instName);
                }
-
-               cudnnTensorDescriptor srcTensorDesc = null;
-               cudnnTensorDescriptor dstTensorDesc = null;
                Pointer alpha = null;
                Pointer beta = null;
+               cudnnTensorDescriptor dyDesc = null;
+               cudnnTensorDescriptor dxDesc = null;
+               cudnnFilterDescriptor wDesc = null;
+               cudnnConvolutionDescriptor convDesc = null;
 
+               Pointer workSpace = null;
+               long sizeInBytes = 0;
                try {
-                       alpha = pointerTo(1.0f);
+                       long t1=0, t2=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       // Allocate descriptors
+                       wDesc = allocateFilterDescriptor(K, C, R, S);
+                       dyDesc = allocateTensorDescriptor(N, K, P, Q);
+                       dxDesc = allocateTensorDescriptor(N, C, H, W);
+
+                       // Allocate data
+                       Pointer w = 
((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dy = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dx = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
+
+                       alpha = pointerTo(1.0); // TODO
                        beta = pointerTo(0.0f);
-                       long N = in.getNumRows();
-                       long H = in.getNumColumns();
-                       long W = 1;
-                       Pointer srcData = 
((JCudaObject)in.getGPUObject()).jcudaDenseMatrixPtr;
 
-                       MatrixObject output = ec.getMatrixObject(outputName);
-                       ec.getDenseMatrixOutputForGPUInstruction(outputName);   
// Allocated the dense output matrix
-                       Pointer dstData = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
+                       int padding [] = { pad_h, pad_w };
+                       int strides [] = { stride_h, stride_w };
+                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
+                       long sizeInBytesArray[] = { 0 };
 
-                       if(N*H*W >= numDoublesIn2GB) {
-                               // Invokes relu(double* A,  double* ret, int 
rlen, int clen)
-                               kernels.launchKernel("relu",
-                                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)),
-                                                               srcData, 
dstData, (int)N, (int) H*W);
-                       }
-                       else {
-                               // Allocate descriptors
-                               srcTensorDesc = 
allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
-                               dstTensorDesc = 
allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
-                cudnnActivationDescriptor activationDescriptor = new 
cudnnActivationDescriptor();
-                               
cudnnCreateActivationDescriptor(activationDescriptor);
-                double dummy = -1;
-                cudnnSetActivationDescriptor(activationDescriptor, 
CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
-                   cudnnActivationForward(cudnnHandle, activationDescriptor,
-                       alpha, srcTensorDesc, srcData, 
-                       beta, dstTensorDesc, dstData);
+                       // TODO: Select the best algorithm depending on the 
data and supported CUDA
+                       int algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+                       workSpace = new Pointer();
+                       
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
+                                                       wDesc, dyDesc, 
convDesc, dxDesc, algo, sizeInBytesArray);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                       int status = cudnnConvolutionBackwardData(cudnnHandle, 
alpha, wDesc, w,
+                                                       dyDesc, dy, convDesc, 
algo, workSpace, sizeInBytes, beta, dxDesc, dx);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - 
t2);
+
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardData: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
                }
                finally {
+                       long t3=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
 
                        if(alpha != null)
-                               cudaFreeHelper(alpha);
+                               cudaFreeHelper(instName, alpha);
                        if(beta != null)
-                               cudaFreeHelper(beta);
+                               cudaFreeHelper(instName, beta);
+                       if(workSpace != null && sizeInBytes != 0)
+                               cudaFreeHelper(instName, workSpace);
 
-                       if(srcTensorDesc != null)
-                               cudnnDestroyTensorDescriptor(srcTensorDesc);
-                       if(dstTensorDesc != null)
-                               cudnnDestroyTensorDescriptor(dstTensorDesc);
+                       if(dyDesc != null)
+                               cudnnDestroyTensorDescriptor(dyDesc);
+                       if(dxDesc != null)
+                               cudnnDestroyTensorDescriptor(dxDesc);
+                       if(wDesc != null)
+                               cudnnDestroyFilterDescriptor(wDesc);
+                       if(convDesc != null)
+                               cudnnDestroyConvolutionDescriptor(convDesc);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
                }
        }
 
        /**
-        * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting 
cublasDsyrk(...)
-        *
-        * @param ec execution context
-        * @param left input matrix, as in a tsmm expression like A %*% A' or 
A' %*% A, we just need to check whether the left one is transposed or not, I 
named it 'left'
-        * @param outputName output matrix name
-        * @param isLeftTransposed if true, left transposed
+        * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param image image as matrix object
+        * @param outputBlock output matrix
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void matmultTSMM(ExecutionContext ec, MatrixObject left, 
String outputName,
-                                                                               
                                                 boolean isLeftTransposed) 
throws DMLRuntimeException {
-               if(isInSparseFormat(left)) {
-                       // For sparse TSMM, invoke matmult (TODO: possible 
performance improvement)
-                       matmult(ec, left, left, outputName, isLeftTransposed, 
!isLeftTransposed);
-                       return;
+       public static void maxpooling(String instName, MatrixObject image,
+                                                                               
                                                MatrixObject outputBlock, int 
N, int C, int H, int W, int K, int R,
+                                                                               
                                                int S, int pad_h, int pad_w, 
int stride_h, int stride_w, int P,
+                                                                               
                                                int Q) throws 
DMLRuntimeException {
+               if(isInSparseFormat(image)) {
+                       
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
                }
+               Pointer alpha = null;
+               Pointer beta = null;
+               cudnnTensorDescriptor xDesc = null;
+               cudnnTensorDescriptor yDesc = null;
+               cudnnPoolingDescriptor poolingDesc = null;
 
-               // For dense TSMM, exploit cublasDsyrk(...) and call custom 
kernel to flip the matrix
-               MatrixObject output = ec.getMatrixObject(outputName);
-               ec.getDenseMatrixOutputForGPUInstruction(outputName);   // 
Allocated the dense output matrix
-
-               // Since CuBLAS expects inputs in column-major format,
-               // reverse the order of matrix-multiplication and take care of 
dimension mismatch.
-               int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : 
cublasOperation.CUBLAS_OP_T;
-               // Note: the dimensions are swapped
-               int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows());
-               int k = (int) (isLeftTransposed ? left.getNumRows() : 
left.getNumColumns());
+               try {
+                       long t1=0,t2=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       // Allocate descriptors
+                       yDesc = allocateTensorDescriptor(N, C, P, Q);
+                       xDesc = allocateTensorDescriptor(N, C, H, W);
+                       poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
 
-               if(m == -1)
-                       throw new DMLRuntimeException("Incorrect dimensions");
+                       // Allocate data
+                       Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer y = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
 
-               double[] alpha = {1.0d};
-               double[] beta = {0.0d};
+                       alpha = pointerTo(1.0);
+                       beta = pointerTo(0.0f);
 
-               int lda = (int) (isLeftTransposed ? m : k);
-               int ldc = m;
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
-               if(!left.getGPUObject().isAllocated())
-                       throw new DMLRuntimeException("Input is not allocated:" 
+ left.getGPUObject().isAllocated());
-               if(!output.getGPUObject().isAllocated())
-                       throw new DMLRuntimeException("Output is not 
allocated:" + output.getGPUObject().isAllocated());
+                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                       int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
 
-               Pointer A = 
((JCudaObject)left.getGPUObject()).jcudaDenseMatrixPtr;
-               Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+               }
+               finally {
+                       long t3=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
+                       if(alpha != null)
+                               cudaFreeHelper(instName, alpha);
+                       if(beta != null)
+                               cudaFreeHelper(instName, beta);
 
-               JCublas2.cublasDsyrk(cublasHandle, 
cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, Pointer.to(alpha), A, lda, 
Pointer.to(beta), C, ldc);
-               copyUpperToLowerTriangle(output);
+                       if(yDesc != null)
+                               cudnnDestroyTensorDescriptor(yDesc);
+                       if(xDesc != null)
+                               cudnnDestroyTensorDescriptor(xDesc);
+                       if(poolingDesc != null)
+                               cudnnDestroyPoolingDescriptor(poolingDesc);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+               }
        }
 
        /**
-        * Used for all version of TSMM where the result is known to be 
symmetric.
-        * Hence, we compute only the upper triangular matrix and copy this 
partial
-        * result down to lower triangular matrix once.
-        *
-        * @param ret upper triangular matrix
+        * Performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
+        * This method computes the backpropogation errors for previous layer 
of maxpooling operation
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param image image as matrix object
+        * @param dout                  delta matrix, output of previous layer
+        * @param outputBlock output matrix
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       private static void copyUpperToLowerTriangle(MatrixObject ret) throws 
DMLRuntimeException {
-               if(isInSparseFormat(ret)) {
-                       throw new DMLRuntimeException("Sparse GPU 
copyUpperToLowerTriangle is not implemented");
+       public static void maxpoolingBackward(String instName, MatrixObject 
image, MatrixObject dout,
+                                                                               
                                                                                
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+                                                                               
                                                                                
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+                                                                               
                                                                                
int Q) throws DMLRuntimeException {
+               if(isInSparseFormat(image)) {
+                       
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
                }
-               if(ret.getNumRows() != ret.getNumColumns()) {
-                       throw new DMLRuntimeException("Only square matrix 
kernel is implemented for copyUpperToLowerTriangle");
+               if(isInSparseFormat(dout)) {
+                       
((JCudaObject)dout.getGPUObject()).sparseToDense(instName);
                }
-               int dim = (int) ret.getNumRows();
-               kernels.launchKernel("copyUpperToLowerTriangleDense",
-                                               
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
-                                               
((JCudaObject)ret.getGPUObject()).jcudaDenseMatrixPtr, dim, dim*dim);
-       }
-
+               Pointer alpha = null;
+               Pointer beta = null;
+               Pointer y = null;
+               cudnnTensorDescriptor xDesc = null;
+               cudnnTensorDescriptor yDesc = null;
+               cudnnTensorDescriptor dyDesc = null;
+               cudnnTensorDescriptor dxDesc = null;
+               cudnnPoolingDescriptor poolingDesc = null;
 
-       //********************************************************************/
-       //***************** MATRIX MULTIPLY Functions ************************/
+               try {
+                       long t1=0, t2=0, t3=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       // Allocate descriptors
+                       xDesc = allocateTensorDescriptor(N, C, H, W);
+                       yDesc = allocateTensorDescriptor(N, C, P, Q);
+                       dxDesc = allocateTensorDescriptor(N, C, H, W);
+                       dyDesc = allocateTensorDescriptor(N, C, P, Q);
+
+                       poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
+
+                       // Calling PoolForward first, y is one of the inputs 
for poolBackward
+                       // TODO: Remove calling poolForward after necessary 
changes at language level for poolBackward
+                       long numBytes = N*C*P*Q*Sizeof.DOUBLE;
+                       y = allocate(numBytes);
+
+                       // Allocate data
+                       Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dx = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dy = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr;
+
+                       alpha = pointerTo(1.0);
+                       beta = pointerTo(0.0f);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
+                       int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward before cudnnPoolingBackward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
+                       status = cudnnPoolingBackward(cudnnHandle, poolingDesc, 
alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
+
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+               }
+               finally {
+                       long t4=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t4 = 
System.nanoTime();
+                       if(alpha != null)
+                               cudaFreeHelper(instName, alpha);
+                       if(beta != null)
+                               cudaFreeHelper(instName, beta);
+                       if(y != null)
+                               cudaFreeHelper(instName, y);
+
+                       if(yDesc != null)
+                               cudnnDestroyTensorDescriptor(yDesc);
+                       if(xDesc != null)
+                               cudnnDestroyTensorDescriptor(xDesc);
+                       if(dyDesc != null)
+                               cudnnDestroyTensorDescriptor(dyDesc);
+                       if(dxDesc != null)
+                               cudnnDestroyTensorDescriptor(dxDesc);
+                       if(poolingDesc != null)
+                               cudnnDestroyPoolingDescriptor(poolingDesc);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
+               }
+       }
+
+
+       /**
+        * Performs the relu operation on the GPU.
+        * @param ec currently active {@link ExecutionContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in input matrix
+        * @param outputName    name of the output matrix
+        * @throws DMLRuntimeException  if an error occurs
+        */
+       public static void relu(ExecutionContext ec, String instName, 
MatrixObject in, String outputName) throws DMLRuntimeException {
+               if(isInSparseFormat(in)) {
+                       // TODO: FIXME: Implement sparse relu kernel
+                       
((JCudaObject)in.getGPUObject()).sparseToDense(instName);
+               }
+
+               cudnnTensorDescriptor srcTensorDesc = null;
+               cudnnTensorDescriptor dstTensorDesc = null;
+               Pointer alpha = null;
+               Pointer beta = null;
+
+               try {
+                       alpha = pointerTo(1.0f);
+                       beta = pointerTo(0.0f);
+                       long N = in.getNumRows();
+                       long H = in.getNumColumns();
+                       long W = 1;
+                       Pointer srcData = 
((JCudaObject)in.getGPUObject()).jcudaDenseMatrixPtr;
+
+                       MatrixObject output = ec.getMatrixObject(outputName);
+                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+                       Pointer dstData = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
+
+                       long t0=0;
+                       if(N*H*W >= numDoublesIn2GB) {
+                               // Invokes relu(double* A,  double* ret, int 
rlen, int clen)
+                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                               kernels.launchKernel("relu",
+                                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)),
+                                                               srcData, 
dstData, (int)N, (int) H*W);
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
+                       }
+                       else {
+                               // Allocate descriptors
+                               srcTensorDesc = 
allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
+                               dstTensorDesc = 
allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
+                               cudnnActivationDescriptor activationDescriptor 
= new cudnnActivationDescriptor();
+                               
cudnnCreateActivationDescriptor(activationDescriptor);
+                               double dummy = -1;
+                               
cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, 
CUDNN_PROPAGATE_NAN, dummy);
+                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                               cudnnActivationForward(cudnnHandle, 
activationDescriptor,
+                                                               alpha, 
srcTensorDesc, srcData,
+                                                               beta, 
dstTensorDesc, dstData);
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
+                       }
+               }
+               finally {
+                       long t1=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       if(alpha != null)
+                               cudaFreeHelper(instName, alpha);
+                       if(beta != null)
+                               cudaFreeHelper(instName, beta);
+
+                       if(srcTensorDesc != null)
+                               cudnnDestroyTensorDescriptor(srcTensorDesc);
+                       if(dstTensorDesc != null)
+                               cudnnDestroyTensorDescriptor(dstTensorDesc);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
+               }
+       }
+
+
+
+       //********************************************************************/
+       //************* End of DEEP LEARNING Operators ***********************/
+       //********************************************************************/
+
+
+
+       //********************************************************************/
+       //********** TRANSPOSE SELF MATRIX MULTIPLY Functions ****************/
+       //********************************************************************/
+
+       /**
+        * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting 
cublasDsyrk(...)
+        *
+        * @param ec execution context
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param left input matrix, as in a tsmm expression like A %*% A' or 
A' %*% A, we just need to check whether the left one is transposed or not, I 
named it 'left'
+        * @param outputName output matrix name
+        * @param isLeftTransposed if true, left transposed
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static void matmultTSMM(ExecutionContext ec, String instName, 
MatrixObject left, String outputName,
+                                                                               
                                                 boolean isLeftTransposed) 
throws DMLRuntimeException {
+               if(isInSparseFormat(left)) {
+                       // For sparse TSMM, invoke matmult (TODO: possible 
performance improvement)
+                       matmult(ec, instName, left, left, outputName, 
isLeftTransposed, !isLeftTransposed);
+                       return;
+               }
+
+               // For dense TSMM, exploit cublasDsyrk(...) and call custom 
kernel to flip the matrix
+               MatrixObject output = ec.getMatrixObject(outputName);
+               getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+
+               // Since CuBLAS expects inputs in column-major format,
+               // reverse the order of matrix-multiplication and take care of 
dimension mismatch.
+               int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : 
cublasOperation.CUBLAS_OP_T;
+               // Note: the dimensions are swapped
+               int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows());
+               int k = (int) (isLeftTransposed ? left.getNumRows() : 
left.getNumColumns());
+
+               if(m == -1)
+                       throw new DMLRuntimeException("Incorrect dimensions");
+
+               double[] alpha = {1.0d};
+               double[] beta = {0.0d};
+
+               int lda = (int) (isLeftTransposed ? m : k);
+               int ldc = m;
+
+               if(!left.getGPUObject().isAllocated())
+                       throw new DMLRuntimeException("Input is not allocated:" 
+ left.getGPUObject().isAllocated());
+               if(!output.getGPUObject().isAllocated())
+                       throw new DMLRuntimeException("Output is not 
allocated:" + output.getGPUObject().isAllocated());
+
+               Pointer A = 
((JCudaObject)left.getGPUObject()).jcudaDenseMatrixPtr;
+               Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
+
+               long t0=0, t1=0;
+
+               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+               JCublas2.cublasDsyrk(cublasHandle, 
cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, Pointer.to(alpha), A, lda, 
Pointer.to(beta), C, ldc);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, 
System.nanoTime() - t0);
+
+               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               copyUpperToLowerTriangle(output);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - 
t1);
+       }
+
+       /**
+        * Used for all version of TSMM where the result is known to be 
symmetric.
+        * Hence, we compute only the upper triangular matrix and copy this 
partial
+        * result down to lower triangular matrix once.
+        *
+        * @param ret upper triangular matrix
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       private static void copyUpperToLowerTriangle(MatrixObject ret) throws 
DMLRuntimeException {
+               if(isInSparseFormat(ret)) {
+                       throw new DMLRuntimeException("Sparse GPU 
copyUpperToLowerTriangle is not implemented");
+               }
+               if(ret.getNumRows() != ret.getNumColumns()) {
+                       throw new DMLRuntimeException("Only square matrix 
kernel is implemented for copyUpperToLowerTriangle");
+               }
+               int dim = (int) ret.getNumRows();
+               kernels.launchKernel("copy_u2l_dense",
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
+                                               
((JCudaObject)ret.getGPUObject()).jcudaDenseMatrixPtr, dim, dim*dim);
+       }
+
+
+
+       //********************************************************************/
+       //******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
+       //********************************************************************/
+
+       //********************************************************************/
+       //***************** MATRIX MULTIPLY Functions ************************/
        //********************************************************************/
 
        /**
@@ -581,16 +986,17 @@ public class LibMatrixCUDA {
         * Examines sparsity and shapes and routes call to appropriate method
         * from cuBLAS or cuSparse
         * C = op(A) x op(B)
-        * @param ec                                    Current {@link 
ExecutionContext} instance
-        * @param left1                                 Matrix A
-        * @param right1                                Matrix B
-        * @param outputName                    Name of the output matrix C (in 
code generated after LOP layer)
+        * @param ec                                                            
        Current {@link ExecutionContext} instance
+        * @param instName name of the invoking instruction to record{@link 
Statistics}.
+        * @param left1                                                         
Matrix A
+        * @param right1                                                        
Matrix B
+        * @param outputName                                    Name of the 
output matrix C (in code generated after LOP layer)
         * @param isLeftTransposed1             op for A, transposed or not
         * @param isRightTransposed1    op for B, tranposed or not
         * @return      output of matrix multiply
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static MatrixObject matmult(ExecutionContext ec, MatrixObject 
left1, MatrixObject right1, String outputName,
+       public static MatrixObject matmult(ExecutionContext ec, String 
instName, MatrixObject left1, MatrixObject right1, String outputName,
                                                                                
                                                                 boolean 
isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
 
                if(!left1.getGPUObject().isAllocated() || 
!right1.getGPUObject().isAllocated())
@@ -603,17 +1009,17 @@ public class LibMatrixCUDA {
 
                if (bothDense) {                // Dense C = Dense A * Dense B
                        // For both dense, do cuBLAS
-                       ec.getDenseMatrixOutputForGPUInstruction(outputName);   
// Allocated the dense output matrix
-                       denseDenseMatmult(output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
+                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+                       denseDenseMatmult(instName, output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
                }
                else if (bothSparse){   // Sparse C = Sparse A * Sparse B
                        ec.allocateGPUMatrixObject(outputName);
-                       bothSparseMatmult(output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
+                       bothSparseMatmult(instName, output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
                }
                else {  // Either of A or B is sparse, Sparse C = Sparse/Dense 
A * Dense/Sparse B
                        // Convert the dense to sparse and use the 
cusparseDcsrgemm routine
                        ec.allocateGPUMatrixObject(outputName);
-                       eitherSparseMatmult(output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
+                       eitherSparseMatmult(instName, output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
                }
 
                return output;
@@ -622,6 +1028,7 @@ public class LibMatrixCUDA {
        /**
         * One of the matrices is sparse, the other dense
         * C = op(A) x op(B)
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param output                                allocated output object 
for C on host to which GPU output will be attached
         * @param left                                  Matrix A on host
         * @param right                                 Matrix B on host
@@ -629,7 +1036,7 @@ public class LibMatrixCUDA {
         * @param isRightTransposed             op for B, transposed or not
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void eitherSparseMatmult(MatrixObject output, 
MatrixObject left, MatrixObject right,
+       protected static void eitherSparseMatmult(String instName, MatrixObject 
output, MatrixObject left, MatrixObject right,
                                                                                
                                                                                
                boolean isLeftTransposed, boolean isRightTransposed) throws 
DMLRuntimeException {
 
                int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -648,10 +1055,10 @@ public class LibMatrixCUDA {
 
                if (left.getGPUObject().isInSparseFormat()) {
                        // Left sparse, right dense
-                       sparseDenseMatmult(output, left, right, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
+                       sparseDenseMatmult(instName, output, left, right, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
                } else {
                        // Left dense, right sparse
-                       denseSparseMatmult(output, right, left, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
+                       denseSparseMatmult(instName, output, right, left, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
                }
        }
 
@@ -659,6 +1066,7 @@ public class LibMatrixCUDA {
         * C = op(A) * op(B) where A is dense and B is sparse
         * If B is ultrasparse, A is converted to a sparse matrix and {@code 
sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, 
CSRPointer)} is invoked
         * otherwise B is converted to a dense matrix and {@code 
denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, 
Pointer)} is invoked.
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param output ?
         * @param right ?
         * @param left ?
@@ -671,7 +1079,7 @@ public class LibMatrixCUDA {
         * @param k ?
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void denseSparseMatmult(MatrixObject output, 
MatrixObject right, MatrixObject left,
+       protected static void denseSparseMatmult(String instName, MatrixObject 
output, MatrixObject right, MatrixObject left,
                                                                                
                                                                                
         boolean isLeftTransposed, boolean isRightTransposed, int transA, int 
transB, int m, int n, int k)
                                        throws DMLRuntimeException {
                // right sparse, left dense
@@ -680,30 +1088,51 @@ public class LibMatrixCUDA {
                if (B.isUltraSparse(k, n)){
                        LOG.debug(" GPU Dense-Sparse Matrix Multiplication 
(Converted to Sparse-Sparse)");
                        // Convert left to CSR and do cuSparse matmul
-                       long t0 = System.nanoTime();
                        int rowsA = (int)left.getNumRows();
                        int colsA = (int)left.getNumColumns();
+
+
+                       long t1=0, t2=0;
+                       long t0 = System.nanoTime();
                        Pointer AT = JCudaObject.transpose(ADense, rowsA, 
colsA, colsA, rowsA);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        CSRPointer A = 
JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsA, colsA, AT);
-                       
Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-                       Statistics.cudaSparseConversionCount.addAndGet(1);
-                       sparseSparseMatmult(output, transA, transB, m, n, k, A, 
B);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
+
+                       
GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
+                       GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
+                       sparseSparseMatmult(instName, output, transA, transB, 
m, n, k, A, B);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
                        A.deallocate();
                        cudaFreeHelper(AT);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
+
                } else {
                        LOG.debug(" GPU Dense-Sparse Matrix Multiplication 
(Converted to Dense-Dense)");
                        // Convert right to dense and do a cuBlas matmul
                        // BDenseTransposed is a column major matrix
                        // Note the arguments to denseDenseMatmult to 
accommodate for this.
+                       long t0=0, t1=0, t2=0;
+                       if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
                        Pointer BDenseTransposed = 
B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, 
(int)right.getNumRows(), (int)right.getNumColumns());
-                       output.getGPUObject().acquireDeviceModifyDense();       
// To allocate the dense matrix
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
+                       
GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
+                       
GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
+
+                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                       boolean allocated = 
output.getGPUObject().acquireDeviceModifyDense();   // To allocate the dense 
matrix
+                       if (allocated) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
                        Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
-                       denseDenseMatmult(C,
+                       denseDenseMatmult(instName, C,
                                                        (int) 
left.getNumRows(), (int) left.getNumColumns(),
                                                        (int) 
right.getNumColumns(), (int) right.getNumRows(),
                                                        isLeftTransposed, 
!isRightTransposed,
                                                        ADense, 
BDenseTransposed);
-                       cudaFreeHelper(BDenseTransposed);
+
+                       cudaFreeHelper(instName, BDenseTransposed);
                }
        }
 
@@ -711,6 +1140,7 @@ public class LibMatrixCUDA {
         * * C = op(A) * op(B) where A is sparse and B is dense
         * If A is ultrasparse, B is converted to a sparse matrix and {@code 
sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, 
CSRPointer)} is invoked
         * otherwise A is converted to a dense matrix and {@code 
denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, 
Pointer)} is invoked.
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param output ?
         * @param left ?
         * @param right ?
@@ -723,7 +1153,7 @@ public class LibMatrixCUDA {
         * @param k ?
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void sparseDenseMatmult(MatrixObject output, 
MatrixObject left, MatrixObject right,
+       protected static void sparseDenseMatmult(String instName, MatrixObject 
output, MatrixObject left, MatrixObject right,
                                                                                
                                                                                
         boolean isLeftTransposed, boolean isRightTransposed, int transA, int 
transB, int m, int n, int k)
                                        throws DMLRuntimeException {
                CSRPointer A = 
((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr;
@@ -732,37 +1162,59 @@ public class LibMatrixCUDA {
                if (n == 1){
                        // Sparse Matrix - Dense Vector multiply
                        LOG.debug(" GPU Sparse Matrix - Dense Vector Mutliply");
-                       sparseMatrixDenseVectorMult(output, A, BDense, transA, 
(int)left.getNumRows(), (int)left.getNumColumns());
+                       sparseMatrixDenseVectorMult(instName, output, A, 
BDense, transA, (int)left.getNumRows(), (int)left.getNumColumns());
 
                } else {
+
+                       long t0=0, t1=0, t2=0;
                        // Sparse Matrix Dense Matrix multiply
                        if (A.isUltraSparse(m, k)){
                                LOG.debug(" GPU Sparse-Dense Matrix 
Multiplication (Converted to Sparse-Sparse)");
                                // Convert right to CSR and do cuSparse matmul
-                               long t0 = System.nanoTime();
                                int rowsB = (int)right.getNumRows();
                                int colsB = (int)right.getNumColumns();
+
+                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
                                Pointer BT = JCudaObject.transpose(BDense, 
rowsB, colsB, colsB, rowsB);
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
+
+                               if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                                CSRPointer B = 
JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsB, colsB, BT);
-                               
Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-                               
Statistics.cudaSparseConversionCount.addAndGet(1);
-                               sparseSparseMatmult(output, transA, transB, m, 
n, k, A, B);
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
+
+                               
GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
+                               
GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
+
+                               sparseSparseMatmult(instName, output, transA, 
transB, m, n, k, A, B);
+
+                               if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
                                B.deallocate();
                                cudaFreeHelper(BT);
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
+
                        } else {
                                LOG.debug(" GPU Sparse-Dense Matrix 
Multiplication (Converted to Dense-Dense)");
                                // Convert left to dense and do a cuBlas matmul
                                // ADenseTransposed is a column major matrix
                                // Note the arguments to denseDenseMatmult to 
accommodate for this.
+                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
                                Pointer ADenseTransposed = 
A.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, 
(int)left.getNumRows(), (int)left.getNumColumns());
-                               
output.getGPUObject().acquireDeviceModifyDense();       // To allocate the 
dense matrix
+                               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
+                               
GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
+                               
GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
+
+                               if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
+                               boolean allocated = 
output.getGPUObject().acquireDeviceModifyDense();   // To allocate the dense 
matrix
+                               if (allocated) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
+
                                Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
-                               denseDenseMatmult(C,
+                               denseDenseMatmult(instName, C,
                                                                (int) 
left.getNumColumns(), (int) left.getNumRows(),
                                                                (int) 
right.getNumRows(), (int) right.getNumColumns(),
                                                                
!isLeftTransposed, isRightTransposed,
                                                                
ADenseTransposed, BDense);
-                               cudaFreeHelper(ADenseTransposed);
+
+                               cudaFreeHelper(instName, ADenseTransposed);
                        }
                }
        }
@@ -770,6 +1222,7 @@ public class LibMatrixCUDA {
        /**
         * C = op(A) x B
         * A is a sparse matrix, B is a dense vector
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param output        allocated output on the host, to which the GPU 
output C will be attached
         * @param A                     sparse matrix A on the GPU
         * @param B_dense       dense matrix/vector B on the GPU
@@ -778,17 +1231,23 @@ public class LibMatrixCUDA {
         * @param k                     number of cols in A or number of rows 
in B (not op(A) or op(B))
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void sparseMatrixDenseVectorMult(MatrixObject output, 
CSRPointer A, Pointer B_dense, int transA,
+       protected static void sparseMatrixDenseVectorMult(String instName, 
MatrixObject output, CSRPointer A, Pointer B_dense, int transA,
                                                                                
                                                                                
                                                int m, int k) throws 
DMLRuntimeException {
                long size = m * Sizeof.DOUBLE;
                if (transA == CUSPARSE_OPERATION_TRANSPOSE){
                        size = k * Sizeof.DOUBLE;
                }
-               Pointer C_dense = JCudaObject.allocate((int)size);
+               Pointer C_dense = JCudaObject.allocate(instName, (int)size);
+
                double[] alpha = { 1 };
                double[] beta = { 0 };
+
+               long t1=0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
                cusparseDcsrmv(cusparseHandle, transA, m, k, (int)A.nnz, 
Pointer.to(alpha), A.descr, A.val, A.rowPtr, A.colInd, B_dense, 
Pointer.to(beta), C_dense);
                cudaDeviceSynchronize();        // Since cusparseDcsrmv is 
asynchronously executed
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - 
t1);
+
                
((JCudaObject)(output.getGPUObject())).setDenseMatrixCudaPointer(C_dense);
                output.getGPUObject().setDeviceModify(size);
        }
@@ -796,14 +1255,16 @@ public class LibMatrixCUDA {
        /**
         * Sparse C = Sparse op(A) * Sparse op(B)
         * Reroutes call to sparse matrix-vector mult if needed
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param output ?
+        * @param instName name of the invoking instruction to record{@link 
Statistics}.
         * @param left ?
         * @param right ?
         * @param isLeftTransposed ?
         * @param isRightTransposed ?
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void bothSparseMatmult(MatrixObject output, 
MatrixObject left, MatrixObject right,
+       protected static void bothSparseMatmult(String instName, MatrixObject 
output, MatrixObject left, MatrixObject right,
                                                                                
                                                                                
        boolean isLeftTransposed, boolean isRightTransposed) throws 
DMLRuntimeException {
 
                int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -825,15 +1286,16 @@ public class LibMatrixCUDA {
                // TODO if (m == 1) {   // Vector-matrix multiplication
 
                if (!isRightTransposed && right.getNumColumns() == 1){  // 
Matrix-Vector multiplication
-                       sparseMatrixVectorMult(output, transA, 
(int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, 
B);
+                       sparseMatrixVectorMult(instName, output, transA, 
(int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, 
B);
                } else {                                                        
                                        // Matrix-Matrix multiplication
-                       sparseSparseMatmult(output, transA, transB, m, n, k, A, 
B);
+                       sparseSparseMatmult(instName, output, transA, transB, 
m, n, k, A, B);
                }
        }
 
        /**
         * Does a sparse matrix-vector multiply.
         * C = op(A) x B, A is a sparse matrix, B is a sparse vector with 
numCols = 1.
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param output        allocated output object C to which the GPU 
output matrix will be attached
         * @param transA        if A is to be transposed or not (the op in 
op(A))
         * @param m                     number of rows in A (not op(A))
@@ -843,16 +1305,20 @@ public class LibMatrixCUDA {
         * @param B                     right sparse vector on GPU
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void sparseMatrixVectorMult(MatrixObject output, int 
transA, int m, int n, int k,
+       protected static void sparseMatrixVectorMult(String instName, 
MatrixObject output, int transA, int m, int n, int k,
                                                                                
                                                                                
                         CSRPointer A, CSRPointer B) throws DMLRuntimeException 
{
                LOG.debug(" GPU Sparse Matrix Sparse Vector Multiply (Converted 
to Sparse Matrix Dense Vector Multiply)");
+               long t0=0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
                Pointer BDenseVector = 
B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, k, 1);
-               sparseMatrixDenseVectorMult(output, A, BDenseVector, transA, m, 
k);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
+               sparseMatrixDenseVectorMult(instName, output, A, BDenseVector, 
transA, m, k);
        }
 
        /**
         * Does a sparse-sparse Matrix multiply
         * C = op(A) x op(B), A, B are sparse matrices
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param output        allocated output object on host to which the 
GPU output matrix will be attached
         * @param transA        op for A - to be transposed or not
         * @param transB        op for B
@@ -863,25 +1329,32 @@ public class LibMatrixCUDA {
         * @param B                     right sparse matrix on GPU
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void sparseSparseMatmult(MatrixObject output, int 
transA, int transB, int m, int n, int k,
+       protected static void sparseSparseMatmult(String instName, MatrixObject 
output, int transA, int transB, int m, int n, int k,
                                                                                
                                                                                
                CSRPointer A, CSRPointer B) throws DMLRuntimeException {
                LOG.debug(" GPU Sparse-Sparse Matrix Multiply ");
 
+               long t0=0, t1=0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
                CSRPointer C = 
CSRPointer.allocateForMatrixMultiply(cusparseHandle, A, transA, B, transB, m, 
n, k);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
+
                
((JCudaObject)output.getGPUObject()).setSparseMatrixCudaPointer(C);
                long sizeOfC = CSRPointer.estimateSize(C.nnz, 
output.getNumRows());
                output.getGPUObject().setDeviceModify(sizeOfC);
 
+               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
                cusparseDcsrgemm(cusparseHandle, transA, transB, m, n, k,
                                                A.descr, (int)A.nnz, A.val, 
A.rowPtr, A.colInd,
                                                B.descr, (int)B.nnz, B.val, 
B.rowPtr, B.colInd,
                                                C.descr, C.val, C.rowPtr, 
C.colInd);
                cudaDeviceSynchronize();
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - 
t1);
        }
 
        /**
         * Dense dense matrix multiply
         * C = op(A) * op(B), A and B are dense matrices
+        * @param instName name of the invoking instruction to record{@link 
Statistics}.
         * @param output                                output object C on host 
with GPU data allocated
         * @param left1                                 left matrix A on host 
(in row-major order)
         * @param right1                                right matrix B on host 
(in row-major order)
@@ -889,7 +1362,7 @@ public class LibMatrixCUDA {
         * @param isRightTransposed1    op for B, transposed or not
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void denseDenseMatmult(MatrixObject output, 
MatrixObject left1, MatrixObject right1,
+       protected static void denseDenseMatmult(String instName, MatrixObject 
output, MatrixObject left1, MatrixObject right1,
                                                                                
                                                                                
        boolean isLeftTransposed1, boolean isRightTransposed1) throws 
DMLRuntimeException {
 
                Pointer leftPtr = 
((JCudaObject)left1.getGPUObject()).jcudaDenseMatrixPtr;
@@ -900,7 +1373,7 @@ public class LibMatrixCUDA {
                int rightRows = (int) right1.getNumRows();
                int rightCols = (int) right1.getNumColumns();
                Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
-               denseDenseMatmult(C, leftRows, leftCols, rightRows, rightCols, 
isLeftTransposed1, isRightTransposed1,
+               denseDenseMatmult(instName, C, leftRows, leftCols, rightRows, 
rightCols, isLeftTransposed1, isRightTransposed1,
                                                leftPtr, rightPtr);
        }
 
@@ -912,6 +1385,7 @@ public class LibMatrixCUDA {
         * We do t(B) %*% t(A) to get t(C);
         * If we were to calculate t(t(C), we would get the resultant matrix C, 
but this would be in column-major format.
         * What we really want is t(C). This we already have as the result of 
t(B) %*% t(A).
+        * @param instName name of the invoking instruction to record{@link 
Statistics}.
         * @param output                        output allocated on GPU in 
column major format
         * @param leftRows1                     number of rows in A
         * @param leftCols1                     number of cols in A
@@ -923,7 +1397,7 @@ public class LibMatrixCUDA {
         * @param rightPtr                      B allocated on the GPU in 
row-major format
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void denseDenseMatmult(Pointer output, int leftRows1, int 
leftCols1, int rightRows1,
+       public static void denseDenseMatmult(String instName, Pointer output, 
int leftRows1, int leftCols1, int rightRows1,
                                                                                
                                                                         int 
rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer 
leftPtr, Pointer rightPtr)
                                        throws DMLRuntimeException {
 
@@ -961,6 +1435,8 @@ public class LibMatrixCUDA {
                int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : 
cublasOperation.CUBLAS_OP_N;
                int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : 
cublasOperation.CUBLAS_OP_N;
 
+               long t0=0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
                Pointer C = output;
                if (m == 1 && n == 1){
                        // Vector product
@@ -972,18 +1448,22 @@ public class LibMatrixCUDA {
                        // The result is copied from the host back to the 
device so that the rest of
                        // infrastructure can treat it uniformly.
                        cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, 
cudaMemcpyHostToDevice);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
                } else if (m == 1) {
                        // Vector-matrix multiply
                        LOG.debug(" GPU Dense Vector-Matrix Multiply");
                        transb = isRightTransposed ? 
cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
                        JCublas2.cublasDgemv(cublasHandle, transb, rightRows, 
rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - 
t0);
                } else if (n == 1){
                        // Matrix-vector multiply
                        LOG.debug(" GPU Dense Matrix-Vector Multiply");
                        JCublas2.cublasDgemv(cublasHandle, transa, leftRows, 
leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - 
t0);
                } else {
                        LOG.debug(" GPU Dense-Dense Matrix Multiply ");
                        JCublas2.cublasDgemm(cublasHandle, transa, transb, m, 
n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - 
t0);
                }
        }
 
@@ -992,7 +1472,6 @@ public class LibMatrixCUDA {
        //********************************************************************/
 
 
-
        //********************************************************************/
        //****************  UNARY AGGREGATE Functions ************************/
        //********************************************************************/
@@ -1002,12 +1481,13 @@ public class LibMatrixCUDA {
         * Entry point to perform Unary aggregate operations on the GPU.
         * The execution context object is used to allocate memory for the GPU.
         * @param ec                    Instance of {@link ExecutionContext}, 
from which the output variable will be allocated
+        * @param instName name of the invoking instruction to record{@link 
Statistics}.
         * @param in1                   input matrix
         * @param output        output matrix/scalar name
         * @param op                    Instance of {@link 
AggregateUnaryOperator} which encapsulates the direction of 
reduction/aggregation and the reduction operation.
         * @throws DMLRuntimeException if {@link DMLRuntimeException} occurs
         */
-       public static void unaryAggregate(ExecutionContext ec, MatrixObject 
in1, String output, AggregateUnaryOperator op)
+       public static void unaryAggregate(ExecutionContext ec, String instName, 
MatrixObject in1, String output, AggregateUnaryOperator op)
                                        throws DMLRuntimeException {
 
                final int REDUCTION_ALL = 1;
@@ -1087,7 +1567,7 @@ public class LibMatrixCUDA {
                if (isSparse){
                        // The strategy for the time being is to convert sparse 
to dense
                        // until a sparse specific kernel is written.
-                       ((JCudaObject)in1.getGPUObject()).sparseToDense();
+                       
((JCudaObject)in1.getGPUObject()).sparseToDense(instName);
                        // long nnz = in1.getNnz();
                        // assert nnz > 0 : "Internal Error - number of non 
zeroes set to " + nnz + " in Aggregate Binary for GPU";
                        // MatrixObject out = 
ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
@@ -1098,28 +1578,29 @@ public class LibMatrixCUDA {
                Pointer out = null;
                if (reductionDirection == REDUCTION_COL || reductionDirection 
== REDUCTION_ROW) {
                        // Matrix output
-                       MatrixObject out1 = 
ec.getDenseMatrixOutputForGPUInstruction(output);
+                       MatrixObject out1 = 
getDenseMatrixOutputForGPUInstruction(ec, instName, output);
                        out = ((JCudaObject) 
out1.getGPUObject()).jcudaDenseMatrixPtr;
                }
 
                Pointer in = 
((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr;
                int size = rlen * clen;
 
+               long t0=0;
                // For scalars, set the scalar output in the Execution Context 
object
                switch (opIndex){
                        case OP_PLUS: {
                                switch(reductionDirection) {
                                        case REDUCTION_ALL : {
-                                               double result = 
reduceAll("reduce_sum", in, size);
+                                               double result = 
reduceAll(instName, "reduce_sum", in, size);
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
                                        case REDUCTION_COL : {  // The names 
are a bit misleading, REDUCTION_COL refers to the direction (reduce all 
elements in a column)
-                                               reduceRow("reduce_row_sum", in, 
out, rlen, clen);
+                                               reduceRow(instName, 
"reduce_row_sum", in, out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_ROW : {
-                                               reduceCol("reduce_col_sum", in, 
out, rlen, clen);
+                                               reduceCol(instName, 
"reduce_col_sum", in, out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_DIAG :
@@ -1129,43 +1610,45 @@ public class LibMatrixCUDA {
                        }
                        case OP_PLUS_SQ : {
                                // Calculate the squares in a temporary object 
tmp
-                               Pointer tmp = JCudaObject.allocate(size * 
Sizeof.DOUBLE);
-                               squareMatrix(in, tmp, rlen, clen);
+                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                               Pointer tmp = JCudaObject.allocate(instName, 
size * Sizeof.DOUBLE);
+
+                               squareMatrix(instName, in, tmp, rlen, clen);
                                // Then do the sum on the temporary object and 
free it
                                switch(reductionDirection) {
                                        case REDUCTION_ALL : {
-                                               double result = 
reduceAll("reduce_sum", tmp, size);
+                                               double result = 
reduceAll(instName, "reduce_sum", tmp, size);
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
                                        case REDUCTION_COL : {  // The names 
are a bit misleading, REDUCTION_COL refers to the direction (reduce all 
elements in a column)
-                                               reduceRow("reduce_row_sum", 
tmp, out, rlen, clen);
+                                               reduceRow(instName, 
"reduce_row_sum", tmp, out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_ROW : {
-                                               reduceCol("reduce_col_sum", 
tmp, out, rlen, clen);
+                                               reduceCol(instName, 
"reduce_col_sum", tmp, out, rlen, clen);
                                                break;
                                        }
                                        default:
                                                throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
summation squared");
                                }
-                               cudaFreeHelper(tmp);
+                               cudaFreeHelper(instName, tmp);
                                break;
                        }
                        case OP_MEAN:{
                                switch(reductionDirection) {
                                        case REDUCTION_ALL: {
-                                               double result = 
reduceAll("reduce_sum", in, size);
+                                               double result = 
reduceAll(instName, "reduce_sum", in, size);
                                                double mean = result / size;
                                                ec.setScalarOutput(output, new 
DoubleObject(mean));
                                                break;
                                        }
                                        case REDUCTION_COL: {
-                                               reduceRow("reduce_row_mean", 
in, out, rlen, clen);
+                                               reduceRow(instName, 
"reduce_row_mean", in, out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_ROW: {
-                                               reduceCol("reduce_col_mean", 
in, out, rlen, clen);
+                                               reduceCol(instName, 
"reduce_col_mean", in, out, rlen, clen);
                                                break;
                                        }
                                        default:
@@ -1176,7 +1659,7 @@ public class LibMatrixCUDA {
                        case OP_MULTIPLY : {
                                switch (reductionDirection) {
                                        case REDUCTION_ALL: {
-                                               double result = 
reduceAll("reduce_prod", in, size);
+                                               double result = 
reduceAll(instName, "reduce_prod", in, size);
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
@@ -1188,16 +1671,16 @@ public class LibMatrixCUDA {
                        case OP_MAX :{
                                switch(reductionDirection) {
                                        case REDUCTION_ALL: {
-                                               double result = 
reduceAll("reduce_max", in, size);
+                                               double result = 
reduceAll(instName, "reduce_max", in, size);
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
                                        case REDUCTION_COL: {
-                                               reduceRow("reduce_row_max", in, 
out, rlen, clen);
+                                               reduceRow(instName, 
"reduce_row_max", in, out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_ROW: {
-                                               reduceCol("reduce_col_max", in, 
out, rlen, clen);
+                                               reduceCol(instName, 
"reduce_col_max", in, out, rlen, clen);
                                                break;
                                        }
                                        default:
@@ -1208,16 +1691,16 @@ public class LibMatrixCUDA {
                        case OP_MIN :{
                                switch(reductionDirection) {
                                        case REDUCTION_ALL: {
-                                               double result = 
reduceAll("reduce_min", in, size);
+                                               double result = 
reduceAll(instName, "reduce_min", in, size);
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
                                        case REDUCTION_COL: {
-                                               reduceRow("reduce_row_min", in, 
out, rlen, clen);
+                                               reduceRow(instName, 
"reduce_row_min", in, out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_ROW: {
-                                               reduceCol("reduce_col_min", in, 
out, rlen, clen);
+                                               reduceCol(instName, 
"reduce_col_min", in, out, rlen, clen);
                                                break;
                                        }
                                        default:
@@ -1227,66 +1710,69 @@ public class LibMatrixCUDA {
                        }
                        case OP_VARIANCE : {
                                // Temporary GPU array for
-                               Pointer tmp = JCudaObject.allocate(size * 
Sizeof.DOUBLE);
-                               Pointer tmp2 = JCudaObject.allocate(size * 
Sizeof.DOUBLE);
+                               if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+                               Pointer tmp = JCudaObject.allocate(instName, 
size * Sizeof.DOUBLE);
+                               Pointer tmp2 = JCudaObject.allocate(instName, 
size * Sizeof.DOUBLE);
 
                                switch(reductionDirection) {
 
                                        case REDUCTION_ALL: {
-                                               double result = 
reduceAll("reduce_sum", in, size);
+                                               double result = 
reduceAll(instName, "reduce_sum", in, size);
                                                double mean = result / size;
 
                                                // Subtract mean from every 
element in the matrix
                                                ScalarOperator minusOp = new 
RightScalarOperator(Minus.getMinusFnObject(), mean);
-                                               matrixScalarOp(in, mean, rlen, 
clen, tmp, minusOp);
+                                               matrixScalarOp(instName, in, 
mean, rlen, clen, tmp, minusOp);
 
-                                               squareMatrix(tmp, tmp2, rlen, 
clen);
+                                               squareMatrix(instName, tmp, 
tmp2, rlen, clen);
 
-                                               double result2 = 
reduceAll("reduce_sum", tmp2, size);
+                                               double result2 = 
reduceAll(instName, "reduce_sum", tmp2, size);
                                                double variance = result2 / 
(size - 1);
                                                ec.setScalarOutput(output, new 
DoubleObject(variance));
 
                                                break;
                                        }
                                        case REDUCTION_COL: {
-                                               reduceRow("reduce_row_mean", 
in, out, rlen, clen);
+                                               reduceRow(instName, 
"reduce_row_mean", in, out, rlen, clen);
                                                // Subtract the row-wise mean 
from every element in the matrix
                                                BinaryOperator minusOp = new 
BinaryOperator(Minus.getMinusFnObject());
-                                               matrixMatrixOp(in, out, rlen, 
clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
+                                               matrixMatrixOp(instName, in, 
out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, 
minusOp);
 
-                                               squareMatrix(tmp, tmp2, rlen, 
clen);
+                                               squareMatrix(instName, tmp, 
tmp2, rlen, clen);
 
-                                               Pointer tmpRow = 
JCudaObject.allocate(rlen * Sizeof.DOUBLE);
-                                               reduceRow("reduce_row_sum", 
tmp2, tmpRow, rlen, clen);
+                                               Pointer tmpRow = 
JCudaObject.allocate(instName, rlen * Sizeof.DOUBLE);
+                                               reduceRow(instName, 
"reduce_row_sum", tmp2, tmpRow, rlen, clen);
 
                                                ScalarOperator divideOp = new 
RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
-                                               matrixScalarOp(tmpRow, clen - 
1, rlen, clen, out, divideOp);
+                                               matrixScalarOp(instName, 
tmpRow, clen - 1, rlen, clen, out, divideOp);
+
+                                               cudaFreeHelper(instName, 
tmpRow);
 
-                                               cudaFreeHelper(tmpRow);
                                                break;
                                        }
                                        case REDUCTION_ROW: {
-                                               reduceCol("reduce_col_mean", 
in, out, rlen, clen);
+                                               reduceCol(instName, 
"reduce_col_mean", in, out, rlen, clen);
                                                // Subtract the columns-wise 
mean from every element in the matrix
                                                BinaryOperator minusOp = new 
BinaryOperator(Minus.getMinusFnObject());
-                                               matrixMatrixOp(in, out, rlen, 
clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
+                                               matrixMatrixOp(instName, in, 
out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
 
-                                               squareMatrix(tmp, tmp2, rlen, 
clen);
+                                               squareMatrix(instName, tmp, 
tmp2, rlen, clen);
 
-                                               Pointer tmpCol = 
JCudaObject.allocate(clen * Sizeof.DOUBLE);
-                                               reduceCol("reduce_col_sum", 
tmp2, tmpCol, rlen, clen);
+                                               Pointer tmpCol = 
JCudaObject.allocate(instName, clen * Sizeof.DOUBLE);
+                                               reduceCol(instName, 
"reduce_col_sum", tmp2, tmpCol, rlen, clen);
 
                                                ScalarOperator divideOp = new 
RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
-                                               matrixScalarOp(tmpCol, rlen - 
1, rlen, clen, out, divideOp);
+                                               matrixScalarOp(instName, 
tmpCol, rlen - 1, rlen, clen, out, divideOp);
+
+                                               cudaFreeHelper(instName, 
tmpCol);
 
-                                               cudaFreeHelper(tmpCol);
                                                break;
                                        }
                                        default:
                                                throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
variance");
                                }
-                               cudaFreeHelper(tmp);
-                               cudaFreeHelper(tmp2);
+                               cudaFreeHelper(instName, tmp);
+                               cudaFreeHelper(instName, tmp2);
                                break;
                        }
                        case OP_MAXINDEX : {
@@ -1313,15 +1799,16 @@ public class LibMatrixCUDA {
 
        /**
         * Helper method to square a matrix in GPU memory
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param in            input matrix on GPU
         * @param out           output matrix on GPU
         * @param rlen  row length
         * @param clen  column length
         * @throws DMLRuntimeException
         */
-       private static void squareMatrix(Pointer in, Pointer out, int rlen, int 
clen) throws DMLRuntimeException {
+       private static void squareMatrix(String instName, Pointer in, Pointer 
out, int rlen, int clen) throws DMLRuntimeException {
                ScalarOperator power2op = new 
RightScalarOperator(Power.getPowerFnObject(), 2);
-               matrixScalarOp(in, 2, rlen, clen, out, power2op);
+               matrixScalarOp(instName, in, 2, rlen, clen, out, power2op);
        }
 
        /**
@@ -1332,26 +1819,36 @@ public class LibMatrixCUDA {
         * @return      the reduced value
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       private static double reduceAll(String kernelFunction, Pointer in, int 
n) throws DMLRuntimeException {
+       private static double reduceAll(String instName, String kernelFunction, 
Pointer in, int n) throws DMLRuntimeException {
                int[] tmp = getKernelParamsForReduceAll(n);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
-               Pointer tempOut = JCudaObject.allocate(n * Sizeof.DOUBLE);
-               kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
-                                               in, tempOut, n);
+               Pointer tempOut = JCudaObject.allocate(instName, n * 
Sizeof.DOUBLE);
+
+               long t1=0,t2=0,t3=0;
+
+               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
                cudaDeviceSynchronize();
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
+
                int s = blocks;
                while (s > 1) {
                        tmp = getKernelParamsForReduceAll(s);
                        blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
+                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
                        kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                                        tempOut, tempOut, s);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
                        s = (s + (threads*2-1)) / (threads*2);
                }
                double[] result = {-1f};
+
+               if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
                cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, 
cudaMemcpyDeviceToHost);
-               cudaFreeHelper(tempOut);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
 
+               cudaFreeHelper(instName, tempOut);
                return result[0];
        }
 
@@ -1365,12 +1862,17 @@ public class LibMatrixCUDA {
         * @param cols                                          number of 
columns in input matrix
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       private static void reduceRow(String kernelFunction, Pointer in, 
Pointer out, int rows, int cols) throws DMLRuntimeException {
+       private static void reduceRow(String instName, String kernelFunction, 
Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
                int[] tmp = getKernelParamsForReduceByRow(rows, cols);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
+
+               long t0=0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
                kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                                in, out, rows, cols);
                cudaDeviceSynchronize();
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
+
        }
 
        /**
@@ -1383,383 +1885,138 @@ public class LibMatrixCUDA {
         * @param cols                                          number of 
columns in input matrix
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       private static void reduceCol(String kernelFunction, Pointer in, 
Pointer out, int rows, int cols) throws DMLRuntimeException {
+       private static void reduceCol(String instName, String kernelFunction, 
Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
                int[] tmp = getKernelParamsForReduceByCol(rows, cols);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
+
+               long t0=0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
                kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                                in, out, rows, cols);
                cudaDeviceSynchronize();
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
        }
 
-       /**
-        * Get threads, blocks and shared memory for a reduce all operation
-        * @param n size of input array
-        * @return integer array containing {blocks, threads, shared memory}
-        */
-       private static int[] getKernelParamsForReduceAll(int n) throws 
DMLRuntimeException{
-               final int MAX_THREADS = getMaxThreads();
-               final int MAX_BLOCKS = getMaxBlocks();
-               final int WARP_SIZE = getWarpSize();
-               int threads = (n < MAX_THREADS *2) ? nextPow2((n + 1)/ 2) : 
MAX_THREADS;
-
-               int blocks = (n + (threads * 2 - 1)) / (threads * 2);
-               blocks = Math.min(MAX_BLOCKS, blocks);
-
-               int sharedMemSize = threads * Sizeof.DOUBLE;
-               if (threads <= WARP_SIZE){
-                       sharedMemSize *= 2;
-               }
-               return new int[] {blocks, threads, sharedMemSize};
-       }
-
-       /**
-        * Get threads, blocks and shared memory for a reduce by row operation
-        * @param rows number of rows in input matrix
-        * @param cols number of columns in input matrix
-        * @return integer array containing {blocks, threads, shared memory}
-        */
-       private static int[] getKernelParamsForReduceByRow(int rows, int cols) 
throws DMLRuntimeException {
-               final int WARP_SIZE = getWarpSize();
-               final int MAX_THREADS = getMaxThreads();
-               int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) 
: MAX_THREADS;
-               int blocks = rows;
-               int sharedMemSize = threads * Sizeof.DOUBLE;
-               if (threads <= WARP_SIZE){
-                       sharedMemSize *=2;
-               }
-               return new int[] {blocks, threads, sharedMemSize};
-       }
-
-       private static int[] getKernelParamsForReduceByCol(int rows, int cols) 
throws DMLRuntimeException {
-               final int MAX_THREADS = getMaxThreads();
-               final int MAX_BLOCKS = getMaxBlocks();
-               final int WARP_SIZE = getWarpSize();
-               int threads = Math.min(cols, MAX_THREADS);
-               int blocks = Math.min(cols/MAX_THREADS, MAX_BLOCKS);
-               if (cols % MAX_THREADS != 0) blocks++;
-               int sharedMemSize = threads * Sizeof.DOUBLE;
-               if (threads <= WARP_SIZE){
-                       sharedMemSize *=2;
-               }
-               return new int[] {blocks, threads, sharedMemSize};
-       }
-
-
-       private static int nextPow2(int x)
-       {
-               --x;
-               x |= x >> 1;
-               x |= x >> 2;
-               x |= x >> 4;
-               x |= x >> 8;
-               x |= x >> 16;
-               return ++x;
-       }
-
-       //********************************************************************/
-       //****************  END OF UNARY AGGREGATE Functions *****************/
-       //********************************************************************/
-
-
-       /**
-        * This method computes the backpropogation errors for previous layer 
of convolution operation
-        *
-        * @param filter filter used in conv2d
-        * @param dout errors from next layer
-        * @param output  output errors
-        * @param N number of images
-        * @param C number of channels
-        * @param H height
-        * @param W width
-        * @param K number of filters
-        * @param R filter height
-        * @param S filter width
-        * @param pad_h pad height
-        * @param pad_w pad width
-        * @param stride_h stride height
-        * @param stride_w stride width
-        * @param P output activation height
-        * @param Q output activation width
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static void conv2dBackwardData(MatrixObject filter, MatrixObject 
dout,
-                                                                               
                                                                                
MatrixObject output, int N, int C, int H, int W, int K, int R,
-                                                                               
                                                                                
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-                                                                               
                                                                                
int Q) throws DMLRuntimeException {
-               if(isInSparseFormat(dout)) {
-                       ((JCudaObject)dout.getGPUObject()).sparseToDense();
-               }
-               if(isInSparseFormat(filter)) {
-                       ((JCudaObject)filter.getGPUObject()).sparseToDense();
-               }
-               Pointer alpha = null;
-               Pointer beta = null;
-               cudnnTensorDescriptor dyDesc = null;
-               cudnnTensorDescriptor dxDesc = null;
-               cudnnFilterDescriptor wDesc = null;
-               cudnnConvolutionDescriptor convDesc = null;
-
-               Pointer workSpace = null;
-               long sizeInBytes = 0;
-               try {
-                       // Allocate descriptors
-                       wDesc = allocateFilterDescriptor(K, C, R, S);
-                       dyDesc = allocateTensorDescriptor(N, K, P, Q);
-                       dxDesc = allocateTensorDescriptor(N, C, H, W);
-
-                       // Allocate data
-                       Pointer w = 
((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr;
-                       Pointer dy = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr;
-                       Pointer dx = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
-
-                       alpha = pointerTo(1.0); // TODO
-                       beta = pointerTo(0.0f);
-
-                       int padding [] = { pad_h, pad_w };
-                       int strides [] = { stride_h, stride_w };
-                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
-                       long sizeInBytesArray[] = { 0 };
-
-                       // TODO: Select the best algorithm depending on the 
data and supported CUDA
-                       int algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-                       workSpace = new Pointer();
-                       
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
-                                                       wDesc, dyDesc, 
convDesc, dxDesc, algo, sizeInBytesArray);
-
-                       int status = cudnnConvolutionBackwardData(cudnnHandle, 
alpha, wDesc, w,
-                                                       dyDesc, dy, convDesc, 
algo, workSpace, sizeInBytes, beta, dxDesc, dx);
-                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-                               throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardData: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
-                       }
-               }
-               finally {
-                       if(alpha != null)
-                               cudaFreeHelper(alpha);
-                       if(beta != null)
-                               cudaFreeHelper(beta);
-                       if(dyDesc != null)
-                               cudnnDestroyTensorDescriptor(dyDesc);
-                       if(dxDesc != null)
-                               cudnnDestroyTensorDescriptor(dxDesc);
-                       if(wDesc != null)
-                               cudnnDestroyFilterDescriptor(wDesc);
-
-                       if(convDesc != null)
-                               cudnnDestroyConvolutionDescriptor(convDesc);
-
-                       if(workSpace != null && sizeInBytes != 0)
-                               cudaFreeHelper(workSpace);
-               }
-       }
-
-       /**
-        * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
-        * @param image image as matrix object
-        * @param outputBlock output matrix
-        * @param N                             batch size
-        * @param C                             number of channels
-        * @param H                             height of image
-        * @param W                             width of image
-        * @param K                             number of filters
-        * @param R                             height of filter
-        * @param S                             width of filter
-        * @param pad_h                 vertical padding
-        * @param pad_w                 horizontal padding
-        * @param stride_h              horizontal stride
-        * @param stride_w              vertical stride
-        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
-        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static void maxpooling(MatrixObject image,
-                                                                               
                                                MatrixObject outputBlock, int 
N, int C, int H, int W, int K, int R,
-                                                                               
                                                int S, int pad_h, int pad_w, 
int stride_h, int stride_w, int P,
-                                                                               
                                                int Q) throws 
DMLRuntimeException {
-               if(isInSparseFormat(image)) {
-                       ((JCuda

<TRUNCATED>

Reply via email to