http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index c363ab1..3c32137 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -19,23 +19,49 @@
 
 package org.apache.sysml.runtime.matrix.data;
 
-import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasFillMode;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcublas.cublasOperation;
-import jcuda.jcudnn.cudnnActivationDescriptor;
-import jcuda.jcudnn.cudnnBatchNormMode;
-import jcuda.jcudnn.cudnnConvolutionDescriptor;
-import jcuda.jcudnn.cudnnConvolutionFwdPreference;
-import jcuda.jcudnn.cudnnFilterDescriptor;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcudnn.cudnnPoolingDescriptor;
-import jcuda.jcudnn.cudnnStatus;
-import jcuda.jcudnn.cudnnTensorDescriptor;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
+import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
+import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
+import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
+import static 
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
+import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
+import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
+import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
+import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
+import static 
jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -72,10 +98,9 @@ import org.apache.sysml.runtime.instructions.cp.DoubleObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
-import org.apache.sysml.runtime.instructions.gpu.context.JCudaContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
 import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
-import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
-import 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer;
 import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
 import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
@@ -86,127 +111,123 @@ import 
org.apache.sysml.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
-import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
-import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
-import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
-import static 
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
-import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
-import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
-import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
-import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
-import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
-import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
-import static 
jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
-import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
-import static jcuda.runtime.JCuda.cudaMemcpy;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-import static 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.allocate;
-import static 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.cudaFreeHelper;
+import jcuda.CudaException;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasFillMode;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.jcudnn.cudnnActivationDescriptor;
+import jcuda.jcudnn.cudnnBatchNormMode;
+import jcuda.jcudnn.cudnnConvolutionDescriptor;
+import jcuda.jcudnn.cudnnConvolutionFwdPreference;
+import jcuda.jcudnn.cudnnFilterDescriptor;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcudnn.cudnnPoolingDescriptor;
+import jcuda.jcudnn.cudnnStatus;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
 
-//FIXME move could to respective instructions, this is not a block library
+/**
+ * All CUDA kernels and library calls are redirected through this class
+ * @see GPUContext
+ * @see GPUObject
+ */
 public class LibMatrixCUDA {
 
-       // Assume Compute Capability 3.0
+       private static final Log LOG = 
LogFactory.getLog(LibMatrixCUDA.class.getName());
+
+    // Assume Compute Capability 3.0
        // MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
        // MAX_THREADS is 1024 For compute capability > 3.0
        private static int _MAX_THREADS = -1;
        private static int _MAX_BLOCKS  = -1;
        private static int _WARP_SIZE   = -1;
 
+       //********************************************************************/
+       //***************************** UTILS ********************************/
+       //********************************************************************/
+
+       /*
+       static GPUContext gCtx throws DMLRuntimeException {
+                       return GPUContext.gCtx;
+       }
+       */
+
        /**
         * Utility function to get maximum number of threads supported by the 
active CUDA device.
-        * This is put into a singleton style method because the GPUContext is 
not fully initialized when
-        * the LibMatrixCUDA class is loaded. If the {@link 
GPUContext#getGPUContext()} is invoked in a
-        * static block in this class, it will access the {@link JCudaContext} 
instance when it is not
-        * properly initialized. Due to the proper checks in place, a deadlock 
occurs.
+        * @param gCtx a valid {@link GPUContext}
         * @return max threads
         * @throws DMLRuntimeException if exception occurs
         */
-       static int getMaxThreads() throws DMLRuntimeException{
+       static int getMaxThreads(GPUContext gCtx) throws DMLRuntimeException{
                if (_MAX_THREADS == -1){
-                       _MAX_THREADS = JCudaContext.getMaxThreadsPerBlock();
+                       _MAX_THREADS = gCtx.getMaxThreadsPerBlock();
                }
                return _MAX_THREADS;
        }
 
        /**
         * Utility function to get maximum number of blocks supported by the 
active CUDA device.
-        * This is put into a singleton style method because the GPUContext is 
not fully initialized when
-        * the LibMatrixCUDA class is loaded. If the {@link 
GPUContext#getGPUContext()} is invoked in a
-        * static block in this class, it will access the {@link JCudaContext} 
instance when it is not
-        * properly initialized. Due to the proper checks in place, a deadlock 
occurs.
+        * @param gCtx a valid {@link GPUContext}
         * @return max blocks
         * @throws DMLRuntimeException if exception occurs
         */
-       static int getMaxBlocks() throws DMLRuntimeException{
+       static int getMaxBlocks(GPUContext gCtx) throws DMLRuntimeException{
                if (_MAX_BLOCKS == -1){
-                       _MAX_BLOCKS = JCudaContext.getMaxBlocks();
+                       _MAX_BLOCKS = gCtx.getMaxBlocks();
                }
                return _MAX_BLOCKS;
        }
 
        /**
         * Utility function to get the warp size supported by the active CUDA 
device.
-        * This is put into a singleton style method because the GPUContext is 
not fully initialized when
-        * the LibMatrixCUDA class is loaded. If the {@link 
GPUContext#getGPUContext()} is invoked in a
-        * static block in this class, it will access the {@link JCudaContext} 
instance when it is not
-        * properly initialized. Due to the proper checks in place, a deadlock 
occurs.
+        * @param gCtx a valid {@link GPUContext}
         * @return warp size
         * @throws DMLRuntimeException if exception occurs
         */
-       static int getWarpSize() throws DMLRuntimeException {
+       static int getWarpSize(GPUContext gCtx) throws DMLRuntimeException {
                if (_WARP_SIZE == -1) {
-                       _WARP_SIZE = JCudaContext.getWarpSize();
+                       _WARP_SIZE = gCtx.getWarpSize();
                }
                return _WARP_SIZE;
        }
 
-       public static boolean isInSparseFormat(MatrixObject mo) {
-               if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated())
-                       return mo.getGPUObject().isInSparseFormat();
+       public static boolean isInSparseFormat(GPUContext gCtx, MatrixObject 
mo) {
+               if(mo.getGPUObject(gCtx) != null && 
mo.getGPUObject(gCtx).isAllocated())
+                       return mo.getGPUObject(gCtx).isSparse();
                return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), 
mo.getNumColumns(), mo.getNnz());
        }
 
 
+       private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws 
DMLRuntimeException{
+               return gCtx.getCusparseHandle();
+       }
+
+       private static cublasHandle getCublasHandle(GPUContext gCtx) throws 
DMLRuntimeException {
+               return gCtx.getCublasHandle();
+       }
+
+       private static cudnnHandle getCudnnHandle(GPUContext gCtx) throws 
DMLRuntimeException {
+               return gCtx.getCudnnHandle();
+       }
+
+       private static JCudaKernels getCudaKernels(GPUContext gCtx) throws 
DMLRuntimeException {
+               return gCtx.getKernels();
+       }
+
+
        //********************************************************************/
-       //***************** DEEP LEARNING Operators **************************/
+       //************************ End of UTILS ******************************/
        //********************************************************************/
 
+       //********************************************************************/
+       //***************** DEEP LEARNING Operators **************************/
+       //********************************************************************/
 
-       public static cudnnHandle cudnnHandle;
-       public static cublasHandle cublasHandle;
-       public static cusparseHandle cusparseHandle;
-       public static JCudaKernels kernels; // Used to launch custom kernels
 
-       private static final Log LOG = 
LogFactory.getLog(LibMatrixCUDA.class.getName());
 
        private static int CONVOLUTION_PREFERENCE = 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
        
@@ -234,7 +255,8 @@ public class LibMatrixCUDA {
        }
        
        /**
-        * Convenience method to get tensor descriptor from underlying 
JCudaObject
+        * Convenience method to get tensor descriptor from underlying GPUObject
+        * @param gCtx   a valid {@link GPUContext}
         * @param mat matrix object
         * @param N number of images
         * @param C number of channels
@@ -243,39 +265,57 @@ public class LibMatrixCUDA {
         * @return cudnn tensor descriptor
         * @throws DMLRuntimeException if the input descriptor and matrix 
dimensions don't match
         */
-       private static cudnnTensorDescriptor 
allocateTensorDescriptor(MatrixObject mat, int N, int C, int H, int W) throws 
DMLRuntimeException {
+       private static cudnnTensorDescriptor 
allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int 
H, int W) throws DMLRuntimeException {
                if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
                        throw new DMLRuntimeException("Mismatch 
descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N 
                                        + " || " + mat.getNumColumns() + " != " 
+ (C*H*W));
                }
-               return 
((JCudaObject)mat.getGPUObject()).allocateTensorDescriptor(N, C, H, W);
+               return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, 
W);
+       }
+
+       /**
+        * Convenience method to get tensor descriptor
+        * @param N number of images
+        * @param C number of channels
+        * @param H height
+        * @param W width
+        * @return cudnn tensor descriptor
+        * @throws DMLRuntimeException if the input descriptor and matrix 
dimensions don't match
+        */
+       private static cudnnTensorDescriptor allocateTensorDescriptor(int N, 
int C, int H, int W) throws DMLRuntimeException {
+               cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
+               cudnnCreateTensorDescriptor(tensorDescriptor);
+               cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
+               return tensorDescriptor;
        }
        
        /**
         * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
+        * @param gCtx a valid {@link GPUContext}
         * @param image input matrix object
         * @param isForCuDNN true if the dense pointer is to be used by a CuDNN 
kernel
         * @return jcuda pointer
         * @throws DMLRuntimeException if error occurs while sparse to dense 
conversion
         */
-       private static Pointer getDensePointer(MatrixObject image, boolean 
isForCuDNN, String instName) throws DMLRuntimeException {
+       private static Pointer getDensePointer(GPUContext gCtx, MatrixObject 
image, boolean isForCuDNN, String instName) throws DMLRuntimeException {
                if(isForCuDNN && image.getNumRows()*image.getNumColumns() > 
numDoublesIn2GB) {
                        throw new DMLRuntimeException("CuDNN restriction: the 
size of input tensor cannot be greater than 2GB. Hint: try reducing the 
mini-batch size.");
                }
-               return getDensePointer(image, instName);
+               return getDensePointer(gCtx, image, instName);
        }
        
        /**
         * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
+        * @param gCtx a valid {@link GPUContext}
         * @param image input matrix object
         * @return jcuda pointer
         * @throws DMLRuntimeException if error occurs while sparse to dense 
conversion
         */
-       private static Pointer getDensePointer(MatrixObject image, String 
instName) throws DMLRuntimeException {
-               if(isInSparseFormat(image)) {
-                       
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
+       private static Pointer getDensePointer(GPUContext gCtx, MatrixObject 
image, String instName) throws DMLRuntimeException {
+               if(isInSparseFormat(gCtx, image)) {
+                       image.getGPUObject(gCtx).sparseToDense(instName);
                }
-               return ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
+               return image.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
        }
        
        /**
@@ -289,34 +329,67 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("Error status returned by 
CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
 
-       public static void conv2dBiasAdd(String instName, MatrixObject image, 
MatrixObject bias, MatrixObject filter, MatrixObject outputBlock, int N, int C, 
int H, int W,
+       public static void conv2dBiasAdd(GPUContext gCtx, String instName, 
MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject 
outputBlock, int N, int C, int H, int W,
                        int K, int R, int S, int pad_h, int pad_w, int 
stride_h, int stride_w, int P, int Q)
                                        throws DMLRuntimeException {
-               conv2d(instName, image, filter, outputBlock, N, C, H, W, K, R, 
S, pad_h, pad_w, stride_h, stride_w, P, Q);
-               biasAdd(instName, outputBlock, bias, outputBlock);
+               /*
+               int rows = (int) outputBlock.getNumRows();
+               int cols = (int) outputBlock.getNumColumns();
+               long size  = rows * cols * Sizeof.DOUBLE;
+
+               Pointer imagePointer = getDensePointer(image, instName);
+               Pointer biasPointer = getDensePointer(bias, instName);
+               Pointer outputPointer = getDensePointer(outputBlock, instName);
+               Pointer filterPointer = getDensePointer(filter, instName);
+
+               Pointer tmp = allocate(size);
+
+               conv2d(instName, imagePointer, filterPointer, tmp, N, C, H, W, 
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+               cudaDeviceSynchronize();
+
+               long k1 = bias.getNumColumns();
+               if(k1 != bias.getNumColumns() || bias.getNumColumns() != 1 || 
cols % k1 != 0) {
+                       throw new DMLRuntimeException("Incorrect inputs for 
bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + 
bias.getNumColumns() + "]");
+               }
+               // biasAdd(instName, outputBlock, bias, outputBlock);
+               biasAdd(instName, tmp, biasPointer, outputPointer, rows, cols, 
(int)k1);
+
+               cudaFreeHelper(tmp);
+               */
+               LOG.trace("GPU : conv2dBiasAdd" + ", GPUContext=" + gCtx);
+               conv2d(gCtx, instName, image, filter, outputBlock, N, C, H, W, 
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+               //cudaDeviceSynchronize;
+               biasAdd(gCtx, instName, outputBlock, bias, outputBlock);
        }
        
-       public static void conv2d(String instName, MatrixObject image, 
MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
+       public static void conv2d(GPUContext gCtx, String instName, 
MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int 
C, int H, int W,
                                                                                
                                int K, int R, int S, int pad_h, int pad_w, int 
stride_h, int stride_w, int P, int Q)
                                        throws DMLRuntimeException {
-               cudnnFilterDescriptor filterDesc = null;
+               Pointer imagePointer = getDensePointer(gCtx, image, true, 
instName);
+               Pointer filterPointer = getDensePointer(gCtx, filter, true, 
instName);
+               Pointer dstPointer = getDensePointer(gCtx, outputBlock, true, 
instName);
+
+               conv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, 
N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+       }
+
+       public static void conv2d(GPUContext gCtx, String instName, Pointer 
image, Pointer filter, Pointer output, int N,
+                                                                               
                                 int C, int H, int W, int K, int R, int S, int 
pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
+                                       throws DMLRuntimeException {
+        LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
+        cudnnFilterDescriptor filterDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
                Pointer workSpace = null;
                long sizeInBytes = 0;
                try {
-                       long t1=0, t2=0;
+                       long t1 = 0, t2 = 0;
                        // Allocate descriptors
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       cudnnTensorDescriptor srcTensorDesc = 
allocateTensorDescriptor(image, N, C, H, W);
-                       cudnnTensorDescriptor dstTensorDesc = 
allocateTensorDescriptor(outputBlock, N, K, P, Q);
+                       cudnnTensorDescriptor srcTensorDesc = 
allocateTensorDescriptor(N, C, H, W);
+                       cudnnTensorDescriptor dstTensorDesc = 
allocateTensorDescriptor(N, K, P, Q);
                        filterDesc = allocateFilterDescriptor(K, C, R, S);
 
-                       Pointer imagePointer = getDensePointer(image, true, 
instName);
-                       Pointer filterPointer = getDensePointer(filter, true, 
instName);
-                       Pointer dstPointer = getDensePointer(outputBlock, true, 
instName);
-
-                       int padding [] = { pad_h, pad_w };
-                       int strides [] = { stride_h, stride_w };
+                       int padding[] = {pad_h, pad_w};
+                       int strides[] = {stride_h, stride_w};
                        convDesc = allocateConvolutionDescriptor(padding, 
strides);
 
                        // Select the best algorithm depending on the data and 
supported CUDA
@@ -324,53 +397,54 @@ public class LibMatrixCUDA {
                        int algo = -1;
                        workSpace = new Pointer();
 
-                       if(CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
+                       if (CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
                                algo = 
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-                       }
-                       else if(CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
-                               int [] algos = {
+                       } else if (CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
+                               int[] algos = {
                                                                
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
                                                                
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
                                                                
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
                                };
                                // TODO: Look into FFt, Winograd, etc
                                // Also ensure that GPU has enough memory to 
allocate memory
-                               long sizeInBytesArray[] = { 0 };
-                               algo = 
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(cudnnHandle, 
srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
+                               long sizeInBytesArray[] = {0};
+                               algo = 
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), 
srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
                                                                
CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
-                               
cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, srcTensorDesc, filterDesc, 
convDesc, dstTensorDesc, algo, sizeInBytesArray);
-                               if(sizeInBytesArray[0] != 0)
-                                       workSpace = 
allocate(sizeInBytesArray[0]);
+                               
cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, 
filterDesc, convDesc, dstTensorDesc, algo, sizeInBytesArray);
+                               if (sizeInBytesArray[0] != 0)
+                                       workSpace = 
gCtx.allocate(sizeInBytesArray[0]);
                                sizeInBytes = sizeInBytesArray[0];
-                       }
-                       else if(CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
+                       } else if (CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
                                throw new 
DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not 
implemented");
-                       }
-                       else {
+                       } else {
                                throw new DMLRuntimeException("Unsupported 
preference criteria for convolution");
                        }
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = cudnnConvolutionForward(cudnnHandle, one(),
-                                                       srcTensorDesc, 
imagePointer,
-                                                       filterDesc, 
filterPointer,
+                       int status = 
cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
+                                                       srcTensorDesc, image,
+                                                       filterDesc, filter,
                                                        convDesc, algo, 
workSpace, sizeInBytes, zero(),
-                                                       dstTensorDesc, 
dstPointer);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
-                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-                               throw new DMLRuntimeException("Could not 
executed cudnnConvolutionForward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                                                       dstTensorDesc, output);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
+                       if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
                        }
-               }
-               finally {
-                       long t3=0;
+               } catch (CudaException e) {
+                       throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
+               } finally {
+                       long t3 = 0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
-                       if(filterDesc != null)
+                       if (filterDesc != null)
                                cudnnDestroyFilterDescriptor(filterDesc);
-                       if(convDesc != null)
+                       if (convDesc != null)
                                cudnnDestroyConvolutionDescriptor(convDesc);
-                       if(workSpace != null && sizeInBytes != 0)
-                               cudaFreeHelper(instName, workSpace);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+                       if (workSpace != null && sizeInBytes != 0)
+                               gCtx.cudaFreeHelper(instName, workSpace);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
                }
        }
 
@@ -381,7 +455,7 @@ public class LibMatrixCUDA {
                return convDesc;
        }
 
-       public static  Pointer pointerTo(double value) {
+       private static Pointer pointerTo(double value) {
                return Pointer.to(new double[] { value });
        }
 
@@ -411,22 +485,24 @@ public class LibMatrixCUDA {
 
        /**
         * This method computes the backpropagation errors for previous layer 
of relu operation
+        * @param gCtx a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param input input image
         * @param dout  next layer error propogation
         * @param outputBlock output
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void reluBackward(String instName, MatrixObject input, 
MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
-               long rows = input.getNumRows();
+       public static void reluBackward(GPUContext gCtx, String instName, 
MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws 
DMLRuntimeException {
+        LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
+        long rows = input.getNumRows();
                long cols = input.getNumColumns();
-               Pointer imagePointer = getDensePointer(input, instName);
-               Pointer doutPointer = getDensePointer(dout, instName);
-               Pointer outputPointer = getDensePointer(outputBlock, instName);
+               Pointer imagePointer = getDensePointer(gCtx, input, instName);
+               Pointer doutPointer = getDensePointer(gCtx, dout, instName);
+               Pointer outputPointer = getDensePointer(gCtx, outputBlock, 
instName);
 
                long t1=0;
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-               kernels.launchKernel("relu_backward",
+               getCudaKernels(gCtx).launchKernel("relu_backward",
                                                
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
                                                imagePointer, doutPointer, 
outputPointer, (int)rows, (int)cols);
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
@@ -438,18 +514,20 @@ public class LibMatrixCUDA {
         * ones = matrix(1, rows=1, cols=Hout*Wout)
         * output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
         * This operation is often followed by conv2d and hence we have 
introduced bias_add(input, bias) built-in function
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param input input image
         * @param bias bias
         * @param outputBlock output
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void biasMultiply(String instName, MatrixObject input, 
MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
-               if(isInSparseFormat(input)) {
-                       
((JCudaObject)input.getGPUObject()).sparseToDense(instName);
+       public static void biasMultiply(GPUContext gCtx, String instName, 
MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws 
DMLRuntimeException {
+        LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
+        if(isInSparseFormat(gCtx, input)) {
+                       input.getGPUObject(gCtx).sparseToDense(instName);
                }
-               if(isInSparseFormat(bias)) {
-                       
((JCudaObject)bias.getGPUObject()).sparseToDense(instName);
+               if(isInSparseFormat(gCtx, bias)) {
+                       bias.getGPUObject(gCtx).sparseToDense(instName);
                }
                long rows = input.getNumRows();
                long cols = input.getNumColumns();
@@ -458,12 +536,12 @@ public class LibMatrixCUDA {
                if(bias.getNumColumns() != 1 || cols % K != 0) {
                        throw new DMLRuntimeException("Incorrect inputs for 
bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + 
bias.getNumColumns() + "]");
                }
-               Pointer imagePointer = 
((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
-               Pointer biasPointer = 
((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr;
-               Pointer outputPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+               Pointer imagePointer = 
input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+               Pointer biasPointer = 
bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+               Pointer outputPointer = 
outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
                long t1 = 0;
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-               kernels.launchKernel("bias_multiply",
+               getCudaKernels(gCtx).launchKernel("bias_multiply",
                                                
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
                                                imagePointer, biasPointer, 
outputPointer, (int)rows, (int)cols, (int) PQ);
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
@@ -475,32 +553,52 @@ public class LibMatrixCUDA {
         * ones = matrix(1, rows=1, cols=Hout*Wout)
         * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
         * This operation is often followed by conv2d and hence we have 
introduced bias_add(input, bias) built-in function
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param input input image
         * @param bias bias
         * @param outputBlock output
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void biasAdd(String instName, MatrixObject input, 
MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
-               long rows = input.getNumRows();
-               long cols = input.getNumColumns();
-               long K = bias.getNumRows();
-               long PQ = cols / K;
+       public static void biasAdd(GPUContext gCtx, String instName, 
MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws 
DMLRuntimeException {
+               Pointer imagePointer = getDensePointer(gCtx, input, instName);
+               Pointer biasPointer = getDensePointer(gCtx, bias, instName);
+               Pointer outputPointer = getDensePointer(gCtx, outputBlock, 
instName);
+               int rows = (int) input.getNumRows();
+               int cols = (int) input.getNumColumns();
+               int K = (int) bias.getNumRows();
                if(bias.getNumColumns() != 1 || cols % K != 0) {
                        throw new DMLRuntimeException("Incorrect inputs for 
bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + 
bias.getNumColumns() + "]");
                }
-               Pointer imagePointer = getDensePointer(input, instName);
-               Pointer biasPointer = getDensePointer(bias, instName);
-               Pointer outputPointer = getDensePointer(outputBlock, instName);
+               biasAdd(gCtx, instName, imagePointer, biasPointer, 
outputPointer, rows, cols, K);
+       }
+
+       /**
+        * Performs the operation corresponding to the DML script:
+        * ones = matrix(1, rows=1, cols=Hout*Wout)
+        * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
+        * This operation is often followed by conv2d and hence we have 
introduced bias_add(input, bias) built-in function
+        * @param gCtx   a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param image input image
+        * @param bias bias
+        * @param output output
+        * @param rows rows in input image
+        * @param cols cols in input image
+        * @param k rows in bias
+        * @throws DMLRuntimeException
+        */
+       private static void biasAdd(GPUContext gCtx, String instName, Pointer 
image, Pointer bias, Pointer output, int rows, int cols, int k) throws 
DMLRuntimeException {
+        LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
+        int PQ = cols / k;
                long t1 = 0;
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-               kernels.launchKernel("bias_add",
-                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
-                                               imagePointer, biasPointer, 
outputPointer, (int)rows, (int)cols, (int) PQ);
+               getCudaKernels(gCtx).launchKernel("bias_add",
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
+                                               image, bias, output, rows, 
cols, PQ);
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
-
        }
-       
+
        private static void validateBatchNormalizationDimensions(MatrixObject 
scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, 
int C) throws DMLRuntimeException {
                if(scale.getNumRows() != 1 || scale.getNumColumns() != C) {
                        throw new DMLRuntimeException("Incorrect dimensions for 
scale");
@@ -518,7 +616,7 @@ public class LibMatrixCUDA {
        
        /**
         * Performs the forward BatchNormalization layer computation for 
inference
-        * 
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName name of the instruction
         * @param image input image
         * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
@@ -529,10 +627,11 @@ public class LibMatrixCUDA {
         * @param epsilon epsilon value used in the batch normalization formula
         * @throws DMLRuntimeException if error occurs
         */
-       public static void batchNormalizationForwardInference(String instName, 
MatrixObject image, 
+       public static void batchNormalizationForwardInference(GPUContext gCtx, 
String instName, MatrixObject image,
                        MatrixObject scale, MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar, 
                        MatrixObject ret, double epsilon) throws 
DMLRuntimeException {
-               int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+        LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" 
+ gCtx);
+        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
                
                int N = (int) image.getNumRows();
                int C = (int) scale.getNumColumns();
@@ -540,19 +639,19 @@ public class LibMatrixCUDA {
                validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
                
                // Allocate descriptors
-               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(N, C, CHW, 
+               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
                                new MatrixObject[] {image},  new MatrixObject[] 
{ret});
-               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(scale, 1, C, 1, 1);
+               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
                
                // Get underlying dense pointer
-               Pointer imagePtr = getDensePointer(image, true, instName);
-               Pointer retPtr = getDensePointer(ret, true, instName);
-               Pointer biasPtr = getDensePointer(bias, true, instName);
-               Pointer scalePtr = getDensePointer(scale, true, instName);
-               Pointer runningMeanPtr = getDensePointer(runningMean, true, 
instName);
-               Pointer runningVarPtr = getDensePointer(runningVar, true, 
instName);
+               Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
+               Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
+               Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
+               Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
+               Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, 
true, instName);
+               Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, 
instName);
                
-               
checkStatus(cudnnBatchNormalizationForwardInference(cudnnHandle, mode, one(), 
zero(),
+               
checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, 
one(), zero(),
                                nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
                        scaleTensorDesc, scalePtr, biasPtr,
                        runningMeanPtr, runningVarPtr, epsilon));
@@ -560,7 +659,7 @@ public class LibMatrixCUDA {
        
        /**
         * Performs the forward BatchNormalization layer computation for 
training
-        * 
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName name of the instruction
         * @param image input image
         * @param scale scale (as per CuDNN) and gamma as per original paper: 
shape [1, C, 1, 1]
@@ -574,10 +673,11 @@ public class LibMatrixCUDA {
         * @param exponentialAverageFactor factor used in the moving average 
computation
         * @throws DMLRuntimeException if error occurs
         */
-       public static void batchNormalizationForwardTraining(String instName, 
MatrixObject image, 
+       public static void batchNormalizationForwardTraining(GPUContext gCtx, 
String instName, MatrixObject image,
                        MatrixObject scale,  MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar, 
                        MatrixObject ret, MatrixObject retRunningMean, 
MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) 
throws DMLRuntimeException {
-               int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+        LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" 
+ gCtx);
+        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
                
                int N = (int) image.getNumRows();
                int C = (int) scale.getNumColumns();
@@ -585,26 +685,26 @@ public class LibMatrixCUDA {
                validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
                
                // Allocate descriptors
-               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(N, C, CHW, 
+               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
                                new MatrixObject[] {image},  new MatrixObject[] 
{ret});
-               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(scale, 1, C, 1, 1);
+               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
                
                // Get underlying dense pointer
-               Pointer imagePtr = getDensePointer(image, true, instName);
-               Pointer retPtr = getDensePointer(ret, true, instName);
-               Pointer biasPtr = getDensePointer(bias, true, instName);
-               Pointer scalePtr = getDensePointer(scale, true, instName);
-               Pointer runningMeanPtr = getDensePointer(runningMean, true, 
instName);
-               Pointer runningVarPtr = getDensePointer(runningVar, true, 
instName);
+               Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
+               Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
+               Pointer biasPtr = getDensePointer(gCtx, bias, true, instName);
+               Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
+               Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, 
true, instName);
+               Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, 
instName);
                
                // To allow for copy-on-write
-               Pointer retRunningMeanPtr = getDensePointer(retRunningMean, 
true, instName);
-               Pointer retRunningVarPtr = getDensePointer(retRunningVar, true, 
instName);
+               Pointer retRunningMeanPtr = getDensePointer(gCtx, 
retRunningMean, true, instName);
+               Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, 
true, instName);
                cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * 
Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
                cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
                
                // ignoring resultSaveMean and resultSaveVariance as it 
requires state management
-               checkStatus(cudnnBatchNormalizationForwardTraining(cudnnHandle, 
mode, one(), zero(),
+               
checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, 
one(), zero(),
                                nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
                        scaleTensorDesc, scalePtr, biasPtr, 
exponentialAverageFactor,
                        retRunningMeanPtr, retRunningVarPtr, epsilon, new 
Pointer(), new Pointer()));
@@ -612,7 +712,7 @@ public class LibMatrixCUDA {
        
        /**
         * Convenient utility for batch normalization that returns a NCHW 
descriptor
-        * 
+        * @param gCtx a valid {@link GPUContext}
         * @param N number of images
         * @param C number of channels
         * @param CHW channels*height*width
@@ -621,7 +721,7 @@ public class LibMatrixCUDA {
         * @return one of the NCHW descriptor
         * @throws DMLRuntimeException if error occurs
         */
-       private static cudnnTensorDescriptor allocateNCHWDescriptors(int N, int 
C, long CHW, MatrixObject [] input, MatrixObject [] output) throws 
DMLRuntimeException {
+       private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext 
gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) 
throws DMLRuntimeException {
                cudnnTensorDescriptor ret  = null; // Return any one
                if(CHW > ((long)Integer.MAX_VALUE)*C) {
                        throw new DMLRuntimeException("image size 
(height*width) should be less than " + Integer.MAX_VALUE);
@@ -629,9 +729,9 @@ public class LibMatrixCUDA {
                cudnnTensorDescriptor knownNCHWdescriptor = null;
                int H = -1; int W = -1;
                for(int i = 0; i < input.length; i++) {
-                       knownNCHWdescriptor = 
((JCudaObject)input[i].getGPUObject()).getTensorDescriptor();
+                       knownNCHWdescriptor = 
input[i].getGPUObject(gCtx).getTensorDescriptor();
                        if(knownNCHWdescriptor != null) {
-                               int [] shape = 
((JCudaObject)input[i].getGPUObject()).getTensorShape();
+                               int [] shape = 
input[i].getGPUObject(gCtx).getTensorShape();
                                if(shape[0] != N || shape[1] != C) {
                                        throw new 
DMLRuntimeException("Incorrect N and C:" + shape[0]  + " != " + N + " || " + 
shape[1]  + " != " +  C);
                                }
@@ -643,10 +743,10 @@ public class LibMatrixCUDA {
                if(knownNCHWdescriptor != null) {
                        // We precisely know N, C, H, W
                        for(int i = 0; i < input.length; i++) {
-                               ret = allocateTensorDescriptor(input[i], N, C, 
H, W);
+                               ret = allocateTensorDescriptor(gCtx, input[i], 
N, C, H, W);
                        }
                        for(int i = 0; i < output.length; i++) {
-                               ret = allocateTensorDescriptor(output[i], N, C, 
H, W);
+                               ret = allocateTensorDescriptor(gCtx, output[i], 
N, C, H, W);
                        }
                }
                else {
@@ -667,7 +767,7 @@ public class LibMatrixCUDA {
        
        /**
         * This method computes the backpropagation errors for image, scale and 
bias of batch normalization layer
-        * 
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName name of the instruction
         * @param image input image
         * @param dout input errors of shape C, H, W
@@ -678,30 +778,31 @@ public class LibMatrixCUDA {
         * @param epsilon epsilon value used in the batch normalization formula
         * @throws DMLRuntimeException if error occurs
         */
-       public static void batchNormalizationBackward(String instName, 
MatrixObject image, MatrixObject dout, 
+       public static void batchNormalizationBackward(GPUContext gCtx, String 
instName, MatrixObject image, MatrixObject dout,
                        MatrixObject scale, MatrixObject ret, MatrixObject 
retScale, MatrixObject retBias,
                        double epsilon) throws DMLRuntimeException {
-               int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+        LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
+        int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
                
                int N = (int) image.getNumRows();
                int C = (int) scale.getNumColumns();
                long CHW = image.getNumColumns();
                
                // Allocate descriptors
-               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(N, C, CHW, 
+               cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
                                new MatrixObject[] {image, dout},  new 
MatrixObject[] {ret});
-               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(scale, 1, C, 1, 1);
+               cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
                
                // Get underlying dense pointer
-               Pointer imagePtr = getDensePointer(image, true, instName);
-               Pointer doutPtr = getDensePointer(dout, true, instName);
-               Pointer scalePtr = getDensePointer(scale, true, instName);
-               Pointer retPtr = getDensePointer(ret, true, instName);
-               Pointer retScalePtr = getDensePointer(retScale, true, instName);
-               Pointer retBiasPtr = getDensePointer(retBias, true, instName);
+               Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
+               Pointer doutPtr = getDensePointer(gCtx, dout, true, instName);
+               Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
+               Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
+               Pointer retScalePtr = getDensePointer(gCtx, retScale, true, 
instName);
+               Pointer retBiasPtr = getDensePointer(gCtx, retBias, true, 
instName);
                
                // ignoring resultSaveMean and resultSaveVariance as it 
requires state management
-               checkStatus(cudnnBatchNormalizationBackward(cudnnHandle, mode,  
one(), zero(), one(), zero(),
+               
checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode,  one(), 
zero(), one(), zero(),
                                nCHWDescriptor,  imagePtr, nCHWDescriptor, 
doutPtr, nCHWDescriptor, retPtr,
                                scaleTensorDesc, scalePtr, retScalePtr, 
retBiasPtr, epsilon, new Pointer(), new Pointer()));
        }
@@ -709,7 +810,7 @@ public class LibMatrixCUDA {
 
        /**
         * This method computes the backpropogation errors for filter of 
convolution operation
-        * 
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param image input image
         * @param dout errors from next layer
@@ -729,56 +830,60 @@ public class LibMatrixCUDA {
         * @param Q output activation width
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void conv2dBackwardFilter(String instName, MatrixObject 
image, MatrixObject dout,
+       public static void conv2dBackwardFilter(GPUContext gCtx, String 
instName, MatrixObject image, MatrixObject dout,
                                                                                
                                                                                
        MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
                                                                                
                                                                                
        int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
                                                                                
                                                                                
        int Q) throws DMLRuntimeException {
-               cudnnFilterDescriptor dwDesc = null;
+        LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
+        cudnnFilterDescriptor dwDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
 
                Pointer workSpace = null;
                long sizeInBytes = 0;
                try {
 
-                       long t1=0, t2=0;
+                       long t1 = 0, t2 = 0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        // Allocate descriptors
-                       cudnnTensorDescriptor xTensorDesc = 
allocateTensorDescriptor(image, N, C, H, W);
-                       cudnnTensorDescriptor doutTensorDesc = 
allocateTensorDescriptor(dout, N, K, P, Q);
+                       cudnnTensorDescriptor xTensorDesc = 
allocateTensorDescriptor(gCtx, image, N, C, H, W);
+                       cudnnTensorDescriptor doutTensorDesc = 
allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
                        dwDesc = allocateFilterDescriptor(K, C, R, S);
 
                        // Allocate data
-                       Pointer imagePointer = getDensePointer(image, true, 
instName);
-                       Pointer doutPointer = getDensePointer(dout, true, 
instName);
-                       Pointer dwPointer = getDensePointer(outputBlock, true, 
instName);
-                       int padding [] = { pad_h, pad_w };
-                       int strides [] = { stride_h, stride_w };
+                       Pointer imagePointer = getDensePointer(gCtx, image, 
true, instName);
+                       Pointer doutPointer = getDensePointer(gCtx, dout, true, 
instName);
+                       Pointer dwPointer = getDensePointer(gCtx, outputBlock, 
true, instName);
+                       int padding[] = {pad_h, pad_w};
+                       int strides[] = {stride_h, stride_w};
                        convDesc = allocateConvolutionDescriptor(padding, 
strides);
-                       long sizeInBytesArray[] = { 0 };
+                       long sizeInBytesArray[] = {0};
 
                        // TODO: Select the best algorithm depending on the 
data and supported CUDA
                        int algo = 
jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
 
                        workSpace = new Pointer();
-                       
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle,
+                       
cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx),
                                                        xTensorDesc, 
doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
-                       if 
(GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = 
cudnnConvolutionBackwardFilter(cudnnHandle, one(), xTensorDesc, imagePointer,
+                       int status = 
cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, 
imagePointer,
                                                        doutTensorDesc, 
doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
-                       if 
(GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - 
t2);
+                       if (GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - 
t2);
 
-                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                       if (status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardFilter: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
-               }
-               finally {
+               } catch (CudaException e) {
+                               throw new DMLRuntimeException("Error in conv2d 
in GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
+               } finally {
                        long t3=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
 
                        if(workSpace != null && sizeInBytes != 0)
-                               cudaFreeHelper(instName, workSpace);
+                               gCtx.cudaFreeHelper(instName, workSpace);
                        if(dwDesc != null)
                                cudnnDestroyFilterDescriptor(dwDesc);
 
@@ -792,6 +897,7 @@ public class LibMatrixCUDA {
 
        /**
         * This method computes the backpropogation errors for previous layer 
of convolution operation
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param filter filter used in conv2d
         * @param dout errors from next layer
@@ -811,11 +917,12 @@ public class LibMatrixCUDA {
         * @param Q output activation width
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void conv2dBackwardData(String instName, MatrixObject 
filter, MatrixObject dout,
+       public static void conv2dBackwardData(GPUContext gCtx, String instName, 
MatrixObject filter, MatrixObject dout,
                                                                                
                                                                                
MatrixObject output, int N, int C, int H, int W, int K, int R,
                                                                                
                                                                                
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
                                                                                
                                                                                
int Q) throws DMLRuntimeException {
-               cudnnFilterDescriptor wDesc = null;
+        LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
+        cudnnFilterDescriptor wDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
 
                Pointer workSpace = null;
@@ -825,13 +932,13 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        // Allocate descriptors
                        wDesc = allocateFilterDescriptor(K, C, R, S);
-                       cudnnTensorDescriptor dyDesc = 
allocateTensorDescriptor(dout, N, K, P, Q);
-                       cudnnTensorDescriptor dxDesc = 
allocateTensorDescriptor(output, N, C, H, W);
+                       cudnnTensorDescriptor dyDesc = 
allocateTensorDescriptor(gCtx, dout, N, K, P, Q);
+                       cudnnTensorDescriptor dxDesc = 
allocateTensorDescriptor(gCtx, output, N, C, H, W);
 
                        // Allocate data
-                       Pointer w = getDensePointer(filter, true, instName);
-                       Pointer dy = getDensePointer(dout, true, instName);
-                       Pointer dx = getDensePointer(output, true, instName);
+                       Pointer w = getDensePointer(gCtx, filter, true, 
instName);
+                       Pointer dy = getDensePointer(gCtx, dout, true, 
instName);
+                       Pointer dx = getDensePointer(gCtx, output, true, 
instName);
                        
                        int padding [] = { pad_h, pad_w };
                        int strides [] = { stride_h, stride_w };
@@ -841,25 +948,27 @@ public class LibMatrixCUDA {
                        // TODO: Select the best algorithm depending on the 
data and supported CUDA
                        int algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
                        workSpace = new Pointer();
-                       
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
+                       
cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx),
                                                        wDesc, dyDesc, 
convDesc, dxDesc, algo, sizeInBytesArray);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = cudnnConvolutionBackwardData(cudnnHandle, 
one(), wDesc, w,
+                       int status = 
cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w,
                                                        dyDesc, dy, convDesc, 
algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - 
t2);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardData: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
+               } catch (CudaException e) {
+                       throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
                }
                finally {
                        long t3=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
 
                        if(workSpace != null && sizeInBytes != 0)
-                               cudaFreeHelper(instName, workSpace);
+                               gCtx.cudaFreeHelper(instName, workSpace);
                        if(wDesc != null)
                                cudnnDestroyFilterDescriptor(wDesc);
                        if(convDesc != null)
@@ -871,6 +980,7 @@ public class LibMatrixCUDA {
        
        /**
         * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param image image as matrix object
         * @param outputBlock output matrix
@@ -889,37 +999,40 @@ public class LibMatrixCUDA {
         * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void maxpooling(String instName, MatrixObject image,
+       public static void maxpooling(GPUContext gCtx, String instName, 
MatrixObject image,
                        MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
-               Pointer x = getDensePointer(image, true, instName);
-               cudnnTensorDescriptor xDesc = allocateTensorDescriptor(image, 
N, C, H, W);
-               performMaxpooling(instName, x, xDesc, outputBlock, N, C, H, W, 
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+        Pointer x = getDensePointer(gCtx, image, true, instName);
+               cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, 
image, N, C, H, W);
+               performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, 
H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
        }
        
-       public static void performMaxpooling(String instName, Pointer x, 
cudnnTensorDescriptor xDesc,
+       public static void performMaxpooling(GPUContext gCtx, String instName, 
Pointer x, cudnnTensorDescriptor xDesc,
                        MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
-               Pointer y = getDensePointer(outputBlock, true, instName);
+        LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
+        Pointer y = getDensePointer(gCtx, outputBlock, true, instName);
                cudnnPoolingDescriptor poolingDesc = null;
 
                try {
                        long t1=0,t2=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        // Allocate descriptors
-                       cudnnTensorDescriptor yDesc = 
allocateTensorDescriptor(outputBlock, N, C, P, Q);
+                       cudnnTensorDescriptor yDesc = 
allocateTensorDescriptor(gCtx, outputBlock, N, C, P, Q);
                        poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+                       int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
poolingDesc, one(), xDesc, x, zero(), yDesc, y);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
+               } catch (CudaException e) {
+                       throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
                }
                finally {
                        long t3=0;
@@ -932,6 +1045,7 @@ public class LibMatrixCUDA {
        
        /**
         * performs relu followed by maxpooling on GPU by exploiting 
cudnnPoolingForward(...)
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param image image as matrix object
         * @param outputBlock output matrix
@@ -950,22 +1064,24 @@ public class LibMatrixCUDA {
         * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void reluMaxpooling(String instName, MatrixObject image,
+       public static void reluMaxpooling(GPUContext gCtx, String instName, 
MatrixObject image,
                        MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
-               cudnnTensorDescriptor srcTensorDesc = 
allocateTensorDescriptor(image, N, C, H, W);
+        LOG.trace("GPU : reluMaxpooling" + ", GPUContext=" + gCtx);
+        cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(gCtx, 
image, N, C, H, W);
                long size  = image.getNumRows() * image.getNumColumns() * 
Sizeof.DOUBLE;
-               Pointer tmp = allocate(size);
-               performCuDNNReLU(instName, image, tmp, srcTensorDesc);
-               cudaDeviceSynchronize(); // It seemed like the cudnn operation 
in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
-               performMaxpooling(instName, tmp, srcTensorDesc, outputBlock, N, 
C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-               cudaFreeHelper(tmp);
+               Pointer tmp = gCtx.allocate(size);
+               performCuDNNReLU(gCtx, instName, image, tmp, srcTensorDesc);
+               //cudaDeviceSynchronize; // It seemed like the cudnn operation 
in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
+               performMaxpooling(gCtx, instName, tmp, srcTensorDesc, 
outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+               gCtx.cudaFreeHelper(tmp);
        }
 
        /**
         * Performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
         * This method computes the backpropogation errors for previous layer 
of maxpooling operation
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param image image as matrix object
         * @param dout                  delta matrix, output of previous layer
@@ -985,38 +1101,39 @@ public class LibMatrixCUDA {
         * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void maxpoolingBackward(String instName, MatrixObject 
image, MatrixObject dout,
+       public static void maxpoolingBackward(GPUContext gCtx, String instName, 
MatrixObject image, MatrixObject dout,
                                                                                
                                                                                
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
                                                                                
                                                                                
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
                                                                                
                                                                                
int Q) throws DMLRuntimeException {
-               Pointer y = null;
+        LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
+        Pointer y = null;
                cudnnPoolingDescriptor poolingDesc = null;
 
                try {
                        long t1=0, t2=0, t3=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        // Allocate descriptors
-                       cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(image, N, C, H, W);
-                       cudnnTensorDescriptor yDesc = 
allocateTensorDescriptor(dout, N, C, P, Q);
-                       cudnnTensorDescriptor dxDesc = 
allocateTensorDescriptor(outputBlock, N, C, H, W);
-                       cudnnTensorDescriptor dyDesc = 
allocateTensorDescriptor(dout, N, C, P, Q);
+                       cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(gCtx, image, N, C, H, W);
+                       cudnnTensorDescriptor yDesc = 
allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
+                       cudnnTensorDescriptor dxDesc = 
allocateTensorDescriptor(gCtx, outputBlock, N, C, H, W);
+                       cudnnTensorDescriptor dyDesc = 
allocateTensorDescriptor(gCtx, dout, N, C, P, Q);
 
                        poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
 
                        // Calling PoolForward first, y is one of the inputs 
for poolBackward
                        // TODO: Remove calling poolForward after necessary 
changes at language level for poolBackward
                        long numBytes = N*C*P*Q*Sizeof.DOUBLE;
-                       y = allocate(numBytes);
+                       y = gCtx.allocate(numBytes);
 
                        // Allocate data
-                       Pointer x = getDensePointer(image, true, instName);
-                       Pointer dx = getDensePointer(outputBlock, true, 
instName);
-                       Pointer dy = getDensePointer(dout, true, instName);
+                       Pointer x = getDensePointer(gCtx, image, true, 
instName);
+                       Pointer dx = getDensePointer(gCtx, outputBlock, true, 
instName);
+                       Pointer dy = getDensePointer(gCtx, dout, true, 
instName);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, one(), xDesc, x, zero(), yDesc, y);
+                       int status = cudnnPoolingForward(getCudnnHandle(gCtx), 
poolingDesc, one(), xDesc, x, zero(), yDesc, y);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -1024,19 +1141,21 @@ public class LibMatrixCUDA {
                        }
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
-                       status = cudnnPoolingBackward(cudnnHandle, poolingDesc, 
one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
+                       status = cudnnPoolingBackward(getCudnnHandle(gCtx), 
poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
+               } catch (CudaException e) {
+                       throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
                }
                finally {
                        long t4=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t4 = 
System.nanoTime();
 
                        if(y != null)
-                               cudaFreeHelper(instName, y);
+                               gCtx.cudaFreeHelper(instName, y);
                        if(poolingDesc != null)
                                cudnnDestroyPoolingDescriptor(poolingDesc);
 
@@ -1044,21 +1163,24 @@ public class LibMatrixCUDA {
                }
        }
        
-       private static void performCuDNNReLU(String instName, MatrixObject in, 
Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws 
DMLRuntimeException {
+       private static void performCuDNNReLU(GPUContext gCtx, String instName, 
MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws 
DMLRuntimeException {
                long t0=0;
                try {
-                       cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
+            LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
+            cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
                        
-                       Pointer srcData = getDensePointer(in, true, instName);
+                       Pointer srcData = getDensePointer(gCtx, in, true, 
instName);
                        cudnnActivationDescriptor activationDescriptor = new 
cudnnActivationDescriptor();
                        cudnnCreateActivationDescriptor(activationDescriptor);
                        double dummy = -1;
                        cudnnSetActivationDescriptor(activationDescriptor, 
CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
                        if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
-                       cudnnActivationForward(cudnnHandle, 
activationDescriptor,
+                       cudnnActivationForward(getCudnnHandle(gCtx), 
activationDescriptor,
                                                        one(), srcTensorDesc, 
srcData,
                                                        zero(), dstTensorDesc, 
dstData);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
+               } catch (CudaException e) {
+                       throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
                }
                finally {
                        long t1=0;
@@ -1071,30 +1193,34 @@ public class LibMatrixCUDA {
        /**
         * Performs the relu operation on the GPU.
         * @param ec currently active {@link ExecutionContext}
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param in input matrix
         * @param outputName    name of the output matrix
         * @throws DMLRuntimeException  if an error occurs
         */
-       public static void relu(ExecutionContext ec, String instName, 
MatrixObject in, String outputName) throws DMLRuntimeException {
+       public static void relu(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in, String outputName) throws DMLRuntimeException {
+               if (ec.getGPUContext() != gCtx)
+                       throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                long N = in.getNumRows();
                long CHW = in.getNumColumns();
                MatrixObject output = ec.getMatrixObject(outputName);
                getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
                long t0=0;
-               cudnnTensorDescriptor srcTensorDesc = 
((JCudaObject)in.getGPUObject()).getTensorDescriptor();
+               cudnnTensorDescriptor srcTensorDesc = 
in.getGPUObject(gCtx).getTensorDescriptor();
                if(N*CHW >= numDoublesIn2GB ||  srcTensorDesc == null) {
-                       // Invokes relu(double* A,  double* ret, int rlen, int 
clen)
+            LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
+            // Invokes relu(double* A,  double* ret, int rlen, int clen)
                        if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
-                       Pointer dstData = getDensePointer(output, instName);
-                       Pointer srcData = getDensePointer(in, instName); // 
TODO: FIXME: Add sparse kernel support for relu
-                       kernels.launchKernel("relu",
+                       Pointer dstData = getDensePointer(gCtx, output, 
instName);
+                       Pointer srcData = getDensePointer(gCtx, in, instName); 
// TODO: FIXME: Add sparse kernel support for relu
+                       getCudaKernels(gCtx).launchKernel("relu",
                                                        
ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int)CHW),
                                                        srcData, dstData, 
(int)N, (int) CHW);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
                }
                else {
-                       performCuDNNReLU(instName, in, getDensePointer(output, 
true, instName), srcTensorDesc);
+                       performCuDNNReLU(gCtx, instName, in, 
getDensePointer(gCtx, output, true, instName), srcTensorDesc);
                }
        }
 
@@ -1114,17 +1240,21 @@ public class LibMatrixCUDA {
         * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting 
cublasDsyrk(...)
         *
         * @param ec execution context
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param left input matrix, as in a tsmm expression like A %*% A' or 
A' %*% A, we just need to check whether the left one is transposed or not, I 
named it 'left'
         * @param outputName output matrix name
         * @param isLeftTransposed if true, left transposed
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void matmultTSMM(ExecutionContext ec, String instName, 
MatrixObject left, String outputName,
+       public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, 
String instName, MatrixObject left, String outputName,
                                                                                
                                                 boolean isLeftTransposed) 
throws DMLRuntimeException {
-               if(isInSparseFormat(left)) {
+               LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
+               if (ec.getGPUContext() != gCtx)
+                       throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
+               if(isInSparseFormat(gCtx, left)) {
                        // For sparse TSMM, invoke matmult (TODO: possible 
performance improvement)
-                       matmult(ec, instName, left, left, outputName, 
isLeftTransposed, !isLeftTransposed);
+                       matmult(ec, gCtx, instName, left, left, outputName, 
isLeftTransposed, !isLeftTransposed);
                        return;
                }
 
@@ -1145,22 +1275,22 @@ public class LibMatrixCUDA {
                int lda = (int) (isLeftTransposed ? m : k);
                int ldc = m;
 
-               if(!left.getGPUObject().isAllocated())
-                       throw new DMLRuntimeException("Input is not allocated:" 
+ left.getGPUObject().isAllocated());
-               if(!output.getGPUObject().isAllocated())
-                       throw new DMLRuntimeException("Output is not 
allocated:" + output.getGPUObject().isAllocated());
+               if(!left.getGPUObject(gCtx).isAllocated())
+                       throw new DMLRuntimeException("Input is not allocated:" 
+ left.getGPUObject(gCtx).isAllocated());
+               if(!output.getGPUObject(gCtx).isAllocated())
+                       throw new DMLRuntimeException("Output is not 
allocated:" + output.getGPUObject(gCtx).isAllocated());
 
-               Pointer A = getDensePointer(left, instName);
-               Pointer C = getDensePointer(output, instName);
+               Pointer A = getDensePointer(gCtx, left, instName);
+               Pointer C = getDensePointer(gCtx, output, instName);
 
                long t0=0, t1=0;
 
                if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-               JCublas2.cublasDsyrk(cublasHandle, 
cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, 
ldc);
+               JCublas2.cublasDsyrk(getCublasHandle(gCtx), 
cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, 
ldc);
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, 
System.nanoTime() - t0);
 
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-               copyUpperToLowerTriangle(instName, output);
+               copyUpperToLowerTriangle(gCtx, instName, output);
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - 
t1);
        }
 
@@ -1168,22 +1298,23 @@ public class LibMatrixCUDA {
         * Used for all version of TSMM where the result is known to be 
symmetric.
         * Hence, we compute only the upper triangular matrix and copy this 
partial
         * result down to lower triangular matrix once.
-        *
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName instruction name
         * @param ret upper triangular matrix
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       private static void copyUpperToLowerTriangle(String instName, 
MatrixObject ret) throws DMLRuntimeException {
-               if(isInSparseFormat(ret)) {
+       private static void copyUpperToLowerTriangle(GPUContext gCtx, String 
instName, MatrixObject ret) throws DMLRuntimeException {
+        LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx);
+        if(isInSparseFormat(gCtx, ret)) {
                        throw new DMLRuntimeException("Sparse GPU 
copyUpperToLowerTriangle is not implemented");
                }
                if(ret.getNumRows() != ret.getNumColumns()) {
                        throw new DMLRuntimeException("Only square matrix 
kernel is implemented for copyUpperToLowerTriangle");
                }
                int dim = (int) ret.getNumRows();
-               kernels.launchKernel("copy_u2l_dense",
+               getCudaKernels(gCtx).launchKernel("copy_u2l_dense",
                                                
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
-                                               getDensePointer(ret, instName), 
dim, dim*dim);
+                                               getDensePointer(gCtx, ret, 
instName), dim, dim*dim);
        }
 
 
@@ -1201,40 +1332,43 @@ public class LibMatrixCUDA {
         * Examines sparsity and shapes and routes call to appropriate method
         * from cuBLAS or cuSparse
         * C = op(A) x op(B)
-        * @param ec                                                            
        Current {@link ExecutionContext} instance
-        * @param instName name of the invoking instruction to record{@link 
Statistics}.
-        * @param left1                                                         
Matrix A
-        * @param right1                                                        
Matrix B
-        * @param outputName                                    Name of the 
output matrix C (in code generated after LOP layer)
-        * @param isLeftTransposed1             op for A, transposed or not
-        * @param isRightTransposed1    op for B, tranposed or not
+        * @param ec                    Current {@link ExecutionContext} 
instance
+        * @param gCtx                  a valid {@link GPUContext}
+        * @param instName              name of the invoking instruction to 
record{@link Statistics}.
+        * @param left                  Matrix A
+        * @param right                 Matrix B
+        * @param outputName            Name of the output matrix C (in code 
generated after LOP layer)
+        * @param isLeftTransposed      op for A, transposed or not
+        * @param isRightTransposed     op for B, tranposed or not
         * @return      output of matrix multiply
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static MatrixObject matmult(ExecutionContext ec, String 
instName, MatrixObject left1, MatrixObject right1, String outputName,
-                                                                               
                                                                 boolean 
isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
-
-               if(!left1.getGPUObject().isAllocated() || 
!right1.getGPUObject().isAllocated())
-                       throw new DMLRuntimeException("One of input is not 
allocated:" + left1.getGPUObject().isAllocated() + " " + 
right1.getGPUObject().isAllocated());
+       public static MatrixObject matmult(ExecutionContext ec, GPUContext 
gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
+                                                                               
                                                                 boolean 
isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+               if (ec.getGPUContext() != gCtx)
+                       throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
+               LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
+               if(!left.getGPUObject(gCtx).isAllocated() || 
!right.getGPUObject(gCtx).isAllocated())
+                       throw new DMLRuntimeException("One of input is not 
allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + 
right.getGPUObject(gCtx).isAllocated());
 
-               boolean bothDense = !left1.getGPUObject().isInSparseFormat() && 
!right1.getGPUObject().isInSparseFormat();
-               boolean bothSparse = left1.getGPUObject().isInSparseFormat() && 
right1.getGPUObject().isInSparseFormat();
+               boolean bothDense = !left.getGPUObject(gCtx).isSparse() && 
!right.getGPUObject(gCtx).isSparse();
+               boolean bothSparse = left.getGPUObject(gCtx).isSparse() && 
right.getGPUObject(gCtx).isSparse();
 
                MatrixObject output = ec.getMatrixObject(outputName);
 
                if (bothDense) {                // Dense C = Dense A * Dense B
                        // For both dense, do cuBLAS
                        getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
-                       denseDenseMatmult(instName, output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
+                       denseDenseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed);
                }
                else if (bothSparse){   // Sparse C = Sparse A * Sparse B
                        ec.allocateGPUMatrixObject(outputName);
-                       bothSparseMatmult(instName, output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
+                       bothSparseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed);
                }
                else {  // Either of A or B is sparse, Sparse C = Sparse/Dense 
A * Dense/Sparse B
                        // Convert the dense to sparse and use the 
cusparseDcsrgemm routine
                        ec.allocateGPUMatrixObject(outputName);
-                       eitherSparseMatmult(instName, output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
+                       eitherSparseMatmult(gCtx, instName, output, left, 
right, isLeftTransposed, isRightTransposed);
                }
 
                return output;
@@ -1243,20 +1377,18 @@ public class LibMatrixCUDA {
        /**
         * One of the matrices is sparse, the other dense
         * C = op(A) x op(B)
-        * @param instName the invoking instruction's name for record {@link 
Statistics}.
-        * @param output                                allocated output object 
for C on host to which GPU output will be attached
-        * @param left                                  Matrix A on host
-        * @param right                                 Matrix B on host
-        * @param isLeftTransposed              op for A, tranposed or not
-        * @param isRightTransposed             op for B, transposed or not
+        * @param gCtx              a valid {@link GPUContext}
+        * @param instName          the invoking instruction's name for record 
{@link Statistics}.
+        * @param output            allocated output object for C on host to 
which GPU output will be attached
+        * @param left              Matrix A on host
+        * @param right             Matrix B on host
+        * @param isLeftTransposed  op for A, tranposed or not
+        * @param isRightTransposed op for B, transposed or not
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void eitherSparseMatmult(String instName, MatrixObject 
output, MatrixObject left, MatrixObject right,
+       private static void eitherSparseMatmult(GPUContext gCtx, String 
instName, MatrixObject output, MatrixObject left, MatrixObject right,
                                                                                
                                                                                
                boolean isLeftTransposed, boolean isRightTransposed) throws 
DMLRuntimeException {
 
-               int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
-               int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
-
                int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
                int n = (int) (isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
                int k = (int) (isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
@@ -1268,12 +1400,12 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("Incorrect dimensions");
 
 
-               if (left.getGPUObject().isInSparseFormat()) {
+               if (left.getGPUObject(gCtx).isSparse()) {
                        // Left sparse, right dense
-                       sparseDenseMatmult(instName, output, left, right, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
+                       sparseDenseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed, m, n, k);
                } else {
                        // Left dense, right sparse
-                       denseSparseMatmult(instName, output, right, left, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
+                       denseSparseMatmult(gCtx, instName, left, right, output, 
isLeftTransposed, isRightTransposed, m, n, k);
                }
        }
 
@@ -1281,73 +1413,72 @@ public class LibMatrixCUDA {
         * C = op(A) * op(B) where A is dense and B is sparse
         * If B is ultrasparse, A is converted to a sparse matrix and {@code 
sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, 
CSRPointer)} is invoked
         * otherwise B is converted to a dense matrix and {@code 
denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, 
Pointer)} is invoked.
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
-        * @param output ?
-        * @param right ?
-        * @param left ?
-        * @param isLeftTransposed ?
-        * @param isRightTransposed ?
-        * @param transA ?
-        * @param transB ?
+        * @param left {@link MatrixObject} of A
+        * @param right {@link MatrixObject} of B
+        * @param output {@link MatrixObject} of the output matrix C
+        * @param isLeftTransposed whether matrix A needs to be transposed
+        * @param isRightTransposed whether matrix B needs to be transposed
         * @param m ?
         * @param n ?
         * @param k ?
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void denseSparseMatmult(String instName, MatrixObject 
output, MatrixObject right, MatrixObject left,
-                                                                               
                                                                                
         boolean isLeftTransposed, boolean isRightTransposed, int transA, int 
transB, int m, int n, int k)
+       private static void denseSparseMatmult(GPUContext gCtx, String 
instName, MatrixObject left, MatrixObject right, MatrixObject output,
+                                             boolean isLeftTransposed, boolean 
isRightTransposed, int m, int n, int k)
                                        throws DMLRuntimeException {
                // right sparse, left dense
-               CSRPointer B = 
((JCudaObject)right.getGPUObject()).jcudaSparseMatrixPtr;
-               Pointer ADense = getDensePointer(left, instName);
+               CSRPointer B = 
right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+               Pointer ADense = getDensePointer(gCtx, left, instName);
                if (B.isUltraSparse(k, n)){
-                       LOG.debug(" GPU Dense-Sparse Matrix Multiplication 
(Converted to Sparse-Sparse)");
-                       // Convert left to CSR and do cuSparse matmul
+            LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", 
GPUContext=" + gCtx);
+
+            // Convert left to CSR and do cuSparse matmul
                        int rowsA = (int)left.getNumRows();
                        int colsA = (int)left.getNumColumns();
 
-
                        long t0=0,t1=0, t2=0;
                        if (DMLScript.STATISTICS) t0 = System.nanoTime();
-                       Pointer AT = JCudaObject.transpose(ADense, rowsA, 
colsA, colsA, rowsA);
+                       Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, 
colsA, colsA, rowsA);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       CSRPointer A = 
JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsA, colsA, AT);
+                       CSRPointer A = 
GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, 
rowsA, colsA);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
 
                        if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
                        if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
-                       sparseSparseMatmult(instName, output, transA, transB, 
m, n, k, A, B);
+                       sparseSparseMatmult(gCtx, instName, A, B, output, 
isLeftTransposed, isRightTransposed, m, n, k);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
                        A.deallocate();
-                       cudaFreeHelper(AT);
+                       gCtx.cudaFreeHelper(AT);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
 
                } else {
-                       LOG.debug(" GPU Dense-Sparse Matrix Multiplication 
(Converted to Dense-Dense)");
+            LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", 
GPUContext=" + gCtx);
                        // Convert right to dense and do a cuBlas matmul
                        // BDenseTransposed is a column major matrix
                        // Note the arguments to denseDenseMatmult to 
accommodate for this.
                        long t0=0, t1=0;
                        if (DMLScript.STATISTICS) t0 = System.nanoTime();
-                       Pointer BDenseTransposed = 
B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, 
(int)right.getNumRows(), (int)right.getNumColumns());
+                       Pointer BDenseTransposed = 
B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), 
(int)right.getNumRows(), (int)right.getNumColumns());
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
                        if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
                        if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       boolean allocated = 
output.getGPUObject().acquireDeviceModifyDense();   // To allocate the dense 
matrix
+                       boolean allocated = 
output.getGPUObject(gCtx).acquireDeviceModifyDense();       // To allocate the 
dense matrix
                        if (GPUStatistics.DISPLAY_STATISTICS && allocated) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-                       Pointer C = getDensePointer(output, instName);
-                       denseDenseMatmult(instName, C,
+                       Pointer C = getDensePointer(gCtx, output, instName);
+                       denseDenseMatmult(gCtx, instName, C,
                                                        (int) 
left.getNumRows(), (int) left.getNumColumns(),
                                                        (int) 
right.getNumColumns(), (int) right.getNumRows(),
                                                        isLeftTransposed, 
!isRightTransposed,
                                                        ADense, 
BDenseTransposed);
 
-                       cudaFreeHelper(instName, BDenseTransposed);
+                       gCtx.cudaFreeHelper(instName, BDenseTransposed);
                }
        }
 
@@ -1355,81 +1486,79 @@ public class LibMatrixCUDA {
         * * C = op(A) * op(B) where A is sparse and B is dense
         * If A is ultrasparse, B is converted to a sparse matrix and {@code 
sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, 
CSRPointer)} is invoked
         * otherwise A is converted to a dense matrix and {@code 
denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, 
Pointer)} is invoked.
+        * @param gCtx   a valid {@link GPUContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
-        * @param output ?
-        * @param left ?
-        * @param right ?
-        * @param isLeftTransposed ?
-        * @param isRightTransposed ?
-        * @param transA ?
-        * @param transB ?
+        * @param output the output matrix object
+        * @param left matrix A
+        * @param right matrix B
+        * @param isLeftTransposed if A needs to be transposed
+        * @param isRightTransposed if B needs to be transposed
         * @param m ?
         * @param n ?
         * @param k ?
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       protected static void sparseDenseMatmult(String instName, MatrixObject 
output, MatrixObject left, MatrixObject right,
-                                                                               
                                                                                
         boolean isLeftTransposed, boolean isRightTransposed, int transA, int 
transB, int m, int n, int k)
+       private static void sparseDenseMatmult(GPUContext gCtx, String 
instName, MatrixObject output, MatrixObject left, MatrixObject right,
+                                                                               
                                                                                
         boolean isLeftTransposed, boole

<TRUNCATED>

Reply via email to