[4/5] incubator-systemml git commit: Refactored GPU{Contex, Object} to make it friendlier for parfor

nakul02 Fri, 21 Apr 2017 16:24:07 -0700

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index d2309b0..708f291 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,65 +18,584 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.LRUCacheMap;
 
-//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA 
specific
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
 
-public abstract class GPUContext {
+/**
+ * Represents a context per GPU accessible through the same JVM
+ * Each context holds cublas, cusparse, cudnn... handles which are separate 
for each GPU
+ */
+public class GPUContext {
 
-       protected static GPUContext currContext;
-       public static volatile Boolean isGPUContextCreated = false;
+       protected static final Log LOG = 
LogFactory.getLog(GPUContext.class.getName());
 
-       protected GPUContext() {}
+  /** Eviction policies for {@link GPUContext#evict(long)} */
+       public enum EvictionPolicy {
+               LRU, LFU, MIN_EVICT
+       }
 
-       /**
-        * Gets device memory available for SystemML operations
-        * 
-        * @return available memory
-        */
-       public abstract long getAvailableMemory();
+       /** currently employed eviction policy */
+       public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
+
+       /** Map of free blocks allocate on GPU. maps size_of_block -> pointer 
on GPU */
+       private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new 
LRUCacheMap<>();
+
+       /** To record size of allocated blocks */
+       private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+
+  /** active device assigned to this GPUContext instance */
+  private final int deviceNum;
+
+  /** list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
+   * These are matrices allocated on the GPU on which rmvar hasn't been called 
yet.
+   * If a {@link GPUObject} has more than one lock on it, it cannot be freed
+   * If it has zero locks on it, it can be freed, but it is preferrable to 
keep it around
+   * so that an extraneous host to dev transfer can be avoided */
+  private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
+
+  /** cudnnHandle specific to the active GPU for this GPUContext */
+  private cudnnHandle cudnnHandle;
+
+  /** cublasHandle specific to the active GPU for this GPUContext */
+  private cublasHandle cublasHandle;
+
+  /** cusparseHandle specific to the active GPU for this GPUContext */
+  private cusparseHandle cusparseHandle;
+
+  /** to launch custom CUDA kernel, specific to the active GPU for this 
GPUContext */
+  private JCudaKernels kernels;
 
        /**
-        * Ensures that all the CUDA cards on the current system are
-        * of the minimum required compute capability.
-        * (The minimum required compute capability is hard coded in {@link 
JCudaContext}.
-        * 
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public abstract void ensureComputeCapability() throws 
DMLRuntimeException;
-       
-       /**
-        * Singleton Factory method for creation of {@link GPUContext}
-        * @return GPU context
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        * The minimum CUDA Compute capability needed for SystemML.
+        * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per 
block are supported.
+        * If SystemML needs to run on an older card, this logic can be 
revisited.
         */
-       public static GPUContext getGPUContext() throws DMLRuntimeException {
-               if(currContext == null && DMLScript.USE_ACCELERATOR) {
-                       synchronized(isGPUContextCreated) {
-                               currContext = new JCudaContext();
-                               currContext.ensureComputeCapability();
-                               OptimizerUtils.GPU_MEMORY_BUDGET = 
currContext.getAvailableMemory();
-                               isGPUContextCreated = true;
-                       }
-               }
-               return currContext;
-       }
-       
-       public static GPUObject createGPUObject(MatrixObject mo) {
-               if(DMLScript.USE_ACCELERATOR) {
-                       synchronized(isGPUContextCreated) {
-                               if(currContext == null)
-                                       throw new RuntimeException("GPUContext 
is not created");
-                               if(currContext instanceof JCudaContext)
-                                       return new JCudaObject(mo);
-                       }
-               }
-               throw new RuntimeException("Cannot create createGPUObject when 
USE_ACCELERATOR is off");
-       }
-       public abstract void destroy() throws DMLRuntimeException;
-       
-       
+       final int MAJOR_REQUIRED = 3;
+       final int MINOR_REQUIRED = 0;
+
+  // Invoke cudaMemGetInfo to get available memory information. Useful if GPU 
is shared among multiple application.
+  public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+
+  protected GPUContext(int deviceNum) throws DMLRuntimeException {
+    this.deviceNum = deviceNum;
+    cudaSetDevice(deviceNum);
+
+    cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+    long free[] = {0};
+    long total[] = {0};
+    cudaMemGetInfo(free, total);
+
+    long start = System.nanoTime();
+    cudnnHandle = new cudnnHandle();
+    cudnnCreate(cudnnHandle);
+    cublasHandle = new cublasHandle();
+    cublasCreate(cublasHandle);
+    // For cublas v2, cublasSetPointerMode tells Cublas whether to expect 
scalar arguments on device or on host
+    // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
+    // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
+    cusparseHandle = new cusparseHandle();
+    cusparseCreate(cusparseHandle);
+    kernels = new JCudaKernels(deviceNum);
+
+    GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+    LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: 
" + (free[0] * (1e-6)) + " MB on " + this);
+
+  }
+
+  public int getDeviceNum() {
+    return deviceNum;
+  }
+
+  /**
+   * Sets the device for the calling thread.
+   * This method must be called after {@link GPUContextPool#getFromPool()}
+   * is called.
+   * If in a multi-threaded env like parfor, this method must be called when 
in the
+   * appropriate thread
+   */
+  public void initializeThread() {
+    cudaSetDevice(deviceNum);
+  }
+
+  @SuppressWarnings("unused")
+  public static int cudaGetDevice() {
+    int[] device = new int[1];
+    JCuda.cudaGetDevice(device);
+    return device[0];
+  }
+
+  /**
+   * Convenience method for {@link #allocate(String, long, int)}, defaults 
statsCount to 1.
+   *
+   * @param size size of data (in bytes) to allocate
+   * @return jcuda pointer
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public Pointer allocate(long size) throws DMLRuntimeException {
+    return allocate(null, size, 1);
+  }
+
+  /**
+   * Convenience method for {@link #allocate(String, long, int)}, defaults 
statsCount to 1.
+   *
+   * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
+   * @param size            size of data (in bytes) to allocate
+   * @return jcuda pointer
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public Pointer allocate(String instructionName, long size) throws 
DMLRuntimeException {
+    return allocate(instructionName, size, 1);
+  }
+
+  /**
+   * Allocates temporary space on the device.
+   * Does not update bookkeeping.
+   * The caller is responsible for freeing up after usage.
+   *
+   * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
+   * @param size            Size of data (in bytes) to allocate
+   * @param statsCount      amount to increment the cudaAllocCount by
+   * @return jcuda Pointer
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public Pointer allocate(String instructionName, long size, int statsCount) 
throws DMLRuntimeException {
+    long t0 = 0, t1 = 0, end = 0;
+    Pointer A;
+    if (freeCUDASpaceMap.containsKey(size)) {
+      LOG.trace("GPU : in allocate from instruction " + instructionName + ", 
found free block of size " + (size / 1024.0) + " Kbytes from previously 
allocated block on " + this);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
+      LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+      A = freeList.pop();
+      if (freeList.isEmpty())
+        freeCUDASpaceMap.remove(size);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
+    } else {
+      LOG.trace("GPU : in allocate from instruction " + instructionName + ", 
allocating new block of size " + (size / 1024.0) + " Kbytes on " + this);
+      if (DMLScript.STATISTICS) t0 = System.nanoTime();
+      ensureFreeSpace(instructionName, size);
+      A = new Pointer();
+      cudaMalloc(A, size);
+      if (DMLScript.STATISTICS) 
GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
+      if (DMLScript.STATISTICS) 
GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
+    }
+    // Set all elements to 0 since newly allocated space will contain garbage
+    if (DMLScript.STATISTICS) t1 = System.nanoTime();
+    LOG.trace("GPU : in allocate from instruction " + instructionName + ", 
setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this);
+    cudaMemset(A, 0, size);
+    if (DMLScript.STATISTICS) end = System.nanoTime();
+    if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+      GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
+    if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - 
t1);
+    if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1);
+    cudaBlockSizeMap.put(A, size);
+    return A;
+
+  }
+
+  /**
+   * Does lazy cudaFree calls
+   *
+   * @param toFree {@link Pointer} instance to be freed
+   */
+  public void cudaFreeHelper(final Pointer toFree) {
+    cudaFreeHelper(null, toFree, false);
+  }
+
+  /**
+   * does lazy/eager cudaFree calls
+   *
+   * @param toFree {@link Pointer} instance to be freed
+   * @param eager  true if to be done eagerly
+   */
+  public void cudaFreeHelper(final Pointer toFree, boolean eager) {
+    cudaFreeHelper(null, toFree, eager);
+  }
+
+  /**
+   * Does lazy cudaFree calls
+   *
+   * @param instructionName name of the instruction for which to record per 
instruction free time, null if do not want to record
+   * @param toFree          {@link Pointer} instance to be freed
+   */
+  public void cudaFreeHelper(String instructionName, final Pointer toFree) {
+    cudaFreeHelper(instructionName, toFree, false);
+  }
+
+  /**
+   * Does cudaFree calls, lazily
+   *
+   * @param instructionName name of the instruction for which to record per 
instruction free time, null if do not want to record
+   * @param toFree          {@link Pointer} instance to be freed
+   * @param eager           true if to be done eagerly
+   */
+  public void cudaFreeHelper(String instructionName, final Pointer toFree, 
boolean eager) {
+    long t0 = 0;
+    assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state 
corrupted, cache block size map is not aware of a block it trying to free up";
+    long size = cudaBlockSizeMap.get(toFree);
+    if (eager) {
+      LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for 
instruction " + instructionName + " on " + this);
+      if (DMLScript.STATISTICS) t0 = System.nanoTime();
+      cudaFree(toFree);
+      cudaBlockSizeMap.remove(toFree);
+      if (DMLScript.STATISTICS) 
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
+      if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1);
+      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
+    } else {
+      LOG.trace("GPU : lazily freeing cuda memory for instruction " + 
instructionName + " on " + this);
+      LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+      if (freeList == null) {
+        freeList = new LinkedList<Pointer>();
+        freeCUDASpaceMap.put(size, freeList);
+      }
+      freeList.add(toFree);
+    }
+  }
+
+  /**
+   * Thin wrapper over {@link GPUContext#evict(long)}
+   *
+   * @param size size to check
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  void ensureFreeSpace(long size) throws DMLRuntimeException {
+    ensureFreeSpace(null, size);
+  }
+
+  /**
+   * Thin wrapper over {@link GPUContext#evict(long)}
+   *
+   * @param instructionName instructionName name of the instruction for which 
performance measurements are made
+   * @param size            size to check
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  void ensureFreeSpace(String instructionName, long size) throws 
DMLRuntimeException {
+    if (size >= getAvailableMemory()) {
+      evict(instructionName, size);
+    }
+  }
+
+  /**
+   * Convenience wrapper over {@link GPUContext#evict(String, long)}
+   *
+   * @param GPUSize Desired size to be freed up on the GPU
+   * @throws DMLRuntimeException If no blocks to free up or if not enough 
blocks with zero locks on them.
+   */
+  protected void evict(final long GPUSize) throws DMLRuntimeException {
+    evict(null, GPUSize);
+  }
+
+  /**
+   * Memory on the GPU is tried to be freed up until either a chunk of needed 
size is freed up
+   * or it fails.
+   * First the set of reusable blocks is freed up. If that isn't enough, the 
set of allocated matrix
+   * blocks with zero locks on them is freed up.
+   * The process cycles through the sorted list of allocated {@link GPUObject} 
instances. Sorting is based on
+   * number of (read) locks that have been obtained on it (reverse order). It 
repeatedly frees up
+   * blocks on which there are zero locks until the required size has been 
freed up.
+   * // TODO: update it with hybrid policy
+   *
+   * @param instructionName name of the instruction for which performance 
measurements are made
+   * @param neededSize      desired size to be freed up on the GPU
+   * @throws DMLRuntimeException If no reusable memory blocks to free up or if 
not enough matrix blocks with zero locks on them.
+   */
+  protected void evict(String instructionName, final long neededSize) throws 
DMLRuntimeException {
+    LOG.trace("GPU : evict called from " + instructionName + " for size " + 
neededSize + " on " + this);
+    GPUStatistics.cudaEvictionCount.addAndGet(1);
+    // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
+    // to free up space
+    LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
+    while (lruCacheMap.size() > 0) {
+      if (neededSize <= getAvailableMemory())
+        break;
+      Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = 
lruCacheMap.removeAndGetLRUEntry();
+      LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
+      Long size = toFreeListPair.getKey();
+      Pointer toFree = toFreeList.pop();
+      if (toFreeList.isEmpty())
+        lruCacheMap.remove(size);
+      cudaFreeHelper(instructionName, toFree, true);
+    }
+
+    if (neededSize <= getAvailableMemory())
+      return;
+
+    if (allocatedGPUObjects.size() == 0) {
+      throw new DMLRuntimeException("There is not enough memory on device for 
this matrix!");
+    }
+
+    Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
+      @Override
+      public int compare(GPUObject p1, GPUObject p2) {
+        long p1Val = p1.readLocks.get();
+        long p2Val = p2.readLocks.get();
+
+        if (p1Val > 0 && p2Val > 0) {
+          // Both are locked, so don't sort
+          return 0;
+        } else if (p1Val > 0 || p2Val > 0) {
+          // Put the unlocked one to RHS
+          return Long.compare(p2Val, p1Val);
+        } else {
+          // Both are unlocked
+
+          if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
+            long p1Size = 0;
+            long p2Size = 0;
+            try {
+              p1Size = p1.getSizeOnDevice() - neededSize;
+              p2Size = p2.getSizeOnDevice() - neededSize;
+            } catch (DMLRuntimeException e) {
+              throw new RuntimeException(e);
+            }
+
+            if (p1Size >= 0 && p2Size >= 0) {
+              return Long.compare(p2Size, p1Size);
+            } else {
+              return Long.compare(p1Size, p2Size);
+            }
+          } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == 
EvictionPolicy.LFU) {
+            return Long.compare(p2.timestamp.get(), p1.timestamp.get());
+          } else {
+            throw new RuntimeException("Unsupported eviction policy:" + 
evictionPolicy.name());
+          }
+        }
+      }
+    });
+
+    while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 
0) {
+      GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
+      if (toBeRemoved.readLocks.get() > 0) {
+        throw new DMLRuntimeException("There is not enough memory on device 
for this matrix!");
+      }
+      if (toBeRemoved.dirty) {
+        toBeRemoved.copyFromDeviceToHost();
+      }
+
+      toBeRemoved.clearData(true);
+    }
+  }
+
+  /**
+   * Whether the GPU associated with this {@link GPUContext} has recorded the 
usage of a certain block
+   *
+   * @param o the block
+   * @return true if present, false otherwise
+   */
+  public boolean isBlockRecorded(GPUObject o) {
+    return allocatedGPUObjects.contains(o);
+  }
+
+  /**
+   * @param o {@link GPUObject} instance to record
+   * @see GPUContext#allocatedGPUObjects
+   * Records the usage of a matrix block
+   */
+  public void recordBlockUsage(GPUObject o) {
+    allocatedGPUObjects.add(o);
+  }
+
+  /**
+   * @param o {@link GPUObject} instance to remove from the list of allocated 
GPU objects
+   * @see GPUContext#allocatedGPUObjects
+   * Records that a block is not used anymore
+   */
+  public void removeRecordedUsage(GPUObject o) {
+    allocatedGPUObjects.remove(o);
+  }
+
+  /**
+   * Gets the available memory on GPU that SystemML can use
+   *
+   * @return the available memory in bytes
+   */
+  public long getAvailableMemory() {
+    long free[] = {0};
+    long total[] = {0};
+    cudaMemGetInfo(free, total);
+    return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
+  }
+
+  /**
+   * Makes sure that GPU that SystemML is trying to use has the minimum 
compute capability needed
+   *
+   * @throws DMLRuntimeException if the compute capability is less than what 
is required
+   */
+  public void ensureComputeCapability() throws DMLRuntimeException {
+    int[] devices = {-1};
+    cudaGetDeviceCount(devices);
+    if (devices[0] == -1) {
+      throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 
devices");
+    }
+    boolean isComputeCapable = true;
+    for (int i = 0; i < devices[0]; i++) {
+      cudaDeviceProp properties = GPUContextPool.getGPUProperties(i);
+      int major = properties.major;
+      int minor = properties.minor;
+      if (major < MAJOR_REQUIRED) {
+        isComputeCapable = false;
+      } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
+        isComputeCapable = false;
+      }
+    }
+    if (!isComputeCapable) {
+      throw new DMLRuntimeException("One of the CUDA cards on the system has 
compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED);
+    }
+  }
+
+  public GPUObject createGPUObject(MatrixObject mo) {
+    return new GPUObject(this, mo);
+  }
+
+  /**
+   * Gets the device properties for the active GPU (set with cudaSetDevice())
+   *
+   * @return the device properties
+   */
+  public cudaDeviceProp getGPUProperties() throws DMLRuntimeException {
+    return GPUContextPool.getGPUProperties(deviceNum);
+  }
+
+  /**
+   * Gets the maximum number of threads per block for "active" GPU
+   *
+   * @return the maximum number of threads per block
+   */
+  public int getMaxThreadsPerBlock() throws DMLRuntimeException {
+    cudaDeviceProp deviceProps = getGPUProperties();
+    return deviceProps.maxThreadsPerBlock;
+  }
+
+  /**
+   * Gets the maximum number of blocks supported by the active cuda device
+   *
+   * @return the maximum number of blocks supported
+   */
+  public int getMaxBlocks() throws DMLRuntimeException {
+    cudaDeviceProp deviceProp = getGPUProperties();
+    return deviceProp.maxGridSize[0];
+  }
+
+  /**
+   * Gets the shared memory per block supported by the active cuda device
+   *
+   * @return the shared memory per block
+   */
+  public long getMaxSharedMemory() throws DMLRuntimeException {
+    cudaDeviceProp deviceProp = getGPUProperties();
+    return deviceProp.sharedMemPerBlock;
+  }
+
+  /**
+   * Gets the warp size supported by the active cuda device
+   *
+   * @return the warp size
+   */
+  public int getWarpSize() throws DMLRuntimeException {
+    cudaDeviceProp deviceProp = getGPUProperties();
+    return deviceProp.warpSize;
+  }
+
+  public cudnnHandle getCudnnHandle() {
+    return cudnnHandle;
+  }
+
+  public cublasHandle getCublasHandle() {
+    return cublasHandle;
+  }
+
+  public cusparseHandle getCusparseHandle() {
+    return cusparseHandle;
+  }
+
+  public JCudaKernels getKernels() {
+    return kernels;
+  }
+
+  /**
+   * Destroys this GPUContext object
+   * This method MUST BE called so that the GPU is available to be used again
+   *
+   * @throws DMLRuntimeException if error
+   */
+  public void destroy() throws DMLRuntimeException {
+    LOG.trace("GPU : this context was destroyed, this = " + this.toString());
+    clearMemory();
+    cudnnDestroy(cudnnHandle);
+    cublasDestroy(cublasHandle);
+    cusparseDestroy(cusparseHandle);
+    cudnnHandle = null;
+    cublasHandle = null;
+    cusparseHandle = null;
+
+  }
+
+  /**
+   * Clears all memory used by this {@link GPUContext}
+   * Be careful to ensure that no memory is currently being used before 
invoking this
+   * @throws DMLRuntimeException
+   */
+  public void clearMemory() throws DMLRuntimeException {
+    while (allocatedGPUObjects.isEmpty()) {
+      GPUObject o = allocatedGPUObjects.get(0);
+      o.clearData();
+    }
+    for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
+      for (Pointer p : l) {
+        cudaFreeHelper(p, true);
+      }
+    }
+    cudaBlockSizeMap.clear();
+    freeCUDASpaceMap.clear();
+    allocatedGPUObjects.clear();
+  }
+
+  @Override
+  public String toString() {
+    return "GPUContext{" +
+            "deviceNum=" + deviceNum +
+            '}';
+  }
+
 }


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
new file mode 100644
index 0000000..6452651
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
+import static jcuda.driver.JCudaDriver.cuInit;
+import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
+
+import java.util.LinkedList;
+import java.util.Queue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.driver.JCudaDriver;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcudnn.JCudnn;
+import jcuda.jcusparse.JCusparse;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
+
+public class GPUContextPool {
+
+  protected static final Log LOG = 
LogFactory.getLog(GPUContextPool.class.getName());
+
+  /** Maximum number of gpus to use, -1 for all */
+  public static int PER_PROCESS_MAX_GPUS = -1;
+
+  /** Whether cuda has been initialized */
+  static boolean initialized = false;
+
+  /** The total number of cuda devices on this machine */
+  static int deviceCount = -1;
+
+  /** Stores the cached deviceProperties */
+  static cudaDeviceProp[] deviceProperties;
+
+  /** Set of free GPUContexts */
+  static Queue<GPUContext> freePool = new LinkedList<>();
+
+  /**
+   * Static initialization of the number of devices
+   * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error
+   * Initializes the CUDA driver
+   * All these need be done once, and not per GPU
+   */
+  public synchronized static void initializeGPU() throws DMLRuntimeException {
+    GPUContext.LOG.info("Initializing CUDA");
+    long start = System.nanoTime();
+    JCuda.setExceptionsEnabled(true);
+    JCudnn.setExceptionsEnabled(true);
+    JCublas2.setExceptionsEnabled(true);
+    JCusparse.setExceptionsEnabled(true);
+    JCudaDriver.setExceptionsEnabled(true);
+    cuInit(0); // Initialize the driver
+
+    int deviceCountArray[] = {0};
+    cuDeviceGetCount(deviceCountArray);        // Obtain the number of devices
+    deviceCount = deviceCountArray[0];
+    deviceProperties = new cudaDeviceProp[deviceCount];
+
+    if (PER_PROCESS_MAX_GPUS > 0)
+       deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount);
+
+    // Initialize the list of devices
+    for (int i = 0; i < deviceCount; i++) {
+      cudaDeviceProp properties = new cudaDeviceProp();
+      cudaGetDeviceProperties(properties, i);
+      deviceProperties[i] = properties;
+    }
+
+    // Initialize the pool of GPUContexts
+    for (int i=0; i<deviceCount; i++){
+      GPUContext gCtx = new GPUContext(i);
+      freePool.add(gCtx);
+    }
+
+    GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount);
+    //int[] device = {-1};
+    //cudaGetDevice(device);
+    //cudaDeviceProp prop = getGPUProperties(device[0]);
+    //int maxBlocks = prop.maxGridSize[0];
+    //int maxThreadsPerBlock = prop.maxThreadsPerBlock;
+    //long sharedMemPerBlock = prop.sharedMemPerBlock;
+    //LOG.debug("Active CUDA device number : " + device[0]);
+    //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks 
+ "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
+    initialized = true;
+    GPUStatistics.cudaInitTime = System.nanoTime() - start;
+  }
+
+  /**
+   * Gets an initialized GPUContext from a pool of GPUContexts, each linked to 
a GPU
+   * @return null if not more GPUContexts in pool, a valid GPUContext otherwise
+   */
+  public static synchronized GPUContext getFromPool() throws 
DMLRuntimeException {
+    if (!initialized) initializeGPU();
+    GPUContext gCtx = freePool.poll();
+    LOG.trace("GPU : got GPUContext (" + gCtx + ") from freePool. New sizes - 
FreePool[" + freePool.size() + "]");
+    return gCtx;
+  }
+
+  /**
+   * Get the number of free GPUContexts
+   * @return number of free GPUContexts
+   */
+  public static synchronized int getAvailableCount() {
+    return freePool.size();
+  }
+
+  /**
+   * Gets the device properties
+   * @param device the device number (on a machine with more than 1 GPU)
+   * @return the device properties
+   * @throws DMLRuntimeException if there is problem initializing the 
GPUContexts
+   */
+  static cudaDeviceProp getGPUProperties(int device) throws 
DMLRuntimeException {
+    // do once - initialization of GPU
+    if (!initialized) initializeGPU();
+    return deviceProperties[device];
+  }
+
+  public static int getDeviceCount() throws DMLRuntimeException {
+    if (!initialized) initializeGPU();
+    return deviceCount;
+  }
+
+  /**
+   * Returns a {@link GPUContext} back to the pool of {@link GPUContext}s
+   * @param gCtx the GPUContext instance to return. If null, nothing happens
+   * @throws DMLRuntimeException if error
+   */
+  public static synchronized void returnToPool(GPUContext gCtx) throws 
DMLRuntimeException {
+    if (gCtx == null)
+      return;
+    freePool.add(gCtx);
+    LOG.trace("GPU : returned GPUContext (" + gCtx + ") to freePool. New sizes 
- FreePool[" + freePool.size() + "]");
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index c116475..3a1fafa 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -18,206 +18,799 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import jcuda.Pointer;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
+import static jcuda.jcusparse.JCusparse.cusparseDnnz;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.CacheException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
+import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
+import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
 import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.LRUCacheMap;
 
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.LinkedList;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
+import jcuda.Pointer;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseDirection;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+/**
+ * Handle to a matrix block on the GPU
+ */
+public class GPUObject {
+
+       private static final Log LOG = 
LogFactory.getLog(GPUObject.class.getName());
+
+       /** GPUContext that owns this GPUObject */
+       private final GPUContext gpuContext;
+
+       /** Pointer to the underlying dense matrix block on GPU */
+       private Pointer jcudaDenseMatrixPtr = null;
+
+    /** Pointer to the underlying sparse matrix block on GPU */
+       private CSRPointer jcudaSparseMatrixPtr = null;
 
-//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity
-public abstract class GPUObject 
-{
-       public enum EvictionPolicy {
-        LRU, LFU, MIN_EVICT
-    }
-       public static final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
-       protected boolean isDeviceCopyModified = false;
-       protected AtomicInteger numLocks = new AtomicInteger(0);
+       /** An optional tensor descriptor (and shape) that can be set by a 
tensor instruction such as convolution,
+        * maxpooling and exploited by a subsequent non-tensor instruction such 
as relu
+        */
+       private cudnnTensorDescriptor tensorDescriptor = null;
+
+       /** the shape of this tensor, if in fact this is a tensor */
+       private int [] tensorShape = null;
+
+       /** whether the block attached to this {@link GPUContext} is dirty on 
the device and needs to be copied back to host */
+       protected boolean dirty = false;
+
+       /** number of read locks on this object */
+       protected AtomicInteger readLocks = new AtomicInteger(0);
+
+       /** Timestamp, needed by {@link GPUContext#evict(long)} */
        AtomicLong timestamp = new AtomicLong(0);
-       
-       protected boolean isInSparseFormat = false;
+
+       /** Whether this block is in sparse format */
+       protected boolean isSparse = false;
+
+       /** Enclosing {@link MatrixObject} instance */
        protected MatrixObject mat = null;
-       
-       protected GPUObject(MatrixObject mat2)  {
-               this.mat = mat2;
+
+       private Pointer allocate(String instName, long size) throws 
DMLRuntimeException {
+               return getGPUContext().allocate(instName, size);
+       }
+
+       private Pointer allocate(long size) throws DMLRuntimeException {
+               return getGPUContext().allocate(size);
+       }
+
+       private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+               getGPUContext().cudaFreeHelper(toFree);
+       }
+
+       private void cudaFreeHelper(Pointer toFree, boolean eager) throws 
DMLRuntimeException {
+               getGPUContext().cudaFreeHelper(toFree, eager);
+       }
+
+       private void cudaFreeHelper(String instName, Pointer toFree, boolean 
eager) throws DMLRuntimeException {
+               getGPUContext().cudaFreeHelper(instName, toFree, eager);
        }
-       
-       public boolean isInSparseFormat() {
-               return isInSparseFormat;
+
+       private GPUContext getGPUContext() throws DMLRuntimeException {
+               return gpuContext;
        }
-       
-       public abstract boolean isAllocated();
 
        /**
-        * Signal intent that a matrix block will be read (as input) on the GPU
-        * @return      true if a host memory to device memory transfer happened
-        * @throws DMLRuntimeException ?
+        * Transposes a dense matrix on the GPU by calling the cublasDgeam 
operation
+        * @param gCtx   a valid {@link GPUContext}
+        * @param densePtr      Pointer to dense matrix on the GPU
+        * @param m                     rows in ouput matrix
+        * @param n                     columns in output matrix
+        * @param lda           rows in input matrix
+        * @param ldc           columns in output matrix
+        * @return                      transposed matrix
+        * @throws DMLRuntimeException if operation failed
         */
-       public abstract boolean acquireDeviceRead() throws DMLRuntimeException;
+       public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int 
m, int n, int lda, int ldc) throws DMLRuntimeException {
+               LOG.trace("GPU : transpose of block of size [" + m + "," + n + 
"]" + ", GPUContext=" + gCtx);
+               Pointer alpha = Pointer.to(new double[]{1.0});
+               Pointer beta = Pointer.to(new double[]{0.0});
+               Pointer A = densePtr;
+               Pointer C = gCtx.allocate(((long)m)*getDoubleSizeOf(n));
+
+               // Transpose the matrix to get a dense matrix
+               JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, 
CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
+               return C;
+       }
+
        /**
-        * To signal intent that a matrix block will be written to on the GPU
-        * @return      true if memory was allocated on the GPU as a result of 
this call
+        * Convenience method to convert a CSR matrix to a dense matrix on the 
GPU
+        * Since the allocated matrix is temporary, bookkeeping is not updated.
+        * Also note that the input dense matrix is expected to be in COLUMN 
MAJOR FORMAT
+        * Caller is responsible for deallocating memory on GPU.
+        * @param gCtx   a valid {@link GPUContext}
+        * @param cusparseHandle handle to cusparse library
+        * @param densePtr [in] dense matrix pointer on the GPU in row major
+        * @param rows number of rows
+        * @param cols number of columns
+        * @return CSR (compressed sparse row) pointer
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public abstract boolean acquireDeviceModifyDense() throws 
DMLRuntimeException;
+       public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext 
gCtx, cusparseHandle cusparseHandle, Pointer densePtr, int rows, int cols) 
throws DMLRuntimeException {
+               cusparseMatDescr matDescr = 
CSRPointer.getDefaultCuSparseMatrixDescriptor();
+               Pointer nnzPerRowPtr = null;
+               Pointer nnzTotalDevHostPtr = null;
+
+               gCtx.ensureFreeSpace(getIntSizeOf(rows + 1));
+               nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows));
+               nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
+
+               // Output is in dense vector format, convert it to CSR
+               cusparseDnnz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, 
nnzPerRowPtr, nnzTotalDevHostPtr);
+               //cudaDeviceSynchronize();
+               int[] nnzC = {-1};
+
+               long t2=0;
+               if (DMLScript.STATISTICS) t2 = System.nanoTime();
+               cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, 
getIntSizeOf(1), cudaMemcpyDeviceToHost);
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevCount.addAndGet(1);
+
+               if (nnzC[0] == -1){
+                       throw new DMLRuntimeException("cusparseDnnz did not 
calculate the correct number of nnz from the sparse-matrix vector mulitply on 
the GPU");
+               }
+
+               LOG.trace("GPU : col-major dense size[" + rows + "," + cols + 
"] to row-major sparse of with nnz = " + nnzC[0] + ", GPUContext=" + gCtx);
+
+               CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
+               cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, 
densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd);
+               //cudaDeviceSynchronize();
+
+               gCtx.cudaFreeHelper(nnzPerRowPtr);
+               gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+
+               return C;
+       }
+
+       /**
+        * Gets the double array from GPU memory onto host memory and returns 
string.
+        * @param A Pointer to memory on device (GPU), assumed to point to a 
double array
+        * @param rows rows in matrix A
+        * @param cols columns in matrix A
+        * @return the debug string
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       @SuppressWarnings("unused")
+       public static String debugString(Pointer A, long rows, long cols) 
throws DMLRuntimeException {
+               StringBuffer sb = new StringBuffer();
+               int len = toIntExact(rows * cols);
+               double[] tmp = new double[len];
+               cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), 
cudaMemcpyDeviceToHost);
+               int k = 0;
+               for (int i=0; i<rows; i++){
+                       for (int j=0; j<cols; j++){
+                               sb.append(tmp[k]).append(' ');
+                               k++;
+                       }
+                       sb.append('\n');
+               }
+               return sb.toString();
+       }
+
        /**
-        * To signal intent that a sparse matrix block will be written to on 
the GPU
-        * @return      true if memory was allocated on the GPU as a result of 
this call
+        * Convenience method to directly examine the Sparse matrix on GPU
+        * @return CSR (compressed sparse row) pointer
+        */
+       public CSRPointer getSparseMatrixCudaPointer() {
+               return getJcudaSparseMatrixPtr();
+       }
+
+       /**
+        * Convenience method to directly set the sparse matrix on GPU
+        * Make sure to call {@link #addReadLock()} after this to set 
appropriate state, if you are not sure what you are doing.
+        * Needed for operations like {@link 
JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, 
cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, 
Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
+        * @param sparseMatrixPtr CSR (compressed sparse row) pointer
+        */
+       public void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) 
throws DMLRuntimeException {
+               this.jcudaSparseMatrixPtr = sparseMatrixPtr;
+               this.isSparse = true;
+               if(getJcudaDenseMatrixPtr() != null) {
+                       cudaFreeHelper(getJcudaDenseMatrixPtr());
+                       jcudaDenseMatrixPtr = null;
+               }
+       }
+
+       /**
+        * Convenience method to directly set the dense matrix pointer on GPU
+        * Make sure to call {@link #addReadLock()} after this to set 
appropriate state, if you are not sure what you are doing.
+        *
+        * @param densePtr dense pointer
+        */
+       public void setDenseMatrixCudaPointer(Pointer densePtr) throws 
DMLRuntimeException{
+               this.jcudaDenseMatrixPtr = densePtr;
+               this.isSparse = false;
+               if(getJcudaSparseMatrixPtr() != null) {
+                       getJcudaSparseMatrixPtr().deallocate();
+                       jcudaSparseMatrixPtr = null;
+               }
+       }
+
+       /**
+        * Converts this GPUObject from dense to sparse format.
+        *
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public abstract boolean acquireDeviceModifySparse() throws 
DMLRuntimeException;
-       
+       public void denseToSparse() throws DMLRuntimeException {
+               LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" 
+ getGPUContext());
+               long t0=0;
+               if (DMLScript.STATISTICS) t0 = System.nanoTime();
+               cusparseHandle cusparseHandle = 
getGPUContext().getCusparseHandle();
+               if(cusparseHandle == null)
+                       throw new DMLRuntimeException("Expected cusparse to be 
initialized");
+               int rows = toIntExact(mat.getNumRows());
+               int cols = toIntExact(mat.getNumColumns());
+
+               if(getJcudaDenseMatrixPtr() == null || !isAllocated())
+                       throw new DMLRuntimeException("Expected allocated dense 
matrix before denseToSparse() call");
+
+               convertDensePtrFromRowMajorToColumnMajor();
+               
setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(getGPUContext(), 
cusparseHandle, getJcudaDenseMatrixPtr(), rows, cols));
+               // TODO: What if mat.getNnz() is -1 ?
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
+       }
+
        /**
-        * If memory on GPU has been allocated from elsewhere, this method 
-        * updates the internal bookkeeping
-        * @param numBytes number of bytes
+        * Convenience method. Converts Row Major Dense Matrix --> Column Major 
Dense Matrix
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public abstract void setDeviceModify(long numBytes);
+       private void convertDensePtrFromRowMajorToColumnMajor() throws 
DMLRuntimeException {
+               LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + 
", GPUContext=" + getGPUContext());
+               int m = toIntExact(mat.getNumRows());
+               int n = toIntExact(mat.getNumColumns());
+               int lda = n;
+               int ldc = m;
+               if(!isAllocated()) {
+                       throw new DMLRuntimeException("Error in converting row 
major to column major : data is not allocated");
+               }
+
+               Pointer tmp = transpose(getGPUContext(), 
getJcudaDenseMatrixPtr(), m, n, lda, ldc);
+               cudaFreeHelper(getJcudaDenseMatrixPtr());
+               setDenseMatrixCudaPointer(tmp);
+       }
+
+       private void convertDensePtrFromColMajorToRowMajor() throws 
DMLRuntimeException {
+               LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + 
", GPUContext=" + getGPUContext());
+
+               int n = toIntExact(mat.getNumRows());
+               int m = toIntExact(mat.getNumColumns());
+               int lda = n;
+               int ldc = m;
+               if(!isAllocated()) {
+                       throw new DMLRuntimeException("Error in converting 
column major to row major : data is not allocated");
+               }
+
+               Pointer tmp = transpose(getGPUContext(), 
getJcudaDenseMatrixPtr(), m, n, lda, ldc);
+               cudaFreeHelper(getJcudaDenseMatrixPtr());
+               setDenseMatrixCudaPointer(tmp);
+       }
 
        /**
-        * Signal intent that a block needs to be read on the host
-        * @return true if copied from device to host
-        * @throws CacheException ?
+        * Convert sparse to dense (Performs transpose, use 
sparseToColumnMajorDense if the kernel can deal with column major format)
+        *
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public abstract boolean acquireHostRead() throws CacheException;
-
-       public abstract void releaseInput() throws CacheException;
-       public abstract void releaseOutput() throws CacheException;
-       
-       // package-level visibility as these methods are guarded by underlying 
GPUContext
-
-       abstract void allocateDenseMatrixOnDevice() throws DMLRuntimeException;
-       abstract void allocateSparseMatrixOnDevice() throws DMLRuntimeException;
-       abstract void deallocateMemoryOnDevice(boolean eager) throws 
DMLRuntimeException;
-       abstract long getSizeOnDevice() throws DMLRuntimeException;
-       
-       abstract void copyFromHostToDevice() throws DMLRuntimeException;
-       
+       public void sparseToDense() throws DMLRuntimeException {
+               sparseToDense(null);
+       }
+
        /**
-        * Copies a matrix block (dense or sparse) from GPU Memory to Host 
memory.
-        * A {@link MatrixBlock} instance is allocated, data from the GPU is 
copied in,
-        * the current one in Host memory is deallocated by calling 
MatrixObject's acquireHostModify(MatrixBlock) (??? does not exist)
-        * and overwritten with the newly allocated instance.
-        * TODO : re-examine this to avoid spurious allocations of memory for 
optimizations
+        * Convert sparse to dense (Performs transpose, use 
sparseToColumnMajorDense if the kernel can deal with column major format)
+        * Also records per instruction invokation of sparseToDense.
+        * @param instructionName       Name of the instruction for which 
statistics are recorded in {@link GPUStatistics}
+        * @throws DMLRuntimeException ?
+        */
+       public void sparseToDense(String instructionName) throws 
DMLRuntimeException {
+               LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" 
+ getGPUContext());
+               long start=0, end=0;
+               if (DMLScript.STATISTICS) start = System.nanoTime();
+               if(getJcudaSparseMatrixPtr() == null || !isAllocated())
+                       throw new DMLRuntimeException("Expected allocated 
sparse matrix before sparseToDense() call");
+
+               sparseToColumnMajorDense();
+               convertDensePtrFromColMajorToRowMajor();
+               if (DMLScript.STATISTICS) end = System.nanoTime();
+               if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
+       }
+
+       /**
+        * More efficient method to convert sparse to dense but returns dense 
in column major format
+        *
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       abstract void copyFromDeviceToHost() throws DMLRuntimeException; // 
Called by export()
+       public void sparseToColumnMajorDense() throws DMLRuntimeException {
+               LOG.trace("GPU : sparse -> col-major dense on " + this + ", 
GPUContext=" + getGPUContext());
+               if(getJcudaSparseMatrixPtr() == null || !isAllocated())
+                       throw new DMLRuntimeException("Expected allocated 
sparse matrix before sparseToDense() call");
+
+               cusparseHandle cusparseHandle = 
getGPUContext().getCusparseHandle();
+               if(cusparseHandle == null)
+                       throw new DMLRuntimeException("Expected cusparse to be 
initialized");
+               int rows = toIntExact(mat.getNumRows());
+               int cols = toIntExact(mat.getNumColumns());
+               
setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle,
 null, rows, cols));
+       }
+
+       /**
+        * Initializes this GPUObject with a {@link MatrixObject} instance 
which will contain metadata about the enclosing matrix block
+        * @param mat2 the matrix block that owns this {@link GPUObject}
+        */
+       GPUObject(GPUContext gCtx, MatrixObject mat2)  {
+               gpuContext = gCtx;
+               this.mat = mat2;
+       }
 
+       public boolean isSparse() {
+               return isSparse;
+       }
 
        /**
-        * Convenience wrapper over {@link GPUObject#evict(String, long)}
-        * @param GPUSize Desired size to be freed up on the GPU
-        * @throws DMLRuntimeException If no blocks to free up or if not enough 
blocks with zero locks on them.
+        * Returns a previously allocated tensor shape or null
+        * @return int array of four elements or null
         */
-       protected static void evict(final long GPUSize) throws 
DMLRuntimeException {
-               evict(null, GPUSize);
+       public int [] getTensorShape() {
+               return tensorShape;
        }
 
        /**
-        * Cycles through the sorted list of allocated {@link GPUObject} 
instances. Sorting is based on
-        * number of (read) locks that have been obtained on it (reverse 
order). It repeatedly frees up 
-        * blocks on which there are zero locks until the required size has 
been freed up.
-        * // TODO: update it with hybrid policy
-        * @param instructionName name of the instruction for which performance 
measurements are made
-        * @param GPUSize Desired size to be freed up on the GPU
-        * @throws DMLRuntimeException If no blocks to free up or if not enough 
blocks with zero locks on them.  
+        * Returns a previously allocated tensor descriptor or null
+        * @return cudnn tensor descriptor
         */
-       protected static void evict(String instructionName, final long GPUSize) 
throws DMLRuntimeException {
-               synchronized (JCudaContext.syncObj) {
-
-                       GPUStatistics.cudaEvictionCount.addAndGet(1);
-                       // Release the set of free blocks maintained in a 
JCudaObject.freeCUDASpaceMap
-                       // to free up space
-                       LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = 
JCudaObject.freeCUDASpaceMap;
-                       while (lruCacheMap.size() > 0) {
-                               if (GPUSize <= getAvailableMemory())
-                                       break;
-                               Map.Entry<Long, LinkedList<Pointer>> 
toFreeListPair = lruCacheMap.removeAndGetLRUEntry();
-                               LinkedList<Pointer> toFreeList = 
toFreeListPair.getValue();
-                               Long size = toFreeListPair.getKey();
-                               Pointer toFree = toFreeList.pop();
-                               if (toFreeList.isEmpty())
-                                       lruCacheMap.remove(size);
-                               JCudaObject.cudaFreeHelper(instructionName, 
toFree, true);
+       public cudnnTensorDescriptor getTensorDescriptor() {
+               return tensorDescriptor;
+       }
+
+       /**
+        * Returns a previously allocated or allocates and returns a tensor 
descriptor
+        * @param N number of images
+        * @param C number of channels
+        * @param H height
+        * @param W width
+        * @return cudnn tensor descriptor
+        */
+       public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int 
H, int W) {
+               LOG.trace("GPU : allocateTensorDescriptor with 
[N="+N+",C="+C+",H="+H+",W="+W+"] on " + this);
+               if(tensorDescriptor == null) {
+                       tensorDescriptor = new cudnnTensorDescriptor();
+                       cudnnCreateTensorDescriptor(tensorDescriptor);
+                       cudnnSetTensor4dDescriptor(tensorDescriptor, 
CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+                       tensorShape = new int[4];
+                       tensorShape[0] = N;
+                       tensorShape[1] = C;
+                       tensorShape[2] = H;
+                       tensorShape[3] = W;
+               }
+               return tensorDescriptor;
+       }
+
+       private static long getDoubleSizeOf(long numElems) {
+               return numElems * ((long)jcuda.Sizeof.DOUBLE);
+       }
+
+       private static long getIntSizeOf(long numElems) {
+               return numElems * ((long)jcuda.Sizeof.INT);
+       }
+
+       public boolean isAllocated() {
+               boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || 
getJcudaSparseMatrixPtr() != null);
+               return eitherAllocated;
+       }
+
+       public boolean isInputAllocated() {
+               try {
+                       boolean eitherAllocated = (getJcudaDenseMatrixPtr() != 
null || getJcudaSparseMatrixPtr() != null);
+                       boolean isAllocatedOnThisGPUContext = 
getGPUContext().isBlockRecorded(this);
+                       if (eitherAllocated && !isAllocatedOnThisGPUContext) {
+                               LOG.warn("GPU : A block was allocated but was 
not on this GPUContext, GPUContext=" + getGPUContext());
                        }
+                       return eitherAllocated && isAllocatedOnThisGPUContext;
+               } catch (DMLRuntimeException e){
+                       LOG.info("GPU : System is in an inconsistent state");
+                       throw new RuntimeException(e);
+               }
+       }
+
+       /**
+        * Allocates a sparse and empty {@link GPUObject}
+        * This is the result of operations that are both non zero matrices.
+        *
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public void allocateSparseAndEmpty() throws DMLRuntimeException{
+               LOG.trace("GPU : allocate sparse and empty block on " + this + 
", GPUContext=" + getGPUContext());
+               
setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, 
mat.getNumRows()));
+               addReadLock();
+       }
+
+       /**
+        * Allocates a dense matrix of size obtained from the attached matrix 
metadata
+        * and fills it up with a single value
+        *
+        * @param v value to fill up the dense matrix
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public void allocateAndFillDense(double v) throws DMLRuntimeException {
+               LOG.trace("GPU : allocate and fill dense with value " + v + " 
on " + this + ", GPUContext=" + getGPUContext());
+               long rows = mat.getNumRows();
+               long cols = mat.getNumColumns();
+               int numElems = toIntExact(rows * cols);
+               long size = getDoubleSizeOf(numElems);
+               setDenseMatrixCudaPointer(allocate(size));
+               addReadLock();
+               // The "fill" kernel is called which treats the matrix 
"jcudaDensePtr" like a vector and fills it with value "v"
+               getGPUContext().getKernels().launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems), 
getJcudaDenseMatrixPtr(), v, numElems);
+       }
+
+       /**
+        * If this {@link GPUObject} is sparse and empty
+        * Being allocated is a prerequisite to being sparse and empty.
+        *
+        * @return true if sparse and empty
+        * @throws DMLRuntimeException if error
+        */
+       public boolean isSparseAndEmpty() throws DMLRuntimeException{
+               boolean isSparseAndAllocated = isAllocated()&& 
LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat);
+               boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && 
getJcudaSparseMatrixPtr().nnz == 0;
+               return isEmptyAndSparseAndAllocated;
+       }
+
+       public boolean acquireDeviceRead() throws DMLRuntimeException {
+               LOG.trace("GPU : acquireDeviceRead on " + this);
+               boolean transferred = false;
+               if(!isAllocated()) {
+                       LOG.trace("GPU : in acquireDeviceRead, data is not 
allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext());
+                       copyFromHostToDevice();
+                       transferred = true;
+               } else {
+                       addReadLock();
+               }
+               if(!isAllocated())
+                       throw new DMLRuntimeException("Expected device data to 
be allocated");
+               return transferred;
+       }
+
+       public boolean acquireDeviceModifyDense() throws DMLRuntimeException {
+               LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", 
GPUContext=" + getGPUContext());
+               boolean allocated = false;
+               if(!isAllocated()) {
+                       mat.setDirty(true);
+                       LOG.trace("GPU : data is not allocated, allocating a 
dense block, on " + this);
+                       // Dense block, size = numRows * numCols
+                       allocateDenseMatrixOnDevice();
+                       allocated = true;
+                       getGPUContext().recordBlockUsage(this);
+               }
+               dirty = true;
+               if(!isAllocated())
+                       throw new DMLRuntimeException("Expected device data to 
be allocated");
+               return allocated;
+       }
 
-                       if (GPUSize <= getAvailableMemory())
-                               return;
+       public boolean acquireDeviceModifySparse() throws DMLRuntimeException {
+               LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", 
GPUContext=" + getGPUContext());
+               boolean allocated = false;
+               isSparse = true;
+               if(!isAllocated()) {
+                       LOG.trace("GPU : data is not allocated, allocating a 
sparse block, on " + this);
+                       mat.setDirty(true);
+                       allocateSparseMatrixOnDevice();
+                       allocated = true;
+                       getGPUContext().recordBlockUsage(this);
+
+               }
+               dirty = true;
+               if(!isAllocated())
+                       throw new DMLRuntimeException("Expected device data to 
be allocated");
+               return allocated;
+       }
+
+       public void addReadLock() {
+               readLocks.addAndGet(1);
+       }
 
-                       if (JCudaContext.allocatedPointers.size() == 0) {
-                               throw new DMLRuntimeException("There is not 
enough memory on device for this matrix!");
+       /**
+        * if the data is allocated on the GPU and is dirty, it is copied back 
to the host memory
+        * @return true if a copy to host happened, false otherwise
+        * @throws CacheException
+        */
+       public boolean acquireHostRead() throws CacheException {
+               boolean copied = false;
+               try {
+                       LOG.trace("GPU : acquireDeviceModifySparse on " + this 
+ ", GPUContext=" + getGPUContext());
+                       if (isAllocated() && dirty) {
+                               LOG.trace("GPU : data is dirty on device, 
copying to host, on " + this + ", GPUContext=" + getGPUContext());
+                               copyFromDeviceToHost();
+                               copied = true;
                        }
+               } catch (DMLRuntimeException e) {
+                       throw new CacheException(e);
+               }
+               return copied;
+       }
+
+       /**
+        * Updates the locks depending on the eviction policy selected
+        * @throws DMLRuntimeException if there is no locked GPU Object or if 
could not obtain a {@link GPUContext}
+        */
+       private void updateReleaseLocks() throws DMLRuntimeException {
+               if (readLocks.addAndGet(-1) < 0) {
+                       throw new CacheException("Redundant release of GPU 
object");
+               }
+               LOG.trace("GPU : updateReleaseLocks, new number of read locks 
is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
+               GPUContext.EvictionPolicy evictionPolicy = 
getGPUContext().evictionPolicy;
+               switch (evictionPolicy){
+                       case LRU : timestamp.set(System.nanoTime()); break;
+                       case LFU : timestamp.addAndGet(1); break;
+                       case MIN_EVICT : /* Do Nothing */ break;
+                       default : throw new CacheException("The eviction policy 
is not supported:" + evictionPolicy.name());
+               }
+       }
+
+       /**
+        * Releases input allocated on GPU
+        * @throws DMLRuntimeException if data is not allocated or if there is 
no locked GPU Object or if could not obtain a {@link GPUContext}
+        */
+       public void releaseInput() throws DMLRuntimeException {
+               updateReleaseLocks();
+               if(!isAllocated())
+                       throw new CacheException("Attempting to release an 
input before allocating it");
+       }
+
+       /**
+        * releases output allocated on GPU
+        * @throws DMLRuntimeException if data is not allocated or if there is 
no locked GPU Object or if could not obtain a {@link GPUContext}
+        */
+       public void releaseOutput() throws DMLRuntimeException {
+               updateReleaseLocks();
+               dirty = true;
+               if(!isAllocated())
+                       throw new CacheException("Attempting to release an 
output before allocating it");
+       }
 
-                       synchronized (evictionLock) {
-                               
Collections.sort(JCudaContext.allocatedPointers, new Comparator<GPUObject>() {
-
-                                       @Override
-                                       public int compare(GPUObject p1, 
GPUObject p2) {
-                                               long p1Val = p1.numLocks.get();
-                                               long p2Val = p2.numLocks.get();
-
-                                               if (p1Val > 0 && p2Val > 0) {
-                                                       // Both are locked, so 
don't sort
-                                                       return 0;
-                                               } else if (p1Val > 0 || p2Val > 
0) {
-                                                       // Put the unlocked one 
to RHS
-                                                       return 
Long.compare(p2Val, p1Val);
-                                               } else {
-                                                       // Both are unlocked
-
-                                                       if (evictionPolicy == 
EvictionPolicy.MIN_EVICT) {
-                                                               long p1Size = 0;
-                                                               long p2Size = 0;
-                                                               try {
-                                                                       p1Size 
= p1.getSizeOnDevice() - GPUSize;
-                                                                       p2Size 
= p2.getSizeOnDevice() - GPUSize;
-                                                               } catch 
(DMLRuntimeException e) {
-                                                                       throw 
new RuntimeException(e);
-                                                               }
-
-                                                               if (p1Size >= 0 
&& p2Size >= 0) {
-                                                                       return 
Long.compare(p2Size, p1Size);
-                                                               } else {
-                                                                       return 
Long.compare(p1Size, p2Size);
-                                                               }
-                                                       } else if 
(evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
-                                                               return 
Long.compare(p2.timestamp.get(), p1.timestamp.get());
-                                                       } else {
-                                                               throw new 
RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
-                                                       }
-                                               }
-                                       }
-                               });
-
-                               while (GPUSize > getAvailableMemory() && 
JCudaContext.allocatedPointers.size() > 0) {
-                                       GPUObject toBeRemoved = 
JCudaContext.allocatedPointers.get(JCudaContext.allocatedPointers.size() - 1);
-                                       if (toBeRemoved.numLocks.get() > 0) {
-                                               throw new 
DMLRuntimeException("There is not enough memory on device for this matrix!");
-                                       }
-                                       if (toBeRemoved.isDeviceCopyModified) {
-                                               
toBeRemoved.copyFromDeviceToHost();
-                                       }
-
-                                       toBeRemoved.clearData(true);
+       void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
+               LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", 
GPUContext=" + getGPUContext());
+               assert !isAllocated() : "Internal error - trying to allocated 
dense matrix to a GPUObject that is already allocated";
+               long rows = mat.getNumRows();
+               long cols = mat.getNumColumns();
+               assert rows > 0 : "Internal error - invalid number of rows when 
allocating dense matrix";
+               assert cols > 0 : "Internal error - invalid number of columns 
when allocating dense matrix;";
+               long size = getDoubleSizeOf(rows * cols);
+               Pointer tmp = allocate(size);
+               setDenseMatrixCudaPointer(tmp);
+               addReadLock();
+       }
+
+       void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
+               LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", 
GPUContext=" + getGPUContext());
+               assert !isAllocated() : "Internal error = trying to allocated 
sparse matrix to a GPUObject that is already allocated";
+               long rows = mat.getNumRows();
+               long nnz = mat.getNnz();
+               assert rows > 0 : "Internal error - invalid number of rows when 
allocating a sparse matrix";
+               assert nnz > 0 : "Internal error - invalid number of non zeroes 
when allocating a sparse matrix";
+               CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, 
rows);
+               setSparseMatrixCudaPointer(tmp);
+               addReadLock();
+       }
+
+       void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException 
{
+               LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", 
GPUContext=" + getGPUContext());
+               if(getJcudaDenseMatrixPtr() != null) {
+                       cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager);
+               }
+               if (getJcudaSparseMatrixPtr() != null) {
+                       getJcudaSparseMatrixPtr().deallocate(eager);
+               }
+               jcudaDenseMatrixPtr = null;
+               jcudaSparseMatrixPtr = null;
+               if(tensorDescriptor != null) {
+                       cudnnDestroyTensorDescriptor(tensorDescriptor);
+                       tensorDescriptor = null;
+               }
+               readLocks.set(0);
+       }
+
+       protected long getSizeOnDevice() throws DMLRuntimeException {
+               long GPUSize = 0;
+               long rlen = mat.getNumRows();
+               long clen = mat.getNumColumns();
+               long nnz = mat.getNnz();
+
+               if(LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
+                       GPUSize = CSRPointer.estimateSize(nnz, rlen);
+               }
+               else {
+                       GPUSize = getDoubleSizeOf(rlen * clen);
+               }
+               return GPUSize;
+       }
+
+       void copyFromHostToDevice() throws DMLRuntimeException {
+               LOG.trace("GPU : copyFromHostToDevice, on " + this + ", 
GPUContext=" + getGPUContext());
+               long start=0;
+               if (DMLScript.STATISTICS) start = System.nanoTime();
+
+               MatrixBlock tmp = mat.acquireRead();
+               if(tmp.isInSparseFormat()) {
+
+                       int rowPtr[] = null;
+                       int colInd[] = null;
+                       double[] values = null;
+
+                       tmp.recomputeNonZeros();
+                       long nnz = tmp.getNonZeros();
+                       mat.getMatrixCharacteristics().setNonZeros(nnz);
+
+                       SparseBlock block = tmp.getSparseBlock();
+                       boolean copyToDevice = true;
+                       if(block == null && tmp.getNonZeros() == 0) {
+//                             // Allocate empty block --> not necessary
+//                             // To reproduce this, see 
org.apache.sysml.test.integration.applications.dml.ID3DMLTest
+//                             rowPtr = new int[0];
+//                             colInd = new int[0];
+//                             values = new double[0];
+                               copyToDevice = false;
+                       }
+                       else if(block == null && tmp.getNonZeros() != 0) {
+                               throw new DMLRuntimeException("Expected CP 
sparse block to be not null.");
+                       }
+                       else {
+                               // CSR is the preferred format for cuSparse GEMM
+                               // Converts MCSR and COO to CSR
+                               SparseBlockCSR csrBlock = null;
+                               long t0=0;
+                               if (block instanceof SparseBlockCSR){
+                                       csrBlock = (SparseBlockCSR)block;
+                               } else if (block instanceof SparseBlockCOO) {
+                                       // TODO - should we do this on the GPU 
using cusparse<t>coo2csr() ?
+                                       if (DMLScript.STATISTICS) t0 = 
System.nanoTime();
+                                       SparseBlockCOO cooBlock = 
(SparseBlockCOO)block;
+                                       csrBlock = new 
SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), 
cooBlock.indexes(), cooBlock.values());
+                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+                               } else if (block instanceof SparseBlockMCSR) {
+                                       if (DMLScript.STATISTICS) t0 = 
System.nanoTime();
+                                       SparseBlockMCSR mcsrBlock = 
(SparseBlockMCSR)block;
+                                       csrBlock = new 
SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
+                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+                               } else {
+                                       throw new 
DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
                                }
+                               rowPtr = csrBlock.rowPointers();
+                               colInd = csrBlock.indexes();
+                               values = csrBlock.values();
+                       }
+                       allocateSparseMatrixOnDevice();
+                       getGPUContext().recordBlockUsage(this);
+
+                       if(copyToDevice) {
+                               
CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), 
tmp.getNonZeros(), rowPtr, colInd, values);
                        }
                }
+               else {
+                       double[] data = tmp.getDenseBlock();
+
+                       if( data == null && tmp.getSparseBlock() != null )
+                               throw new DMLRuntimeException("Incorrect 
sparsity calculation");
+                       else if( data==null && tmp.getNonZeros() != 0 )
+                               throw new DMLRuntimeException("MatrixBlock is 
not allocated");
+                       else if( tmp.getNonZeros() == 0 )
+                               data = new 
double[tmp.getNumRows()*tmp.getNumColumns()];
+
+                       // Copy dense block
+                       allocateDenseMatrixOnDevice();
+                       getGPUContext().recordBlockUsage(this);
+
+                       cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), 
getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
+               }
+
+               mat.release();
+
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
+               if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevCount.addAndGet(1);
+       }
+
+       public static int toIntExact(long l) throws DMLRuntimeException {
+               if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+                       throw new DMLRuntimeException("Cannot be cast to int:" 
+ l);
+               }
+               return (int) l;
        }
 
+       protected void copyFromDeviceToHost() throws DMLRuntimeException {
+               LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", 
GPUContext=" + getGPUContext());
+               if (getJcudaDenseMatrixPtr() != null && 
getJcudaSparseMatrixPtr() != null){
+                       throw new DMLRuntimeException("Invalid state : JCuda 
dense/sparse pointer are both allocated");
+               }
+
+               if(getJcudaDenseMatrixPtr() != null) {
+                       long start=0;
+                       if (DMLScript.STATISTICS) start = System.nanoTime();
+                       MatrixBlock tmp = new 
MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), 
false);
+                       tmp.allocateDenseBlock();
+                       double [] data = tmp.getDenseBlock();
+
+                       cudaMemcpy(Pointer.to(data), getJcudaDenseMatrixPtr(), 
getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost);
+
+                       tmp.recomputeNonZeros();
+                       mat.acquireModify(tmp);
+                       mat.release();
+
+                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
+                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevCount.addAndGet(1);
+               }
+               else if (getJcudaSparseMatrixPtr() != null){
+                       if(!LibMatrixCUDA.isInSparseFormat(getGPUContext(), 
mat))
+                               throw new DMLRuntimeException("Block not in 
sparse format on host yet the device sparse matrix pointer is not null");
+
+                       if(this.isSparseAndEmpty()){
+                               MatrixBlock tmp = new MatrixBlock();    // 
Empty Block
+                               mat.acquireModify(tmp);
+                               mat.release();
+                       } else {
+                               long start=0;
+                               if (DMLScript.STATISTICS) start = 
System.nanoTime();
+
+                               int rows = toIntExact(mat.getNumRows());
+                               int cols = toIntExact(mat.getNumColumns());
+                               int nnz = 
toIntExact(getJcudaSparseMatrixPtr().nnz);
+                               int[] rowPtr = new int[rows + 1];
+                               int[] colInd = new int[nnz];
+                               double[] values = new double[nnz];
+                               
CSRPointer.copyToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd, 
values);
+
+                               SparseBlockCSR sparseBlock = new 
SparseBlockCSR(rowPtr, colInd, values, nnz);
+                               MatrixBlock tmp = new MatrixBlock(rows, cols, 
nnz, sparseBlock);
+                               mat.acquireModify(tmp);
+                               mat.release();
+                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevCount.addAndGet(1);
+                       }
+               }
+               else {
+                       throw new DMLRuntimeException("Cannot copy from device 
to host as JCuda dense/sparse pointer is not allocated");
+               }
+               dirty = false;
+       }
+
+
        /**
         * lazily clears the data associated with this {@link GPUObject} 
instance
         * @throws CacheException ?
         */
-       public void clearData() throws CacheException {
+       public void clearData() throws DMLRuntimeException {
                clearData(false);
        }
 
@@ -226,36 +819,38 @@ public abstract class GPUObject
         * @param eager whether to be done synchronously or asynchronously
         * @throws CacheException ?
         */
-       public void clearData(boolean eager) throws CacheException {
-               synchronized(evictionLock) {
-                       JCudaContext.allocatedPointers.remove(this);
-               }
-               try {
-                       deallocateMemoryOnDevice(eager);
-               } catch (DMLRuntimeException e) {
-                       throw new CacheException(e);
-               }
+       public void clearData(boolean eager) throws DMLRuntimeException {
+               getGPUContext().removeRecordedUsage(this);
+               deallocateMemoryOnDevice(eager);
+
+       }
+
+       /** Pointer to dense matrix */
+       public Pointer getJcudaDenseMatrixPtr() {
+               return jcudaDenseMatrixPtr;
+       }
+
+       /** Pointer to sparse matrix */
+       public CSRPointer getJcudaSparseMatrixPtr() {
+               return jcudaSparseMatrixPtr;
        }
-       
-       static Boolean evictionLock = new Boolean(true);
-       
-       protected static long getAvailableMemory() {
-               return GPUContext.currContext.getAvailableMemory();
-       }
-       
-//     // Copying from device -> host occurs here
-//     // Called by MatrixObject's exportData
-//     public void exportData() throws CacheException {
-//             boolean isDeviceCopyModified = mat.getGPUObject() != null && 
mat.getGPUObject().isDeviceCopyModified;
-//             boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || 
-//                             (mat.getMatrixBlock().getDenseBlock() == null 
&& mat.getMatrixBlock().getSparseBlock() == null);
-//             
-//             if(mat.getGPUObject() != null && (isDeviceCopyModified || 
isHostCopyUnavailable)) {
-//                     try {
-//                             mat.getGPUObject().copyFromDeviceToHost();
-//                     } catch (DMLRuntimeException e) {
-//                             throw new CacheException(e);
-//                     }
-//             }
-//     }
+
+       /** Whether this block is dirty on the GPU */
+       public boolean isDirty() {
+               return dirty;
+       }
+
+       @Override
+       public String toString() {
+               final StringBuilder sb = new StringBuilder("GPUObject{");
+               sb.append(", 
tensorShape=").append(Arrays.toString(tensorShape));
+               sb.append(", dirty=").append(dirty);
+               sb.append(", readLocks=").append(readLocks);
+               sb.append(", sparse? ").append(isSparse);
+               sb.append(", 
dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
+               sb.append('}');
+               return sb.toString();
+       }
+
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
deleted file mode 100644
index bb73f4b..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.instructions.gpu.context;
-
-import jcuda.driver.JCudaDriver;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.JCudnn;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.conf.ConfigurationManager;
-import org.apache.sysml.conf.DMLConfig;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.utils.GPUStatistics;
-
-import java.util.ArrayList;
-import java.util.concurrent.atomic.AtomicLong;
-
-import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
-import static jcuda.driver.JCudaDriver.cuInit;
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.runtime.JCuda.*;
-import static jcuda.runtime.cudaError.cudaSuccess;
-
-
-public class JCudaContext extends GPUContext {
-
-       /** Synchronization object to make sure no allocations happen when 
something is being evicted from memory */
-       public static final Object syncObj = new Object();
-       private static final Log LOG = 
LogFactory.getLog(JCudaContext.class.getName());
-
-       /** Global list of allocated {@link GPUObject} instances. This list 
must be accessed in a synchronized way */
-       public static ArrayList<GPUObject> allocatedPointers = new 
ArrayList<GPUObject>();
-
-       // The minimum CUDA Compute capability needed for SystemML.
-       // After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per 
block are supported.
-       // If SystemML needs to run on an older card, this logic can be 
revisited.
-       final int MAJOR_REQUIRED = 3;
-       final int MINOR_REQUIRED = 0;
-
-       /** The total number of cuda devices on this machine */
-       public static int deviceCount = -1;
-
-       /** enable this to print debug information before code pertaining to 
the GPU is executed  */
-       public static boolean DEBUG = false;
-
-       /** total bytes available on currently active cude device, please be 
careful with its bookkeeping */
-       AtomicLong deviceMemBytes = new AtomicLong(0);
-
-       /** Stores the cached deviceProperties */
-       private static cudaDeviceProp[] deviceProperties;
-
-       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
-       public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
-       // Whether to invoke cudaMemGetInfo for available memory or rely on 
internal bookkeeping for memory info.
-       public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME);
-       static {
-               long start = System.nanoTime();
-               JCuda.setExceptionsEnabled(true);
-               JCudnn.setExceptionsEnabled(true);
-               JCublas2.setExceptionsEnabled(true);
-               JCusparse.setExceptionsEnabled(true);
-               JCudaDriver.setExceptionsEnabled(true);
-               cuInit(0); // Initialize the driver
-
-               int deviceCountArray[] = { 0 };
-               cuDeviceGetCount(deviceCountArray);                             
// Obtain the number of devices
-               deviceCount = deviceCountArray[0];
-               deviceProperties = new cudaDeviceProp[deviceCount];
-
-               LOG.info("Total number of GPUs on the machine: " + deviceCount);
-               int maxBlocks = getMaxBlocks();
-               int maxThreadsPerBlock = getMaxThreadsPerBlock();
-               long sharedMemPerBlock = getMaxSharedMemory();
-               int[] device = {-1};
-               cudaGetDevice(device);
-               LOG.info("Active CUDA device number : " + device[0]);
-               LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + 
maxThreadsPerBlock + "/" + sharedMemPerBlock);
-
-               GPUStatistics.cudaInitTime = System.nanoTime() - start;
-       }
-
-       @Override
-       public long getAvailableMemory() {
-               if (REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
-                       long free[] = {0};
-                       long total[] = {0};
-                       if (cudaMemGetInfo(free, total) == cudaSuccess) {
-                               //long totalNumBytes = total[0];
-                               deviceMemBytes.set(free[0]);
-                       } else {
-                               throw new RuntimeException("ERROR: Unable to 
get memory information of the GPU.");
-                       }
-               }
-               return (long) 
(deviceMemBytes.get()*GPU_MEMORY_UTILIZATION_FACTOR);
-       }
-
-       @Override
-       public void ensureComputeCapability() throws DMLRuntimeException {
-               int[] devices =  {-1};
-               cudaGetDeviceCount(devices);
-               if (devices[0] == -1){
-                       throw new DMLRuntimeException("Call to 
cudaGetDeviceCount returned 0 devices");
-               }
-               boolean isComputeCapable = true;
-               for (int i=0; i<devices[0]; i++) {
-                       cudaDeviceProp properties = getGPUProperties(i);
-                       int major = properties.major;
-                       int minor = properties.minor;
-                       if (major < MAJOR_REQUIRED) {
-                               isComputeCapable = false;
-                       } else if (major == MAJOR_REQUIRED && minor < 
MINOR_REQUIRED) {
-                               isComputeCapable = false;
-                       }
-               }
-               if (!isComputeCapable) {
-                       throw new DMLRuntimeException("One of the CUDA cards on 
the system has compute capability lower than " + MAJOR_REQUIRED + "." + 
MINOR_REQUIRED);
-               }
-       }
-
-       /**
-        * Gets the device properties for the active GPU (set with 
cudaSetDevice())
-        * @return the device properties
-        */
-       public static cudaDeviceProp getGPUProperties() {
-               int[] device = {-1};
-               cudaGetDevice(device);  // Get currently active device
-               return getGPUProperties(device[0]);
-       }
-
-       /**
-        * Gets the device properties
-        * @param device the device number (on a machine with more than 1 GPU)
-        * @return the device properties
-        */
-       public static cudaDeviceProp getGPUProperties(int device){
-               if (deviceProperties[device] == null) {
-                       cudaDeviceProp properties = new cudaDeviceProp();
-                       cudaGetDeviceProperties(properties, device);
-                       deviceProperties[device] = properties;
-               }
-               return deviceProperties[device];
-       }
-
-
-       /**
-        * Gets the maximum number of threads per block for "active" GPU
-        * @return the maximum number of threads per block
-        */
-       public static int getMaxThreadsPerBlock() {
-               cudaDeviceProp deviceProps = getGPUProperties();
-               return deviceProps.maxThreadsPerBlock;
-       }
-
-       /**
-        * Gets the maximum number of blocks supported by the active cuda device
-        * @return the maximum number of blocks supported
-        */
-       public static int getMaxBlocks() {
-               cudaDeviceProp deviceProp = getGPUProperties();
-               return deviceProp.maxGridSize[0];
-       }
-
-       /**
-        * Gets the shared memory per block supported by the active cuda device
-        * @return the shared memory per block
-        */
-       public static long getMaxSharedMemory() {
-               cudaDeviceProp deviceProp = getGPUProperties();
-               return deviceProp.sharedMemPerBlock;
-       }
-
-       /**
-        * Gets the warp size supported by the active cuda device
-        * @return the warp size
-        */
-       public static int getWarpSize() {
-               cudaDeviceProp deviceProp = getGPUProperties();
-               return deviceProp.warpSize;
-       }
-
-       /**
-        * Gets the available memory and then adds value to it
-        * @param v the value to add
-        * @return the current available memory before adding value to it
-        */
-       public long getAndAddAvailableMemory(long v){
-               return deviceMemBytes.getAndAdd(v);
-       }
-
-       public JCudaContext() throws DMLRuntimeException {
-               if(isGPUContextCreated) {
-                       // Wait until it is deleted. This case happens during 
multi-threaded testing.
-                       // This also allows for multi-threaded execute calls
-                       long startTime = System.currentTimeMillis();
-                       do {
-                               try {
-                                       Thread.sleep(100);
-                               } catch (InterruptedException e) {}
-                       } while(isGPUContextCreated && 
(System.currentTimeMillis() - startTime) < 60000);
-                       synchronized(isGPUContextCreated) {
-                               if(GPUContext.currContext != null) {
-                                       throw new RuntimeException("Cannot 
create multiple JCudaContext. Waited for 10 min to close previous GPUContext");
-                               }
-                       }
-               }
-               synchronized (isGPUContextCreated){
-                       GPUContext.currContext = this;
-               }
-
-               long free [] = { 0 };
-               long total [] = { 0 };
-               long totalNumBytes = 0;
-               if(cudaMemGetInfo(free, total) == cudaSuccess) {
-                       totalNumBytes = total[0];
-                       deviceMemBytes.set(free[0]);
-               }
-               else {
-                       throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
-               }
-               LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
-               LOG.info("Available GPU memory: " + 
(deviceMemBytes.get()*(1e-6)) + " MB");
-
-               long start = System.nanoTime();
-               LibMatrixCUDA.cudnnHandle = new cudnnHandle();
-               cudnnCreate(LibMatrixCUDA.cudnnHandle);
-               LibMatrixCUDA.cublasHandle = new cublasHandle();
-               cublasCreate(LibMatrixCUDA.cublasHandle);
-               // For cublas v2, cublasSetPointerMode tells Cublas whether to 
expect scalar arguments on device or on host
-               // This applies to arguments like "alpha" in Dgemm, and "y" in 
Ddot.
-               // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
-               LibMatrixCUDA.cusparseHandle = new cusparseHandle();
-               cusparseCreate(LibMatrixCUDA.cusparseHandle);
-               try {
-                       LibMatrixCUDA.kernels = new JCudaKernels();
-               } catch (DMLRuntimeException e) {
-                       System.err.println("ERROR - Unable to initialize 
JCudaKernels. System in an inconsistent state");
-                       LibMatrixCUDA.kernels = null;
-               }
-               GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
-       }
-
-       @Override
-       public void destroy() throws DMLRuntimeException {
-               if(currContext != null) {
-                       synchronized(isGPUContextCreated) {
-                               cudnnDestroy(LibMatrixCUDA.cudnnHandle);
-                               cublasDestroy(LibMatrixCUDA.cublasHandle);
-                               cusparseDestroy(LibMatrixCUDA.cusparseHandle);
-                               currContext = null;
-                               isGPUContextCreated = false;
-                       }
-               }
-               else if(LibMatrixCUDA.cudnnHandle != null || 
LibMatrixCUDA.cublasHandle != null) {
-                       throw new DMLRuntimeException("Error while destroying 
the GPUContext");
-               }
-       }
-
-}

[4/5] incubator-systemml git commit: Refactored GPU{Contex, Object} to make it friendlier for parfor

Reply via email to