[2/3] systemml git commit: [FIX] Fixed nested parfor for GPUs

nakul02 Sat, 10 Jun 2017 12:07:33 -0700

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 8da67ea..b3c19ef 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,14 +18,24 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import jcuda.Pointer;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusolver.cusolverDnHandle;
-import jcuda.jcusolver.cusolverSpHandle;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
 
 import java.util.ArrayList;
 import java.util.Collections;
@@ -45,24 +55,14 @@ import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.LRUCacheMap;
 
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaGetDeviceCount;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemGetInfo;
-import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.JCuda.cudaSetDevice;
-import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusolver.cusolverSpHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
 
 /**
  * Represents a context per GPU accessible through the same JVM
@@ -71,606 +71,643 @@ import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
 public class GPUContext {
 
        protected static final Log LOG = 
LogFactory.getLog(GPUContext.class.getName());
+       /**
+        * currently employed eviction policy
+        */
+       public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
+       /**
+        * The minimum CUDA Compute capability needed for SystemML.
+        * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per 
block are supported.
+        * If SystemML needs to run on an older card, this logic can be 
revisited.
+        */
+       final int MAJOR_REQUIRED = 3;
+       final int MINOR_REQUIRED = 0;
+       /**
+        * active device assigned to this GPUContext instance
+        */
+       private final int deviceNum;
+       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
+       public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig()
+                       
.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+       /**
+        * Map of free blocks allocate on GPU. maps size_of_block -> pointer on 
GPU
+        */
+       private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new 
LRUCacheMap<>();
+       /**
+        * To record size of allocated blocks
+        */
+       private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+       /**
+        * list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
+        * These are matrices allocated on the GPU on which rmvar hasn't been 
called yet.
+        * If a {@link GPUObject} has more than one lock on it, it cannot be 
freed
+        * If it has zero locks on it, it can be freed, but it is preferrable 
to keep it around
+        * so that an extraneous host to dev transfer can be avoided
+        */
+       private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
+       /**
+        * cudnnHandle for Deep Neural Network operations on the GPU
+        */
+       private cudnnHandle cudnnHandle;
+       /**
+        * cublasHandle for BLAS operations on the GPU
+        */
+       private cublasHandle cublasHandle;
+       /**
+        * cusparseHandle for certain sparse BLAS operations on the GPU
+        */
+       private cusparseHandle cusparseHandle;
+       /**
+        * cusolverDnHandle for invoking solve() function on dense matrices on 
the GPU
+        */
+       private cusolverDnHandle cusolverDnHandle;
+       /**
+        * cusolverSpHandle for invoking solve() function on sparse matrices on 
the GPU
+        */
+       private cusolverSpHandle cusolverSpHandle;
+       /**
+        * to launch custom CUDA kernel, specific to the active GPU for this 
GPUContext
+        */
+       private JCudaKernels kernels;
+
+       protected GPUContext(int deviceNum) throws DMLRuntimeException {
+               this.deviceNum = deviceNum;
+               cudaSetDevice(deviceNum);
+
+               cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+               long free[] = { 0 };
+               long total[] = { 0 };
+               cudaMemGetInfo(free, total);
+
+               long start = System.nanoTime();
+               cudnnHandle = new cudnnHandle();
+               cudnnCreate(cudnnHandle);
+               cublasHandle = new cublasHandle();
+               cublasCreate(cublasHandle);
+               // For cublas v2, cublasSetPointerMode tells Cublas whether to 
expect scalar arguments on device or on host
+               // This applies to arguments like "alpha" in Dgemm, and "y" in 
Ddot.
+               // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
+               cusparseHandle = new cusparseHandle();
+               cusparseCreate(cusparseHandle);
+
+               cusolverDnHandle = new cusolverDnHandle();
+               cusolverDnCreate(cusolverDnHandle);
+               cusolverSpHandle = new cusolverSpHandle();
+               cusolverSpCreate(cusolverSpHandle);
+
+               kernels = new JCudaKernels(deviceNum);
+
+               GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+               LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, 
Available: " + (free[0] * (1e-6)) + " MB on "
+                               + this);
 
-  /** Eviction policies for {@link GPUContext#evict(long)} */
-       public enum EvictionPolicy {
-               LRU, LFU, MIN_EVICT
        }
 
-       /** currently employed eviction policy */
-       public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
+       public static int cudaGetDevice() {
+               int[] device = new int[1];
+               JCuda.cudaGetDevice(device);
+               return device[0];
+       }
 
-       /** Map of free blocks allocate on GPU. maps size_of_block -> pointer 
on GPU */
-       private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new 
LRUCacheMap<>();
+       public int getDeviceNum() {
+               return deviceNum;
+       }
 
-       /** To record size of allocated blocks */
-       private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+       /**
+        * Sets the device for the calling thread.
+        * This method must be called after
+        * {@link 
org.apache.sysml.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)}
+        * If in a multi-threaded env like parfor, this method must be called 
when in the
+        * appropriate thread
+        */
+       public void initializeThread() {
+               cudaSetDevice(deviceNum);
+       }
+
+       /**
+        * Convenience method for {@link #allocate(String, long, int)}, 
defaults statsCount to 1.
+        *
+        * @param size size of data (in bytes) to allocate
+        * @return jcuda pointer
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public Pointer allocate(long size) throws DMLRuntimeException {
+               return allocate(null, size, 1);
+       }
+
+       /**
+        * Convenience method for {@link #allocate(String, long, int)}, 
defaults statsCount to 1.
+        *
+        * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
+        * @param size            size of data (in bytes) to allocate
+        * @return jcuda pointer
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public Pointer allocate(String instructionName, long size) throws 
DMLRuntimeException {
+               return allocate(instructionName, size, 1);
+       }
 
-  /** active device assigned to this GPUContext instance */
-  private final int deviceNum;
+       /**
+        * Allocates temporary space on the device.
+        * Does not update bookkeeping.
+        * The caller is responsible for freeing up after usage.
+        *
+        * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
+        * @param size            Size of data (in bytes) to allocate
+        * @param statsCount      amount to increment the cudaAllocCount by
+        * @return jcuda Pointer
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public Pointer allocate(String instructionName, long size, int 
statsCount) throws DMLRuntimeException {
+               long t0 = 0, t1 = 0, end = 0;
+               Pointer A;
+               if (freeCUDASpaceMap.containsKey(size)) {
+                       LOG.trace("GPU : in allocate from instruction " + 
instructionName + ", found free block of size " + (size
+                                       / 1024.0) + " Kbytes from previously 
allocated block on " + this);
+                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                               t0 = System.nanoTime();
+                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
+                       A = freeList.pop();
+                       if (freeList.isEmpty())
+                               freeCUDASpaceMap.remove(size);
+                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                               GPUStatistics
+                                               
.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, 
System.nanoTime() - t0);
+               } else {
+                       LOG.trace(
+                                       "GPU : in allocate from instruction " + 
instructionName + ", allocating new block of size " + (size
+                                                       / 1024.0) + " Kbytes on 
" + this);
+                       if (DMLScript.STATISTICS)
+                               t0 = System.nanoTime();
+                       ensureFreeSpace(instructionName, size);
+                       A = new Pointer();
+                       cudaMalloc(A, size);
+                       if (DMLScript.STATISTICS)
+                               
GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
+                       if (DMLScript.STATISTICS)
+                               
GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                               
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_ALLOCATE,
+                                               System.nanoTime() - t0);
+               }
+               // Set all elements to 0 since newly allocated space will 
contain garbage
+               if (DMLScript.STATISTICS)
+                       t1 = System.nanoTime();
+               LOG.trace("GPU : in allocate from instruction " + 
instructionName + ", setting block of size " + (size / 1024.0)
+                               + " Kbytes to zero on " + this);
+               cudaMemset(A, 0, size);
+               if (DMLScript.STATISTICS)
+                       end = System.nanoTime();
+               if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
+               if (DMLScript.STATISTICS)
+                       GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
+               if (DMLScript.STATISTICS)
+                       GPUStatistics.cudaMemSet0Count.getAndAdd(1);
+               cudaBlockSizeMap.put(A, size);
+               return A;
 
-  /** list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
-   * These are matrices allocated on the GPU on which rmvar hasn't been called 
yet.
-   * If a {@link GPUObject} has more than one lock on it, it cannot be freed
-   * If it has zero locks on it, it can be freed, but it is preferrable to 
keep it around
-   * so that an extraneous host to dev transfer can be avoided */
-  private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
+       }
 
-  /** cudnnHandle for Deep Neural Network operations on the GPU */
-  private cudnnHandle cudnnHandle;
+       /**
+        * Does lazy cudaFree calls
+        *
+        * @param toFree {@link Pointer} instance to be freed
+        */
+       public void cudaFreeHelper(final Pointer toFree) {
+               cudaFreeHelper(null, toFree, false);
+       }
 
-  /** cublasHandle for BLAS operations on the GPU */
-  private cublasHandle cublasHandle;
+       /**
+        * does lazy/eager cudaFree calls
+        *
+        * @param toFree {@link Pointer} instance to be freed
+        * @param eager  true if to be done eagerly
+        */
+       public void cudaFreeHelper(final Pointer toFree, boolean eager) {
+               cudaFreeHelper(null, toFree, eager);
+       }
 
-  /** cusparseHandle for certain sparse BLAS operations on the GPU */
-  private cusparseHandle cusparseHandle;
+       /**
+        * Does lazy cudaFree calls
+        *
+        * @param instructionName name of the instruction for which to record 
per instruction free time, null if do not want to record
+        * @param toFree          {@link Pointer} instance to be freed
+        */
+       public void cudaFreeHelper(String instructionName, final Pointer 
toFree) {
+               cudaFreeHelper(instructionName, toFree, false);
+       }
 
-  /** cusolverDnHandle for invoking solve() function on dense matrices on the 
GPU */
-  private cusolverDnHandle cusolverDnHandle;
+       /**
+        * Does cudaFree calls, lazily
+        *
+        * @param instructionName name of the instruction for which to record 
per instruction free time, null if do not want to record
+        * @param toFree          {@link Pointer} instance to be freed
+        * @param eager           true if to be done eagerly
+        */
+       public void cudaFreeHelper(String instructionName, final Pointer 
toFree, boolean eager) {
+               Pointer dummy = new Pointer();
+               if (toFree == dummy) // trying to free a null pointer
+                       return;
+               long t0 = 0;
+               assert cudaBlockSizeMap.containsKey(
+                               toFree) : "ERROR : Internal state corrupted, 
cache block size map is not aware of a block it trying to free up";
+               long size = cudaBlockSizeMap.get(toFree);
+               if (eager) {
+                       LOG.trace("GPU : eagerly freeing cuda memory [ " + 
toFree + " ] for instruction " + instructionName + " on "
+                                       + this);
+                       if (DMLScript.STATISTICS)
+                               t0 = System.nanoTime();
+                       cudaFree(toFree);
+                       cudaBlockSizeMap.remove(toFree);
+                       if (DMLScript.STATISTICS)
+                               
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
+                       if (DMLScript.STATISTICS)
+                               GPUStatistics.cudaDeAllocCount.addAndGet(1);
+                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                               
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE,
+                                               System.nanoTime() - t0);
+               } else {
+                       LOG.trace("GPU : lazily freeing cuda memory for 
instruction " + instructionName + " on " + this);
+                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
+                       if (freeList == null) {
+                               freeList = new LinkedList<Pointer>();
+                               freeCUDASpaceMap.put(size, freeList);
+                       }
+                       if (freeList.contains(toFree))
+                               throw new RuntimeException("GPU : Internal 
state corrupted, double free");
+                       freeList.add(toFree);
+               }
+       }
 
-  /** cusolverSpHandle for invoking solve() function on sparse matrices on the 
GPU */
-  private cusolverSpHandle cusolverSpHandle;
+       /**
+        * Thin wrapper over {@link GPUContext#evict(long)}
+        *
+        * @param size size to check
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       void ensureFreeSpace(long size) throws DMLRuntimeException {
+               ensureFreeSpace(null, size);
+       }
 
-  /** to launch custom CUDA kernel, specific to the active GPU for this 
GPUContext */
-  private JCudaKernels kernels;
+       /**
+        * Thin wrapper over {@link GPUContext#evict(long)}
+        *
+        * @param instructionName instructionName name of the instruction for 
which performance measurements are made
+        * @param size            size to check
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       void ensureFreeSpace(String instructionName, long size) throws 
DMLRuntimeException {
+               if (size >= getAvailableMemory()) {
+                       evict(instructionName, size);
+               }
+       }
 
        /**
-        * The minimum CUDA Compute capability needed for SystemML.
-        * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per 
block are supported.
-        * If SystemML needs to run on an older card, this logic can be 
revisited.
+        * Convenience wrapper over {@link GPUContext#evict(String, long)}
+        *
+        * @param GPUSize Desired size to be freed up on the GPU
+        * @throws DMLRuntimeException If no blocks to free up or if not enough 
blocks with zero locks on them.
         */
-       final int MAJOR_REQUIRED = 3;
-       final int MINOR_REQUIRED = 0;
+       protected void evict(final long GPUSize) throws DMLRuntimeException {
+               evict(null, GPUSize);
+       }
+
+       /**
+        * Memory on the GPU is tried to be freed up until either a chunk of 
needed size is freed up
+        * or it fails.
+        * First the set of reusable blocks is freed up. If that isn't enough, 
the set of allocated matrix
+        * blocks with zero locks on them is freed up.
+        * The process cycles through the sorted list of allocated {@link 
GPUObject} instances. Sorting is based on
+        * number of (read) locks that have been obtained on it (reverse 
order). It repeatedly frees up
+        * blocks on which there are zero locks until the required size has 
been freed up.
+        * // TODO: update it with hybrid policy
+        *
+        * @param instructionName name of the instruction for which performance 
measurements are made
+        * @param neededSize      desired size to be freed up on the GPU
+        * @throws DMLRuntimeException If no reusable memory blocks to free up 
or if not enough matrix blocks with zero locks on them.
+        */
+       protected void evict(String instructionName, final long neededSize) 
throws DMLRuntimeException {
+               LOG.trace("GPU : evict called from " + instructionName + " for 
size " + neededSize + " on " + this);
+               GPUStatistics.cudaEvictionCount.addAndGet(1);
+               // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
+               // to free up space
+               LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = 
freeCUDASpaceMap;
+               while (lruCacheMap.size() > 0) {
+                       if (neededSize <= getAvailableMemory())
+                               break;
+                       Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = 
lruCacheMap.removeAndGetLRUEntry();
+                       LinkedList<Pointer> toFreeList = 
toFreeListPair.getValue();
+                       Long size = toFreeListPair.getKey();
+                       Pointer toFree = toFreeList.pop();
+                       if (toFreeList.isEmpty())
+                               lruCacheMap.remove(size);
+                       cudaFreeHelper(instructionName, toFree, true);
+               }
+
+               if (neededSize <= getAvailableMemory())
+                       return;
+
+               if (allocatedGPUObjects.size() == 0) {
+                       throw new DMLRuntimeException(
+                                       "There is not enough memory on device 
for this matrix, request (" + neededSize + ")");
+               }
+
+               Collections.sort(allocatedGPUObjects, new 
Comparator<GPUObject>() {
+                       @Override
+                       public int compare(GPUObject p1, GPUObject p2) {
+                               long p1Val = p1.locks.get();
+                               long p2Val = p2.locks.get();
+
+                               if (p1Val > 0 && p2Val > 0) {
+                                       // Both are locked, so don't sort
+                                       return 0;
+                               } else if (p1Val > 0 || p2Val > 0) {
+                                       // Put the unlocked one to RHS
+                                       return Long.compare(p2Val, p1Val);
+                               } else {
+                                       // Both are unlocked
+
+                                       if (evictionPolicy == 
EvictionPolicy.MIN_EVICT) {
+                                               long p1Size = 0;
+                                               long p2Size = 0;
+                                               try {
+                                                       p1Size = 
p1.getSizeOnDevice() - neededSize;
+                                                       p2Size = 
p2.getSizeOnDevice() - neededSize;
+                                               } catch (DMLRuntimeException e) 
{
+                                                       throw new 
RuntimeException(e);
+                                               }
+
+                                               if (p1Size >= 0 && p2Size >= 0) 
{
+                                                       return 
Long.compare(p2Size, p1Size);
+                                               } else {
+                                                       return 
Long.compare(p1Size, p2Size);
+                                               }
+                                       } else if (evictionPolicy == 
EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) {
+                                               return 
Long.compare(p2.timestamp.get(), p1.timestamp.get());
+                                       } else {
+                                               throw new 
RuntimeException("Unsupported eviction policy:" + evictionPolicy.name());
+                                       }
+                               }
+                       }
+               });
+
+               while (neededSize > getAvailableMemory() && 
allocatedGPUObjects.size() > 0) {
+                       GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
+                       if (toBeRemoved.locks.get() > 0) {
+                               throw new DMLRuntimeException(
+                                               "There is not enough memory on 
device for this matrix, request (" + neededSize + ")");
+                       }
+                       if (toBeRemoved.dirty) {
+                               toBeRemoved.copyFromDeviceToHost();
+                       }
+
+                       toBeRemoved.clearData(true);
+               }
+       }
+
+       /**
+        * Whether the GPU associated with this {@link GPUContext} has recorded 
the usage of a certain block
+        *
+        * @param o the block
+        * @return true if present, false otherwise
+        */
+       public boolean isBlockRecorded(GPUObject o) {
+               return allocatedGPUObjects.contains(o);
+       }
+
+       /**
+        * @param o {@link GPUObject} instance to record
+        * @see GPUContext#allocatedGPUObjects
+        * Records the usage of a matrix block
+        */
+       public void recordBlockUsage(GPUObject o) {
+               allocatedGPUObjects.add(o);
+       }
 
-  // Invoke cudaMemGetInfo to get available memory information. Useful if GPU 
is shared among multiple application.
-  public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
-
-  protected GPUContext(int deviceNum) throws DMLRuntimeException {
-    this.deviceNum = deviceNum;
-    cudaSetDevice(deviceNum);
-
-    cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-
-    long free[] = {0};
-    long total[] = {0};
-    cudaMemGetInfo(free, total);
-
-    long start = System.nanoTime();
-    cudnnHandle = new cudnnHandle();
-    cudnnCreate(cudnnHandle);
-    cublasHandle = new cublasHandle();
-    cublasCreate(cublasHandle);
-    // For cublas v2, cublasSetPointerMode tells Cublas whether to expect 
scalar arguments on device or on host
-    // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
-    // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
-    cusparseHandle = new cusparseHandle();
-    cusparseCreate(cusparseHandle);
-
-    cusolverDnHandle = new cusolverDnHandle();
-    cusolverDnCreate(cusolverDnHandle);
-    cusolverSpHandle = new cusolverSpHandle();
-    cusolverSpCreate(cusolverSpHandle);
-
-    kernels = new JCudaKernels(deviceNum);
-
-    GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
-    LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: 
" + (free[0] * (1e-6)) + " MB on " + this);
-
-  }
-
-  public int getDeviceNum() {
-    return deviceNum;
-  }
-
-  /**
-   * Sets the device for the calling thread.
-   * This method must be called after {@link GPUContextPool#getFromPool()}
-   * is called.
-   * If in a multi-threaded env like parfor, this method must be called when 
in the
-   * appropriate thread
-   */
-  public void initializeThread() {
-    cudaSetDevice(deviceNum);
-  }
-
-  public static int cudaGetDevice() {
-    int[] device = new int[1];
-    JCuda.cudaGetDevice(device);
-    return device[0];
-  }
-
-  /**
-   * Convenience method for {@link #allocate(String, long, int)}, defaults 
statsCount to 1.
-   *
-   * @param size size of data (in bytes) to allocate
-   * @return jcuda pointer
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public Pointer allocate(long size) throws DMLRuntimeException {
-    return allocate(null, size, 1);
-  }
-
-  /**
-   * Convenience method for {@link #allocate(String, long, int)}, defaults 
statsCount to 1.
-   *
-   * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
-   * @param size            size of data (in bytes) to allocate
-   * @return jcuda pointer
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public Pointer allocate(String instructionName, long size) throws 
DMLRuntimeException {
-    return allocate(instructionName, size, 1);
-  }
-
-  /**
-   * Allocates temporary space on the device.
-   * Does not update bookkeeping.
-   * The caller is responsible for freeing up after usage.
-   *
-   * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
-   * @param size            Size of data (in bytes) to allocate
-   * @param statsCount      amount to increment the cudaAllocCount by
-   * @return jcuda Pointer
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public Pointer allocate(String instructionName, long size, int statsCount) 
throws DMLRuntimeException {
-    long t0 = 0, t1 = 0, end = 0;
-    Pointer A;
-    if (freeCUDASpaceMap.containsKey(size)) {
-      LOG.trace("GPU : in allocate from instruction " + instructionName + ", 
found free block of size " + (size / 1024.0) + " Kbytes from previously 
allocated block on " + this);
-      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
-      LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
-      A = freeList.pop();
-      if (freeList.isEmpty())
-        freeCUDASpaceMap.remove(size);
-      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
-        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
-    } else {
-      LOG.trace("GPU : in allocate from instruction " + instructionName + ", 
allocating new block of size " + (size / 1024.0) + " Kbytes on " + this);
-      if (DMLScript.STATISTICS) t0 = System.nanoTime();
-      ensureFreeSpace(instructionName, size);
-      A = new Pointer();
-      cudaMalloc(A, size);
-      if (DMLScript.STATISTICS) 
GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
-      if (DMLScript.STATISTICS) 
GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
-      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
-        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
-    }
-    // Set all elements to 0 since newly allocated space will contain garbage
-    if (DMLScript.STATISTICS) t1 = System.nanoTime();
-    LOG.trace("GPU : in allocate from instruction " + instructionName + ", 
setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this);
-    cudaMemset(A, 0, size);
-    if (DMLScript.STATISTICS) end = System.nanoTime();
-    if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
-      GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - 
t1);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1);
-    cudaBlockSizeMap.put(A, size);
-    return A;
-
-  }
-
-  /**
-   * Does lazy cudaFree calls
-   *
-   * @param toFree {@link Pointer} instance to be freed
-   */
-  public void cudaFreeHelper(final Pointer toFree) {
-    cudaFreeHelper(null, toFree, false);
-  }
-
-  /**
-   * does lazy/eager cudaFree calls
-   *
-   * @param toFree {@link Pointer} instance to be freed
-   * @param eager  true if to be done eagerly
-   */
-  public void cudaFreeHelper(final Pointer toFree, boolean eager) {
-    cudaFreeHelper(null, toFree, eager);
-  }
-
-  /**
-   * Does lazy cudaFree calls
-   *
-   * @param instructionName name of the instruction for which to record per 
instruction free time, null if do not want to record
-   * @param toFree          {@link Pointer} instance to be freed
-   */
-  public void cudaFreeHelper(String instructionName, final Pointer toFree) {
-    cudaFreeHelper(instructionName, toFree, false);
-  }
-
-  /**
-   * Does cudaFree calls, lazily
-   *
-   * @param instructionName name of the instruction for which to record per 
instruction free time, null if do not want to record
-   * @param toFree          {@link Pointer} instance to be freed
-   * @param eager           true if to be done eagerly
-   */
-  public void cudaFreeHelper(String instructionName, final Pointer toFree, 
boolean eager) {
-       Pointer dummy = new Pointer();
-       if (toFree == dummy) // trying to free a null pointer
-               return;
-    long t0 = 0;
-    assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state 
corrupted, cache block size map is not aware of a block it trying to free up";
-    long size = cudaBlockSizeMap.get(toFree);
-    if (eager) {
-      LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for 
instruction " + instructionName + " on " + this);
-      if (DMLScript.STATISTICS) t0 = System.nanoTime();
-      cudaFree(toFree);
-      cudaBlockSizeMap.remove(toFree);
-      if (DMLScript.STATISTICS) 
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
-      if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1);
-      if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
-        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
-    } else {
-      LOG.trace("GPU : lazily freeing cuda memory for instruction " + 
instructionName + " on " + this);
-      LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
-      if (freeList == null) {
-        freeList = new LinkedList<Pointer>();
-        freeCUDASpaceMap.put(size, freeList);
-      }
-      if (freeList.contains(toFree))
-        throw new RuntimeException("GPU : Internal state corrupted, double 
free");
-      freeList.add(toFree);
-    }
-  }
-
-  /**
-   * Thin wrapper over {@link GPUContext#evict(long)}
-   *
-   * @param size size to check
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  void ensureFreeSpace(long size) throws DMLRuntimeException {
-    ensureFreeSpace(null, size);
-  }
-
-  /**
-   * Thin wrapper over {@link GPUContext#evict(long)}
-   *
-   * @param instructionName instructionName name of the instruction for which 
performance measurements are made
-   * @param size            size to check
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  void ensureFreeSpace(String instructionName, long size) throws 
DMLRuntimeException {
-    if (size >= getAvailableMemory()) {
-      evict(instructionName, size);
-    }
-  }
-
-  /**
-   * Convenience wrapper over {@link GPUContext#evict(String, long)}
-   *
-   * @param GPUSize Desired size to be freed up on the GPU
-   * @throws DMLRuntimeException If no blocks to free up or if not enough 
blocks with zero locks on them.
-   */
-  protected void evict(final long GPUSize) throws DMLRuntimeException {
-    evict(null, GPUSize);
-  }
-
-  /**
-   * Memory on the GPU is tried to be freed up until either a chunk of needed 
size is freed up
-   * or it fails.
-   * First the set of reusable blocks is freed up. If that isn't enough, the 
set of allocated matrix
-   * blocks with zero locks on them is freed up.
-   * The process cycles through the sorted list of allocated {@link GPUObject} 
instances. Sorting is based on
-   * number of (read) locks that have been obtained on it (reverse order). It 
repeatedly frees up
-   * blocks on which there are zero locks until the required size has been 
freed up.
-   * // TODO: update it with hybrid policy
-   *
-   * @param instructionName name of the instruction for which performance 
measurements are made
-   * @param neededSize      desired size to be freed up on the GPU
-   * @throws DMLRuntimeException If no reusable memory blocks to free up or if 
not enough matrix blocks with zero locks on them.
-   */
-  protected void evict(String instructionName, final long neededSize) throws 
DMLRuntimeException {
-    LOG.trace("GPU : evict called from " + instructionName + " for size " + 
neededSize + " on " + this);
-    GPUStatistics.cudaEvictionCount.addAndGet(1);
-    // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
-    // to free up space
-    LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;
-    while (lruCacheMap.size() > 0) {
-      if (neededSize <= getAvailableMemory())
-        break;
-      Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = 
lruCacheMap.removeAndGetLRUEntry();
-      LinkedList<Pointer> toFreeList = toFreeListPair.getValue();
-      Long size = toFreeListPair.getKey();
-      Pointer toFree = toFreeList.pop();
-      if (toFreeList.isEmpty())
-        lruCacheMap.remove(size);
-      cudaFreeHelper(instructionName, toFree, true);
-    }
-
-    if (neededSize <= getAvailableMemory())
-      return;
-
-    if (allocatedGPUObjects.size() == 0) {
-      throw new DMLRuntimeException("There is not enough memory on device for 
this matrix, request (" + neededSize + ")");
-    }
-
-    Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
-      @Override
-      public int compare(GPUObject p1, GPUObject p2) {
-        long p1Val = p1.locks.get();
-        long p2Val = p2.locks.get();
-
-        if (p1Val > 0 && p2Val > 0) {
-          // Both are locked, so don't sort
-          return 0;
-        } else if (p1Val > 0 || p2Val > 0) {
-          // Put the unlocked one to RHS
-          return Long.compare(p2Val, p1Val);
-        } else {
-          // Both are unlocked
-
-          if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
-            long p1Size = 0;
-            long p2Size = 0;
-            try {
-              p1Size = p1.getSizeOnDevice() - neededSize;
-              p2Size = p2.getSizeOnDevice() - neededSize;
-            } catch (DMLRuntimeException e) {
-              throw new RuntimeException(e);
-            }
-
-            if (p1Size >= 0 && p2Size >= 0) {
-              return Long.compare(p2Size, p1Size);
-            } else {
-              return Long.compare(p1Size, p2Size);
-            }
-          } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == 
EvictionPolicy.LFU) {
-            return Long.compare(p2.timestamp.get(), p1.timestamp.get());
-          } else {
-            throw new RuntimeException("Unsupported eviction policy:" + 
evictionPolicy.name());
-          }
-        }
-      }
-    });
-
-    while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 
0) {
-      GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
-      if (toBeRemoved.locks.get() > 0) {
-        throw new DMLRuntimeException("There is not enough memory on device 
for this matrix, request (" + neededSize + ")");
-      }
-      if (toBeRemoved.dirty) {
-        toBeRemoved.copyFromDeviceToHost();
-      }
-
-      toBeRemoved.clearData(true);
-    }
-  }
-
-  /**
-   * Whether the GPU associated with this {@link GPUContext} has recorded the 
usage of a certain block
-   *
-   * @param o the block
-   * @return true if present, false otherwise
-   */
-  public boolean isBlockRecorded(GPUObject o) {
-    return allocatedGPUObjects.contains(o);
-  }
-
-  /**
-   * @param o {@link GPUObject} instance to record
-   * @see GPUContext#allocatedGPUObjects
-   * Records the usage of a matrix block
-   */
-  public void recordBlockUsage(GPUObject o) {
-    allocatedGPUObjects.add(o);
-  }
-
-  /**
-   * @param o {@link GPUObject} instance to remove from the list of allocated 
GPU objects
-   * @see GPUContext#allocatedGPUObjects
-   * Records that a block is not used anymore
-   */
-  public void removeRecordedUsage(GPUObject o) {
-    allocatedGPUObjects.remove(o);
-  }
-
-  /**
-   * Gets the available memory on GPU that SystemML can use
-   *
-   * @return the available memory in bytes
-   */
-  public long getAvailableMemory() {
-    long free[] = {0};
-    long total[] = {0};
-    cudaMemGetInfo(free, total);
-    return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
-  }
-
-  /**
-   * Makes sure that GPU that SystemML is trying to use has the minimum 
compute capability needed
-   *
-   * @throws DMLRuntimeException if the compute capability is less than what 
is required
-   */
-  public void ensureComputeCapability() throws DMLRuntimeException {
-    int[] devices = {-1};
-    cudaGetDeviceCount(devices);
-    if (devices[0] == -1) {
-      throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 
devices");
-    }
-    boolean isComputeCapable = true;
-    for (int i = 0; i < devices[0]; i++) {
-      cudaDeviceProp properties = GPUContextPool.getGPUProperties(i);
-      int major = properties.major;
-      int minor = properties.minor;
-      if (major < MAJOR_REQUIRED) {
-        isComputeCapable = false;
-      } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) {
-        isComputeCapable = false;
-      }
-    }
-    if (!isComputeCapable) {
-      throw new DMLRuntimeException("One of the CUDA cards on the system has 
compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED);
-    }
-  }
-
-  public GPUObject createGPUObject(MatrixObject mo) {
-    return new GPUObject(this, mo);
-  }
-
-  /**
-   * Gets the device properties for the active GPU (set with cudaSetDevice())
-   *
-   * @return the device properties
-   * @throws DMLRuntimeException ?
-   */
-  public cudaDeviceProp getGPUProperties() throws DMLRuntimeException {
-    return GPUContextPool.getGPUProperties(deviceNum);
-  }
-
-  /**
-   * Gets the maximum number of threads per block for "active" GPU
-   *
-   * @return the maximum number of threads per block
-   * @throws DMLRuntimeException ?
-   */
-  public int getMaxThreadsPerBlock() throws DMLRuntimeException {
-    cudaDeviceProp deviceProps = getGPUProperties();
-    return deviceProps.maxThreadsPerBlock;
-  }
-
-  /**
-   * Gets the maximum number of blocks supported by the active cuda device
-   *
-   * @return the maximum number of blocks supported
-   * @throws DMLRuntimeException ?
-   */
-  public int getMaxBlocks() throws DMLRuntimeException {
-    cudaDeviceProp deviceProp = getGPUProperties();
-    return deviceProp.maxGridSize[0];
-  }
-
-  /**
-   * Gets the shared memory per block supported by the active cuda device
-   *
-   * @return the shared memory per block
-   * @throws DMLRuntimeException ?
-   */
-  public long getMaxSharedMemory() throws DMLRuntimeException {
-    cudaDeviceProp deviceProp = getGPUProperties();
-    return deviceProp.sharedMemPerBlock;
-  }
-
-  /**
-   * Gets the warp size supported by the active cuda device
-   *
-   * @return the warp size
-   * @throws DMLRuntimeException ?
-   */
-  public int getWarpSize() throws DMLRuntimeException {
-    cudaDeviceProp deviceProp = getGPUProperties();
-    return deviceProp.warpSize;
-  }
-
-  public cudnnHandle getCudnnHandle() {
-    return cudnnHandle;
-  }
-
-  public cublasHandle getCublasHandle() {
-    return cublasHandle;
-  }
-
-  public cusparseHandle getCusparseHandle() {
-    return cusparseHandle;
-  }
-
-  public cusolverDnHandle getCusolverDnHandle() {
-    return cusolverDnHandle;
-  }
-
-  public cusolverSpHandle getCusolverSpHandle() {
-    return cusolverSpHandle;
-  }
-
-  public JCudaKernels getKernels() {
-    return kernels;
-  }
-
-  /**
-   * Destroys this GPUContext object
-   *
-   * @throws DMLRuntimeException if error
-   */
-  public void destroy() throws DMLRuntimeException {
-    LOG.trace("GPU : this context was destroyed, this = " + this.toString());
-    clearMemory();
-    cudnnDestroy(cudnnHandle);
-    cublasDestroy(cublasHandle);
-    cusparseDestroy(cusparseHandle);
-    cusolverDnDestroy(cusolverDnHandle);
-    cusolverSpDestroy(cusolverSpHandle);
-    cudnnHandle = null;
-    cublasHandle = null;
-    cusparseHandle = null;
-
-  }
-
-  /**
-   * Clears all memory used by this {@link GPUContext}
-   * Be careful to ensure that no memory is currently being used in the 
temporary memory before invoking this
-   * If memory is being used between MLContext invocations, they are pointed 
to by a {@link GPUObject} instance
-   * which would be part of the {@link MatrixObject}. The cleanup of that 
{@link MatrixObject} instance will
-   * cause the memory associated with that block on the GPU to be freed up.
-   * @throws DMLRuntimeException ?
-   */
-  public void clearMemory() throws DMLRuntimeException {
-    clearTemporaryMemory();
-    while (!allocatedGPUObjects.isEmpty()) {
-      GPUObject o = allocatedGPUObjects.get(0);
-      if (o.isDirty()){
-        LOG.warn("Attempted to free GPU Memory when a block[" + o + "] is 
still on GPU memory, copying it back to host.");
-        o.acquireHostRead();
-      }
-      o.clearData(true);
-    }
-    allocatedGPUObjects.clear();
-  }
-
-  /**
-   * Clears up the memory used to optimize cudaMalloc/cudaFree calls
-   */
-  public void clearTemporaryMemory() {
-    // To record the cuda block sizes needed by allocatedGPUObjects, others 
are cleared up.
-    HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
-         for (GPUObject o : allocatedGPUObjects) {
-                 if (o.isSparse()) {
-                         CSRPointer p = o.getSparseMatrixCudaPointer();
-                         if (p.rowPtr != null && 
cudaBlockSizeMap.containsKey(p.rowPtr)) {
-                                 tmpCudaBlockSizeMap.put(p.rowPtr, 
cudaBlockSizeMap.get(p.rowPtr));
-                         }
-                         if (p.colInd != null && 
cudaBlockSizeMap.containsKey(p.colInd)) {
-                                 tmpCudaBlockSizeMap.put(p.colInd, 
cudaBlockSizeMap.get(p.colInd));
-                         }
-                         if (p.val != null && 
cudaBlockSizeMap.containsKey(p.val)) {
-                                 tmpCudaBlockSizeMap.put(p.val, 
cudaBlockSizeMap.get(p.val));
-                         }
-
-                 } else {
-                         Pointer p = o.getJcudaDenseMatrixPtr();
-                         tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
-                 }
-         }
-
-    // garbage collect all temporarily allocated spaces
-    for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
-      for (Pointer p : l) {
-        cudaFreeHelper(p, true);
-      }
-    }
-    cudaBlockSizeMap.clear();
-    freeCUDASpaceMap.clear();
-
-    // Restore only those entries for which there are still blocks on the GPU
-    cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
-  }
-
-  @Override
-  public String toString() {
-    return "GPUContext{" +
-            "deviceNum=" + deviceNum +
-            '}';
-  }
+       /**
+        * @param o {@link GPUObject} instance to remove from the list of 
allocated GPU objects
+        * @see GPUContext#allocatedGPUObjects
+        * Records that a block is not used anymore
+        */
+       public void removeRecordedUsage(GPUObject o) {
+               allocatedGPUObjects.remove(o);
+       }
+
+       /**
+        * Gets the available memory on GPU that SystemML can use
+        *
+        * @return the available memory in bytes
+        */
+       public long getAvailableMemory() {
+               long free[] = { 0 };
+               long total[] = { 0 };
+               cudaMemGetInfo(free, total);
+               return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
+       }
+
+       /**
+        * Makes sure that GPU that SystemML is trying to use has the minimum 
compute capability needed
+        *
+        * @throws DMLRuntimeException if the compute capability is less than 
what is required
+        */
+       public void ensureComputeCapability() throws DMLRuntimeException {
+               int[] devices = { -1 };
+               cudaGetDeviceCount(devices);
+               if (devices[0] == -1) {
+                       throw new DMLRuntimeException("Call to 
cudaGetDeviceCount returned 0 devices");
+               }
+               boolean isComputeCapable = true;
+               for (int i = 0; i < devices[0]; i++) {
+                       cudaDeviceProp properties = 
GPUContextPool.getGPUProperties(i);
+                       int major = properties.major;
+                       int minor = properties.minor;
+                       if (major < MAJOR_REQUIRED) {
+                               isComputeCapable = false;
+                       } else if (major == MAJOR_REQUIRED && minor < 
MINOR_REQUIRED) {
+                               isComputeCapable = false;
+                       }
+               }
+               if (!isComputeCapable) {
+                       throw new DMLRuntimeException(
+                                       "One of the CUDA cards on the system 
has compute capability lower than " + MAJOR_REQUIRED + "."
+                                                       + MINOR_REQUIRED);
+               }
+       }
+
+       public GPUObject createGPUObject(MatrixObject mo) {
+               return new GPUObject(this, mo);
+       }
+
+       /**
+        * Gets the device properties for the active GPU (set with 
cudaSetDevice())
+        *
+        * @return the device properties
+        * @throws DMLRuntimeException ?
+        */
+       public cudaDeviceProp getGPUProperties() throws DMLRuntimeException {
+               return GPUContextPool.getGPUProperties(deviceNum);
+       }
+
+       /**
+        * Gets the maximum number of threads per block for "active" GPU
+        *
+        * @return the maximum number of threads per block
+        * @throws DMLRuntimeException ?
+        */
+       public int getMaxThreadsPerBlock() throws DMLRuntimeException {
+               cudaDeviceProp deviceProps = getGPUProperties();
+               return deviceProps.maxThreadsPerBlock;
+       }
+
+       /**
+        * Gets the maximum number of blocks supported by the active cuda device
+        *
+        * @return the maximum number of blocks supported
+        * @throws DMLRuntimeException ?
+        */
+       public int getMaxBlocks() throws DMLRuntimeException {
+               cudaDeviceProp deviceProp = getGPUProperties();
+               return deviceProp.maxGridSize[0];
+       }
+
+       /**
+        * Gets the shared memory per block supported by the active cuda device
+        *
+        * @return the shared memory per block
+        * @throws DMLRuntimeException ?
+        */
+       public long getMaxSharedMemory() throws DMLRuntimeException {
+               cudaDeviceProp deviceProp = getGPUProperties();
+               return deviceProp.sharedMemPerBlock;
+       }
+
+       /**
+        * Gets the warp size supported by the active cuda device
+        *
+        * @return the warp size
+        * @throws DMLRuntimeException ?
+        */
+       public int getWarpSize() throws DMLRuntimeException {
+               cudaDeviceProp deviceProp = getGPUProperties();
+               return deviceProp.warpSize;
+       }
+
+       public cudnnHandle getCudnnHandle() {
+               return cudnnHandle;
+       }
+
+       public cublasHandle getCublasHandle() {
+               return cublasHandle;
+       }
+
+       public cusparseHandle getCusparseHandle() {
+               return cusparseHandle;
+       }
+
+       public cusolverDnHandle getCusolverDnHandle() {
+               return cusolverDnHandle;
+       }
+
+       public cusolverSpHandle getCusolverSpHandle() {
+               return cusolverSpHandle;
+       }
+
+       public JCudaKernels getKernels() {
+               return kernels;
+       }
+
+       /**
+        * Destroys this GPUContext object
+        *
+        * @throws DMLRuntimeException if error
+        */
+       public void destroy() throws DMLRuntimeException {
+               LOG.trace("GPU : this context was destroyed, this = " + 
this.toString());
+               clearMemory();
+               cudnnDestroy(cudnnHandle);
+               cublasDestroy(cublasHandle);
+               cusparseDestroy(cusparseHandle);
+               cusolverDnDestroy(cusolverDnHandle);
+               cusolverSpDestroy(cusolverSpHandle);
+               cudnnHandle = null;
+               cublasHandle = null;
+               cusparseHandle = null;
+
+       }
+
+       /**
+        * Clears all memory used by this {@link GPUContext}
+        * Be careful to ensure that no memory is currently being used in the 
temporary memory before invoking this
+        * If memory is being used between MLContext invocations, they are 
pointed to by a {@link GPUObject} instance
+        * which would be part of the {@link MatrixObject}. The cleanup of that 
{@link MatrixObject} instance will
+        * cause the memory associated with that block on the GPU to be freed 
up.
+        *
+        * @throws DMLRuntimeException ?
+        */
+       public void clearMemory() throws DMLRuntimeException {
+               clearTemporaryMemory();
+               while (!allocatedGPUObjects.isEmpty()) {
+                       GPUObject o = allocatedGPUObjects.get(0);
+                       if (o.isDirty()) {
+                               LOG.warn("Attempted to free GPU Memory when a 
block[" + o
+                                               + "] is still on GPU memory, 
copying it back to host.");
+                               o.acquireHostRead();
+                       }
+                       o.clearData(true);
+               }
+               allocatedGPUObjects.clear();
+       }
+
+       /**
+        * Clears up the memory used to optimize cudaMalloc/cudaFree calls
+        */
+       public void clearTemporaryMemory() {
+               // To record the cuda block sizes needed by 
allocatedGPUObjects, others are cleared up.
+               HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
+               for (GPUObject o : allocatedGPUObjects) {
+                       if (o.isSparse()) {
+                               CSRPointer p = o.getSparseMatrixCudaPointer();
+                               if (p.rowPtr != null && 
cudaBlockSizeMap.containsKey(p.rowPtr)) {
+                                       tmpCudaBlockSizeMap.put(p.rowPtr, 
cudaBlockSizeMap.get(p.rowPtr));
+                               }
+                               if (p.colInd != null && 
cudaBlockSizeMap.containsKey(p.colInd)) {
+                                       tmpCudaBlockSizeMap.put(p.colInd, 
cudaBlockSizeMap.get(p.colInd));
+                               }
+                               if (p.val != null && 
cudaBlockSizeMap.containsKey(p.val)) {
+                                       tmpCudaBlockSizeMap.put(p.val, 
cudaBlockSizeMap.get(p.val));
+                               }
+
+                       } else {
+                               Pointer p = o.getJcudaDenseMatrixPtr();
+                               tmpCudaBlockSizeMap.put(p, 
cudaBlockSizeMap.get(p));
+                       }
+               }
+
+               // garbage collect all temporarily allocated spaces
+               for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
+                       for (Pointer p : l) {
+                               cudaFreeHelper(p, true);
+                       }
+               }
+               cudaBlockSizeMap.clear();
+               freeCUDASpaceMap.clear();
+
+               // Restore only those entries for which there are still blocks 
on the GPU
+               cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
+       }
+
+       @Override
+       public String toString() {
+               return "GPUContext{" + "deviceNum=" + deviceNum + '}';
+       }
+
+       /**
+        * Eviction policies for {@link GPUContext#evict(long)}
+        */
+       public enum EvictionPolicy {
+               LRU, LFU, MIN_EVICT
+       }
 
 }


http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
index 1d0b5c8..ac1c059 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -23,7 +23,7 @@ import static jcuda.driver.JCudaDriver.cuInit;
 import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
 
 import java.util.LinkedList;
-import java.util.Queue;
+import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -39,122 +39,152 @@ import jcuda.runtime.cudaDeviceProp;
 
 public class GPUContextPool {
 
-  protected static final Log LOG = 
LogFactory.getLog(GPUContextPool.class.getName());
-
-  /** Maximum number of gpus to use, -1 for all */
-  public static int PER_PROCESS_MAX_GPUS = -1;
-
-  /** Whether cuda has been initialized */
-  static boolean initialized = false;
-
-  /** The total number of cuda devices on this machine */
-  static int deviceCount = -1;
-
-  /** Stores the cached deviceProperties */
-  static cudaDeviceProp[] deviceProperties;
-
-  /** Set of free GPUContexts */
-  static Queue<GPUContext> freePool = new LinkedList<>();
-
-  /**
-   * Static initialization of the number of devices
-   * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error
-   * Initializes the CUDA driver
-   * All these need be done once, and not per GPU
-   * @throws DMLRuntimeException ?
-   */
-  public synchronized static void initializeGPU() throws DMLRuntimeException {
-    GPUContext.LOG.info("Initializing CUDA");
-    long start = System.nanoTime();
-    JCuda.setExceptionsEnabled(true);
-    JCudnn.setExceptionsEnabled(true);
-    JCublas2.setExceptionsEnabled(true);
-    JCusparse.setExceptionsEnabled(true);
-    JCudaDriver.setExceptionsEnabled(true);
-    cuInit(0); // Initialize the driver
-
-    int deviceCountArray[] = {0};
-    cuDeviceGetCount(deviceCountArray);        // Obtain the number of devices
-    deviceCount = deviceCountArray[0];
-    deviceProperties = new cudaDeviceProp[deviceCount];
-
-    if (PER_PROCESS_MAX_GPUS > 0)
-       deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount);
-
-    // Initialize the list of devices
-    for (int i = 0; i < deviceCount; i++) {
-      cudaDeviceProp properties = new cudaDeviceProp();
-      cudaGetDeviceProperties(properties, i);
-      deviceProperties[i] = properties;
-    }
-
-    // Initialize the pool of GPUContexts
-    for (int i=0; i<deviceCount; i++){
-      GPUContext gCtx = new GPUContext(i);
-      freePool.add(gCtx);
-    }
-
-    GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount);
-    //int[] device = {-1};
-    //cudaGetDevice(device);
-    //cudaDeviceProp prop = getGPUProperties(device[0]);
-    //int maxBlocks = prop.maxGridSize[0];
-    //int maxThreadsPerBlock = prop.maxThreadsPerBlock;
-    //long sharedMemPerBlock = prop.sharedMemPerBlock;
-    //LOG.debug("Active CUDA device number : " + device[0]);
-    //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks 
+ "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
-    initialized = true;
-    GPUStatistics.cudaInitTime = System.nanoTime() - start;
-  }
-
-  /**
-   * Gets an initialized GPUContext from a pool of GPUContexts, each linked to 
a GPU
-   * @return null if not more GPUContexts in pool, a valid GPUContext otherwise
-   * @throws DMLRuntimeException ?
-   */
-  public static synchronized GPUContext getFromPool() throws 
DMLRuntimeException {
-    if (!initialized) initializeGPU();
-    GPUContext gCtx = freePool.poll();
-    LOG.trace("GPU : got GPUContext (" + gCtx + ") from freePool. New sizes - 
FreePool[" + freePool.size() + "]");
-    return gCtx;
-  }
-
-  /**
-   * Get the number of free GPUContexts
-   * @return number of free GPUContexts
-   */
-  public static synchronized int getAvailableCount() {
-    return freePool.size();
-  }
-
-  /**
-   * Gets the device properties
-   * @param device the device number (on a machine with more than 1 GPU)
-   * @return the device properties
-   * @throws DMLRuntimeException if there is problem initializing the 
GPUContexts
-   */
-  static cudaDeviceProp getGPUProperties(int device) throws 
DMLRuntimeException {
-    // do once - initialization of GPU
-    if (!initialized) initializeGPU();
-    return deviceProperties[device];
-  }
-
-  public static int getDeviceCount() throws DMLRuntimeException {
-    if (!initialized) initializeGPU();
-    return deviceCount;
-  }
-
-  /**
-   * Returns a {@link GPUContext} back to the pool of {@link GPUContext}s
-   * @param gCtx the GPUContext instance to return. If null, nothing happens
-   * @throws DMLRuntimeException if error
-   */
-  public static synchronized void returnToPool(GPUContext gCtx) throws 
DMLRuntimeException {
-    if (gCtx == null)
-      return;
-    freePool.add(gCtx);
-    LOG.trace("GPU : returned GPUContext (" + gCtx + ") to freePool. New sizes 
- FreePool[" + freePool.size() + "]");
-
-  }
+       protected static final Log LOG = 
LogFactory.getLog(GPUContextPool.class.getName());
+
+       /**
+        * Maximum number of gpus to use, -1 for all
+        */
+       public static int PER_PROCESS_MAX_GPUS = -1;
+
+       /**
+        * Whether cuda has been initialized
+        */
+       static boolean initialized = false;
+
+       /**
+        * The total number of cuda devices on this machine
+        */
+       static int deviceCount = -1;
+
+       /**
+        * Stores the cached deviceProperties
+        */
+       static cudaDeviceProp[] deviceProperties;
+
+       /**
+        * Set of free GPUContexts
+        */
+       static List<GPUContext> pool = new LinkedList<>();
+
+       /**
+        * Whether the pool of GPUs is reserved or not
+        */
+       static boolean reserved = false;
+
+       /**
+        * Static initialization of the number of devices
+        * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of 
error
+        * Initializes the CUDA driver
+        * All these need be done once, and not per GPU
+        *
+        * @throws DMLRuntimeException ?
+        */
+       public synchronized static void initializeGPU() throws 
DMLRuntimeException {
+               GPUContext.LOG.info("Initializing CUDA");
+               long start = System.nanoTime();
+               JCuda.setExceptionsEnabled(true);
+               JCudnn.setExceptionsEnabled(true);
+               JCublas2.setExceptionsEnabled(true);
+               JCusparse.setExceptionsEnabled(true);
+               JCudaDriver.setExceptionsEnabled(true);
+               cuInit(0); // Initialize the driver
+
+               int deviceCountArray[] = { 0 };
+               cuDeviceGetCount(deviceCountArray);        // Obtain the number 
of devices
+               deviceCount = deviceCountArray[0];
+               deviceProperties = new cudaDeviceProp[deviceCount];
+
+               if (PER_PROCESS_MAX_GPUS > 0)
+                       deviceCount = Math.min(PER_PROCESS_MAX_GPUS, 
deviceCount);
+
+               // Initialize the list of devices
+               for (int i = 0; i < deviceCount; i++) {
+                       cudaDeviceProp properties = new cudaDeviceProp();
+                       cudaGetDeviceProperties(properties, i);
+                       deviceProperties[i] = properties;
+               }
+
+               // Initialize the pool of GPUContexts
+               for (int i = 0; i < deviceCount; i++) {
+                       GPUContext gCtx = new GPUContext(i);
+                       pool.add(gCtx);
+               }
+
+               GPUContext.LOG.info("Total number of GPUs on the machine: " + 
deviceCount);
+               //int[] device = {-1};
+               //cudaGetDevice(device);
+               //cudaDeviceProp prop = getGPUProperties(device[0]);
+               //int maxBlocks = prop.maxGridSize[0];
+               //int maxThreadsPerBlock = prop.maxThreadsPerBlock;
+               //long sharedMemPerBlock = prop.sharedMemPerBlock;
+               //LOG.debug("Active CUDA device number : " + device[0]);
+               //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + 
maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
+               initialized = true;
+               GPUStatistics.cudaInitTime = System.nanoTime() - start;
+       }
+
+       /**
+        * Reserves and gets an initialized list of GPUContexts
+        *
+        * @return null if no GPUContexts in pool, otherwise a valid list of 
GPUContext
+        * @throws DMLRuntimeException ?
+        */
+       public static synchronized List<GPUContext> reserveAllGPUContexts() 
throws DMLRuntimeException {
+               if (reserved)
+                       throw new DMLRuntimeException("Trying to re-reserve 
GPUs");
+               if (!initialized)
+                       initializeGPU();
+               reserved = true;
+               LOG.trace("GPU : Reserved all GPUs");
+               return pool;
+       }
+
+       /**
+        * Get the number of free GPUContexts
+        *
+        * @return number of free GPUContexts
+        */
+       public static synchronized int getAvailableCount() {
+               return pool.size();
+       }
+
+       /**
+        * Gets the device properties
+        *
+        * @param device the device number (on a machine with more than 1 GPU)
+        * @return the device properties
+        * @throws DMLRuntimeException if there is problem initializing the 
GPUContexts
+        */
+       static cudaDeviceProp getGPUProperties(int device) throws 
DMLRuntimeException {
+               // do once - initialization of GPU
+               if (!initialized)
+                       initializeGPU();
+               return deviceProperties[device];
+       }
+
+       /**
+        * Number of available devices on this machine
+        *
+        * @return number of available GPUs on this machine
+        * @throws DMLRuntimeException if error
+        */
+       public static int getDeviceCount() throws DMLRuntimeException {
+               if (!initialized)
+                       initializeGPU();
+               return deviceCount;
+       }
+
+       /**
+        * Unreserves all GPUContexts
+        *
+        * @throws DMLRuntimeException if error
+        */
+       public static synchronized void freeAllGPUContexts() throws 
DMLRuntimeException {
+               if (!reserved)
+                       throw new DMLRuntimeException("Trying to free 
unreserved GPUs");
+               reserved = false;
+               LOG.trace("GPU : Unreserved all GPUs");
+
+       }
 
 }

[2/3] systemml git commit: [FIX] Fixed nested parfor for GPUs

Reply via email to