http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 8da67ea..b3c19ef 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -18,14 +18,24 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; -import jcuda.Pointer; -import jcuda.jcublas.cublasHandle; -import jcuda.jcudnn.cudnnHandle; -import jcuda.jcusolver.cusolverDnHandle; -import jcuda.jcusolver.cusolverSpHandle; -import jcuda.jcusparse.cusparseHandle; -import jcuda.runtime.JCuda; -import jcuda.runtime.cudaDeviceProp; +import static jcuda.jcublas.JCublas2.cublasCreate; +import static jcuda.jcublas.JCublas2.cublasDestroy; +import static jcuda.jcudnn.JCudnn.cudnnCreate; +import static jcuda.jcudnn.JCudnn.cudnnDestroy; +import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate; +import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy; +import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate; +import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy; +import static jcuda.jcusparse.JCusparse.cusparseCreate; +import static jcuda.jcusparse.JCusparse.cusparseDestroy; +import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync; +import static jcuda.runtime.JCuda.cudaFree; +import static jcuda.runtime.JCuda.cudaGetDeviceCount; +import static jcuda.runtime.JCuda.cudaMalloc; +import static jcuda.runtime.JCuda.cudaMemGetInfo; +import static jcuda.runtime.JCuda.cudaMemset; +import static jcuda.runtime.JCuda.cudaSetDevice; +import static jcuda.runtime.JCuda.cudaSetDeviceFlags; import java.util.ArrayList; import java.util.Collections; @@ -45,24 +55,14 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.LRUCacheMap; -import static jcuda.jcublas.JCublas2.cublasCreate; -import static jcuda.jcublas.JCublas2.cublasDestroy; -import static jcuda.jcudnn.JCudnn.cudnnCreate; -import static jcuda.jcudnn.JCudnn.cudnnDestroy; -import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate; -import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy; -import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate; -import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy; -import static jcuda.jcusparse.JCusparse.cusparseCreate; -import static jcuda.jcusparse.JCusparse.cusparseDestroy; -import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync; -import static jcuda.runtime.JCuda.cudaFree; -import static jcuda.runtime.JCuda.cudaGetDeviceCount; -import static jcuda.runtime.JCuda.cudaMalloc; -import static jcuda.runtime.JCuda.cudaMemGetInfo; -import static jcuda.runtime.JCuda.cudaMemset; -import static jcuda.runtime.JCuda.cudaSetDevice; -import static jcuda.runtime.JCuda.cudaSetDeviceFlags; +import jcuda.Pointer; +import jcuda.jcublas.cublasHandle; +import jcuda.jcudnn.cudnnHandle; +import jcuda.jcusolver.cusolverDnHandle; +import jcuda.jcusolver.cusolverSpHandle; +import jcuda.jcusparse.cusparseHandle; +import jcuda.runtime.JCuda; +import jcuda.runtime.cudaDeviceProp; /** * Represents a context per GPU accessible through the same JVM @@ -71,606 +71,643 @@ import static jcuda.runtime.JCuda.cudaSetDeviceFlags; public class GPUContext { protected static final Log LOG = LogFactory.getLog(GPUContext.class.getName()); + /** + * currently employed eviction policy + */ + public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU; + /** + * The minimum CUDA Compute capability needed for SystemML. + * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported. + * If SystemML needs to run on an older card, this logic can be revisited. + */ + final int MAJOR_REQUIRED = 3; + final int MINOR_REQUIRED = 0; + /** + * active device assigned to this GPUContext instance + */ + private final int deviceNum; + // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. + public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig() + .getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); + /** + * Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU + */ + private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<>(); + /** + * To record size of allocated blocks + */ + private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>(); + /** + * list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU + * These are matrices allocated on the GPU on which rmvar hasn't been called yet. + * If a {@link GPUObject} has more than one lock on it, it cannot be freed + * If it has zero locks on it, it can be freed, but it is preferrable to keep it around + * so that an extraneous host to dev transfer can be avoided + */ + private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>(); + /** + * cudnnHandle for Deep Neural Network operations on the GPU + */ + private cudnnHandle cudnnHandle; + /** + * cublasHandle for BLAS operations on the GPU + */ + private cublasHandle cublasHandle; + /** + * cusparseHandle for certain sparse BLAS operations on the GPU + */ + private cusparseHandle cusparseHandle; + /** + * cusolverDnHandle for invoking solve() function on dense matrices on the GPU + */ + private cusolverDnHandle cusolverDnHandle; + /** + * cusolverSpHandle for invoking solve() function on sparse matrices on the GPU + */ + private cusolverSpHandle cusolverSpHandle; + /** + * to launch custom CUDA kernel, specific to the active GPU for this GPUContext + */ + private JCudaKernels kernels; + + protected GPUContext(int deviceNum) throws DMLRuntimeException { + this.deviceNum = deviceNum; + cudaSetDevice(deviceNum); + + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + + long free[] = { 0 }; + long total[] = { 0 }; + cudaMemGetInfo(free, total); + + long start = System.nanoTime(); + cudnnHandle = new cudnnHandle(); + cudnnCreate(cudnnHandle); + cublasHandle = new cublasHandle(); + cublasCreate(cublasHandle); + // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host + // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. + // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); + cusparseHandle = new cusparseHandle(); + cusparseCreate(cusparseHandle); + + cusolverDnHandle = new cusolverDnHandle(); + cusolverDnCreate(cusolverDnHandle); + cusolverSpHandle = new cusolverSpHandle(); + cusolverSpCreate(cusolverSpHandle); + + kernels = new JCudaKernels(deviceNum); + + GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; + LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + + this); - /** Eviction policies for {@link GPUContext#evict(long)} */ - public enum EvictionPolicy { - LRU, LFU, MIN_EVICT } - /** currently employed eviction policy */ - public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU; + public static int cudaGetDevice() { + int[] device = new int[1]; + JCuda.cudaGetDevice(device); + return device[0]; + } - /** Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU */ - private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<>(); + public int getDeviceNum() { + return deviceNum; + } - /** To record size of allocated blocks */ - private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>(); + /** + * Sets the device for the calling thread. + * This method must be called after + * {@link org.apache.sysml.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)} + * If in a multi-threaded env like parfor, this method must be called when in the + * appropriate thread + */ + public void initializeThread() { + cudaSetDevice(deviceNum); + } + + /** + * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. + * + * @param size size of data (in bytes) to allocate + * @return jcuda pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public Pointer allocate(long size) throws DMLRuntimeException { + return allocate(null, size, 1); + } + + /** + * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. + * + * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record + * @param size size of data (in bytes) to allocate + * @return jcuda pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public Pointer allocate(String instructionName, long size) throws DMLRuntimeException { + return allocate(instructionName, size, 1); + } - /** active device assigned to this GPUContext instance */ - private final int deviceNum; + /** + * Allocates temporary space on the device. + * Does not update bookkeeping. + * The caller is responsible for freeing up after usage. + * + * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record + * @param size Size of data (in bytes) to allocate + * @param statsCount amount to increment the cudaAllocCount by + * @return jcuda Pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException { + long t0 = 0, t1 = 0, end = 0; + Pointer A; + if (freeCUDASpaceMap.containsKey(size)) { + LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size + / 1024.0) + " Kbytes from previously allocated block on " + this); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + t0 = System.nanoTime(); + LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); + A = freeList.pop(); + if (freeList.isEmpty()) + freeCUDASpaceMap.remove(size); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics + .maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); + } else { + LOG.trace( + "GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size + / 1024.0) + " Kbytes on " + this); + if (DMLScript.STATISTICS) + t0 = System.nanoTime(); + ensureFreeSpace(instructionName, size); + A = new Pointer(); + cudaMalloc(A, size); + if (DMLScript.STATISTICS) + GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0); + if (DMLScript.STATISTICS) + GPUStatistics.cudaAllocCount.getAndAdd(statsCount); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, + System.nanoTime() - t0); + } + // Set all elements to 0 since newly allocated space will contain garbage + if (DMLScript.STATISTICS) + t1 = System.nanoTime(); + LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + + " Kbytes to zero on " + this); + cudaMemset(A, 0, size); + if (DMLScript.STATISTICS) + end = System.nanoTime(); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1); + if (DMLScript.STATISTICS) + GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1); + if (DMLScript.STATISTICS) + GPUStatistics.cudaMemSet0Count.getAndAdd(1); + cudaBlockSizeMap.put(A, size); + return A; - /** list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU - * These are matrices allocated on the GPU on which rmvar hasn't been called yet. - * If a {@link GPUObject} has more than one lock on it, it cannot be freed - * If it has zero locks on it, it can be freed, but it is preferrable to keep it around - * so that an extraneous host to dev transfer can be avoided */ - private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>(); + } - /** cudnnHandle for Deep Neural Network operations on the GPU */ - private cudnnHandle cudnnHandle; + /** + * Does lazy cudaFree calls + * + * @param toFree {@link Pointer} instance to be freed + */ + public void cudaFreeHelper(final Pointer toFree) { + cudaFreeHelper(null, toFree, false); + } - /** cublasHandle for BLAS operations on the GPU */ - private cublasHandle cublasHandle; + /** + * does lazy/eager cudaFree calls + * + * @param toFree {@link Pointer} instance to be freed + * @param eager true if to be done eagerly + */ + public void cudaFreeHelper(final Pointer toFree, boolean eager) { + cudaFreeHelper(null, toFree, eager); + } - /** cusparseHandle for certain sparse BLAS operations on the GPU */ - private cusparseHandle cusparseHandle; + /** + * Does lazy cudaFree calls + * + * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record + * @param toFree {@link Pointer} instance to be freed + */ + public void cudaFreeHelper(String instructionName, final Pointer toFree) { + cudaFreeHelper(instructionName, toFree, false); + } - /** cusolverDnHandle for invoking solve() function on dense matrices on the GPU */ - private cusolverDnHandle cusolverDnHandle; + /** + * Does cudaFree calls, lazily + * + * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record + * @param toFree {@link Pointer} instance to be freed + * @param eager true if to be done eagerly + */ + public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) { + Pointer dummy = new Pointer(); + if (toFree == dummy) // trying to free a null pointer + return; + long t0 = 0; + assert cudaBlockSizeMap.containsKey( + toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"; + long size = cudaBlockSizeMap.get(toFree); + if (eager) { + LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + + this); + if (DMLScript.STATISTICS) + t0 = System.nanoTime(); + cudaFree(toFree); + cudaBlockSizeMap.remove(toFree); + if (DMLScript.STATISTICS) + GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) + GPUStatistics.cudaDeAllocCount.addAndGet(1); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, + System.nanoTime() - t0); + } else { + LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this); + LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); + if (freeList == null) { + freeList = new LinkedList<Pointer>(); + freeCUDASpaceMap.put(size, freeList); + } + if (freeList.contains(toFree)) + throw new RuntimeException("GPU : Internal state corrupted, double free"); + freeList.add(toFree); + } + } - /** cusolverSpHandle for invoking solve() function on sparse matrices on the GPU */ - private cusolverSpHandle cusolverSpHandle; + /** + * Thin wrapper over {@link GPUContext#evict(long)} + * + * @param size size to check + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + void ensureFreeSpace(long size) throws DMLRuntimeException { + ensureFreeSpace(null, size); + } - /** to launch custom CUDA kernel, specific to the active GPU for this GPUContext */ - private JCudaKernels kernels; + /** + * Thin wrapper over {@link GPUContext#evict(long)} + * + * @param instructionName instructionName name of the instruction for which performance measurements are made + * @param size size to check + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException { + if (size >= getAvailableMemory()) { + evict(instructionName, size); + } + } /** - * The minimum CUDA Compute capability needed for SystemML. - * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported. - * If SystemML needs to run on an older card, this logic can be revisited. + * Convenience wrapper over {@link GPUContext#evict(String, long)} + * + * @param GPUSize Desired size to be freed up on the GPU + * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them. */ - final int MAJOR_REQUIRED = 3; - final int MINOR_REQUIRED = 0; + protected void evict(final long GPUSize) throws DMLRuntimeException { + evict(null, GPUSize); + } + + /** + * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up + * or it fails. + * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix + * blocks with zero locks on them is freed up. + * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on + * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up + * blocks on which there are zero locks until the required size has been freed up. + * // TODO: update it with hybrid policy + * + * @param instructionName name of the instruction for which performance measurements are made + * @param neededSize desired size to be freed up on the GPU + * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them. + */ + protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException { + LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this); + GPUStatistics.cudaEvictionCount.addAndGet(1); + // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap + // to free up space + LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap; + while (lruCacheMap.size() > 0) { + if (neededSize <= getAvailableMemory()) + break; + Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry(); + LinkedList<Pointer> toFreeList = toFreeListPair.getValue(); + Long size = toFreeListPair.getKey(); + Pointer toFree = toFreeList.pop(); + if (toFreeList.isEmpty()) + lruCacheMap.remove(size); + cudaFreeHelper(instructionName, toFree, true); + } + + if (neededSize <= getAvailableMemory()) + return; + + if (allocatedGPUObjects.size() == 0) { + throw new DMLRuntimeException( + "There is not enough memory on device for this matrix, request (" + neededSize + ")"); + } + + Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() { + @Override + public int compare(GPUObject p1, GPUObject p2) { + long p1Val = p1.locks.get(); + long p2Val = p2.locks.get(); + + if (p1Val > 0 && p2Val > 0) { + // Both are locked, so don't sort + return 0; + } else if (p1Val > 0 || p2Val > 0) { + // Put the unlocked one to RHS + return Long.compare(p2Val, p1Val); + } else { + // Both are unlocked + + if (evictionPolicy == EvictionPolicy.MIN_EVICT) { + long p1Size = 0; + long p2Size = 0; + try { + p1Size = p1.getSizeOnDevice() - neededSize; + p2Size = p2.getSizeOnDevice() - neededSize; + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + + if (p1Size >= 0 && p2Size >= 0) { + return Long.compare(p2Size, p1Size); + } else { + return Long.compare(p1Size, p2Size); + } + } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) { + return Long.compare(p2.timestamp.get(), p1.timestamp.get()); + } else { + throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name()); + } + } + } + }); + + while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) { + GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1); + if (toBeRemoved.locks.get() > 0) { + throw new DMLRuntimeException( + "There is not enough memory on device for this matrix, request (" + neededSize + ")"); + } + if (toBeRemoved.dirty) { + toBeRemoved.copyFromDeviceToHost(); + } + + toBeRemoved.clearData(true); + } + } + + /** + * Whether the GPU associated with this {@link GPUContext} has recorded the usage of a certain block + * + * @param o the block + * @return true if present, false otherwise + */ + public boolean isBlockRecorded(GPUObject o) { + return allocatedGPUObjects.contains(o); + } + + /** + * @param o {@link GPUObject} instance to record + * @see GPUContext#allocatedGPUObjects + * Records the usage of a matrix block + */ + public void recordBlockUsage(GPUObject o) { + allocatedGPUObjects.add(o); + } - // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. - public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); - - protected GPUContext(int deviceNum) throws DMLRuntimeException { - this.deviceNum = deviceNum; - cudaSetDevice(deviceNum); - - cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); - - long free[] = {0}; - long total[] = {0}; - cudaMemGetInfo(free, total); - - long start = System.nanoTime(); - cudnnHandle = new cudnnHandle(); - cudnnCreate(cudnnHandle); - cublasHandle = new cublasHandle(); - cublasCreate(cublasHandle); - // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host - // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. - // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); - cusparseHandle = new cusparseHandle(); - cusparseCreate(cusparseHandle); - - cusolverDnHandle = new cusolverDnHandle(); - cusolverDnCreate(cusolverDnHandle); - cusolverSpHandle = new cusolverSpHandle(); - cusolverSpCreate(cusolverSpHandle); - - kernels = new JCudaKernels(deviceNum); - - GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; - LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + this); - - } - - public int getDeviceNum() { - return deviceNum; - } - - /** - * Sets the device for the calling thread. - * This method must be called after {@link GPUContextPool#getFromPool()} - * is called. - * If in a multi-threaded env like parfor, this method must be called when in the - * appropriate thread - */ - public void initializeThread() { - cudaSetDevice(deviceNum); - } - - public static int cudaGetDevice() { - int[] device = new int[1]; - JCuda.cudaGetDevice(device); - return device[0]; - } - - /** - * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. - * - * @param size size of data (in bytes) to allocate - * @return jcuda pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public Pointer allocate(long size) throws DMLRuntimeException { - return allocate(null, size, 1); - } - - /** - * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. - * - * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record - * @param size size of data (in bytes) to allocate - * @return jcuda pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public Pointer allocate(String instructionName, long size) throws DMLRuntimeException { - return allocate(instructionName, size, 1); - } - - /** - * Allocates temporary space on the device. - * Does not update bookkeeping. - * The caller is responsible for freeing up after usage. - * - * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record - * @param size Size of data (in bytes) to allocate - * @param statsCount amount to increment the cudaAllocCount by - * @return jcuda Pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException { - long t0 = 0, t1 = 0, end = 0; - Pointer A; - if (freeCUDASpaceMap.containsKey(size)) { - LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); - A = freeList.pop(); - if (freeList.isEmpty()) - freeCUDASpaceMap.remove(size); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) - GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); - } else { - LOG.trace("GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this); - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - ensureFreeSpace(instructionName, size); - A = new Pointer(); - cudaMalloc(A, size); - if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.getAndAdd(statsCount); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) - GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0); - } - // Set all elements to 0 since newly allocated space will contain garbage - if (DMLScript.STATISTICS) t1 = System.nanoTime(); - LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this); - cudaMemset(A, 0, size); - if (DMLScript.STATISTICS) end = System.nanoTime(); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) - GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1); - if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1); - if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1); - cudaBlockSizeMap.put(A, size); - return A; - - } - - /** - * Does lazy cudaFree calls - * - * @param toFree {@link Pointer} instance to be freed - */ - public void cudaFreeHelper(final Pointer toFree) { - cudaFreeHelper(null, toFree, false); - } - - /** - * does lazy/eager cudaFree calls - * - * @param toFree {@link Pointer} instance to be freed - * @param eager true if to be done eagerly - */ - public void cudaFreeHelper(final Pointer toFree, boolean eager) { - cudaFreeHelper(null, toFree, eager); - } - - /** - * Does lazy cudaFree calls - * - * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record - * @param toFree {@link Pointer} instance to be freed - */ - public void cudaFreeHelper(String instructionName, final Pointer toFree) { - cudaFreeHelper(instructionName, toFree, false); - } - - /** - * Does cudaFree calls, lazily - * - * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record - * @param toFree {@link Pointer} instance to be freed - * @param eager true if to be done eagerly - */ - public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) { - Pointer dummy = new Pointer(); - if (toFree == dummy) // trying to free a null pointer - return; - long t0 = 0; - assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"; - long size = cudaBlockSizeMap.get(toFree); - if (eager) { - LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + this); - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - cudaFree(toFree); - cudaBlockSizeMap.remove(toFree); - if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) - GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0); - } else { - LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this); - LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); - if (freeList == null) { - freeList = new LinkedList<Pointer>(); - freeCUDASpaceMap.put(size, freeList); - } - if (freeList.contains(toFree)) - throw new RuntimeException("GPU : Internal state corrupted, double free"); - freeList.add(toFree); - } - } - - /** - * Thin wrapper over {@link GPUContext#evict(long)} - * - * @param size size to check - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - void ensureFreeSpace(long size) throws DMLRuntimeException { - ensureFreeSpace(null, size); - } - - /** - * Thin wrapper over {@link GPUContext#evict(long)} - * - * @param instructionName instructionName name of the instruction for which performance measurements are made - * @param size size to check - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException { - if (size >= getAvailableMemory()) { - evict(instructionName, size); - } - } - - /** - * Convenience wrapper over {@link GPUContext#evict(String, long)} - * - * @param GPUSize Desired size to be freed up on the GPU - * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them. - */ - protected void evict(final long GPUSize) throws DMLRuntimeException { - evict(null, GPUSize); - } - - /** - * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up - * or it fails. - * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix - * blocks with zero locks on them is freed up. - * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on - * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up - * blocks on which there are zero locks until the required size has been freed up. - * // TODO: update it with hybrid policy - * - * @param instructionName name of the instruction for which performance measurements are made - * @param neededSize desired size to be freed up on the GPU - * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them. - */ - protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException { - LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this); - GPUStatistics.cudaEvictionCount.addAndGet(1); - // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap - // to free up space - LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap; - while (lruCacheMap.size() > 0) { - if (neededSize <= getAvailableMemory()) - break; - Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry(); - LinkedList<Pointer> toFreeList = toFreeListPair.getValue(); - Long size = toFreeListPair.getKey(); - Pointer toFree = toFreeList.pop(); - if (toFreeList.isEmpty()) - lruCacheMap.remove(size); - cudaFreeHelper(instructionName, toFree, true); - } - - if (neededSize <= getAvailableMemory()) - return; - - if (allocatedGPUObjects.size() == 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")"); - } - - Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() { - @Override - public int compare(GPUObject p1, GPUObject p2) { - long p1Val = p1.locks.get(); - long p2Val = p2.locks.get(); - - if (p1Val > 0 && p2Val > 0) { - // Both are locked, so don't sort - return 0; - } else if (p1Val > 0 || p2Val > 0) { - // Put the unlocked one to RHS - return Long.compare(p2Val, p1Val); - } else { - // Both are unlocked - - if (evictionPolicy == EvictionPolicy.MIN_EVICT) { - long p1Size = 0; - long p2Size = 0; - try { - p1Size = p1.getSizeOnDevice() - neededSize; - p2Size = p2.getSizeOnDevice() - neededSize; - } catch (DMLRuntimeException e) { - throw new RuntimeException(e); - } - - if (p1Size >= 0 && p2Size >= 0) { - return Long.compare(p2Size, p1Size); - } else { - return Long.compare(p1Size, p2Size); - } - } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) { - return Long.compare(p2.timestamp.get(), p1.timestamp.get()); - } else { - throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name()); - } - } - } - }); - - while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) { - GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1); - if (toBeRemoved.locks.get() > 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")"); - } - if (toBeRemoved.dirty) { - toBeRemoved.copyFromDeviceToHost(); - } - - toBeRemoved.clearData(true); - } - } - - /** - * Whether the GPU associated with this {@link GPUContext} has recorded the usage of a certain block - * - * @param o the block - * @return true if present, false otherwise - */ - public boolean isBlockRecorded(GPUObject o) { - return allocatedGPUObjects.contains(o); - } - - /** - * @param o {@link GPUObject} instance to record - * @see GPUContext#allocatedGPUObjects - * Records the usage of a matrix block - */ - public void recordBlockUsage(GPUObject o) { - allocatedGPUObjects.add(o); - } - - /** - * @param o {@link GPUObject} instance to remove from the list of allocated GPU objects - * @see GPUContext#allocatedGPUObjects - * Records that a block is not used anymore - */ - public void removeRecordedUsage(GPUObject o) { - allocatedGPUObjects.remove(o); - } - - /** - * Gets the available memory on GPU that SystemML can use - * - * @return the available memory in bytes - */ - public long getAvailableMemory() { - long free[] = {0}; - long total[] = {0}; - cudaMemGetInfo(free, total); - return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR); - } - - /** - * Makes sure that GPU that SystemML is trying to use has the minimum compute capability needed - * - * @throws DMLRuntimeException if the compute capability is less than what is required - */ - public void ensureComputeCapability() throws DMLRuntimeException { - int[] devices = {-1}; - cudaGetDeviceCount(devices); - if (devices[0] == -1) { - throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices"); - } - boolean isComputeCapable = true; - for (int i = 0; i < devices[0]; i++) { - cudaDeviceProp properties = GPUContextPool.getGPUProperties(i); - int major = properties.major; - int minor = properties.minor; - if (major < MAJOR_REQUIRED) { - isComputeCapable = false; - } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) { - isComputeCapable = false; - } - } - if (!isComputeCapable) { - throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED); - } - } - - public GPUObject createGPUObject(MatrixObject mo) { - return new GPUObject(this, mo); - } - - /** - * Gets the device properties for the active GPU (set with cudaSetDevice()) - * - * @return the device properties - * @throws DMLRuntimeException ? - */ - public cudaDeviceProp getGPUProperties() throws DMLRuntimeException { - return GPUContextPool.getGPUProperties(deviceNum); - } - - /** - * Gets the maximum number of threads per block for "active" GPU - * - * @return the maximum number of threads per block - * @throws DMLRuntimeException ? - */ - public int getMaxThreadsPerBlock() throws DMLRuntimeException { - cudaDeviceProp deviceProps = getGPUProperties(); - return deviceProps.maxThreadsPerBlock; - } - - /** - * Gets the maximum number of blocks supported by the active cuda device - * - * @return the maximum number of blocks supported - * @throws DMLRuntimeException ? - */ - public int getMaxBlocks() throws DMLRuntimeException { - cudaDeviceProp deviceProp = getGPUProperties(); - return deviceProp.maxGridSize[0]; - } - - /** - * Gets the shared memory per block supported by the active cuda device - * - * @return the shared memory per block - * @throws DMLRuntimeException ? - */ - public long getMaxSharedMemory() throws DMLRuntimeException { - cudaDeviceProp deviceProp = getGPUProperties(); - return deviceProp.sharedMemPerBlock; - } - - /** - * Gets the warp size supported by the active cuda device - * - * @return the warp size - * @throws DMLRuntimeException ? - */ - public int getWarpSize() throws DMLRuntimeException { - cudaDeviceProp deviceProp = getGPUProperties(); - return deviceProp.warpSize; - } - - public cudnnHandle getCudnnHandle() { - return cudnnHandle; - } - - public cublasHandle getCublasHandle() { - return cublasHandle; - } - - public cusparseHandle getCusparseHandle() { - return cusparseHandle; - } - - public cusolverDnHandle getCusolverDnHandle() { - return cusolverDnHandle; - } - - public cusolverSpHandle getCusolverSpHandle() { - return cusolverSpHandle; - } - - public JCudaKernels getKernels() { - return kernels; - } - - /** - * Destroys this GPUContext object - * - * @throws DMLRuntimeException if error - */ - public void destroy() throws DMLRuntimeException { - LOG.trace("GPU : this context was destroyed, this = " + this.toString()); - clearMemory(); - cudnnDestroy(cudnnHandle); - cublasDestroy(cublasHandle); - cusparseDestroy(cusparseHandle); - cusolverDnDestroy(cusolverDnHandle); - cusolverSpDestroy(cusolverSpHandle); - cudnnHandle = null; - cublasHandle = null; - cusparseHandle = null; - - } - - /** - * Clears all memory used by this {@link GPUContext} - * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this - * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance - * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will - * cause the memory associated with that block on the GPU to be freed up. - * @throws DMLRuntimeException ? - */ - public void clearMemory() throws DMLRuntimeException { - clearTemporaryMemory(); - while (!allocatedGPUObjects.isEmpty()) { - GPUObject o = allocatedGPUObjects.get(0); - if (o.isDirty()){ - LOG.warn("Attempted to free GPU Memory when a block[" + o + "] is still on GPU memory, copying it back to host."); - o.acquireHostRead(); - } - o.clearData(true); - } - allocatedGPUObjects.clear(); - } - - /** - * Clears up the memory used to optimize cudaMalloc/cudaFree calls - */ - public void clearTemporaryMemory() { - // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up. - HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>(); - for (GPUObject o : allocatedGPUObjects) { - if (o.isSparse()) { - CSRPointer p = o.getSparseMatrixCudaPointer(); - if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) { - tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr)); - } - if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) { - tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd)); - } - if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) { - tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val)); - } - - } else { - Pointer p = o.getJcudaDenseMatrixPtr(); - tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p)); - } - } - - // garbage collect all temporarily allocated spaces - for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) { - for (Pointer p : l) { - cudaFreeHelper(p, true); - } - } - cudaBlockSizeMap.clear(); - freeCUDASpaceMap.clear(); - - // Restore only those entries for which there are still blocks on the GPU - cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap); - } - - @Override - public String toString() { - return "GPUContext{" + - "deviceNum=" + deviceNum + - '}'; - } + /** + * @param o {@link GPUObject} instance to remove from the list of allocated GPU objects + * @see GPUContext#allocatedGPUObjects + * Records that a block is not used anymore + */ + public void removeRecordedUsage(GPUObject o) { + allocatedGPUObjects.remove(o); + } + + /** + * Gets the available memory on GPU that SystemML can use + * + * @return the available memory in bytes + */ + public long getAvailableMemory() { + long free[] = { 0 }; + long total[] = { 0 }; + cudaMemGetInfo(free, total); + return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR); + } + + /** + * Makes sure that GPU that SystemML is trying to use has the minimum compute capability needed + * + * @throws DMLRuntimeException if the compute capability is less than what is required + */ + public void ensureComputeCapability() throws DMLRuntimeException { + int[] devices = { -1 }; + cudaGetDeviceCount(devices); + if (devices[0] == -1) { + throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices"); + } + boolean isComputeCapable = true; + for (int i = 0; i < devices[0]; i++) { + cudaDeviceProp properties = GPUContextPool.getGPUProperties(i); + int major = properties.major; + int minor = properties.minor; + if (major < MAJOR_REQUIRED) { + isComputeCapable = false; + } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) { + isComputeCapable = false; + } + } + if (!isComputeCapable) { + throw new DMLRuntimeException( + "One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + + MINOR_REQUIRED); + } + } + + public GPUObject createGPUObject(MatrixObject mo) { + return new GPUObject(this, mo); + } + + /** + * Gets the device properties for the active GPU (set with cudaSetDevice()) + * + * @return the device properties + * @throws DMLRuntimeException ? + */ + public cudaDeviceProp getGPUProperties() throws DMLRuntimeException { + return GPUContextPool.getGPUProperties(deviceNum); + } + + /** + * Gets the maximum number of threads per block for "active" GPU + * + * @return the maximum number of threads per block + * @throws DMLRuntimeException ? + */ + public int getMaxThreadsPerBlock() throws DMLRuntimeException { + cudaDeviceProp deviceProps = getGPUProperties(); + return deviceProps.maxThreadsPerBlock; + } + + /** + * Gets the maximum number of blocks supported by the active cuda device + * + * @return the maximum number of blocks supported + * @throws DMLRuntimeException ? + */ + public int getMaxBlocks() throws DMLRuntimeException { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.maxGridSize[0]; + } + + /** + * Gets the shared memory per block supported by the active cuda device + * + * @return the shared memory per block + * @throws DMLRuntimeException ? + */ + public long getMaxSharedMemory() throws DMLRuntimeException { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.sharedMemPerBlock; + } + + /** + * Gets the warp size supported by the active cuda device + * + * @return the warp size + * @throws DMLRuntimeException ? + */ + public int getWarpSize() throws DMLRuntimeException { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.warpSize; + } + + public cudnnHandle getCudnnHandle() { + return cudnnHandle; + } + + public cublasHandle getCublasHandle() { + return cublasHandle; + } + + public cusparseHandle getCusparseHandle() { + return cusparseHandle; + } + + public cusolverDnHandle getCusolverDnHandle() { + return cusolverDnHandle; + } + + public cusolverSpHandle getCusolverSpHandle() { + return cusolverSpHandle; + } + + public JCudaKernels getKernels() { + return kernels; + } + + /** + * Destroys this GPUContext object + * + * @throws DMLRuntimeException if error + */ + public void destroy() throws DMLRuntimeException { + LOG.trace("GPU : this context was destroyed, this = " + this.toString()); + clearMemory(); + cudnnDestroy(cudnnHandle); + cublasDestroy(cublasHandle); + cusparseDestroy(cusparseHandle); + cusolverDnDestroy(cusolverDnHandle); + cusolverSpDestroy(cusolverSpHandle); + cudnnHandle = null; + cublasHandle = null; + cusparseHandle = null; + + } + + /** + * Clears all memory used by this {@link GPUContext} + * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this + * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance + * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will + * cause the memory associated with that block on the GPU to be freed up. + * + * @throws DMLRuntimeException ? + */ + public void clearMemory() throws DMLRuntimeException { + clearTemporaryMemory(); + while (!allocatedGPUObjects.isEmpty()) { + GPUObject o = allocatedGPUObjects.get(0); + if (o.isDirty()) { + LOG.warn("Attempted to free GPU Memory when a block[" + o + + "] is still on GPU memory, copying it back to host."); + o.acquireHostRead(); + } + o.clearData(true); + } + allocatedGPUObjects.clear(); + } + + /** + * Clears up the memory used to optimize cudaMalloc/cudaFree calls + */ + public void clearTemporaryMemory() { + // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up. + HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>(); + for (GPUObject o : allocatedGPUObjects) { + if (o.isSparse()) { + CSRPointer p = o.getSparseMatrixCudaPointer(); + if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) { + tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr)); + } + if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) { + tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd)); + } + if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) { + tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val)); + } + + } else { + Pointer p = o.getJcudaDenseMatrixPtr(); + tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p)); + } + } + + // garbage collect all temporarily allocated spaces + for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) { + for (Pointer p : l) { + cudaFreeHelper(p, true); + } + } + cudaBlockSizeMap.clear(); + freeCUDASpaceMap.clear(); + + // Restore only those entries for which there are still blocks on the GPU + cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap); + } + + @Override + public String toString() { + return "GPUContext{" + "deviceNum=" + deviceNum + '}'; + } + + /** + * Eviction policies for {@link GPUContext#evict(long)} + */ + public enum EvictionPolicy { + LRU, LFU, MIN_EVICT + } }
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java index 1d0b5c8..ac1c059 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java @@ -23,7 +23,7 @@ import static jcuda.driver.JCudaDriver.cuInit; import static jcuda.runtime.JCuda.cudaGetDeviceProperties; import java.util.LinkedList; -import java.util.Queue; +import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -39,122 +39,152 @@ import jcuda.runtime.cudaDeviceProp; public class GPUContextPool { - protected static final Log LOG = LogFactory.getLog(GPUContextPool.class.getName()); - - /** Maximum number of gpus to use, -1 for all */ - public static int PER_PROCESS_MAX_GPUS = -1; - - /** Whether cuda has been initialized */ - static boolean initialized = false; - - /** The total number of cuda devices on this machine */ - static int deviceCount = -1; - - /** Stores the cached deviceProperties */ - static cudaDeviceProp[] deviceProperties; - - /** Set of free GPUContexts */ - static Queue<GPUContext> freePool = new LinkedList<>(); - - /** - * Static initialization of the number of devices - * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error - * Initializes the CUDA driver - * All these need be done once, and not per GPU - * @throws DMLRuntimeException ? - */ - public synchronized static void initializeGPU() throws DMLRuntimeException { - GPUContext.LOG.info("Initializing CUDA"); - long start = System.nanoTime(); - JCuda.setExceptionsEnabled(true); - JCudnn.setExceptionsEnabled(true); - JCublas2.setExceptionsEnabled(true); - JCusparse.setExceptionsEnabled(true); - JCudaDriver.setExceptionsEnabled(true); - cuInit(0); // Initialize the driver - - int deviceCountArray[] = {0}; - cuDeviceGetCount(deviceCountArray); // Obtain the number of devices - deviceCount = deviceCountArray[0]; - deviceProperties = new cudaDeviceProp[deviceCount]; - - if (PER_PROCESS_MAX_GPUS > 0) - deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount); - - // Initialize the list of devices - for (int i = 0; i < deviceCount; i++) { - cudaDeviceProp properties = new cudaDeviceProp(); - cudaGetDeviceProperties(properties, i); - deviceProperties[i] = properties; - } - - // Initialize the pool of GPUContexts - for (int i=0; i<deviceCount; i++){ - GPUContext gCtx = new GPUContext(i); - freePool.add(gCtx); - } - - GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount); - //int[] device = {-1}; - //cudaGetDevice(device); - //cudaDeviceProp prop = getGPUProperties(device[0]); - //int maxBlocks = prop.maxGridSize[0]; - //int maxThreadsPerBlock = prop.maxThreadsPerBlock; - //long sharedMemPerBlock = prop.sharedMemPerBlock; - //LOG.debug("Active CUDA device number : " + device[0]); - //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock); - initialized = true; - GPUStatistics.cudaInitTime = System.nanoTime() - start; - } - - /** - * Gets an initialized GPUContext from a pool of GPUContexts, each linked to a GPU - * @return null if not more GPUContexts in pool, a valid GPUContext otherwise - * @throws DMLRuntimeException ? - */ - public static synchronized GPUContext getFromPool() throws DMLRuntimeException { - if (!initialized) initializeGPU(); - GPUContext gCtx = freePool.poll(); - LOG.trace("GPU : got GPUContext (" + gCtx + ") from freePool. New sizes - FreePool[" + freePool.size() + "]"); - return gCtx; - } - - /** - * Get the number of free GPUContexts - * @return number of free GPUContexts - */ - public static synchronized int getAvailableCount() { - return freePool.size(); - } - - /** - * Gets the device properties - * @param device the device number (on a machine with more than 1 GPU) - * @return the device properties - * @throws DMLRuntimeException if there is problem initializing the GPUContexts - */ - static cudaDeviceProp getGPUProperties(int device) throws DMLRuntimeException { - // do once - initialization of GPU - if (!initialized) initializeGPU(); - return deviceProperties[device]; - } - - public static int getDeviceCount() throws DMLRuntimeException { - if (!initialized) initializeGPU(); - return deviceCount; - } - - /** - * Returns a {@link GPUContext} back to the pool of {@link GPUContext}s - * @param gCtx the GPUContext instance to return. If null, nothing happens - * @throws DMLRuntimeException if error - */ - public static synchronized void returnToPool(GPUContext gCtx) throws DMLRuntimeException { - if (gCtx == null) - return; - freePool.add(gCtx); - LOG.trace("GPU : returned GPUContext (" + gCtx + ") to freePool. New sizes - FreePool[" + freePool.size() + "]"); - - } + protected static final Log LOG = LogFactory.getLog(GPUContextPool.class.getName()); + + /** + * Maximum number of gpus to use, -1 for all + */ + public static int PER_PROCESS_MAX_GPUS = -1; + + /** + * Whether cuda has been initialized + */ + static boolean initialized = false; + + /** + * The total number of cuda devices on this machine + */ + static int deviceCount = -1; + + /** + * Stores the cached deviceProperties + */ + static cudaDeviceProp[] deviceProperties; + + /** + * Set of free GPUContexts + */ + static List<GPUContext> pool = new LinkedList<>(); + + /** + * Whether the pool of GPUs is reserved or not + */ + static boolean reserved = false; + + /** + * Static initialization of the number of devices + * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error + * Initializes the CUDA driver + * All these need be done once, and not per GPU + * + * @throws DMLRuntimeException ? + */ + public synchronized static void initializeGPU() throws DMLRuntimeException { + GPUContext.LOG.info("Initializing CUDA"); + long start = System.nanoTime(); + JCuda.setExceptionsEnabled(true); + JCudnn.setExceptionsEnabled(true); + JCublas2.setExceptionsEnabled(true); + JCusparse.setExceptionsEnabled(true); + JCudaDriver.setExceptionsEnabled(true); + cuInit(0); // Initialize the driver + + int deviceCountArray[] = { 0 }; + cuDeviceGetCount(deviceCountArray); // Obtain the number of devices + deviceCount = deviceCountArray[0]; + deviceProperties = new cudaDeviceProp[deviceCount]; + + if (PER_PROCESS_MAX_GPUS > 0) + deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount); + + // Initialize the list of devices + for (int i = 0; i < deviceCount; i++) { + cudaDeviceProp properties = new cudaDeviceProp(); + cudaGetDeviceProperties(properties, i); + deviceProperties[i] = properties; + } + + // Initialize the pool of GPUContexts + for (int i = 0; i < deviceCount; i++) { + GPUContext gCtx = new GPUContext(i); + pool.add(gCtx); + } + + GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount); + //int[] device = {-1}; + //cudaGetDevice(device); + //cudaDeviceProp prop = getGPUProperties(device[0]); + //int maxBlocks = prop.maxGridSize[0]; + //int maxThreadsPerBlock = prop.maxThreadsPerBlock; + //long sharedMemPerBlock = prop.sharedMemPerBlock; + //LOG.debug("Active CUDA device number : " + device[0]); + //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock); + initialized = true; + GPUStatistics.cudaInitTime = System.nanoTime() - start; + } + + /** + * Reserves and gets an initialized list of GPUContexts + * + * @return null if no GPUContexts in pool, otherwise a valid list of GPUContext + * @throws DMLRuntimeException ? + */ + public static synchronized List<GPUContext> reserveAllGPUContexts() throws DMLRuntimeException { + if (reserved) + throw new DMLRuntimeException("Trying to re-reserve GPUs"); + if (!initialized) + initializeGPU(); + reserved = true; + LOG.trace("GPU : Reserved all GPUs"); + return pool; + } + + /** + * Get the number of free GPUContexts + * + * @return number of free GPUContexts + */ + public static synchronized int getAvailableCount() { + return pool.size(); + } + + /** + * Gets the device properties + * + * @param device the device number (on a machine with more than 1 GPU) + * @return the device properties + * @throws DMLRuntimeException if there is problem initializing the GPUContexts + */ + static cudaDeviceProp getGPUProperties(int device) throws DMLRuntimeException { + // do once - initialization of GPU + if (!initialized) + initializeGPU(); + return deviceProperties[device]; + } + + /** + * Number of available devices on this machine + * + * @return number of available GPUs on this machine + * @throws DMLRuntimeException if error + */ + public static int getDeviceCount() throws DMLRuntimeException { + if (!initialized) + initializeGPU(); + return deviceCount; + } + + /** + * Unreserves all GPUContexts + * + * @throws DMLRuntimeException if error + */ + public static synchronized void freeAllGPUContexts() throws DMLRuntimeException { + if (!reserved) + throw new DMLRuntimeException("Trying to free unreserved GPUs"); + reserved = false; + LOG.trace("GPU : Unreserved all GPUs"); + + } }