http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index d2309b0..708f291 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -18,65 +18,584 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; +import static jcuda.jcublas.JCublas2.cublasCreate; +import static jcuda.jcublas.JCublas2.cublasDestroy; +import static jcuda.jcudnn.JCudnn.cudnnCreate; +import static jcuda.jcudnn.JCudnn.cudnnDestroy; +import static jcuda.jcusparse.JCusparse.cusparseCreate; +import static jcuda.jcusparse.JCusparse.cusparseDestroy; +import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync; +import static jcuda.runtime.JCuda.cudaFree; +import static jcuda.runtime.JCuda.cudaGetDeviceCount; +import static jcuda.runtime.JCuda.cudaMalloc; +import static jcuda.runtime.JCuda.cudaMemGetInfo; +import static jcuda.runtime.JCuda.cudaMemset; +import static jcuda.runtime.JCuda.cudaSetDevice; +import static jcuda.runtime.JCuda.cudaSetDeviceFlags; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; -import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.conf.ConfigurationManager; +import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.utils.GPUStatistics; +import org.apache.sysml.utils.LRUCacheMap; -//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA specific +import jcuda.Pointer; +import jcuda.jcublas.cublasHandle; +import jcuda.jcudnn.cudnnHandle; +import jcuda.jcusparse.cusparseHandle; +import jcuda.runtime.JCuda; +import jcuda.runtime.cudaDeviceProp; -public abstract class GPUContext { +/** + * Represents a context per GPU accessible through the same JVM + * Each context holds cublas, cusparse, cudnn... handles which are separate for each GPU + */ +public class GPUContext { - protected static GPUContext currContext; - public static volatile Boolean isGPUContextCreated = false; + protected static final Log LOG = LogFactory.getLog(GPUContext.class.getName()); - protected GPUContext() {} + /** Eviction policies for {@link GPUContext#evict(long)} */ + public enum EvictionPolicy { + LRU, LFU, MIN_EVICT + } - /** - * Gets device memory available for SystemML operations - * - * @return available memory - */ - public abstract long getAvailableMemory(); + /** currently employed eviction policy */ + public final EvictionPolicy evictionPolicy = EvictionPolicy.LRU; + + /** Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU */ + private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<>(); + + /** To record size of allocated blocks */ + private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>(); + + /** active device assigned to this GPUContext instance */ + private final int deviceNum; + + /** list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU + * These are matrices allocated on the GPU on which rmvar hasn't been called yet. + * If a {@link GPUObject} has more than one lock on it, it cannot be freed + * If it has zero locks on it, it can be freed, but it is preferrable to keep it around + * so that an extraneous host to dev transfer can be avoided */ + private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>(); + + /** cudnnHandle specific to the active GPU for this GPUContext */ + private cudnnHandle cudnnHandle; + + /** cublasHandle specific to the active GPU for this GPUContext */ + private cublasHandle cublasHandle; + + /** cusparseHandle specific to the active GPU for this GPUContext */ + private cusparseHandle cusparseHandle; + + /** to launch custom CUDA kernel, specific to the active GPU for this GPUContext */ + private JCudaKernels kernels; /** - * Ensures that all the CUDA cards on the current system are - * of the minimum required compute capability. - * (The minimum required compute capability is hard coded in {@link JCudaContext}. - * - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public abstract void ensureComputeCapability() throws DMLRuntimeException; - - /** - * Singleton Factory method for creation of {@link GPUContext} - * @return GPU context - * @throws DMLRuntimeException if DMLRuntimeException occurs + * The minimum CUDA Compute capability needed for SystemML. + * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported. + * If SystemML needs to run on an older card, this logic can be revisited. */ - public static GPUContext getGPUContext() throws DMLRuntimeException { - if(currContext == null && DMLScript.USE_ACCELERATOR) { - synchronized(isGPUContextCreated) { - currContext = new JCudaContext(); - currContext.ensureComputeCapability(); - OptimizerUtils.GPU_MEMORY_BUDGET = currContext.getAvailableMemory(); - isGPUContextCreated = true; - } - } - return currContext; - } - - public static GPUObject createGPUObject(MatrixObject mo) { - if(DMLScript.USE_ACCELERATOR) { - synchronized(isGPUContextCreated) { - if(currContext == null) - throw new RuntimeException("GPUContext is not created"); - if(currContext instanceof JCudaContext) - return new JCudaObject(mo); - } - } - throw new RuntimeException("Cannot create createGPUObject when USE_ACCELERATOR is off"); - } - public abstract void destroy() throws DMLRuntimeException; - - + final int MAJOR_REQUIRED = 3; + final int MINOR_REQUIRED = 0; + + // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. + public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); + + protected GPUContext(int deviceNum) throws DMLRuntimeException { + this.deviceNum = deviceNum; + cudaSetDevice(deviceNum); + + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + + long free[] = {0}; + long total[] = {0}; + cudaMemGetInfo(free, total); + + long start = System.nanoTime(); + cudnnHandle = new cudnnHandle(); + cudnnCreate(cudnnHandle); + cublasHandle = new cublasHandle(); + cublasCreate(cublasHandle); + // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host + // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. + // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); + cusparseHandle = new cusparseHandle(); + cusparseCreate(cusparseHandle); + kernels = new JCudaKernels(deviceNum); + + GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; + LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + this); + + } + + public int getDeviceNum() { + return deviceNum; + } + + /** + * Sets the device for the calling thread. + * This method must be called after {@link GPUContextPool#getFromPool()} + * is called. + * If in a multi-threaded env like parfor, this method must be called when in the + * appropriate thread + */ + public void initializeThread() { + cudaSetDevice(deviceNum); + } + + @SuppressWarnings("unused") + public static int cudaGetDevice() { + int[] device = new int[1]; + JCuda.cudaGetDevice(device); + return device[0]; + } + + /** + * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. + * + * @param size size of data (in bytes) to allocate + * @return jcuda pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public Pointer allocate(long size) throws DMLRuntimeException { + return allocate(null, size, 1); + } + + /** + * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. + * + * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record + * @param size size of data (in bytes) to allocate + * @return jcuda pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public Pointer allocate(String instructionName, long size) throws DMLRuntimeException { + return allocate(instructionName, size, 1); + } + + /** + * Allocates temporary space on the device. + * Does not update bookkeeping. + * The caller is responsible for freeing up after usage. + * + * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record + * @param size Size of data (in bytes) to allocate + * @param statsCount amount to increment the cudaAllocCount by + * @return jcuda Pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException { + long t0 = 0, t1 = 0, end = 0; + Pointer A; + if (freeCUDASpaceMap.containsKey(size)) { + LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); + A = freeList.pop(); + if (freeList.isEmpty()) + freeCUDASpaceMap.remove(size); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); + } else { + LOG.trace("GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this); + if (DMLScript.STATISTICS) t0 = System.nanoTime(); + ensureFreeSpace(instructionName, size); + A = new Pointer(); + cudaMalloc(A, size); + if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.getAndAdd(statsCount); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0); + } + // Set all elements to 0 since newly allocated space will contain garbage + if (DMLScript.STATISTICS) t1 = System.nanoTime(); + LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this); + cudaMemset(A, 0, size); + if (DMLScript.STATISTICS) end = System.nanoTime(); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1); + if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1); + if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1); + cudaBlockSizeMap.put(A, size); + return A; + + } + + /** + * Does lazy cudaFree calls + * + * @param toFree {@link Pointer} instance to be freed + */ + public void cudaFreeHelper(final Pointer toFree) { + cudaFreeHelper(null, toFree, false); + } + + /** + * does lazy/eager cudaFree calls + * + * @param toFree {@link Pointer} instance to be freed + * @param eager true if to be done eagerly + */ + public void cudaFreeHelper(final Pointer toFree, boolean eager) { + cudaFreeHelper(null, toFree, eager); + } + + /** + * Does lazy cudaFree calls + * + * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record + * @param toFree {@link Pointer} instance to be freed + */ + public void cudaFreeHelper(String instructionName, final Pointer toFree) { + cudaFreeHelper(instructionName, toFree, false); + } + + /** + * Does cudaFree calls, lazily + * + * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record + * @param toFree {@link Pointer} instance to be freed + * @param eager true if to be done eagerly + */ + public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) { + long t0 = 0; + assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"; + long size = cudaBlockSizeMap.get(toFree); + if (eager) { + LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + this); + if (DMLScript.STATISTICS) t0 = System.nanoTime(); + cudaFree(toFree); + cudaBlockSizeMap.remove(toFree); + if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0); + } else { + LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this); + LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); + if (freeList == null) { + freeList = new LinkedList<Pointer>(); + freeCUDASpaceMap.put(size, freeList); + } + freeList.add(toFree); + } + } + + /** + * Thin wrapper over {@link GPUContext#evict(long)} + * + * @param size size to check + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + void ensureFreeSpace(long size) throws DMLRuntimeException { + ensureFreeSpace(null, size); + } + + /** + * Thin wrapper over {@link GPUContext#evict(long)} + * + * @param instructionName instructionName name of the instruction for which performance measurements are made + * @param size size to check + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException { + if (size >= getAvailableMemory()) { + evict(instructionName, size); + } + } + + /** + * Convenience wrapper over {@link GPUContext#evict(String, long)} + * + * @param GPUSize Desired size to be freed up on the GPU + * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them. + */ + protected void evict(final long GPUSize) throws DMLRuntimeException { + evict(null, GPUSize); + } + + /** + * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up + * or it fails. + * First the set of reusable blocks is freed up. If that isn't enough, the set of allocated matrix + * blocks with zero locks on them is freed up. + * The process cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on + * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up + * blocks on which there are zero locks until the required size has been freed up. + * // TODO: update it with hybrid policy + * + * @param instructionName name of the instruction for which performance measurements are made + * @param neededSize desired size to be freed up on the GPU + * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them. + */ + protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException { + LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this); + GPUStatistics.cudaEvictionCount.addAndGet(1); + // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap + // to free up space + LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap; + while (lruCacheMap.size() > 0) { + if (neededSize <= getAvailableMemory()) + break; + Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry(); + LinkedList<Pointer> toFreeList = toFreeListPair.getValue(); + Long size = toFreeListPair.getKey(); + Pointer toFree = toFreeList.pop(); + if (toFreeList.isEmpty()) + lruCacheMap.remove(size); + cudaFreeHelper(instructionName, toFree, true); + } + + if (neededSize <= getAvailableMemory()) + return; + + if (allocatedGPUObjects.size() == 0) { + throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + } + + Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() { + @Override + public int compare(GPUObject p1, GPUObject p2) { + long p1Val = p1.readLocks.get(); + long p2Val = p2.readLocks.get(); + + if (p1Val > 0 && p2Val > 0) { + // Both are locked, so don't sort + return 0; + } else if (p1Val > 0 || p2Val > 0) { + // Put the unlocked one to RHS + return Long.compare(p2Val, p1Val); + } else { + // Both are unlocked + + if (evictionPolicy == EvictionPolicy.MIN_EVICT) { + long p1Size = 0; + long p2Size = 0; + try { + p1Size = p1.getSizeOnDevice() - neededSize; + p2Size = p2.getSizeOnDevice() - neededSize; + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + + if (p1Size >= 0 && p2Size >= 0) { + return Long.compare(p2Size, p1Size); + } else { + return Long.compare(p1Size, p2Size); + } + } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) { + return Long.compare(p2.timestamp.get(), p1.timestamp.get()); + } else { + throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name()); + } + } + } + }); + + while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) { + GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1); + if (toBeRemoved.readLocks.get() > 0) { + throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + } + if (toBeRemoved.dirty) { + toBeRemoved.copyFromDeviceToHost(); + } + + toBeRemoved.clearData(true); + } + } + + /** + * Whether the GPU associated with this {@link GPUContext} has recorded the usage of a certain block + * + * @param o the block + * @return true if present, false otherwise + */ + public boolean isBlockRecorded(GPUObject o) { + return allocatedGPUObjects.contains(o); + } + + /** + * @param o {@link GPUObject} instance to record + * @see GPUContext#allocatedGPUObjects + * Records the usage of a matrix block + */ + public void recordBlockUsage(GPUObject o) { + allocatedGPUObjects.add(o); + } + + /** + * @param o {@link GPUObject} instance to remove from the list of allocated GPU objects + * @see GPUContext#allocatedGPUObjects + * Records that a block is not used anymore + */ + public void removeRecordedUsage(GPUObject o) { + allocatedGPUObjects.remove(o); + } + + /** + * Gets the available memory on GPU that SystemML can use + * + * @return the available memory in bytes + */ + public long getAvailableMemory() { + long free[] = {0}; + long total[] = {0}; + cudaMemGetInfo(free, total); + return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR); + } + + /** + * Makes sure that GPU that SystemML is trying to use has the minimum compute capability needed + * + * @throws DMLRuntimeException if the compute capability is less than what is required + */ + public void ensureComputeCapability() throws DMLRuntimeException { + int[] devices = {-1}; + cudaGetDeviceCount(devices); + if (devices[0] == -1) { + throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices"); + } + boolean isComputeCapable = true; + for (int i = 0; i < devices[0]; i++) { + cudaDeviceProp properties = GPUContextPool.getGPUProperties(i); + int major = properties.major; + int minor = properties.minor; + if (major < MAJOR_REQUIRED) { + isComputeCapable = false; + } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) { + isComputeCapable = false; + } + } + if (!isComputeCapable) { + throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED); + } + } + + public GPUObject createGPUObject(MatrixObject mo) { + return new GPUObject(this, mo); + } + + /** + * Gets the device properties for the active GPU (set with cudaSetDevice()) + * + * @return the device properties + */ + public cudaDeviceProp getGPUProperties() throws DMLRuntimeException { + return GPUContextPool.getGPUProperties(deviceNum); + } + + /** + * Gets the maximum number of threads per block for "active" GPU + * + * @return the maximum number of threads per block + */ + public int getMaxThreadsPerBlock() throws DMLRuntimeException { + cudaDeviceProp deviceProps = getGPUProperties(); + return deviceProps.maxThreadsPerBlock; + } + + /** + * Gets the maximum number of blocks supported by the active cuda device + * + * @return the maximum number of blocks supported + */ + public int getMaxBlocks() throws DMLRuntimeException { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.maxGridSize[0]; + } + + /** + * Gets the shared memory per block supported by the active cuda device + * + * @return the shared memory per block + */ + public long getMaxSharedMemory() throws DMLRuntimeException { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.sharedMemPerBlock; + } + + /** + * Gets the warp size supported by the active cuda device + * + * @return the warp size + */ + public int getWarpSize() throws DMLRuntimeException { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.warpSize; + } + + public cudnnHandle getCudnnHandle() { + return cudnnHandle; + } + + public cublasHandle getCublasHandle() { + return cublasHandle; + } + + public cusparseHandle getCusparseHandle() { + return cusparseHandle; + } + + public JCudaKernels getKernels() { + return kernels; + } + + /** + * Destroys this GPUContext object + * This method MUST BE called so that the GPU is available to be used again + * + * @throws DMLRuntimeException if error + */ + public void destroy() throws DMLRuntimeException { + LOG.trace("GPU : this context was destroyed, this = " + this.toString()); + clearMemory(); + cudnnDestroy(cudnnHandle); + cublasDestroy(cublasHandle); + cusparseDestroy(cusparseHandle); + cudnnHandle = null; + cublasHandle = null; + cusparseHandle = null; + + } + + /** + * Clears all memory used by this {@link GPUContext} + * Be careful to ensure that no memory is currently being used before invoking this + * @throws DMLRuntimeException + */ + public void clearMemory() throws DMLRuntimeException { + while (allocatedGPUObjects.isEmpty()) { + GPUObject o = allocatedGPUObjects.get(0); + o.clearData(); + } + for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) { + for (Pointer p : l) { + cudaFreeHelper(p, true); + } + } + cudaBlockSizeMap.clear(); + freeCUDASpaceMap.clear(); + allocatedGPUObjects.clear(); + } + + @Override + public String toString() { + return "GPUContext{" + + "deviceNum=" + deviceNum + + '}'; + } + }
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java new file mode 100644 index 0000000..6452651 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import static jcuda.driver.JCudaDriver.cuDeviceGetCount; +import static jcuda.driver.JCudaDriver.cuInit; +import static jcuda.runtime.JCuda.cudaGetDeviceProperties; + +import java.util.LinkedList; +import java.util.Queue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.utils.GPUStatistics; + +import jcuda.driver.JCudaDriver; +import jcuda.jcublas.JCublas2; +import jcuda.jcudnn.JCudnn; +import jcuda.jcusparse.JCusparse; +import jcuda.runtime.JCuda; +import jcuda.runtime.cudaDeviceProp; + +public class GPUContextPool { + + protected static final Log LOG = LogFactory.getLog(GPUContextPool.class.getName()); + + /** Maximum number of gpus to use, -1 for all */ + public static int PER_PROCESS_MAX_GPUS = -1; + + /** Whether cuda has been initialized */ + static boolean initialized = false; + + /** The total number of cuda devices on this machine */ + static int deviceCount = -1; + + /** Stores the cached deviceProperties */ + static cudaDeviceProp[] deviceProperties; + + /** Set of free GPUContexts */ + static Queue<GPUContext> freePool = new LinkedList<>(); + + /** + * Static initialization of the number of devices + * Also sets behaviour for J{Cuda, Cudnn, Cublas, Cusparse} in case of error + * Initializes the CUDA driver + * All these need be done once, and not per GPU + */ + public synchronized static void initializeGPU() throws DMLRuntimeException { + GPUContext.LOG.info("Initializing CUDA"); + long start = System.nanoTime(); + JCuda.setExceptionsEnabled(true); + JCudnn.setExceptionsEnabled(true); + JCublas2.setExceptionsEnabled(true); + JCusparse.setExceptionsEnabled(true); + JCudaDriver.setExceptionsEnabled(true); + cuInit(0); // Initialize the driver + + int deviceCountArray[] = {0}; + cuDeviceGetCount(deviceCountArray); // Obtain the number of devices + deviceCount = deviceCountArray[0]; + deviceProperties = new cudaDeviceProp[deviceCount]; + + if (PER_PROCESS_MAX_GPUS > 0) + deviceCount = Math.min(PER_PROCESS_MAX_GPUS, deviceCount); + + // Initialize the list of devices + for (int i = 0; i < deviceCount; i++) { + cudaDeviceProp properties = new cudaDeviceProp(); + cudaGetDeviceProperties(properties, i); + deviceProperties[i] = properties; + } + + // Initialize the pool of GPUContexts + for (int i=0; i<deviceCount; i++){ + GPUContext gCtx = new GPUContext(i); + freePool.add(gCtx); + } + + GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount); + //int[] device = {-1}; + //cudaGetDevice(device); + //cudaDeviceProp prop = getGPUProperties(device[0]); + //int maxBlocks = prop.maxGridSize[0]; + //int maxThreadsPerBlock = prop.maxThreadsPerBlock; + //long sharedMemPerBlock = prop.sharedMemPerBlock; + //LOG.debug("Active CUDA device number : " + device[0]); + //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock); + initialized = true; + GPUStatistics.cudaInitTime = System.nanoTime() - start; + } + + /** + * Gets an initialized GPUContext from a pool of GPUContexts, each linked to a GPU + * @return null if not more GPUContexts in pool, a valid GPUContext otherwise + */ + public static synchronized GPUContext getFromPool() throws DMLRuntimeException { + if (!initialized) initializeGPU(); + GPUContext gCtx = freePool.poll(); + LOG.trace("GPU : got GPUContext (" + gCtx + ") from freePool. New sizes - FreePool[" + freePool.size() + "]"); + return gCtx; + } + + /** + * Get the number of free GPUContexts + * @return number of free GPUContexts + */ + public static synchronized int getAvailableCount() { + return freePool.size(); + } + + /** + * Gets the device properties + * @param device the device number (on a machine with more than 1 GPU) + * @return the device properties + * @throws DMLRuntimeException if there is problem initializing the GPUContexts + */ + static cudaDeviceProp getGPUProperties(int device) throws DMLRuntimeException { + // do once - initialization of GPU + if (!initialized) initializeGPU(); + return deviceProperties[device]; + } + + public static int getDeviceCount() throws DMLRuntimeException { + if (!initialized) initializeGPU(); + return deviceCount; + } + + /** + * Returns a {@link GPUContext} back to the pool of {@link GPUContext}s + * @param gCtx the GPUContext instance to return. If null, nothing happens + * @throws DMLRuntimeException if error + */ + public static synchronized void returnToPool(GPUContext gCtx) throws DMLRuntimeException { + if (gCtx == null) + return; + freePool.add(gCtx); + LOG.trace("GPU : returned GPUContext (" + gCtx + ") to freePool. New sizes - FreePool[" + freePool.size() + "]"); + + } + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index c116475..3a1fafa 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -18,206 +18,799 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; -import jcuda.Pointer; +import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T; +import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor; +import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE; +import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW; +import static jcuda.jcusparse.JCusparse.cusparseDdense2csr; +import static jcuda.jcusparse.JCusparse.cusparseDnnz; +import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; + +import java.util.Arrays; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.CacheException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.data.SparseBlock; +import org.apache.sysml.runtime.matrix.data.SparseBlockCOO; +import org.apache.sysml.runtime.matrix.data.SparseBlockCSR; +import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR; import org.apache.sysml.utils.GPUStatistics; -import org.apache.sysml.utils.LRUCacheMap; -import java.util.Collections; -import java.util.Comparator; -import java.util.LinkedList; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; +import jcuda.Pointer; +import jcuda.jcublas.JCublas2; +import jcuda.jcudnn.cudnnTensorDescriptor; +import jcuda.jcusparse.JCusparse; +import jcuda.jcusparse.cusparseDirection; +import jcuda.jcusparse.cusparseHandle; +import jcuda.jcusparse.cusparseMatDescr; + +/** + * Handle to a matrix block on the GPU + */ +public class GPUObject { + + private static final Log LOG = LogFactory.getLog(GPUObject.class.getName()); + + /** GPUContext that owns this GPUObject */ + private final GPUContext gpuContext; + + /** Pointer to the underlying dense matrix block on GPU */ + private Pointer jcudaDenseMatrixPtr = null; + + /** Pointer to the underlying sparse matrix block on GPU */ + private CSRPointer jcudaSparseMatrixPtr = null; -//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity -public abstract class GPUObject -{ - public enum EvictionPolicy { - LRU, LFU, MIN_EVICT - } - public static final EvictionPolicy evictionPolicy = EvictionPolicy.LRU; - protected boolean isDeviceCopyModified = false; - protected AtomicInteger numLocks = new AtomicInteger(0); + /** An optional tensor descriptor (and shape) that can be set by a tensor instruction such as convolution, + * maxpooling and exploited by a subsequent non-tensor instruction such as relu + */ + private cudnnTensorDescriptor tensorDescriptor = null; + + /** the shape of this tensor, if in fact this is a tensor */ + private int [] tensorShape = null; + + /** whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host */ + protected boolean dirty = false; + + /** number of read locks on this object */ + protected AtomicInteger readLocks = new AtomicInteger(0); + + /** Timestamp, needed by {@link GPUContext#evict(long)} */ AtomicLong timestamp = new AtomicLong(0); - - protected boolean isInSparseFormat = false; + + /** Whether this block is in sparse format */ + protected boolean isSparse = false; + + /** Enclosing {@link MatrixObject} instance */ protected MatrixObject mat = null; - - protected GPUObject(MatrixObject mat2) { - this.mat = mat2; + + private Pointer allocate(String instName, long size) throws DMLRuntimeException { + return getGPUContext().allocate(instName, size); + } + + private Pointer allocate(long size) throws DMLRuntimeException { + return getGPUContext().allocate(size); + } + + private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(toFree); + } + + private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(toFree, eager); + } + + private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(instName, toFree, eager); } - - public boolean isInSparseFormat() { - return isInSparseFormat; + + private GPUContext getGPUContext() throws DMLRuntimeException { + return gpuContext; } - - public abstract boolean isAllocated(); /** - * Signal intent that a matrix block will be read (as input) on the GPU - * @return true if a host memory to device memory transfer happened - * @throws DMLRuntimeException ? + * Transposes a dense matrix on the GPU by calling the cublasDgeam operation + * @param gCtx a valid {@link GPUContext} + * @param densePtr Pointer to dense matrix on the GPU + * @param m rows in ouput matrix + * @param n columns in output matrix + * @param lda rows in input matrix + * @param ldc columns in output matrix + * @return transposed matrix + * @throws DMLRuntimeException if operation failed */ - public abstract boolean acquireDeviceRead() throws DMLRuntimeException; + public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException { + LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx); + Pointer alpha = Pointer.to(new double[]{1.0}); + Pointer beta = Pointer.to(new double[]{0.0}); + Pointer A = densePtr; + Pointer C = gCtx.allocate(((long)m)*getDoubleSizeOf(n)); + + // Transpose the matrix to get a dense matrix + JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc); + return C; + } + /** - * To signal intent that a matrix block will be written to on the GPU - * @return true if memory was allocated on the GPU as a result of this call + * Convenience method to convert a CSR matrix to a dense matrix on the GPU + * Since the allocated matrix is temporary, bookkeeping is not updated. + * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT + * Caller is responsible for deallocating memory on GPU. + * @param gCtx a valid {@link GPUContext} + * @param cusparseHandle handle to cusparse library + * @param densePtr [in] dense matrix pointer on the GPU in row major + * @param rows number of rows + * @param cols number of columns + * @return CSR (compressed sparse row) pointer * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public abstract boolean acquireDeviceModifyDense() throws DMLRuntimeException; + public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cusparseHandle cusparseHandle, Pointer densePtr, int rows, int cols) throws DMLRuntimeException { + cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor(); + Pointer nnzPerRowPtr = null; + Pointer nnzTotalDevHostPtr = null; + + gCtx.ensureFreeSpace(getIntSizeOf(rows + 1)); + nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows)); + nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1)); + + // Output is in dense vector format, convert it to CSR + cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr); + //cudaDeviceSynchronize(); + int[] nnzC = {-1}; + + long t2=0; + if (DMLScript.STATISTICS) t2 = System.nanoTime(); + cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2); + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1); + + if (nnzC[0] == -1){ + throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU"); + } + + LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] + ", GPUContext=" + gCtx); + + CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows); + cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd); + //cudaDeviceSynchronize(); + + gCtx.cudaFreeHelper(nnzPerRowPtr); + gCtx.cudaFreeHelper(nnzTotalDevHostPtr); + + return C; + } + + /** + * Gets the double array from GPU memory onto host memory and returns string. + * @param A Pointer to memory on device (GPU), assumed to point to a double array + * @param rows rows in matrix A + * @param cols columns in matrix A + * @return the debug string + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + @SuppressWarnings("unused") + public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException { + StringBuffer sb = new StringBuffer(); + int len = toIntExact(rows * cols); + double[] tmp = new double[len]; + cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost); + int k = 0; + for (int i=0; i<rows; i++){ + for (int j=0; j<cols; j++){ + sb.append(tmp[k]).append(' '); + k++; + } + sb.append('\n'); + } + return sb.toString(); + } + /** - * To signal intent that a sparse matrix block will be written to on the GPU - * @return true if memory was allocated on the GPU as a result of this call + * Convenience method to directly examine the Sparse matrix on GPU + * @return CSR (compressed sparse row) pointer + */ + public CSRPointer getSparseMatrixCudaPointer() { + return getJcudaSparseMatrixPtr(); + } + + /** + * Convenience method to directly set the sparse matrix on GPU + * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing. + * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)} + * @param sparseMatrixPtr CSR (compressed sparse row) pointer + */ + public void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) throws DMLRuntimeException { + this.jcudaSparseMatrixPtr = sparseMatrixPtr; + this.isSparse = true; + if(getJcudaDenseMatrixPtr() != null) { + cudaFreeHelper(getJcudaDenseMatrixPtr()); + jcudaDenseMatrixPtr = null; + } + } + + /** + * Convenience method to directly set the dense matrix pointer on GPU + * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing. + * + * @param densePtr dense pointer + */ + public void setDenseMatrixCudaPointer(Pointer densePtr) throws DMLRuntimeException{ + this.jcudaDenseMatrixPtr = densePtr; + this.isSparse = false; + if(getJcudaSparseMatrixPtr() != null) { + getJcudaSparseMatrixPtr().deallocate(); + jcudaSparseMatrixPtr = null; + } + } + + /** + * Converts this GPUObject from dense to sparse format. + * * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public abstract boolean acquireDeviceModifySparse() throws DMLRuntimeException; - + public void denseToSparse() throws DMLRuntimeException { + LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + getGPUContext()); + long t0=0; + if (DMLScript.STATISTICS) t0 = System.nanoTime(); + cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle(); + if(cusparseHandle == null) + throw new DMLRuntimeException("Expected cusparse to be initialized"); + int rows = toIntExact(mat.getNumRows()); + int cols = toIntExact(mat.getNumColumns()); + + if(getJcudaDenseMatrixPtr() == null || !isAllocated()) + throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call"); + + convertDensePtrFromRowMajorToColumnMajor(); + setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getJcudaDenseMatrixPtr(), rows, cols)); + // TODO: What if mat.getNnz() is -1 ? + if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.addAndGet(1); + } + /** - * If memory on GPU has been allocated from elsewhere, this method - * updates the internal bookkeeping - * @param numBytes number of bytes + * Convenience method. Converts Row Major Dense Matrix --> Column Major Dense Matrix + * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public abstract void setDeviceModify(long numBytes); + private void convertDensePtrFromRowMajorToColumnMajor() throws DMLRuntimeException { + LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); + int m = toIntExact(mat.getNumRows()); + int n = toIntExact(mat.getNumColumns()); + int lda = n; + int ldc = m; + if(!isAllocated()) { + throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated"); + } + + Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc); + cudaFreeHelper(getJcudaDenseMatrixPtr()); + setDenseMatrixCudaPointer(tmp); + } + + private void convertDensePtrFromColMajorToRowMajor() throws DMLRuntimeException { + LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); + + int n = toIntExact(mat.getNumRows()); + int m = toIntExact(mat.getNumColumns()); + int lda = n; + int ldc = m; + if(!isAllocated()) { + throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated"); + } + + Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc); + cudaFreeHelper(getJcudaDenseMatrixPtr()); + setDenseMatrixCudaPointer(tmp); + } /** - * Signal intent that a block needs to be read on the host - * @return true if copied from device to host - * @throws CacheException ? + * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format) + * + * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public abstract boolean acquireHostRead() throws CacheException; - - public abstract void releaseInput() throws CacheException; - public abstract void releaseOutput() throws CacheException; - - // package-level visibility as these methods are guarded by underlying GPUContext - - abstract void allocateDenseMatrixOnDevice() throws DMLRuntimeException; - abstract void allocateSparseMatrixOnDevice() throws DMLRuntimeException; - abstract void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException; - abstract long getSizeOnDevice() throws DMLRuntimeException; - - abstract void copyFromHostToDevice() throws DMLRuntimeException; - + public void sparseToDense() throws DMLRuntimeException { + sparseToDense(null); + } + /** - * Copies a matrix block (dense or sparse) from GPU Memory to Host memory. - * A {@link MatrixBlock} instance is allocated, data from the GPU is copied in, - * the current one in Host memory is deallocated by calling MatrixObject's acquireHostModify(MatrixBlock) (??? does not exist) - * and overwritten with the newly allocated instance. - * TODO : re-examine this to avoid spurious allocations of memory for optimizations + * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format) + * Also records per instruction invokation of sparseToDense. + * @param instructionName Name of the instruction for which statistics are recorded in {@link GPUStatistics} + * @throws DMLRuntimeException ? + */ + public void sparseToDense(String instructionName) throws DMLRuntimeException { + LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + getGPUContext()); + long start=0, end=0; + if (DMLScript.STATISTICS) start = System.nanoTime(); + if(getJcudaSparseMatrixPtr() == null || !isAllocated()) + throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); + + sparseToColumnMajorDense(); + convertDensePtrFromColMajorToRowMajor(); + if (DMLScript.STATISTICS) end = System.nanoTime(); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.addAndGet(1); + } + + /** + * More efficient method to convert sparse to dense but returns dense in column major format + * * @throws DMLRuntimeException if DMLRuntimeException occurs */ - abstract void copyFromDeviceToHost() throws DMLRuntimeException; // Called by export() + public void sparseToColumnMajorDense() throws DMLRuntimeException { + LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + getGPUContext()); + if(getJcudaSparseMatrixPtr() == null || !isAllocated()) + throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); + + cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle(); + if(cusparseHandle == null) + throw new DMLRuntimeException("Expected cusparse to be initialized"); + int rows = toIntExact(mat.getNumRows()); + int cols = toIntExact(mat.getNumColumns()); + setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols)); + } + + /** + * Initializes this GPUObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block + * @param mat2 the matrix block that owns this {@link GPUObject} + */ + GPUObject(GPUContext gCtx, MatrixObject mat2) { + gpuContext = gCtx; + this.mat = mat2; + } + public boolean isSparse() { + return isSparse; + } /** - * Convenience wrapper over {@link GPUObject#evict(String, long)} - * @param GPUSize Desired size to be freed up on the GPU - * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them. + * Returns a previously allocated tensor shape or null + * @return int array of four elements or null */ - protected static void evict(final long GPUSize) throws DMLRuntimeException { - evict(null, GPUSize); + public int [] getTensorShape() { + return tensorShape; } /** - * Cycles through the sorted list of allocated {@link GPUObject} instances. Sorting is based on - * number of (read) locks that have been obtained on it (reverse order). It repeatedly frees up - * blocks on which there are zero locks until the required size has been freed up. - * // TODO: update it with hybrid policy - * @param instructionName name of the instruction for which performance measurements are made - * @param GPUSize Desired size to be freed up on the GPU - * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them. + * Returns a previously allocated tensor descriptor or null + * @return cudnn tensor descriptor */ - protected static void evict(String instructionName, final long GPUSize) throws DMLRuntimeException { - synchronized (JCudaContext.syncObj) { - - GPUStatistics.cudaEvictionCount.addAndGet(1); - // Release the set of free blocks maintained in a JCudaObject.freeCUDASpaceMap - // to free up space - LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = JCudaObject.freeCUDASpaceMap; - while (lruCacheMap.size() > 0) { - if (GPUSize <= getAvailableMemory()) - break; - Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry(); - LinkedList<Pointer> toFreeList = toFreeListPair.getValue(); - Long size = toFreeListPair.getKey(); - Pointer toFree = toFreeList.pop(); - if (toFreeList.isEmpty()) - lruCacheMap.remove(size); - JCudaObject.cudaFreeHelper(instructionName, toFree, true); + public cudnnTensorDescriptor getTensorDescriptor() { + return tensorDescriptor; + } + + /** + * Returns a previously allocated or allocates and returns a tensor descriptor + * @param N number of images + * @param C number of channels + * @param H height + * @param W width + * @return cudnn tensor descriptor + */ + public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) { + LOG.trace("GPU : allocateTensorDescriptor with [N="+N+",C="+C+",H="+H+",W="+W+"] on " + this); + if(tensorDescriptor == null) { + tensorDescriptor = new cudnnTensorDescriptor(); + cudnnCreateTensorDescriptor(tensorDescriptor); + cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W); + tensorShape = new int[4]; + tensorShape[0] = N; + tensorShape[1] = C; + tensorShape[2] = H; + tensorShape[3] = W; + } + return tensorDescriptor; + } + + private static long getDoubleSizeOf(long numElems) { + return numElems * ((long)jcuda.Sizeof.DOUBLE); + } + + private static long getIntSizeOf(long numElems) { + return numElems * ((long)jcuda.Sizeof.INT); + } + + public boolean isAllocated() { + boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null); + return eitherAllocated; + } + + public boolean isInputAllocated() { + try { + boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null); + boolean isAllocatedOnThisGPUContext = getGPUContext().isBlockRecorded(this); + if (eitherAllocated && !isAllocatedOnThisGPUContext) { + LOG.warn("GPU : A block was allocated but was not on this GPUContext, GPUContext=" + getGPUContext()); } + return eitherAllocated && isAllocatedOnThisGPUContext; + } catch (DMLRuntimeException e){ + LOG.info("GPU : System is in an inconsistent state"); + throw new RuntimeException(e); + } + } + + /** + * Allocates a sparse and empty {@link GPUObject} + * This is the result of operations that are both non zero matrices. + * + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public void allocateSparseAndEmpty() throws DMLRuntimeException{ + LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext()); + setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows())); + addReadLock(); + } + + /** + * Allocates a dense matrix of size obtained from the attached matrix metadata + * and fills it up with a single value + * + * @param v value to fill up the dense matrix + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public void allocateAndFillDense(double v) throws DMLRuntimeException { + LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + getGPUContext()); + long rows = mat.getNumRows(); + long cols = mat.getNumColumns(); + int numElems = toIntExact(rows * cols); + long size = getDoubleSizeOf(numElems); + setDenseMatrixCudaPointer(allocate(size)); + addReadLock(); + // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v" + getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems); + } + + /** + * If this {@link GPUObject} is sparse and empty + * Being allocated is a prerequisite to being sparse and empty. + * + * @return true if sparse and empty + * @throws DMLRuntimeException if error + */ + public boolean isSparseAndEmpty() throws DMLRuntimeException{ + boolean isSparseAndAllocated = isAllocated()&& LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat); + boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && getJcudaSparseMatrixPtr().nnz == 0; + return isEmptyAndSparseAndAllocated; + } + + public boolean acquireDeviceRead() throws DMLRuntimeException { + LOG.trace("GPU : acquireDeviceRead on " + this); + boolean transferred = false; + if(!isAllocated()) { + LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext()); + copyFromHostToDevice(); + transferred = true; + } else { + addReadLock(); + } + if(!isAllocated()) + throw new DMLRuntimeException("Expected device data to be allocated"); + return transferred; + } + + public boolean acquireDeviceModifyDense() throws DMLRuntimeException { + LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext()); + boolean allocated = false; + if(!isAllocated()) { + mat.setDirty(true); + LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this); + // Dense block, size = numRows * numCols + allocateDenseMatrixOnDevice(); + allocated = true; + getGPUContext().recordBlockUsage(this); + } + dirty = true; + if(!isAllocated()) + throw new DMLRuntimeException("Expected device data to be allocated"); + return allocated; + } - if (GPUSize <= getAvailableMemory()) - return; + public boolean acquireDeviceModifySparse() throws DMLRuntimeException { + LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); + boolean allocated = false; + isSparse = true; + if(!isAllocated()) { + LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this); + mat.setDirty(true); + allocateSparseMatrixOnDevice(); + allocated = true; + getGPUContext().recordBlockUsage(this); + + } + dirty = true; + if(!isAllocated()) + throw new DMLRuntimeException("Expected device data to be allocated"); + return allocated; + } + + public void addReadLock() { + readLocks.addAndGet(1); + } - if (JCudaContext.allocatedPointers.size() == 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + /** + * if the data is allocated on the GPU and is dirty, it is copied back to the host memory + * @return true if a copy to host happened, false otherwise + * @throws CacheException + */ + public boolean acquireHostRead() throws CacheException { + boolean copied = false; + try { + LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); + if (isAllocated() && dirty) { + LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + getGPUContext()); + copyFromDeviceToHost(); + copied = true; } + } catch (DMLRuntimeException e) { + throw new CacheException(e); + } + return copied; + } + + /** + * Updates the locks depending on the eviction policy selected + * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext} + */ + private void updateReleaseLocks() throws DMLRuntimeException { + if (readLocks.addAndGet(-1) < 0) { + throw new CacheException("Redundant release of GPU object"); + } + LOG.trace("GPU : updateReleaseLocks, new number of read locks is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext()); + GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy; + switch (evictionPolicy){ + case LRU : timestamp.set(System.nanoTime()); break; + case LFU : timestamp.addAndGet(1); break; + case MIN_EVICT : /* Do Nothing */ break; + default : throw new CacheException("The eviction policy is not supported:" + evictionPolicy.name()); + } + } + + /** + * Releases input allocated on GPU + * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext} + */ + public void releaseInput() throws DMLRuntimeException { + updateReleaseLocks(); + if(!isAllocated()) + throw new CacheException("Attempting to release an input before allocating it"); + } + + /** + * releases output allocated on GPU + * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext} + */ + public void releaseOutput() throws DMLRuntimeException { + updateReleaseLocks(); + dirty = true; + if(!isAllocated()) + throw new CacheException("Attempting to release an output before allocating it"); + } - synchronized (evictionLock) { - Collections.sort(JCudaContext.allocatedPointers, new Comparator<GPUObject>() { - - @Override - public int compare(GPUObject p1, GPUObject p2) { - long p1Val = p1.numLocks.get(); - long p2Val = p2.numLocks.get(); - - if (p1Val > 0 && p2Val > 0) { - // Both are locked, so don't sort - return 0; - } else if (p1Val > 0 || p2Val > 0) { - // Put the unlocked one to RHS - return Long.compare(p2Val, p1Val); - } else { - // Both are unlocked - - if (evictionPolicy == EvictionPolicy.MIN_EVICT) { - long p1Size = 0; - long p2Size = 0; - try { - p1Size = p1.getSizeOnDevice() - GPUSize; - p2Size = p2.getSizeOnDevice() - GPUSize; - } catch (DMLRuntimeException e) { - throw new RuntimeException(e); - } - - if (p1Size >= 0 && p2Size >= 0) { - return Long.compare(p2Size, p1Size); - } else { - return Long.compare(p1Size, p2Size); - } - } else if (evictionPolicy == EvictionPolicy.LRU || evictionPolicy == EvictionPolicy.LFU) { - return Long.compare(p2.timestamp.get(), p1.timestamp.get()); - } else { - throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy.name()); - } - } - } - }); - - while (GPUSize > getAvailableMemory() && JCudaContext.allocatedPointers.size() > 0) { - GPUObject toBeRemoved = JCudaContext.allocatedPointers.get(JCudaContext.allocatedPointers.size() - 1); - if (toBeRemoved.numLocks.get() > 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); - } - if (toBeRemoved.isDeviceCopyModified) { - toBeRemoved.copyFromDeviceToHost(); - } - - toBeRemoved.clearData(true); + void allocateDenseMatrixOnDevice() throws DMLRuntimeException { + LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); + assert !isAllocated() : "Internal error - trying to allocated dense matrix to a GPUObject that is already allocated"; + long rows = mat.getNumRows(); + long cols = mat.getNumColumns(); + assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix"; + assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;"; + long size = getDoubleSizeOf(rows * cols); + Pointer tmp = allocate(size); + setDenseMatrixCudaPointer(tmp); + addReadLock(); + } + + void allocateSparseMatrixOnDevice() throws DMLRuntimeException { + LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); + assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a GPUObject that is already allocated"; + long rows = mat.getNumRows(); + long nnz = mat.getNnz(); + assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix"; + assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix"; + CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows); + setSparseMatrixCudaPointer(tmp); + addReadLock(); + } + + void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException { + LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", GPUContext=" + getGPUContext()); + if(getJcudaDenseMatrixPtr() != null) { + cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager); + } + if (getJcudaSparseMatrixPtr() != null) { + getJcudaSparseMatrixPtr().deallocate(eager); + } + jcudaDenseMatrixPtr = null; + jcudaSparseMatrixPtr = null; + if(tensorDescriptor != null) { + cudnnDestroyTensorDescriptor(tensorDescriptor); + tensorDescriptor = null; + } + readLocks.set(0); + } + + protected long getSizeOnDevice() throws DMLRuntimeException { + long GPUSize = 0; + long rlen = mat.getNumRows(); + long clen = mat.getNumColumns(); + long nnz = mat.getNnz(); + + if(LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) { + GPUSize = CSRPointer.estimateSize(nnz, rlen); + } + else { + GPUSize = getDoubleSizeOf(rlen * clen); + } + return GPUSize; + } + + void copyFromHostToDevice() throws DMLRuntimeException { + LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext()); + long start=0; + if (DMLScript.STATISTICS) start = System.nanoTime(); + + MatrixBlock tmp = mat.acquireRead(); + if(tmp.isInSparseFormat()) { + + int rowPtr[] = null; + int colInd[] = null; + double[] values = null; + + tmp.recomputeNonZeros(); + long nnz = tmp.getNonZeros(); + mat.getMatrixCharacteristics().setNonZeros(nnz); + + SparseBlock block = tmp.getSparseBlock(); + boolean copyToDevice = true; + if(block == null && tmp.getNonZeros() == 0) { +// // Allocate empty block --> not necessary +// // To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest +// rowPtr = new int[0]; +// colInd = new int[0]; +// values = new double[0]; + copyToDevice = false; + } + else if(block == null && tmp.getNonZeros() != 0) { + throw new DMLRuntimeException("Expected CP sparse block to be not null."); + } + else { + // CSR is the preferred format for cuSparse GEMM + // Converts MCSR and COO to CSR + SparseBlockCSR csrBlock = null; + long t0=0; + if (block instanceof SparseBlockCSR){ + csrBlock = (SparseBlockCSR)block; + } else if (block instanceof SparseBlockCOO) { + // TODO - should we do this on the GPU using cusparse<t>coo2csr() ? + if (DMLScript.STATISTICS) t0 = System.nanoTime(); + SparseBlockCOO cooBlock = (SparseBlockCOO)block; + csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values()); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet(); + } else if (block instanceof SparseBlockMCSR) { + if (DMLScript.STATISTICS) t0 = System.nanoTime(); + SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block; + csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size())); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet(); + } else { + throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations"); } + rowPtr = csrBlock.rowPointers(); + colInd = csrBlock.indexes(); + values = csrBlock.values(); + } + allocateSparseMatrixOnDevice(); + getGPUContext().recordBlockUsage(this); + + if(copyToDevice) { + CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values); } } + else { + double[] data = tmp.getDenseBlock(); + + if( data == null && tmp.getSparseBlock() != null ) + throw new DMLRuntimeException("Incorrect sparsity calculation"); + else if( data==null && tmp.getNonZeros() != 0 ) + throw new DMLRuntimeException("MatrixBlock is not allocated"); + else if( tmp.getNonZeros() == 0 ) + data = new double[tmp.getNumRows()*tmp.getNumColumns()]; + + // Copy dense block + allocateDenseMatrixOnDevice(); + getGPUContext().recordBlockUsage(this); + + cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice); + } + + mat.release(); + + if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start); + if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(1); + } + + public static int toIntExact(long l) throws DMLRuntimeException { + if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { + throw new DMLRuntimeException("Cannot be cast to int:" + l); + } + return (int) l; } + protected void copyFromDeviceToHost() throws DMLRuntimeException { + LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext()); + if (getJcudaDenseMatrixPtr() != null && getJcudaSparseMatrixPtr() != null){ + throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated"); + } + + if(getJcudaDenseMatrixPtr() != null) { + long start=0; + if (DMLScript.STATISTICS) start = System.nanoTime(); + MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); + tmp.allocateDenseBlock(); + double [] data = tmp.getDenseBlock(); + + cudaMemcpy(Pointer.to(data), getJcudaDenseMatrixPtr(), getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost); + + tmp.recomputeNonZeros(); + mat.acquireModify(tmp); + mat.release(); + + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start); + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1); + } + else if (getJcudaSparseMatrixPtr() != null){ + if(!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) + throw new DMLRuntimeException("Block not in sparse format on host yet the device sparse matrix pointer is not null"); + + if(this.isSparseAndEmpty()){ + MatrixBlock tmp = new MatrixBlock(); // Empty Block + mat.acquireModify(tmp); + mat.release(); + } else { + long start=0; + if (DMLScript.STATISTICS) start = System.nanoTime(); + + int rows = toIntExact(mat.getNumRows()); + int cols = toIntExact(mat.getNumColumns()); + int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz); + int[] rowPtr = new int[rows + 1]; + int[] colInd = new int[nnz]; + double[] values = new double[nnz]; + CSRPointer.copyToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd, values); + + SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz); + MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock); + mat.acquireModify(tmp); + mat.release(); + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start); + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1); + } + } + else { + throw new DMLRuntimeException("Cannot copy from device to host as JCuda dense/sparse pointer is not allocated"); + } + dirty = false; + } + + /** * lazily clears the data associated with this {@link GPUObject} instance * @throws CacheException ? */ - public void clearData() throws CacheException { + public void clearData() throws DMLRuntimeException { clearData(false); } @@ -226,36 +819,38 @@ public abstract class GPUObject * @param eager whether to be done synchronously or asynchronously * @throws CacheException ? */ - public void clearData(boolean eager) throws CacheException { - synchronized(evictionLock) { - JCudaContext.allocatedPointers.remove(this); - } - try { - deallocateMemoryOnDevice(eager); - } catch (DMLRuntimeException e) { - throw new CacheException(e); - } + public void clearData(boolean eager) throws DMLRuntimeException { + getGPUContext().removeRecordedUsage(this); + deallocateMemoryOnDevice(eager); + + } + + /** Pointer to dense matrix */ + public Pointer getJcudaDenseMatrixPtr() { + return jcudaDenseMatrixPtr; + } + + /** Pointer to sparse matrix */ + public CSRPointer getJcudaSparseMatrixPtr() { + return jcudaSparseMatrixPtr; } - - static Boolean evictionLock = new Boolean(true); - - protected static long getAvailableMemory() { - return GPUContext.currContext.getAvailableMemory(); - } - -// // Copying from device -> host occurs here -// // Called by MatrixObject's exportData -// public void exportData() throws CacheException { -// boolean isDeviceCopyModified = mat.getGPUObject() != null && mat.getGPUObject().isDeviceCopyModified; -// boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || -// (mat.getMatrixBlock().getDenseBlock() == null && mat.getMatrixBlock().getSparseBlock() == null); -// -// if(mat.getGPUObject() != null && (isDeviceCopyModified || isHostCopyUnavailable)) { -// try { -// mat.getGPUObject().copyFromDeviceToHost(); -// } catch (DMLRuntimeException e) { -// throw new CacheException(e); -// } -// } -// } + + /** Whether this block is dirty on the GPU */ + public boolean isDirty() { + return dirty; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("GPUObject{"); + sb.append(", tensorShape=").append(Arrays.toString(tensorShape)); + sb.append(", dirty=").append(dirty); + sb.append(", readLocks=").append(readLocks); + sb.append(", sparse? ").append(isSparse); + sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]"); + sb.append('}'); + return sb.toString(); + } + + } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java deleted file mode 100644 index bb73f4b..0000000 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.instructions.gpu.context; - -import jcuda.driver.JCudaDriver; -import jcuda.jcublas.JCublas2; -import jcuda.jcublas.cublasHandle; -import jcuda.jcudnn.JCudnn; -import jcuda.jcudnn.cudnnHandle; -import jcuda.jcusparse.JCusparse; -import jcuda.jcusparse.cusparseHandle; -import jcuda.runtime.JCuda; -import jcuda.runtime.cudaDeviceProp; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.sysml.conf.ConfigurationManager; -import org.apache.sysml.conf.DMLConfig; -import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; -import org.apache.sysml.utils.GPUStatistics; - -import java.util.ArrayList; -import java.util.concurrent.atomic.AtomicLong; - -import static jcuda.driver.JCudaDriver.cuDeviceGetCount; -import static jcuda.driver.JCudaDriver.cuInit; -import static jcuda.jcublas.JCublas2.cublasCreate; -import static jcuda.jcublas.JCublas2.cublasDestroy; -import static jcuda.jcudnn.JCudnn.cudnnCreate; -import static jcuda.jcudnn.JCudnn.cudnnDestroy; -import static jcuda.jcusparse.JCusparse.cusparseCreate; -import static jcuda.jcusparse.JCusparse.cusparseDestroy; -import static jcuda.runtime.JCuda.*; -import static jcuda.runtime.cudaError.cudaSuccess; - - -public class JCudaContext extends GPUContext { - - /** Synchronization object to make sure no allocations happen when something is being evicted from memory */ - public static final Object syncObj = new Object(); - private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName()); - - /** Global list of allocated {@link GPUObject} instances. This list must be accessed in a synchronized way */ - public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>(); - - // The minimum CUDA Compute capability needed for SystemML. - // After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported. - // If SystemML needs to run on an older card, this logic can be revisited. - final int MAJOR_REQUIRED = 3; - final int MINOR_REQUIRED = 0; - - /** The total number of cuda devices on this machine */ - public static int deviceCount = -1; - - /** enable this to print debug information before code pertaining to the GPU is executed */ - public static boolean DEBUG = false; - - /** total bytes available on currently active cude device, please be careful with its bookkeeping */ - AtomicLong deviceMemBytes = new AtomicLong(0); - - /** Stores the cached deviceProperties */ - private static cudaDeviceProp[] deviceProperties; - - // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. - public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); - // Whether to invoke cudaMemGetInfo for available memory or rely on internal bookkeeping for memory info. - public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME); - static { - long start = System.nanoTime(); - JCuda.setExceptionsEnabled(true); - JCudnn.setExceptionsEnabled(true); - JCublas2.setExceptionsEnabled(true); - JCusparse.setExceptionsEnabled(true); - JCudaDriver.setExceptionsEnabled(true); - cuInit(0); // Initialize the driver - - int deviceCountArray[] = { 0 }; - cuDeviceGetCount(deviceCountArray); // Obtain the number of devices - deviceCount = deviceCountArray[0]; - deviceProperties = new cudaDeviceProp[deviceCount]; - - LOG.info("Total number of GPUs on the machine: " + deviceCount); - int maxBlocks = getMaxBlocks(); - int maxThreadsPerBlock = getMaxThreadsPerBlock(); - long sharedMemPerBlock = getMaxSharedMemory(); - int[] device = {-1}; - cudaGetDevice(device); - LOG.info("Active CUDA device number : " + device[0]); - LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock); - - GPUStatistics.cudaInitTime = System.nanoTime() - start; - } - - @Override - public long getAvailableMemory() { - if (REFRESH_AVAILABLE_MEMORY_EVERY_TIME) { - long free[] = {0}; - long total[] = {0}; - if (cudaMemGetInfo(free, total) == cudaSuccess) { - //long totalNumBytes = total[0]; - deviceMemBytes.set(free[0]); - } else { - throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); - } - } - return (long) (deviceMemBytes.get()*GPU_MEMORY_UTILIZATION_FACTOR); - } - - @Override - public void ensureComputeCapability() throws DMLRuntimeException { - int[] devices = {-1}; - cudaGetDeviceCount(devices); - if (devices[0] == -1){ - throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices"); - } - boolean isComputeCapable = true; - for (int i=0; i<devices[0]; i++) { - cudaDeviceProp properties = getGPUProperties(i); - int major = properties.major; - int minor = properties.minor; - if (major < MAJOR_REQUIRED) { - isComputeCapable = false; - } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) { - isComputeCapable = false; - } - } - if (!isComputeCapable) { - throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED); - } - } - - /** - * Gets the device properties for the active GPU (set with cudaSetDevice()) - * @return the device properties - */ - public static cudaDeviceProp getGPUProperties() { - int[] device = {-1}; - cudaGetDevice(device); // Get currently active device - return getGPUProperties(device[0]); - } - - /** - * Gets the device properties - * @param device the device number (on a machine with more than 1 GPU) - * @return the device properties - */ - public static cudaDeviceProp getGPUProperties(int device){ - if (deviceProperties[device] == null) { - cudaDeviceProp properties = new cudaDeviceProp(); - cudaGetDeviceProperties(properties, device); - deviceProperties[device] = properties; - } - return deviceProperties[device]; - } - - - /** - * Gets the maximum number of threads per block for "active" GPU - * @return the maximum number of threads per block - */ - public static int getMaxThreadsPerBlock() { - cudaDeviceProp deviceProps = getGPUProperties(); - return deviceProps.maxThreadsPerBlock; - } - - /** - * Gets the maximum number of blocks supported by the active cuda device - * @return the maximum number of blocks supported - */ - public static int getMaxBlocks() { - cudaDeviceProp deviceProp = getGPUProperties(); - return deviceProp.maxGridSize[0]; - } - - /** - * Gets the shared memory per block supported by the active cuda device - * @return the shared memory per block - */ - public static long getMaxSharedMemory() { - cudaDeviceProp deviceProp = getGPUProperties(); - return deviceProp.sharedMemPerBlock; - } - - /** - * Gets the warp size supported by the active cuda device - * @return the warp size - */ - public static int getWarpSize() { - cudaDeviceProp deviceProp = getGPUProperties(); - return deviceProp.warpSize; - } - - /** - * Gets the available memory and then adds value to it - * @param v the value to add - * @return the current available memory before adding value to it - */ - public long getAndAddAvailableMemory(long v){ - return deviceMemBytes.getAndAdd(v); - } - - public JCudaContext() throws DMLRuntimeException { - if(isGPUContextCreated) { - // Wait until it is deleted. This case happens during multi-threaded testing. - // This also allows for multi-threaded execute calls - long startTime = System.currentTimeMillis(); - do { - try { - Thread.sleep(100); - } catch (InterruptedException e) {} - } while(isGPUContextCreated && (System.currentTimeMillis() - startTime) < 60000); - synchronized(isGPUContextCreated) { - if(GPUContext.currContext != null) { - throw new RuntimeException("Cannot create multiple JCudaContext. Waited for 10 min to close previous GPUContext"); - } - } - } - synchronized (isGPUContextCreated){ - GPUContext.currContext = this; - } - - long free [] = { 0 }; - long total [] = { 0 }; - long totalNumBytes = 0; - if(cudaMemGetInfo(free, total) == cudaSuccess) { - totalNumBytes = total[0]; - deviceMemBytes.set(free[0]); - } - else { - throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); - } - LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB"); - LOG.info("Available GPU memory: " + (deviceMemBytes.get()*(1e-6)) + " MB"); - - long start = System.nanoTime(); - LibMatrixCUDA.cudnnHandle = new cudnnHandle(); - cudnnCreate(LibMatrixCUDA.cudnnHandle); - LibMatrixCUDA.cublasHandle = new cublasHandle(); - cublasCreate(LibMatrixCUDA.cublasHandle); - // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host - // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. - // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); - LibMatrixCUDA.cusparseHandle = new cusparseHandle(); - cusparseCreate(LibMatrixCUDA.cusparseHandle); - try { - LibMatrixCUDA.kernels = new JCudaKernels(); - } catch (DMLRuntimeException e) { - System.err.println("ERROR - Unable to initialize JCudaKernels. System in an inconsistent state"); - LibMatrixCUDA.kernels = null; - } - GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; - } - - @Override - public void destroy() throws DMLRuntimeException { - if(currContext != null) { - synchronized(isGPUContextCreated) { - cudnnDestroy(LibMatrixCUDA.cudnnHandle); - cublasDestroy(LibMatrixCUDA.cublasHandle); - cusparseDestroy(LibMatrixCUDA.cusparseHandle); - currContext = null; - isGPUContextCreated = false; - } - } - else if(LibMatrixCUDA.cudnnHandle != null || LibMatrixCUDA.cublasHandle != null) { - throw new DMLRuntimeException("Error while destroying the GPUContext"); - } - } - -}
