Repository: systemml Updated Branches: refs/heads/master 87d7fee73 -> 9b270d61a
[SYSTEMML-445] Added two-step strategy to deal with potential fragmentation on GPU Closes #669. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/9b270d61 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/9b270d61 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/9b270d61 Branch: refs/heads/master Commit: 9b270d61a16cab35b4cb66bbae36f09c5d738289 Parents: 87d7fee Author: Niketan Pansare <[email protected]> Authored: Mon Jan 29 14:56:42 2018 -0800 Committer: Niketan Pansare <[email protected]> Committed: Mon Jan 29 14:56:42 2018 -0800 ---------------------------------------------------------------------- .../instructions/gpu/context/GPUContext.java | 96 ++++++++++++++++---- .../org/apache/sysml/utils/GPUStatistics.java | 10 +- 2 files changed, 85 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/9b270d61/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 4b47117..311e2a7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -292,6 +292,9 @@ public class GPUContext { public Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException { long t0 = 0, t1 = 0, end = 0; Pointer A; + if(size < 0) { + throw new DMLRuntimeException("Cannot allocate memory of size " + size); + } if (freeCUDASpaceMap.containsKey(size)) { if (LOG.isTraceEnabled()) { LOG.trace( @@ -321,7 +324,41 @@ public class GPUContext { t0 = System.nanoTime(); ensureFreeSpace(instructionName, size); A = new Pointer(); - cudaMalloc(A, size); + try { + cudaMalloc(A, size); + } catch(jcuda.CudaException e) { + if(!DMLScript.EAGER_CUDA_FREE) { + // Strategy to avoid memory allocation due to potential fragmentation (a rare event): + // Step 1. First clear up lazy matrices and try cudaMalloc again. + // Step 2. Even if the issue persists, then evict all the allocated GPU objects and and try cudaMalloc again. + // After Step 2, SystemML will hold no pointers on GPU and the hope is that cudaMalloc will start afresh + // by allocating objects sequentially with no holes. + + // Step 1: + LOG.debug("Eagerly deallocating rmvar-ed matrices to avoid memory allocation error due to potential fragmentation."); + clearFreeCUDASpaceMap(instructionName, -1); + try { + cudaMalloc(A, size); + } catch(jcuda.CudaException e1) { + // Step 2: + GPUStatistics.cudaForcedClearUnpinnedMatCount.add(1); + LOG.warn("Eagerly deallocating unpinned matrices to avoid memory allocation error due to potential fragmentation. " + + "If you see this warning often, we recommend that you set systemml.gpu.eager.cudaFree configuration property to true"); + for(GPUObject toBeRemoved : allocatedGPUObjects) { + if (!toBeRemoved.isLocked()) { + if (toBeRemoved.dirty) { + toBeRemoved.copyFromDeviceToHost(instructionName, true); + } + toBeRemoved.clearData(true); + } + } + cudaMalloc(A, size); + } + } + else { + throw new DMLRuntimeException("Unable to allocate memory of size " + size + " using cudaMalloc", e); + } + } if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) @@ -464,6 +501,44 @@ public class GPUContext { protected void evict(final long GPUSize) throws DMLRuntimeException { evict(null, GPUSize); } + + /** + * Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap to free up space + * + * @param instructionName name of the instruction for which performance measurements are made + * @param neededSize desired size to be freed up on the GPU (-1 if we want to eagerly free up all the blocks) + * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them. + */ + protected void clearFreeCUDASpaceMap(String instructionName, final long neededSize) throws DMLRuntimeException { + if(neededSize < 0) { + GPUStatistics.cudaForcedClearLazyFreedMatCount.add(1); + while(freeCUDASpaceMap.size() > 0) { + Entry<Long, Set<Pointer>> toFreeListPair = freeCUDASpaceMap.removeAndGetLRUEntry(); + freeCUDASpaceMap.remove(toFreeListPair.getKey()); + for(Pointer toFree : toFreeListPair.getValue()) { + cudaFreeHelper(instructionName, toFree, true); + } + } + } + else { + LRUCacheMap<Long, Set<Pointer>> lruCacheMap = freeCUDASpaceMap; + while (lruCacheMap.size() > 0) { + if (neededSize <= getAvailableMemory()) + break; + Map.Entry<Long, Set<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry(); + Set<Pointer> toFreeList = toFreeListPair.getValue(); + Long size = toFreeListPair.getKey(); + + Iterator<Pointer> it = toFreeList.iterator(); // at this point, freeList should have at least one element + Pointer toFree = it.next(); + it.remove(); + + if (toFreeList.isEmpty()) + lruCacheMap.remove(size); + cudaFreeHelper(instructionName, toFree, true); + } + } + } /** * Memory on the GPU is tried to be freed up until either a chunk of needed size is freed up @@ -487,25 +562,8 @@ public class GPUContext { if (LOG.isDebugEnabled()) { printMemoryInfo("EVICTION_CUDA_FREE_SPACE"); } - - // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap - // to free up space - LRUCacheMap<Long, Set<Pointer>> lruCacheMap = freeCUDASpaceMap; - while (lruCacheMap.size() > 0) { - if (neededSize <= getAvailableMemory()) - break; - Map.Entry<Long, Set<Pointer>> toFreeListPair = lruCacheMap.removeAndGetLRUEntry(); - Set<Pointer> toFreeList = toFreeListPair.getValue(); - Long size = toFreeListPair.getKey(); - - Iterator<Pointer> it = toFreeList.iterator(); // at this point, freeList should have at least one element - Pointer toFree = it.next(); - it.remove(); - if (toFreeList.isEmpty()) - lruCacheMap.remove(size); - cudaFreeHelper(instructionName, toFree, true); - } + clearFreeCUDASpaceMap(instructionName, neededSize); if (neededSize <= getAvailableMemory()) return; http://git-wip-us.apache.org/repos/asf/systemml/blob/9b270d61/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index 33ab953..f671d36 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -57,6 +57,8 @@ public class GPUStatistics { public static LongAdder cudaToDevCount = new LongAdder(); public static LongAdder cudaFromDevCount = new LongAdder(); public static LongAdder cudaEvictionCount = new LongAdder(); + public static LongAdder cudaForcedClearLazyFreedMatCount = new LongAdder(); + public static LongAdder cudaForcedClearUnpinnedMatCount = new LongAdder(); // Per instruction miscellaneous timers. // Used to record events in a CP Heavy Hitter instruction and @@ -89,6 +91,8 @@ public class GPUStatistics { cudaToDevCount.reset(); cudaFromDevCount.reset(); cudaEvictionCount.reset(); + cudaForcedClearLazyFreedMatCount.reset(); + cudaForcedClearUnpinnedMatCount.reset(); resetMiscTimers(); } @@ -193,14 +197,16 @@ public class GPUStatistics { + String.format("%.3f", cudaMemSet0Time.longValue()*1e-9) + "/" + String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "/" + String.format("%.3f", cudaFromDevTime.longValue()*1e-9) + " sec.\n"); - sb.append("GPU mem tx count (alloc/dealloc/set0/toDev/fromDev/evict):\t" + sb.append("GPU mem tx count (alloc/dealloc/set0/toDev/fromDev/evict/forcedEvict(lazy/unpinned)):\t" + cudaAllocCount.longValue() + "/" + cudaDeAllocCount.longValue() + "/" + cudaMemSet0Count.longValue() + "/" + cudaSparseConversionCount.longValue() + "/" + cudaToDevCount.longValue() + "/" + cudaFromDevCount.longValue() + "/" - + cudaEvictionCount.longValue() + ".\n"); + + cudaEvictionCount.longValue() + "/(" + + cudaForcedClearLazyFreedMatCount.longValue() + "/" + + cudaForcedClearUnpinnedMatCount.longValue() + ").\n"); sb.append("GPU conversion time (sparseConv/sp2dense/dense2sp):\t" + String.format("%.3f", cudaSparseConversionTime.longValue()*1e-9) + "/" + String.format("%.3f", cudaSparseToDenseTime.longValue()*1e-9) + "/"
