systemml git commit: [SYSTEMML-445] Added two-step strategy to deal with potential fragmentation on GPU

niketanpansare Mon, 29 Jan 2018 14:59:33 -0800

Repository: systemml
Updated Branches:
  refs/heads/master 87d7fee73 -> 9b270d61a



[SYSTEMML-445] Added two-step strategy to deal with potential fragmentation on 
GPU

Closes #669.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/9b270d61
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/9b270d61
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/9b270d61

Branch: refs/heads/master
Commit: 9b270d61a16cab35b4cb66bbae36f09c5d738289
Parents: 87d7fee
Author: Niketan Pansare <[email protected]>
Authored: Mon Jan 29 14:56:42 2018 -0800
Committer: Niketan Pansare <[email protected]>
Committed: Mon Jan 29 14:56:42 2018 -0800

----------------------------------------------------------------------
 .../instructions/gpu/context/GPUContext.java    | 96 ++++++++++++++++----
 .../org/apache/sysml/utils/GPUStatistics.java   | 10 +-
 2 files changed, 85 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/9b270d61/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 4b47117..311e2a7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -292,6 +292,9 @@ public class GPUContext {
        public Pointer allocate(String instructionName, long size, int 
statsCount) throws DMLRuntimeException {
                long t0 = 0, t1 = 0, end = 0;
                Pointer A;
+               if(size < 0) {
+                       throw new DMLRuntimeException("Cannot allocate memory 
of size " + size);
+               }
                if (freeCUDASpaceMap.containsKey(size)) {
                        if (LOG.isTraceEnabled()) {
                                LOG.trace(
@@ -321,7 +324,41 @@ public class GPUContext {
                                t0 = System.nanoTime();
                        ensureFreeSpace(instructionName, size);
                        A = new Pointer();
-                       cudaMalloc(A, size);
+                       try {
+                               cudaMalloc(A, size);
+                       } catch(jcuda.CudaException e) {
+                               if(!DMLScript.EAGER_CUDA_FREE) {
+                                       // Strategy to avoid memory allocation 
due to potential fragmentation (a rare event):
+                                       // Step 1. First clear up lazy matrices 
and try cudaMalloc again.
+                                       // Step 2. Even if the issue persists, 
then evict all the allocated GPU objects and and try cudaMalloc again.
+                                       // After Step 2, SystemML will hold no 
pointers on GPU and the hope is that cudaMalloc will start afresh 
+                                       // by allocating objects sequentially 
with no holes.
+                                       
+                                       // Step 1:
+                                       LOG.debug("Eagerly deallocating 
rmvar-ed matrices to avoid memory allocation error due to potential 
fragmentation.");
+                                       clearFreeCUDASpaceMap(instructionName, 
-1);
+                                       try {
+                                               cudaMalloc(A, size);
+                                       } catch(jcuda.CudaException e1) {
+                                               // Step 2:
+                                               
GPUStatistics.cudaForcedClearUnpinnedMatCount.add(1);
+                                               LOG.warn("Eagerly deallocating 
unpinned matrices to avoid memory allocation error due to potential 
fragmentation. "
+                                                               + "If you see 
this warning often, we recommend that you set systemml.gpu.eager.cudaFree 
configuration property to true");
+                                               for(GPUObject toBeRemoved : 
allocatedGPUObjects) {
+                                                       if 
(!toBeRemoved.isLocked()) {
+                                                               if 
(toBeRemoved.dirty) {
+                                                                       
toBeRemoved.copyFromDeviceToHost(instructionName, true);
+                                                               }
+                                                               
toBeRemoved.clearData(true);
+                                                       }
+                                               }
+                                               cudaMalloc(A, size);
+                                       }
+                               }
+                               else {
+                                       throw new DMLRuntimeException("Unable 
to allocate memory of size " + size + " using cudaMalloc", e);
+                               }
+                       }
                        if (DMLScript.STATISTICS)
                                
GPUStatistics.cudaAllocTime.add(System.nanoTime() - t0);
                        if (DMLScript.STATISTICS)
@@ -464,6 +501,44 @@ public class GPUContext {
        protected void evict(final long GPUSize) throws DMLRuntimeException {
                evict(null, GPUSize);
        }
+       
+       /**
+        * Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap to free up space
+        * 
+        * @param instructionName name of the instruction for which performance 
measurements are made
+        * @param neededSize      desired size to be freed up on the GPU (-1 if 
we want to eagerly free up all the blocks)
+        * @throws DMLRuntimeException If no reusable memory blocks to free up 
or if not enough matrix blocks with zero locks on them.
+        */
+       protected void clearFreeCUDASpaceMap(String instructionName,  final 
long neededSize) throws DMLRuntimeException {
+               if(neededSize < 0) {
+                       GPUStatistics.cudaForcedClearLazyFreedMatCount.add(1);
+                       while(freeCUDASpaceMap.size() > 0) {
+                               Entry<Long, Set<Pointer>> toFreeListPair = 
freeCUDASpaceMap.removeAndGetLRUEntry();
+                               
freeCUDASpaceMap.remove(toFreeListPair.getKey());
+                               for(Pointer toFree : toFreeListPair.getValue()) 
{
+                                       cudaFreeHelper(instructionName, toFree, 
true);
+                               }
+                       }
+               }
+               else {
+                       LRUCacheMap<Long, Set<Pointer>> lruCacheMap = 
freeCUDASpaceMap;
+                       while (lruCacheMap.size() > 0) {
+                               if (neededSize <= getAvailableMemory())
+                                       break;
+                               Map.Entry<Long, Set<Pointer>> toFreeListPair = 
lruCacheMap.removeAndGetLRUEntry();
+                               Set<Pointer> toFreeList = 
toFreeListPair.getValue();
+                               Long size = toFreeListPair.getKey();
+       
+                               Iterator<Pointer> it = toFreeList.iterator(); 
// at this point, freeList should have at least one element
+                               Pointer toFree = it.next();
+                               it.remove();
+       
+                               if (toFreeList.isEmpty())
+                                       lruCacheMap.remove(size);
+                               cudaFreeHelper(instructionName, toFree, true);
+                       }
+               }
+       }
 
        /**
         * Memory on the GPU is tried to be freed up until either a chunk of 
needed size is freed up
@@ -487,25 +562,8 @@ public class GPUContext {
                if (LOG.isDebugEnabled()) {
                        printMemoryInfo("EVICTION_CUDA_FREE_SPACE");
                }
-               
-               // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
-               // to free up space
-               LRUCacheMap<Long, Set<Pointer>> lruCacheMap = freeCUDASpaceMap;
-               while (lruCacheMap.size() > 0) {
-                       if (neededSize <= getAvailableMemory())
-                               break;
-                       Map.Entry<Long, Set<Pointer>> toFreeListPair = 
lruCacheMap.removeAndGetLRUEntry();
-                       Set<Pointer> toFreeList = toFreeListPair.getValue();
-                       Long size = toFreeListPair.getKey();
-
-                       Iterator<Pointer> it = toFreeList.iterator(); // at 
this point, freeList should have at least one element
-                       Pointer toFree = it.next();
-                       it.remove();
 
-                       if (toFreeList.isEmpty())
-                               lruCacheMap.remove(size);
-                       cudaFreeHelper(instructionName, toFree, true);
-               }
+               clearFreeCUDASpaceMap(instructionName,  neededSize);
 
                if (neededSize <= getAvailableMemory())
                        return;

http://git-wip-us.apache.org/repos/asf/systemml/blob/9b270d61/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index 33ab953..f671d36 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -57,6 +57,8 @@ public class GPUStatistics {
        public static LongAdder cudaToDevCount = new LongAdder();
        public static LongAdder cudaFromDevCount = new LongAdder();
        public static LongAdder cudaEvictionCount = new LongAdder();
+       public static LongAdder cudaForcedClearLazyFreedMatCount = new 
LongAdder();
+       public static LongAdder cudaForcedClearUnpinnedMatCount = new 
LongAdder();
 
        // Per instruction miscellaneous timers.
        // Used to record events in a CP Heavy Hitter instruction and
@@ -89,6 +91,8 @@ public class GPUStatistics {
                cudaToDevCount.reset();
                cudaFromDevCount.reset();
                cudaEvictionCount.reset();
+               cudaForcedClearLazyFreedMatCount.reset();
+               cudaForcedClearUnpinnedMatCount.reset();
                resetMiscTimers();
        }
 
@@ -193,14 +197,16 @@ public class GPUStatistics {
                                + String.format("%.3f", 
cudaMemSet0Time.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaToDevTime.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaFromDevTime.longValue()*1e-9)  + " sec.\n");
-               sb.append("GPU mem tx count 
(alloc/dealloc/set0/toDev/fromDev/evict):\t"
+               sb.append("GPU mem tx count 
(alloc/dealloc/set0/toDev/fromDev/evict/forcedEvict(lazy/unpinned)):\t"
                                + cudaAllocCount.longValue() + "/"
                                + cudaDeAllocCount.longValue() + "/"
                                + cudaMemSet0Count.longValue() + "/"
                                + cudaSparseConversionCount.longValue() + "/"
                                + cudaToDevCount.longValue() + "/"
                                + cudaFromDevCount.longValue() + "/"
-                               + cudaEvictionCount.longValue() + ".\n");
+                               + cudaEvictionCount.longValue() + "/("
+                               + cudaForcedClearLazyFreedMatCount.longValue() 
+ "/"
+                               + cudaForcedClearUnpinnedMatCount.longValue() + 
").\n");
                sb.append("GPU conversion time  
(sparseConv/sp2dense/dense2sp):\t"
                                + String.format("%.3f", 
cudaSparseConversionTime.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaSparseToDenseTime.longValue()*1e-9) + "/"

systemml git commit: [SYSTEMML-445] Added two-step strategy to deal with potential fragmentation on GPU

Reply via email to