[systemds] branch main updated: [SYSTEMDS-3510] Size-oriented free lists in GPU lineage cache

arnabp20 Tue, 28 Mar 2023 05:21:57 -0700

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 138638b1bf [SYSTEMDS-3510] Size-oriented free lists in GPU lineage 
cache
138638b1bf is described below

commit 138638b1bfadd2cae44dec73aa7cd7da735fbed0
Author: Arnab Phani <[email protected]>
AuthorDate: Tue Mar 28 14:20:33 2023 +0200

    [SYSTEMDS-3510] Size-oriented free lists in GPU lineage cache
    
    This patch splits the weighted queue of the free pointers in the GPU
    cache into multiple free lists, one for each allocated size. The entries
    in the free lists are ordered by a scoring function of compute time.
    When malloc is called, we recycle one free pointer with less reuse
    potential. We also extended the statistics to report recycling count.
    
    Closes #1797
---
 .../gpu/context/GPUMemoryEviction.java             |  4 +-
 .../instructions/gpu/context/GPUMemoryManager.java | 69 +++++++++--------
 .../runtime/lineage/LineageCacheStatistics.java    | 22 ++++++
 .../runtime/lineage/LineageGPUCacheEviction.java   | 88 ++++++++++++++++------
 .../java/org/apache/sysds/utils/Statistics.java    |  1 +
 5 files changed, 122 insertions(+), 62 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
index 3676d337ab..3a55fd906b 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
@@ -135,13 +135,13 @@ public class GPUMemoryEviction implements Runnable
                                
LineageCacheStatistics.incrementGpuAsyncEvicts();
                        }
                        count++;
-               }*/
+               }
 
                // Add the locked entries back to the eviction queue
                if (!lockedOrLiveEntries.isEmpty())
                        
LineageGPUCacheEviction.addEntryList(lockedOrLiveEntries);
                
                if (DMLScript.STATISTICS) //TODO: dedicated statistics for 
lineage
-                       GPUStatistics.cudaEvictTime.add(System.nanoTime() - t0);
+                       GPUStatistics.cudaEvictTime.add(System.nanoTime() - 
t0); */
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 4b0a67cbb4..526b477a4f 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -290,37 +290,17 @@ public class GPUMemoryManager {
                        }
                }
                
-               // Step 5: Evict gpu intermediates from lineage cache
-               // This can create holes. However, evicting rmVarpending 
objects might right away make the required space
-               // TODO: Size dependent eviction logic (CostNSize is one)
+               // Step 5.1: Recycle, delete or evict gpu intermediates from 
lineage cache
                if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
-                       long currentAvailableMemory = 
allocator.getAvailableMemory();
-                       List<LineageCacheEntry> lockedAndLiveList = new 
ArrayList<>();
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
-                       while (A == null && 
!LineageGPUCacheEviction.isGPUCacheEmpty()) {
-                               LineageCacheEntry le = 
LineageGPUCacheEviction.pollFirstEntry();
-
-                               // First remove the gpuobj chains that don't 
contain any live and dirty objects.
-                               // TODO: Handle dirty objects separately. Copy 
them back to the host
-
-                               // Check and continue if the pointer is live
-                               // Note: all locked entries are live
-                               Pointer ptr = le.getGPUPointer();
-                               if 
(LineageGPUCacheEviction.probeLiveCachedPointers(ptr)) {
-                                       lockedAndLiveList.add(le);
-                                       continue;
-                               }
-                               currentAvailableMemory += 
getSizeAllocatedGPUPointer(ptr);
-
-                               if (!LineageCacheConfig.GPU2HOSTEVICTION) {
+                       // Recycle a cached pointer if exactly matches the 
required size
+                       LineageCacheEntry le = 
LineageGPUCacheEviction.pollFirstFreeEntry(size);
+                       if (le != null) {
+                               if(!LineageCacheConfig.GPU2HOSTEVICTION) {
+                                       A = le.getGPUPointer(); //recycle
                                        
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
-                                       // Recycle the pointer if matches the 
required size
-                                       if (getSizeAllocatedGPUPointer(ptr) == 
size) {
-                                               A = ptr;
-                                               continue;
-                                       }
-                                       else
-                                               free(opcode, ptr, true);
+                                       if (DMLScript.STATISTICS)
+                                               
LineageCacheStatistics.incrementGpuRecycle();
                                }
                                /*else {
                                        // Copy from device cache to CPU 
lineage cache if not already copied
@@ -328,18 +308,35 @@ public class GPUMemoryManager {
                                        if(DMLScript.STATISTICS)
                                                
LineageCacheStatistics.incrementGpuSyncEvicts();
                                }*/
-
-                               if(currentAvailableMemory >= size)
-                                       // This doesn't guarantee allocation 
due to fragmented freed memory
-                                       A = cudaMallocNoWarn(tmpA, size, null); 
                        }
+                       // TODO: Handle live (dirty) objects separately. Copy 
them back to the host
 
-                       // Add the locked entries back to the eviction queue
-                       if (!lockedAndLiveList.isEmpty())
-                               
LineageGPUCacheEviction.addEntryList(lockedAndLiveList);
                        if (DMLScript.STATISTICS)
                                
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
+               }
 
+               // Step 5.2: Use a non-exact sized pointer
+               if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
+                       long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
+                       long freedSize = 0;
+                       while (A == null && 
!LineageGPUCacheEviction.isGPUCacheFreeQEmpty()) {
+                               // Deallocate a non-exact matched entry from 
the cached free lists
+                               LineageCacheEntry le = 
LineageGPUCacheEviction.pollFistFreeNotExact(size);
+                               if(le != null) {
+                                       freedSize += 
getSizeAllocatedGPUPointer(le.getGPUPointer());
+                                       
if(!LineageCacheConfig.GPU2HOSTEVICTION) {
+                                               
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
+                                               
guardedCudaFree(le.getGPUPointer()); //free
+                                               if (DMLScript.STATISTICS)
+                                                       
LineageCacheStatistics.incrementGpuDel();
+                                       }
+                                       // TODO: else evict to the host cache
+                                       if (freedSize > size)
+                                               A = cudaMallocNoWarn(tmpA, 
size, "recycle non-exact match of lineage cache");
+                               }
+                       }
+                       if (DMLScript.STATISTICS)
+                               
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
                        if (A == null)
                                LOG.warn("cudaMalloc failed after Lineage GPU 
cache eviction.");
                }
@@ -497,8 +494,10 @@ public class GPUMemoryManager {
         * @throws DMLRuntimeException if error occurs
         */
        public void free(String opcode, Pointer toFree, boolean eager) throws 
DMLRuntimeException {
+               // Do not deallocate if the pointer is cached.
                if (!LineageCacheConfig.ReuseCacheType.isNone()
                        && 
LineageGPUCacheEviction.probeLiveCachedPointers(toFree)) {
+                       // Move the pointer to the free list inside lineage 
cache
                        LineageGPUCacheEviction.decrementLiveCount(toFree);
                        return;
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
index 8ae0831f41..182e04cfb6 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
@@ -45,6 +45,8 @@ public class LineageCacheStatistics {
        private static final LongAdder _numHitsGpu      = new LongAdder();
        private static final LongAdder _numAsyncEvictGpu= new LongAdder();
        private static final LongAdder _numSyncEvictGpu = new LongAdder();
+       private static final LongAdder _numRecycleGpu   = new LongAdder();
+       private static final LongAdder _numDelGpu       = new LongAdder();
        private static final LongAdder _evtimeGpu       = new LongAdder();
        // Below entries are specific to Spark instructions
        private static final LongAdder _numHitsRdd      = new LongAdder();
@@ -70,6 +72,8 @@ public class LineageCacheStatistics {
                _numHitsGpu.reset();
                _numAsyncEvictGpu.reset();
                _numSyncEvictGpu.reset();
+               _numRecycleGpu.reset();
+               _numDelGpu.reset();
                _numHitsRdd.reset();
                _numHitsSparkActions.reset();
                _numHitsRddPersist.reset();
@@ -206,6 +210,16 @@ public class LineageCacheStatistics {
                _numSyncEvictGpu.increment();
        }
 
+       public static void incrementGpuRecycle() {
+               // Number of gpu cached pointers recycled
+               _numRecycleGpu.increment();
+       }
+
+       public static void incrementGpuDel() {
+               // Number of gpu cached pointers deleted to make space
+               _numDelGpu.increment();
+       }
+
        public static void incrementEvictTimeGpu(long delta) {
                // Total time spent on evicting from GPU to main memory or 
deleting from GPU lineage cache
                _evtimeGpu.add(delta);
@@ -288,6 +302,14 @@ public class LineageCacheStatistics {
                return sb.toString();
        }
 
+       public static String displayGpuPointerStats() {
+               StringBuilder sb = new StringBuilder();
+               sb.append(_numRecycleGpu.longValue());
+               sb.append("/");
+               sb.append(_numDelGpu.longValue());
+               return sb.toString();
+       }
+
        public static String displayGpuEvictTime() {
                StringBuilder sb = new StringBuilder();
                sb.append(String.format("%.3f", 
((double)_evtimeGpu.longValue())/1000000000)); //in sec
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index ed0d85a712..b427f81097 100644
--- 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -19,8 +19,11 @@
 
 package org.apache.sysds.runtime.lineage;
 
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.ExecutorService;
@@ -40,15 +43,19 @@ public class LineageGPUCacheEviction
        public static ExecutorService gpuEvictionThread = null;
 
        // Weighted queue of freed pointers.
-       private static TreeSet<LineageCacheEntry> weightedQueue = new 
TreeSet<>(LineageCacheConfig.LineageCacheComparator);
+       private static HashMap<Long, TreeSet<LineageCacheEntry>> freeQueues = 
new HashMap<>();
+
+       // Pointers and live counts associated
        private static HashMap<Pointer, Integer> livePointers = new HashMap<>();
+
+       // All cached pointers mapped to the corresponding lineage cache entries
        private static HashMap<Pointer, LineageCacheEntry> GPUCacheEntries = 
new HashMap<>();
 
        protected static void resetEviction() {
                _currentCacheSize = 0;
                gpuEvictionThread = null;
                //LineageCacheConfig.CONCURRENTGPUEVICTION = false;
-               weightedQueue.clear();
+               freeQueues.clear();
                livePointers.clear();
                GPUCacheEntries.clear();
        }
@@ -66,15 +73,19 @@ public class LineageGPUCacheEviction
        }
 
        protected static void incrementLiveCount(Pointer ptr) {
-               //TODO: move from free list to live list
+               // Move from free list (if exists) to live list
                if(livePointers.merge(ptr, 1, Integer::sum) == 1)
-                       weightedQueue.remove(GPUCacheEntries.get(ptr));
+                       
freeQueues.get(getPointerSize(ptr)).remove(GPUCacheEntries.get(ptr));
        }
 
        public static void decrementLiveCount(Pointer ptr) {
-               // Decrement and remove if the live counte becomes 0
-               if(livePointers.compute(ptr, (k, v) -> v==1 ? null : v-1) == 
null)
-                       weightedQueue.add(GPUCacheEntries.get(ptr));
+               // Decrement and move to the free list if the live count 
becomes 0
+               if(livePointers.compute(ptr, (k, v) -> v==1 ? null : v-1) == 
null) {
+                       long size = getPointerSize(ptr);
+                       if (!freeQueues.containsKey(size))
+                               freeQueues.put(size, new 
TreeSet<>(LineageCacheConfig.LineageCacheComparator));
+                       freeQueues.get(size).add(GPUCacheEntries.get(ptr));
+               }
        }
 
        public static boolean probeLiveCachedPointers(Pointer ptr) {
@@ -122,29 +133,55 @@ public class LineageGPUCacheEviction
 
                // TODO: Separate removelist, starttimestamp, score and weights 
from CPU cache
                entry.computeScore(LineageCacheEviction._removelist);
-               //weightedQueue.add(entry);
+               // The pointer must be live at this moment
                livePointers.put(entry.getGPUPointer(), 1);
                GPUCacheEntries.put(entry.getGPUPointer(), entry);
        }
        
        public static boolean isGPUCacheEmpty() {
-               return weightedQueue.isEmpty();
+               return (freeQueues.isEmpty() && livePointers.isEmpty());
        }
 
-       public static LineageCacheEntry pollFirstEntry() {
-               return weightedQueue.pollFirst();
+       public static boolean isGPUCacheFreeQEmpty() {
+               return freeQueues.isEmpty();
        }
 
-       public static LineageCacheEntry peekFirstEntry() {
-               return weightedQueue.first();
+       // Remove and return the cached free pointer with exact size
+       public static LineageCacheEntry pollFirstFreeEntry(long size) {
+               TreeSet<LineageCacheEntry> freeList = freeQueues.get(size);
+               if (freeList != null && freeList.isEmpty())
+                       freeQueues.remove(size); //remove if empty
+
+               // Poll the first pointer from the queue
+               if (freeList != null && !freeList.isEmpty())
+                       return freeList.pollFirst();
+               return null;
        }
-       
-       public static void removeEntry(LineageCacheEntry e) {
-               weightedQueue.remove(e);
+
+       // Remove and return the minimum non-exact sized pointer.
+       // If no bigger sized pointer available, return one from the highest 
sized list
+       public static LineageCacheEntry pollFistFreeNotExact(long size) {
+               // Assuming no exact match
+               List<Long> sortedSizes = new ArrayList<>(freeQueues.keySet());
+               // If the asked size is bigger than all, return a pointer of 
the highest size available
+               long maxSize = sortedSizes.get(sortedSizes.size()-1);
+               if (size > maxSize)
+                       return pollFirstFreeEntry(maxSize);
+               // Return a pointer of the next biggest size
+               for (long fSize : sortedSizes) {
+                       if (fSize >= size)
+                               return pollFirstFreeEntry(fSize);
+               }
+               return null;
        }
 
-       public static void addEntryList(List<LineageCacheEntry> entryList) {
-               weightedQueue.addAll(entryList);
+       public static LineageCacheEntry peekFirstFreeEntry(long size) {
+               return freeQueues.get(size).first();
+       }
+       
+       public static void removeFreeEntry(LineageCacheEntry e) {
+               long size = getPointerSize(e.getGPUPointer());
+               freeQueues.get(size).remove(e);
        }
 
        //---------------- CACHE SPACE MANAGEMENT METHODS -----------------//
@@ -165,22 +202,23 @@ public class LineageGPUCacheEviction
        }
 
        public static int numPointersCached() {
-               return livePointers.size() + weightedQueue.size();
+               return livePointers.size() + 
freeQueues.values().stream().mapToInt(TreeSet::size).sum();
        }
 
        public static long totalMemoryCached() {
                long totLive = livePointers.keySet().stream()
                        .mapToLong(ptr -> 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(ptr)).sum();
-               long totFree = weightedQueue.stream()
-                       .mapToLong(en -> 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(en.getGPUPointer())).sum();
+               long totFree = 0;
+               for (Map.Entry<Long, TreeSet<LineageCacheEntry>> entry : 
freeQueues.entrySet())
+                       totFree += entry.getKey() * entry.getValue().size();
                return totLive + totFree;
        }
 
        public static Set<Pointer> getAllCachedPointers() {
-               //livePointers.keySet() + weightedQueue.stream().map()
-               Set<Pointer> cachedPointers = weightedQueue.stream()
-                       .map(LineageCacheEntry::getGPUPointer)
-                       .collect(Collectors.toSet());
+               Set<Pointer> cachedPointers = new HashSet<>();
+               for (Map.Entry<Long, TreeSet<LineageCacheEntry>> entry : 
freeQueues.entrySet())
+                       cachedPointers.addAll(entry.getValue().stream()
+                               
.map(LineageCacheEntry::getGPUPointer).collect(Collectors.toSet()));
                cachedPointers.addAll(livePointers.keySet());
                return cachedPointers;
        }
diff --git a/src/main/java/org/apache/sysds/utils/Statistics.java 
b/src/main/java/org/apache/sysds/utils/Statistics.java
index 89d8088734..02d72fad08 100644
--- a/src/main/java/org/apache/sysds/utils/Statistics.java
+++ b/src/main/java/org/apache/sysds/utils/Statistics.java
@@ -639,6 +639,7 @@ public class Statistics
                                sb.append("LinCache hits (Mem/FS/Del): \t" + 
LineageCacheStatistics.displayHits() + ".\n");
                                sb.append("LinCache MultiLevel (Ins/SB/Fn):" + 
LineageCacheStatistics.displayMultiLevelHits() + ".\n");
                                sb.append("LinCache GPU (Hit/Async/Sync): \t" + 
LineageCacheStatistics.displayGpuStats() + ".\n");
+                               sb.append("LinCache GPU (Recyc/Del): \t" + 
LineageCacheStatistics.displayGpuPointerStats() + ".\n");
                                sb.append("LinCache GPU evict time: \t" + 
LineageCacheStatistics.displayGpuEvictTime() + " sec.\n");
                                sb.append("LinCache Spark (Col/Loc/Dist): \t" + 
LineageCacheStatistics.displaySparkStats() + ".\n");
                                sb.append("LinCache writes (Mem/FS/Del): \t" + 
LineageCacheStatistics.displayWtrites() + ".\n");

[systemds] branch main updated: [SYSTEMDS-3510] Size-oriented free lists in GPU lineage cache

Reply via email to