This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 138638b1bf [SYSTEMDS-3510] Size-oriented free lists in GPU lineage
cache
138638b1bf is described below
commit 138638b1bfadd2cae44dec73aa7cd7da735fbed0
Author: Arnab Phani <[email protected]>
AuthorDate: Tue Mar 28 14:20:33 2023 +0200
[SYSTEMDS-3510] Size-oriented free lists in GPU lineage cache
This patch splits the weighted queue of the free pointers in the GPU
cache into multiple free lists, one for each allocated size. The entries
in the free lists are ordered by a scoring function of compute time.
When malloc is called, we recycle one free pointer with less reuse
potential. We also extended the statistics to report recycling count.
Closes #1797
---
.../gpu/context/GPUMemoryEviction.java | 4 +-
.../instructions/gpu/context/GPUMemoryManager.java | 69 +++++++++--------
.../runtime/lineage/LineageCacheStatistics.java | 22 ++++++
.../runtime/lineage/LineageGPUCacheEviction.java | 88 ++++++++++++++++------
.../java/org/apache/sysds/utils/Statistics.java | 1 +
5 files changed, 122 insertions(+), 62 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
index 3676d337ab..3a55fd906b 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
@@ -135,13 +135,13 @@ public class GPUMemoryEviction implements Runnable
LineageCacheStatistics.incrementGpuAsyncEvicts();
}
count++;
- }*/
+ }
// Add the locked entries back to the eviction queue
if (!lockedOrLiveEntries.isEmpty())
LineageGPUCacheEviction.addEntryList(lockedOrLiveEntries);
if (DMLScript.STATISTICS) //TODO: dedicated statistics for
lineage
- GPUStatistics.cudaEvictTime.add(System.nanoTime() - t0);
+ GPUStatistics.cudaEvictTime.add(System.nanoTime() -
t0); */
}
}
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 4b0a67cbb4..526b477a4f 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -290,37 +290,17 @@ public class GPUMemoryManager {
}
}
- // Step 5: Evict gpu intermediates from lineage cache
- // This can create holes. However, evicting rmVarpending
objects might right away make the required space
- // TODO: Size dependent eviction logic (CostNSize is one)
+ // Step 5.1: Recycle, delete or evict gpu intermediates from
lineage cache
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
- long currentAvailableMemory =
allocator.getAvailableMemory();
- List<LineageCacheEntry> lockedAndLiveList = new
ArrayList<>();
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
- while (A == null &&
!LineageGPUCacheEviction.isGPUCacheEmpty()) {
- LineageCacheEntry le =
LineageGPUCacheEviction.pollFirstEntry();
-
- // First remove the gpuobj chains that don't
contain any live and dirty objects.
- // TODO: Handle dirty objects separately. Copy
them back to the host
-
- // Check and continue if the pointer is live
- // Note: all locked entries are live
- Pointer ptr = le.getGPUPointer();
- if
(LineageGPUCacheEviction.probeLiveCachedPointers(ptr)) {
- lockedAndLiveList.add(le);
- continue;
- }
- currentAvailableMemory +=
getSizeAllocatedGPUPointer(ptr);
-
- if (!LineageCacheConfig.GPU2HOSTEVICTION) {
+ // Recycle a cached pointer if exactly matches the
required size
+ LineageCacheEntry le =
LineageGPUCacheEviction.pollFirstFreeEntry(size);
+ if (le != null) {
+ if(!LineageCacheConfig.GPU2HOSTEVICTION) {
+ A = le.getGPUPointer(); //recycle
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
- // Recycle the pointer if matches the
required size
- if (getSizeAllocatedGPUPointer(ptr) ==
size) {
- A = ptr;
- continue;
- }
- else
- free(opcode, ptr, true);
+ if (DMLScript.STATISTICS)
+
LineageCacheStatistics.incrementGpuRecycle();
}
/*else {
// Copy from device cache to CPU
lineage cache if not already copied
@@ -328,18 +308,35 @@ public class GPUMemoryManager {
if(DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuSyncEvicts();
}*/
-
- if(currentAvailableMemory >= size)
- // This doesn't guarantee allocation
due to fragmented freed memory
- A = cudaMallocNoWarn(tmpA, size, null);
}
+ // TODO: Handle live (dirty) objects separately. Copy
them back to the host
- // Add the locked entries back to the eviction queue
- if (!lockedAndLiveList.isEmpty())
-
LineageGPUCacheEviction.addEntryList(lockedAndLiveList);
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
+ }
+ // Step 5.2: Use a non-exact sized pointer
+ if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
+ long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+ long freedSize = 0;
+ while (A == null &&
!LineageGPUCacheEviction.isGPUCacheFreeQEmpty()) {
+ // Deallocate a non-exact matched entry from
the cached free lists
+ LineageCacheEntry le =
LineageGPUCacheEviction.pollFistFreeNotExact(size);
+ if(le != null) {
+ freedSize +=
getSizeAllocatedGPUPointer(le.getGPUPointer());
+
if(!LineageCacheConfig.GPU2HOSTEVICTION) {
+
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
+
guardedCudaFree(le.getGPUPointer()); //free
+ if (DMLScript.STATISTICS)
+
LineageCacheStatistics.incrementGpuDel();
+ }
+ // TODO: else evict to the host cache
+ if (freedSize > size)
+ A = cudaMallocNoWarn(tmpA,
size, "recycle non-exact match of lineage cache");
+ }
+ }
+ if (DMLScript.STATISTICS)
+
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
if (A == null)
LOG.warn("cudaMalloc failed after Lineage GPU
cache eviction.");
}
@@ -497,8 +494,10 @@ public class GPUMemoryManager {
* @throws DMLRuntimeException if error occurs
*/
public void free(String opcode, Pointer toFree, boolean eager) throws
DMLRuntimeException {
+ // Do not deallocate if the pointer is cached.
if (!LineageCacheConfig.ReuseCacheType.isNone()
&&
LineageGPUCacheEviction.probeLiveCachedPointers(toFree)) {
+ // Move the pointer to the free list inside lineage
cache
LineageGPUCacheEviction.decrementLiveCount(toFree);
return;
}
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
index 8ae0831f41..182e04cfb6 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
@@ -45,6 +45,8 @@ public class LineageCacheStatistics {
private static final LongAdder _numHitsGpu = new LongAdder();
private static final LongAdder _numAsyncEvictGpu= new LongAdder();
private static final LongAdder _numSyncEvictGpu = new LongAdder();
+ private static final LongAdder _numRecycleGpu = new LongAdder();
+ private static final LongAdder _numDelGpu = new LongAdder();
private static final LongAdder _evtimeGpu = new LongAdder();
// Below entries are specific to Spark instructions
private static final LongAdder _numHitsRdd = new LongAdder();
@@ -70,6 +72,8 @@ public class LineageCacheStatistics {
_numHitsGpu.reset();
_numAsyncEvictGpu.reset();
_numSyncEvictGpu.reset();
+ _numRecycleGpu.reset();
+ _numDelGpu.reset();
_numHitsRdd.reset();
_numHitsSparkActions.reset();
_numHitsRddPersist.reset();
@@ -206,6 +210,16 @@ public class LineageCacheStatistics {
_numSyncEvictGpu.increment();
}
+ public static void incrementGpuRecycle() {
+ // Number of gpu cached pointers recycled
+ _numRecycleGpu.increment();
+ }
+
+ public static void incrementGpuDel() {
+ // Number of gpu cached pointers deleted to make space
+ _numDelGpu.increment();
+ }
+
public static void incrementEvictTimeGpu(long delta) {
// Total time spent on evicting from GPU to main memory or
deleting from GPU lineage cache
_evtimeGpu.add(delta);
@@ -288,6 +302,14 @@ public class LineageCacheStatistics {
return sb.toString();
}
+ public static String displayGpuPointerStats() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(_numRecycleGpu.longValue());
+ sb.append("/");
+ sb.append(_numDelGpu.longValue());
+ return sb.toString();
+ }
+
public static String displayGpuEvictTime() {
StringBuilder sb = new StringBuilder();
sb.append(String.format("%.3f",
((double)_evtimeGpu.longValue())/1000000000)); //in sec
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index ed0d85a712..b427f81097 100644
---
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -19,8 +19,11 @@
package org.apache.sysds.runtime.lineage;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
@@ -40,15 +43,19 @@ public class LineageGPUCacheEviction
public static ExecutorService gpuEvictionThread = null;
// Weighted queue of freed pointers.
- private static TreeSet<LineageCacheEntry> weightedQueue = new
TreeSet<>(LineageCacheConfig.LineageCacheComparator);
+ private static HashMap<Long, TreeSet<LineageCacheEntry>> freeQueues =
new HashMap<>();
+
+ // Pointers and live counts associated
private static HashMap<Pointer, Integer> livePointers = new HashMap<>();
+
+ // All cached pointers mapped to the corresponding lineage cache entries
private static HashMap<Pointer, LineageCacheEntry> GPUCacheEntries =
new HashMap<>();
protected static void resetEviction() {
_currentCacheSize = 0;
gpuEvictionThread = null;
//LineageCacheConfig.CONCURRENTGPUEVICTION = false;
- weightedQueue.clear();
+ freeQueues.clear();
livePointers.clear();
GPUCacheEntries.clear();
}
@@ -66,15 +73,19 @@ public class LineageGPUCacheEviction
}
protected static void incrementLiveCount(Pointer ptr) {
- //TODO: move from free list to live list
+ // Move from free list (if exists) to live list
if(livePointers.merge(ptr, 1, Integer::sum) == 1)
- weightedQueue.remove(GPUCacheEntries.get(ptr));
+
freeQueues.get(getPointerSize(ptr)).remove(GPUCacheEntries.get(ptr));
}
public static void decrementLiveCount(Pointer ptr) {
- // Decrement and remove if the live counte becomes 0
- if(livePointers.compute(ptr, (k, v) -> v==1 ? null : v-1) ==
null)
- weightedQueue.add(GPUCacheEntries.get(ptr));
+ // Decrement and move to the free list if the live count
becomes 0
+ if(livePointers.compute(ptr, (k, v) -> v==1 ? null : v-1) ==
null) {
+ long size = getPointerSize(ptr);
+ if (!freeQueues.containsKey(size))
+ freeQueues.put(size, new
TreeSet<>(LineageCacheConfig.LineageCacheComparator));
+ freeQueues.get(size).add(GPUCacheEntries.get(ptr));
+ }
}
public static boolean probeLiveCachedPointers(Pointer ptr) {
@@ -122,29 +133,55 @@ public class LineageGPUCacheEviction
// TODO: Separate removelist, starttimestamp, score and weights
from CPU cache
entry.computeScore(LineageCacheEviction._removelist);
- //weightedQueue.add(entry);
+ // The pointer must be live at this moment
livePointers.put(entry.getGPUPointer(), 1);
GPUCacheEntries.put(entry.getGPUPointer(), entry);
}
public static boolean isGPUCacheEmpty() {
- return weightedQueue.isEmpty();
+ return (freeQueues.isEmpty() && livePointers.isEmpty());
}
- public static LineageCacheEntry pollFirstEntry() {
- return weightedQueue.pollFirst();
+ public static boolean isGPUCacheFreeQEmpty() {
+ return freeQueues.isEmpty();
}
- public static LineageCacheEntry peekFirstEntry() {
- return weightedQueue.first();
+ // Remove and return the cached free pointer with exact size
+ public static LineageCacheEntry pollFirstFreeEntry(long size) {
+ TreeSet<LineageCacheEntry> freeList = freeQueues.get(size);
+ if (freeList != null && freeList.isEmpty())
+ freeQueues.remove(size); //remove if empty
+
+ // Poll the first pointer from the queue
+ if (freeList != null && !freeList.isEmpty())
+ return freeList.pollFirst();
+ return null;
}
-
- public static void removeEntry(LineageCacheEntry e) {
- weightedQueue.remove(e);
+
+ // Remove and return the minimum non-exact sized pointer.
+ // If no bigger sized pointer available, return one from the highest
sized list
+ public static LineageCacheEntry pollFistFreeNotExact(long size) {
+ // Assuming no exact match
+ List<Long> sortedSizes = new ArrayList<>(freeQueues.keySet());
+ // If the asked size is bigger than all, return a pointer of
the highest size available
+ long maxSize = sortedSizes.get(sortedSizes.size()-1);
+ if (size > maxSize)
+ return pollFirstFreeEntry(maxSize);
+ // Return a pointer of the next biggest size
+ for (long fSize : sortedSizes) {
+ if (fSize >= size)
+ return pollFirstFreeEntry(fSize);
+ }
+ return null;
}
- public static void addEntryList(List<LineageCacheEntry> entryList) {
- weightedQueue.addAll(entryList);
+ public static LineageCacheEntry peekFirstFreeEntry(long size) {
+ return freeQueues.get(size).first();
+ }
+
+ public static void removeFreeEntry(LineageCacheEntry e) {
+ long size = getPointerSize(e.getGPUPointer());
+ freeQueues.get(size).remove(e);
}
//---------------- CACHE SPACE MANAGEMENT METHODS -----------------//
@@ -165,22 +202,23 @@ public class LineageGPUCacheEviction
}
public static int numPointersCached() {
- return livePointers.size() + weightedQueue.size();
+ return livePointers.size() +
freeQueues.values().stream().mapToInt(TreeSet::size).sum();
}
public static long totalMemoryCached() {
long totLive = livePointers.keySet().stream()
.mapToLong(ptr ->
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(ptr)).sum();
- long totFree = weightedQueue.stream()
- .mapToLong(en ->
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(en.getGPUPointer())).sum();
+ long totFree = 0;
+ for (Map.Entry<Long, TreeSet<LineageCacheEntry>> entry :
freeQueues.entrySet())
+ totFree += entry.getKey() * entry.getValue().size();
return totLive + totFree;
}
public static Set<Pointer> getAllCachedPointers() {
- //livePointers.keySet() + weightedQueue.stream().map()
- Set<Pointer> cachedPointers = weightedQueue.stream()
- .map(LineageCacheEntry::getGPUPointer)
- .collect(Collectors.toSet());
+ Set<Pointer> cachedPointers = new HashSet<>();
+ for (Map.Entry<Long, TreeSet<LineageCacheEntry>> entry :
freeQueues.entrySet())
+ cachedPointers.addAll(entry.getValue().stream()
+
.map(LineageCacheEntry::getGPUPointer).collect(Collectors.toSet()));
cachedPointers.addAll(livePointers.keySet());
return cachedPointers;
}
diff --git a/src/main/java/org/apache/sysds/utils/Statistics.java
b/src/main/java/org/apache/sysds/utils/Statistics.java
index 89d8088734..02d72fad08 100644
--- a/src/main/java/org/apache/sysds/utils/Statistics.java
+++ b/src/main/java/org/apache/sysds/utils/Statistics.java
@@ -639,6 +639,7 @@ public class Statistics
sb.append("LinCache hits (Mem/FS/Del): \t" +
LineageCacheStatistics.displayHits() + ".\n");
sb.append("LinCache MultiLevel (Ins/SB/Fn):" +
LineageCacheStatistics.displayMultiLevelHits() + ".\n");
sb.append("LinCache GPU (Hit/Async/Sync): \t" +
LineageCacheStatistics.displayGpuStats() + ".\n");
+ sb.append("LinCache GPU (Recyc/Del): \t" +
LineageCacheStatistics.displayGpuPointerStats() + ".\n");
sb.append("LinCache GPU evict time: \t" +
LineageCacheStatistics.displayGpuEvictTime() + " sec.\n");
sb.append("LinCache Spark (Col/Loc/Dist): \t" +
LineageCacheStatistics.displaySparkStats() + ".\n");
sb.append("LinCache writes (Mem/FS/Del): \t" +
LineageCacheStatistics.displayWtrites() + ".\n");