This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new ca72e4b03f [SYSTEMDS-2947] Re-enable lineage cache eviction from GPU
to host
ca72e4b03f is described below
commit ca72e4b03fc36dc7faf1f206e9261f02198ef8b0
Author: Arnab Phani <[email protected]>
AuthorDate: Thu Apr 6 15:25:24 2023 +0200
[SYSTEMDS-2947] Re-enable lineage cache eviction from GPU to host
This patch adds code to be able to copy a cached pointer to a cached
matrix block. The plan is to conditionally evict cached entries
to host based of the score (compute time and #hits, #misses) while
recycling. Currently, the eviction is disabled as we do not have
a way to measure the elapsed time of the GPU kernels due to their
asynchronous nature.
Closes #1802
---
.../instructions/gpu/context/CSRPointer.java | 11 +--
.../instructions/gpu/context/GPUMemoryManager.java | 24 ++++--
.../instructions/gpu/context/GPUObject.java | 42 +----------
.../sysds/runtime/lineage/LineageCacheConfig.java | 2 +-
.../sysds/runtime/lineage/LineageCacheEntry.java | 11 +++
.../runtime/lineage/LineageGPUCacheEviction.java | 88 +++++++++++++++-------
.../test/functions/lineage/GPUFullReuseTest.java | 4 +-
7 files changed, 101 insertions(+), 81 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index 23ae4d1648..dc3f41ed28 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -178,9 +178,6 @@ public class CSRPointer {
*/
public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int
rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
CSRPointer r = dest;
- long t0 = 0;
- if (DMLScript.STATISTICS)
- t0 = System.nanoTime();
r.nnz = nnz;
if(rows < 0) throw new DMLRuntimeException("Incorrect input
parameter: rows=" + rows);
if(nnz < 0) throw new DMLRuntimeException("Incorrect input
parameter: nnz=" + nnz);
@@ -190,10 +187,10 @@ public class CSRPointer {
LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values,
r.val, null);
cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows +
1), cudaMemcpyHostToDevice);
cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz),
cudaMemcpyHostToDevice);
- if (DMLScript.STATISTICS)
- GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
- if (DMLScript.STATISTICS)
- GPUStatistics.cudaToDevCount.add(3);
+ //if (DMLScript.STATISTICS)
+ // GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
+ //if (DMLScript.STATISTICS)
+ // GPUStatistics.cudaToDevCount.add(3);
}
/**
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index a52c9eb6a9..fc249f03c3 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -297,16 +297,19 @@ public class GPUMemoryManager {
if (le != null) {
if(!LineageCacheConfig.GPU2HOSTEVICTION) {
A = le.getGPUPointer(); //recycle
-
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
+
LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuRecycle();
}
- /*else {
- // Copy from device cache to CPU
lineage cache if not already copied
-
LineageGPUCacheEviction.copyToHostCache(le, opcode, copied);
+ else {
+ // Copy from device cache to CPU
lineage cache
+ // TODO: Copy conditionally (if score >
theta)
+ Pointer copiedPtr =
LineageGPUCacheEviction.copyToHostCache(le);
+
LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
+ A = copiedPtr;
if(DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuSyncEvicts();
- }*/
+ }
}
// TODO: Handle live (dirty) objects separately. Copy
them back to the host
@@ -324,12 +327,19 @@ public class GPUMemoryManager {
if(le != null) {
freedSize +=
getSizeAllocatedGPUPointer(le.getGPUPointer());
if(!LineageCacheConfig.GPU2HOSTEVICTION) {
-
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
+
LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
guardedCudaFree(le.getGPUPointer()); //free
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuDel();
}
- // TODO: else evict to the host cache
+ else {
+ // Copy from device cache to
CPU lineage cache
+ Pointer copiedPtr =
LineageGPUCacheEviction.copyToHostCache(le);
+
LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
+ guardedCudaFree(copiedPtr);
//free
+ if(DMLScript.STATISTICS)
+
LineageCacheStatistics.incrementGpuSyncEvicts();
+ }
if (freedSize > size)
A = cudaMallocNoWarn(tmpA,
size, "recycle non-exact match of lineage cache");
// Else, deallocate another free
pointer. We are calling pollFistFreeNotExact with
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 0cb793949a..5c4967cda9 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -886,6 +886,8 @@ public class GPUObject {
if (copyToDevice) {
CSRPointer.copyToDevice(getGPUContext(),
getJcudaSparseMatrixPtr(),
tmp.getNumRows(), tmp.getNonZeros(),
rowPtr, colInd, values);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaToDevCount.add(3);
}
} else {
double[] data = tmp.getDenseBlockValues();
@@ -906,6 +908,8 @@ public class GPUObject {
// Copy dense block
// H2D now only measures the time taken to do
LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data,
getDensePointer(), opcode);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaToDevCount.add(1);
}
}
@@ -913,8 +917,6 @@ public class GPUObject {
if (DMLScript.STATISTICS)
GPUStatistics.cudaToDevTime.add(System.nanoTime() -
start);
- if (DMLScript.STATISTICS)
- GPUStatistics.cudaToDevCount.add(1);
}
public static int toIntExact(long l) {
@@ -1012,42 +1014,6 @@ public class GPUObject {
dirty = false;
}
- // Copy and convert to a MatrixBlock, and return
- public MatrixBlock evictFromDeviceToHostMB(String instName, boolean
eagerDelete) throws DMLRuntimeException {
- if(LOG.isTraceEnabled()) {
- LOG.trace("GPU : copyFromDeviceToHost, on " + this + ",
GPUContext=" + getGPUContext());
- }
- MatrixBlock tmp = null;
- if (!isDensePointerNull()) {
- tmp = new MatrixBlock(toIntExact(mat.getNumRows()),
toIntExact(mat.getNumColumns()), false);
- tmp.allocateDenseBlock();
-
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
- getDensePointer(),
tmp.getDenseBlockValues(), instName, true);
- //if(eagerDelete)
- // clearData(instName, true);
- tmp.recomputeNonZeros();
- } else {
- int rows = toIntExact(mat.getNumRows());
- int cols = toIntExact(mat.getNumColumns());
- int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
- double[] values = new double[nnz];
-
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
getJcudaSparseMatrixPtr().val, values, instName, true);
- int[] rowPtr = new int[rows + 1];
- int[] colInd = new int[nnz];
- CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(),
rows, nnz, rowPtr, colInd);
- //if(eagerDelete)
- // clearData(instName, true);
- SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr,
colInd, values, nnz);
- tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
- }
- //mat.acquireModify(tmp);
- //mat.release();
- //dirty = false;
- //isLineageCached = false;
- return tmp;
- }
-
-
/**
* Clears the data associated with this {@link GPUObject} instance
*
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 5bb4d97853..a483b6c21b 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -106,7 +106,7 @@ public class LineageCacheConfig
public static double FSREAD_SPARSE = 400;
public static double FSWRITE_DENSE = 450;
public static double FSWRITE_SPARSE = 225;
- public static double D2HCOPY = 1500;
+ public static double D2HCOPYBANDWIDTH = 1500; //MB/sec
public static double D2HMAXBANDWIDTH = 8192;
private enum CachedItemHead {
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index 0d89dc396d..f4d3e24982 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -183,6 +183,12 @@ public class LineageCacheEntry {
return _gpuPointer!= null;
}
+ public synchronized boolean isDensePointer() {
+ if (!isGPUObject())
+ return false;
+ return _gpuPointer.isDensepointer();
+ }
+
public boolean isSerializedBytes() {
return _dt.isUnknown() &&
_key.getOpcode().equals(LineageItemUtils.SERIALIZATION_OPCODE);
}
@@ -343,5 +349,10 @@ public class LineageCacheEntry {
protected DataCharacteristics getDataCharacteristics() {
return _metadata.getDataCharacteristics();
}
+
+ protected boolean isDensepointer() {
+ return true;
+ // TODO: Support sparse pointer caching
+ }
}
}
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index 9135565bd9..04ddfed198 100644
---
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -33,6 +33,11 @@ import jcuda.Pointer;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
+import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.meta.DataCharacteristics;
+
+import static
org.apache.sysds.runtime.instructions.gpu.context.GPUObject.toIntExact;
public class LineageGPUCacheEviction
{
@@ -115,13 +120,12 @@ public class LineageGPUCacheEviction
double sizeMB = sizeByte / (1024*1024);
double newTSpeed = sizeMB / copyTime; //bandwidth (MB/sec) +
java overhead
- // FIXME: A D2H copy lazily executes previous kernels
if (newTSpeed > LineageCacheConfig.D2HMAXBANDWIDTH)
return; //filter out errorneous measurements (~
>8GB/sec)
// Perform exponential smoothing.
double smFactor = 0.5; //smoothing factor
- LineageCacheConfig.D2HCOPY = (smFactor * newTSpeed) +
((1-smFactor) * LineageCacheConfig.D2HCOPY);
- //System.out.println("size_t: "+sizeMB+ " speed_t: "+newTSpeed
+ " estimate_t+1: "+LineageCacheConfig.D2HCOPY);
+ LineageCacheConfig.D2HCOPYBANDWIDTH = (smFactor * newTSpeed) +
((1-smFactor) * LineageCacheConfig.D2HCOPYBANDWIDTH);
+ //System.out.println("size_t: "+sizeMB+ " speed_t: "+newTSpeed
+ " estimate_t+1: "+LineageCacheConfig.D2HCOPYBANDWIDTH);
}
//--------------- CACHE MAINTENANCE & LOOKUP FUNCTIONS --------------//
@@ -224,37 +228,67 @@ public class LineageGPUCacheEviction
cachedPointers.addAll(livePointers.keySet());
return cachedPointers;
}
-
- /*public static void copyToHostCache(LineageCacheEntry entry, String
instName, boolean alreadyCopied) {
- // TODO: move to the shadow buffer. Convert to double precision
only when reused.
+
+ // Copy an intermediate from GPU cache to host cache
+ // TODO: move to the shadow buffer. Convert to double precision only
when reused.
+ public static Pointer copyToHostCache(LineageCacheEntry entry) {
+ // Memcopy from the GPU pointer to a matrix block
long t0 = System.nanoTime();
- MatrixBlock mb = alreadyCopied ?
entry._gpuObject.getMatrixObject().acquireReadAndRelease()
- :
entry._gpuObject.evictFromDeviceToHostMB(instName, false);
+ MatrixBlock mb = pointerToMatrixBlock(entry);
long t1 = System.nanoTime();
-
adjustD2HTransferSpeed(((double)entry._gpuObject.getSizeOnDevice()),
((double)(t1-t0))/1000000000);
+ // Adjust the estimated D2H bandwidth
+ adjustD2HTransferSpeed(((double)entry.getSize()),
((double)(t1-t0))/1000000000);
+ Pointer ptr = entry.getGPUPointer();
long size = mb.getInMemorySize();
- // make space in the host memory for the data TODO: synchronize
- if (!LineageCacheEviction.isBelowThreshold(size)) {
- synchronized (LineageCache.getLineageCache()) {
-
LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+ synchronized(LineageCache.getLineageCache()) {
+ // Make space in the host cache for the data
+ if(!LineageCacheEviction.isBelowThreshold(size)) {
+ synchronized(LineageCache.getLineageCache()) {
+
LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+ }
}
+ LineageCacheEviction.updateSize(size, true);
+ // Place the data and set gpu object to null in the
cache entry
+ entry.setValue(mb);
+ // Maintain order for eviction of host cache.
+ LineageCacheEviction.addEntry(entry);
}
- // FIXME: updateSize outside of synchronized is problematic,
but eliminates waiting for background eviction
- LineageCacheEviction.updateSize(size, true);
- // place the data and set gpu object to null in the cache entry
- entry.setValue(mb);
- // maintain order for eviction of host cache. FIXME: synchronize
- LineageCacheEviction.addEntry(entry);
- // manage space in gpu cache
- updateSize(size, false);
- }*/
+ return ptr;
+ }
+
+ private static MatrixBlock pointerToMatrixBlock(LineageCacheEntry le) {
+ MatrixBlock ret = null;
+ DataCharacteristics dc = le.getDataCharacteristics();
+ if (le.isDensePointer()) {
+ ret = new MatrixBlock(toIntExact(dc.getRows()),
toIntExact(dc.getCols()), false);
+ ret.allocateDenseBlock();
+ // copy to the host
+
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
+ le.getGPUPointer(), ret.getDenseBlockValues(),
null, true);
+ ret.recomputeNonZeros();
+ } /*else {
+ int rows = toIntExact(dc.getRows());
+ int cols = toIntExact(dc.getCols());
+ int nnz = toIntExact(le.getGPUPointer().nnz);
+ double[] values = new double[nnz];
+
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
le.getGPUPointer().val, values, null, true);
+ int[] rowPtr = new int[rows + 1];
+ int[] colInd = new int[nnz];
+ CSRPointer.copyPtrToHost(le.getGPUPointer(), rows, nnz,
rowPtr, colInd);
+ SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr,
colInd, values, nnz);
+ ret = new MatrixBlock(rows, cols, nnz, sparseBlock);
+ }*/
+ //mat.acquireModify(tmp);
+ //mat.release();
+ return ret;
+ }
- public static void removeFromDeviceCache(LineageCacheEntry entry,
String instName, boolean alreadyCopied) {
- //long size = entry.getGPUObject().getSizeOnDevice();
- long size =
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(entry.getGPUPointer());
- LineageCache.removeEntry(entry._key);
+ public static void removeFromDeviceCache(LineageCacheEntry entry,
Pointer ptr, boolean removeFromCache) {
+ long size =
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(ptr);
+ if (removeFromCache)
+ LineageCache.removeEntry(entry._key);
updateSize(size, false);
- GPUCacheEntries.remove(entry.getGPUPointer());
+ GPUCacheEntries.remove(ptr);
}
}
\ No newline at end of file
diff --git
a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 1fcd0766cf..1a0665c187 100644
---
a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -84,6 +84,7 @@ public class GPUFullReuseTest extends AutomatedTestBase{
AutomatedTestBase.TEST_GPU = true; //adds '-gpu'
List<String> proArgs = new ArrayList<>();
+ proArgs.add("-explain");
proArgs.add("-stats");
proArgs.add("-args");
proArgs.add(output("R"));
@@ -94,7 +95,8 @@ public class GPUFullReuseTest extends AutomatedTestBase{
//run the test
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
HashMap<MatrixValue.CellIndex, Double> R_orig =
readDMLMatrixFromOutputDir("R");
-
+
+ proArgs.clear();
proArgs.add("-stats");
proArgs.add("-lineage");
proArgs.add(LineageCacheConfig.ReuseCacheType.REUSE_MULTILEVEL.name().toLowerCase());