This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 832acb04bd [SYSTEMDS-3510] Cache data characteristics with GPU pointer
832acb04bd is described below
commit 832acb04bd5d72a9275472211d278bf58a714ce3
Author: Arnab Phani <[email protected]>
AuthorDate: Thu Mar 30 15:19:17 2023 +0200
[SYSTEMDS-3510] Cache data characteristics with GPU pointer
This patch enables caching the metadata along with the GPU pointers
in the lineage cache. The data characteristics are required to evict
cached entries from the device cache to the host cache. Also, sometime
the output dimensions are not known before execution.
Closes #1798
---
.../instructions/gpu/context/GPUContext.java | 1 -
.../instructions/gpu/context/GPUMemoryManager.java | 2 +
.../instructions/gpu/context/GPUObject.java | 8 ++-
.../apache/sysds/runtime/lineage/LineageCache.java | 41 +++++++-----
.../sysds/runtime/lineage/LineageCacheEntry.java | 77 ++++++++++++++++++----
5 files changed, 97 insertions(+), 32 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
index 3ffbedea3f..913eebbf80 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -37,7 +37,6 @@ import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysds.runtime.lineage.LineageCacheConfig;
import org.apache.sysds.utils.GPUStatistics;
import jcuda.Pointer;
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 526b477a4f..57636726a2 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -333,6 +333,8 @@ public class GPUMemoryManager {
// TODO: else evict to the host cache
if (freedSize > size)
A = cudaMallocNoWarn(tmpA,
size, "recycle non-exact match of lineage cache");
+ // Else, deallocate another free
pointer. We are calling pollFistFreeNotExact with
+ // the same size (not with
freedSize-size) to reduce potentials for creating holes
}
}
if (DMLScript.STATISTICS)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 6dccee983d..0cb793949a 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -101,7 +101,7 @@ public class GPUObject {
* Shadow buffer instance
*/
final ShadowBuffer shadowBuffer;
-
+
//
----------------------------------------------------------------------
// Methods used to access, set and check jcudaDenseMatrixPtr
@@ -791,6 +791,7 @@ public class GPUObject {
setSparseMatrixCudaPointer(tmp);
}
+ // Method to find the estimated size of this GPU Object in the device
public long getSizeOnDevice() {
long GPUSize = 0;
long rlen = mat.getNumRows();
@@ -805,6 +806,11 @@ public class GPUObject {
return GPUSize;
}
+ // Method to find the allocated size of this GPU Object in the device
+ public long getAllocatedSize() {
+ return
gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(getDensePointer());
+ }
+
void copyFromHostToDevice(String opcode) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : copyFromHostToDevice, on " + this + ",
GPUContext=" + getGPUContext());
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index c38132a5a3..eea3ea003d 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -19,7 +19,6 @@
package org.apache.sysds.runtime.lineage;
-import jcuda.Pointer;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
@@ -104,14 +103,15 @@ public class LineageCache
LineageCacheEntry e = null;
boolean reuseAll = true;
synchronized( _cache ) {
- //try to reuse full or partial intermediates
+ //try to reuse full or partial intermediates
(CPU and FED only)
for (MutablePair<LineageItem,LineageCacheEntry>
item : liList) {
if
(LineageCacheConfig.getCacheType().isFullReuse())
e =
LineageCache.probe(item.getKey()) ? getIntern(item.getKey()) : null;
//TODO need to also move execution of
compensation plan out of here
//(create lazily evaluated entry)
if (e == null &&
LineageCacheConfig.getCacheType().isPartialReuse()
- && !(inst instanceof
ComputationSPInstruction))
+ && !(inst instanceof
ComputationSPInstruction)
+ && !(DMLScript.USE_ACCELERATOR))
if(
LineageRewriteReuse.executeRewrites(inst, ec) )
e =
getIntern(item.getKey());
reuseAll &= (e != null);
@@ -151,13 +151,14 @@ public class LineageCache
((SparkExecutionContext)
ec).setRDDHandleForVariable(outName, rdd);
}
else { //TODO handle locks on gpu
objects
- //shallow copy the cached
GPUObj to the output MatrixObject
//Create a GPUObject with the
cached pointer
GPUObject gpuObj = new
GPUObject(ec.getGPUContext(0),
ec.getMatrixObject(outName), e.getGPUPointer());
ec.getMatrixObject(outName).setGPUObject(ec.getGPUContext(0), gpuObj);
//Set dirty to true, so that it
is later copied to the host for write
ec.getMatrixObject(outName).getGPUObject(ec.getGPUContext(0)).setDirty(true);
+ //Set the cached data
characteristics to the output matrix object
+
ec.getMatrixObject(outName).updateDataCharacteristics(e.getDataCharacteristics());
//Increment the live count for
this pointer
LineageGPUCacheEviction.incrementLiveCount(e.getGPUPointer());
}
@@ -479,7 +480,7 @@ public class LineageCache
if (LineageCacheConfig.isReusable(inst, ec) ) {
//if (!isMarkedForCaching(inst, ec)) return;
List<Pair<LineageItem, Data>> liData = null;
- Pointer gpuPtr = null;
+ GPUObject liGPUObj= null;
LineageItem instLI = ((LineageTraceable)
inst).getLineageItem(ec).getValue();
if (inst instanceof MultiReturnBuiltinCPInstruction) {
liData = new ArrayList<>();
@@ -494,13 +495,13 @@ public class LineageCache
else if (inst instanceof GPUInstruction) {
// TODO: gpu multiretrun instructions
Data gpudata = ec.getVariable(((GPUInstruction)
inst)._output);
- gpuPtr = gpudata instanceof MatrixObject ?
+ liGPUObj = gpudata instanceof MatrixObject ?
ec.getMatrixObject(((GPUInstruction)inst)._output).
-
getGPUObject(ec.getGPUContext(0)).getDensePointer() : null;
+
getGPUObject(ec.getGPUContext(0)) : null;
// Scalar gpu intermediates is always copied
back to host.
// No need to cache the GPUobj for scalar
intermediates.
- if (gpuPtr == null)
+ if (liGPUObj == null)
liData = Arrays.asList(Pair.of(instLI,
ec.getVariable(((GPUInstruction)inst)._output)));
}
else if (inst instanceof ComputationSPInstruction
@@ -517,10 +518,10 @@ public class LineageCache
else if (inst instanceof
ComputationSPInstruction) //collects or prefetches
liData = Arrays.asList(Pair.of(instLI,
ec.getVariable(((ComputationSPInstruction) inst).output)));
- if (gpuPtr == null)
+ if (liGPUObj == null)
putValueCPU(inst, liData, computetime);
else
- putValueGPU(gpuPtr, instLI, computetime);
+ putValueGPU(liGPUObj, instLI, computetime);
}
}
@@ -594,14 +595,20 @@ public class LineageCache
}
}
- private static void putValueGPU(Pointer gpuPtr, LineageItem instLI,
long computetime) {
+ private static void putValueGPU(GPUObject gpuObj, LineageItem instLI,
long computetime) {
synchronized( _cache ) {
LineageCacheEntry centry = _cache.get(instLI);
+ // TODO: Cache sparse pointers
+ if (gpuObj.isSparse()) {
+ removePlaceholder(instLI);
+ return;
+ }
// Update the total size of lineage cached gpu objects
// The eviction is handled by the unified gpu memory
manager
-
LineageGPUCacheEviction.updateSize(LineageGPUCacheEviction.getPointerSize(gpuPtr),
true);
+
LineageGPUCacheEviction.updateSize(gpuObj.getAllocatedSize(), true);
// Set the GPUOject in the cache
- centry.setGPUValue(gpuPtr, computetime);
+ centry.setGPUValue(gpuObj.getDensePointer(),
gpuObj.getAllocatedSize(),
+ gpuObj.getMatrixObject().getMetaData(),
computetime);
// Maintain order for eviction
LineageGPUCacheEviction.addEntry(centry);
}
@@ -908,10 +915,10 @@ public class LineageCache
LineageCacheEntry oe = getIntern(probeItem);
LineageCacheEntry e = _cache.get(item);
boolean exists = !e.isNullVal();
- if (oe.isMatrixValue())
- e.setValue(oe.getMBValue(), computetime);
- else
- e.setValue(oe.getSOValue(), computetime);
+ e.copyValueFrom(oe, computetime);
+ if (e.isNullVal())
+ throw new DMLRuntimeException("Lineage Cache:
Original item is empty: "+oe._key);
+
e._origItem = probeItem;
// Add itself as original item to navigate the list.
oe._origItem = probeItem;
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index e0b595e29f..2526fb60b5 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -25,10 +25,11 @@ import jcuda.Pointer;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysds.runtime.instructions.spark.data.RDDObject;
import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCacheStatus;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.meta.DataCharacteristics;
+import org.apache.sysds.runtime.meta.MetaData;
public class LineageCacheEntry {
protected final LineageItem _key;
@@ -43,7 +44,7 @@ public class LineageCacheEntry {
protected LineageItem _origItem;
private String _outfile = null;
protected double score;
- protected Pointer _gpuPointer;
+ protected GPUPointer _gpuPointer;
protected RDDObject _rddObject;
@@ -124,6 +125,21 @@ public class LineageCacheEntry {
}
}
+ public synchronized Pointer getGPUPointer() {
+ try {
+ //wait until other thread completes operation
+ //in order to avoid redundant computation
+ while(_status == LineageCacheStatus.EMPTY) {
+ wait();
+ }
+ //comes here if data is placed or the entry is removed
by the running thread
+ return _gpuPointer.getPointer();
+ }
+ catch( InterruptedException ex ) {
+ throw new DMLRuntimeException(ex);
+ }
+ }
+
public synchronized LineageCacheStatus getCacheStatus() {
return _status;
}
@@ -142,8 +158,8 @@ public class LineageCacheEntry {
size += _MBval.getInMemorySize();
if (_SOval != null)
size += _SOval.getSize();
- if (_gpuPointer!= null)
- size +=
LineageGPUCacheEviction.getPointerSize(_gpuPointer);
+ if (_gpuPointer != null)
+ size += _gpuPointer.getPointerSize();
return size;
}
@@ -164,7 +180,7 @@ public class LineageCacheEntry {
}
public boolean isGPUObject() {
- return _gpuPointer != null;
+ return _gpuPointer!= null;
}
public boolean isSerializedBytes() {
@@ -173,7 +189,7 @@ public class LineageCacheEntry {
public synchronized void setValue(MatrixBlock val, long computetime) {
_MBval = val;
- _gpuPointer = null; //Matrix block and gpu pointer cannot
coexist
+ _gpuPointer = null; //Matrix block and gpu object cannot
coexist
_computeTime = computetime;
_status = isNullVal() ? LineageCacheStatus.EMPTY :
LineageCacheStatus.CACHED;
//resume all threads waiting for val
@@ -186,15 +202,15 @@ public class LineageCacheEntry {
public synchronized void setValue(ScalarObject val, long computetime) {
_SOval = val;
- _gpuPointer = null; //scalar and gpu pointer cannot coexist
+ _gpuPointer = null; //scalar and gpu object cannot coexist
_computeTime = computetime;
_status = isNullVal() ? LineageCacheStatus.EMPTY :
LineageCacheStatus.CACHED;
//resume all threads waiting for val
notifyAll();
}
- public synchronized void setGPUValue(Pointer ptr, long computetime) {
- _gpuPointer = ptr;
+ public synchronized void setGPUValue(Pointer ptr, long size, MetaData
md, long computetime) {
+ _gpuPointer = new GPUPointer(ptr, size, md);
_computeTime = computetime;
_status = isNullVal() ? LineageCacheStatus.EMPTY :
LineageCacheStatus.GPUCACHED;
//resume all threads waiting for val
@@ -216,11 +232,22 @@ public class LineageCacheEntry {
// resume all threads waiting for val
notifyAll();
}
-
- public synchronized Pointer getGPUPointer() {
- return _gpuPointer;
+
+ public synchronized void copyValueFrom(LineageCacheEntry src, long
computetime) {
+ _MBval = src._MBval;
+ _SOval = src._SOval;
+ _gpuPointer = src._gpuPointer;
+ _rddObject = src._rddObject;
+ _computeTime = src._computeTime;
+ _status = isNullVal() ? LineageCacheStatus.EMPTY :
LineageCacheStatus.CACHED;
+ // resume all threads waiting for val
+ notifyAll();
}
-
+
+ public synchronized DataCharacteristics getDataCharacteristics() {
+ return _gpuPointer.getDataCharacteristics();
+ }
+
protected synchronized void setNullValues() {
_MBval = null;
_SOval = null;
@@ -293,4 +320,28 @@ public class LineageCacheEntry {
// Generate scores
score = w1*(((double)_computeTime)/getSize()) +
w2*getTimestamp() + w3*(((double)1)/getDagHeight());
}
+
+ static class GPUPointer {
+ private Pointer _pointer;
+ private long _allocatedSize; //bytes
+ private MetaData _metadata;
+
+ public GPUPointer(Pointer pointer, long size, MetaData
metadata) {
+ _pointer = pointer;
+ _allocatedSize = size;
+ _metadata = metadata;
+ }
+
+ protected long getPointerSize() {
+ return _allocatedSize;
+ }
+
+ protected Pointer getPointer() {
+ return _pointer;
+ }
+
+ protected DataCharacteristics getDataCharacteristics() {
+ return _metadata.getDataCharacteristics();
+ }
+ }
}