[systemds] branch main updated: [SYSTEMDS-3510] Cache data characteristics with GPU pointer

arnabp20 Thu, 30 Mar 2023 06:19:50 -0700

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 832acb04bd [SYSTEMDS-3510] Cache data characteristics with GPU pointer
832acb04bd is described below

commit 832acb04bd5d72a9275472211d278bf58a714ce3
Author: Arnab Phani <[email protected]>
AuthorDate: Thu Mar 30 15:19:17 2023 +0200

    [SYSTEMDS-3510] Cache data characteristics with GPU pointer
    
    This patch enables caching the metadata along with the GPU pointers
    in the lineage cache. The data characteristics are required to evict
    cached entries from the device cache to the host cache. Also, sometime
    the output dimensions are not known before execution.
    
    Closes #1798
---
 .../instructions/gpu/context/GPUContext.java       |  1 -
 .../instructions/gpu/context/GPUMemoryManager.java |  2 +
 .../instructions/gpu/context/GPUObject.java        |  8 ++-
 .../apache/sysds/runtime/lineage/LineageCache.java | 41 +++++++-----
 .../sysds/runtime/lineage/LineageCacheEntry.java   | 77 ++++++++++++++++++----
 5 files changed, 97 insertions(+), 32 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
index 3ffbedea3f..913eebbf80 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -37,7 +37,6 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysds.runtime.lineage.LineageCacheConfig;
 import org.apache.sysds.utils.GPUStatistics;
 
 import jcuda.Pointer;
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 526b477a4f..57636726a2 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -333,6 +333,8 @@ public class GPUMemoryManager {
                                        // TODO: else evict to the host cache
                                        if (freedSize > size)
                                                A = cudaMallocNoWarn(tmpA, 
size, "recycle non-exact match of lineage cache");
+                                       // Else, deallocate another free 
pointer. We are calling pollFistFreeNotExact with
+                                       // the same size (not with 
freedSize-size) to reduce potentials for creating holes
                                }
                        }
                        if (DMLScript.STATISTICS)
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 6dccee983d..0cb793949a 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -101,7 +101,7 @@ public class GPUObject {
         * Shadow buffer instance
         */
        final ShadowBuffer shadowBuffer;
-       
+
        // 
----------------------------------------------------------------------
        // Methods used to access, set and check jcudaDenseMatrixPtr
        
@@ -791,6 +791,7 @@ public class GPUObject {
                setSparseMatrixCudaPointer(tmp);
        }
 
+       // Method to find the estimated size of this GPU Object in the device
        public long getSizeOnDevice() {
                long GPUSize = 0;
                long rlen = mat.getNumRows();
@@ -805,6 +806,11 @@ public class GPUObject {
                return GPUSize;
        }
 
+       // Method to find the allocated size of this GPU Object in the device
+       public long getAllocatedSize() {
+               return 
gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(getDensePointer());
+       }
+
        void copyFromHostToDevice(String opcode) {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : copyFromHostToDevice, on " + this + ", 
GPUContext=" + getGPUContext());
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index c38132a5a3..eea3ea003d 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -19,7 +19,6 @@
 
 package org.apache.sysds.runtime.lineage;
 
-import jcuda.Pointer;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.tuple.MutablePair;
 import org.apache.commons.lang3.tuple.Pair;
@@ -104,14 +103,15 @@ public class LineageCache
                        LineageCacheEntry e = null;
                        boolean reuseAll = true;
                        synchronized( _cache ) {
-                               //try to reuse full or partial intermediates
+                               //try to reuse full or partial intermediates 
(CPU and FED only)
                                for (MutablePair<LineageItem,LineageCacheEntry> 
item : liList) {
                                        if 
(LineageCacheConfig.getCacheType().isFullReuse())
                                                e = 
LineageCache.probe(item.getKey()) ? getIntern(item.getKey()) : null;
                                        //TODO need to also move execution of 
compensation plan out of here
                                        //(create lazily evaluated entry)
                                        if (e == null && 
LineageCacheConfig.getCacheType().isPartialReuse()
-                                               && !(inst instanceof 
ComputationSPInstruction))
+                                               && !(inst instanceof 
ComputationSPInstruction)
+                                               && !(DMLScript.USE_ACCELERATOR))
                                                if( 
LineageRewriteReuse.executeRewrites(inst, ec) )
                                                        e = 
getIntern(item.getKey());
                                        reuseAll &= (e != null);
@@ -151,13 +151,14 @@ public class LineageCache
                                                ((SparkExecutionContext) 
ec).setRDDHandleForVariable(outName, rdd);
                                        }
                                        else { //TODO handle locks on gpu 
objects
-                                               //shallow copy the cached 
GPUObj to the output MatrixObject
                                                //Create a GPUObject with the 
cached pointer
                                                GPUObject gpuObj = new 
GPUObject(ec.getGPUContext(0),
                                                        
ec.getMatrixObject(outName), e.getGPUPointer());
                                                
ec.getMatrixObject(outName).setGPUObject(ec.getGPUContext(0), gpuObj);
                                                //Set dirty to true, so that it 
is later copied to the host for write
                                                
ec.getMatrixObject(outName).getGPUObject(ec.getGPUContext(0)).setDirty(true);
+                                               //Set the cached data 
characteristics to the output matrix object
+                                               
ec.getMatrixObject(outName).updateDataCharacteristics(e.getDataCharacteristics());
                                                //Increment the live count for 
this pointer
                                                
LineageGPUCacheEviction.incrementLiveCount(e.getGPUPointer());
                                        }
@@ -479,7 +480,7 @@ public class LineageCache
                if (LineageCacheConfig.isReusable(inst, ec) ) {
                        //if (!isMarkedForCaching(inst, ec)) return;
                        List<Pair<LineageItem, Data>> liData = null;
-                       Pointer gpuPtr = null;
+                       GPUObject liGPUObj= null;
                        LineageItem instLI = ((LineageTraceable) 
inst).getLineageItem(ec).getValue();
                        if (inst instanceof MultiReturnBuiltinCPInstruction) {
                                liData = new ArrayList<>();
@@ -494,13 +495,13 @@ public class LineageCache
                        else if (inst instanceof GPUInstruction) {
                                // TODO: gpu multiretrun instructions
                                Data gpudata = ec.getVariable(((GPUInstruction) 
inst)._output);
-                               gpuPtr = gpudata instanceof MatrixObject ?
+                               liGPUObj = gpudata instanceof MatrixObject ?
                                                
ec.getMatrixObject(((GPUInstruction)inst)._output).
-                                                       
getGPUObject(ec.getGPUContext(0)).getDensePointer() : null;
+                                                       
getGPUObject(ec.getGPUContext(0)) : null;
 
                                // Scalar gpu intermediates is always copied 
back to host. 
                                // No need to cache the GPUobj for scalar 
intermediates.
-                               if (gpuPtr == null)
+                               if (liGPUObj == null)
                                        liData = Arrays.asList(Pair.of(instLI, 
ec.getVariable(((GPUInstruction)inst)._output)));
                        }
                        else if (inst instanceof ComputationSPInstruction
@@ -517,10 +518,10 @@ public class LineageCache
                                else if (inst instanceof 
ComputationSPInstruction) //collects or prefetches
                                        liData = Arrays.asList(Pair.of(instLI, 
ec.getVariable(((ComputationSPInstruction) inst).output)));
 
-                       if (gpuPtr == null)
+                       if (liGPUObj == null)
                                putValueCPU(inst, liData, computetime);
                        else
-                               putValueGPU(gpuPtr, instLI, computetime);
+                               putValueGPU(liGPUObj, instLI, computetime);
                }
        }
        
@@ -594,14 +595,20 @@ public class LineageCache
                }
        }
        
-       private static void putValueGPU(Pointer gpuPtr, LineageItem instLI, 
long computetime) {
+       private static void putValueGPU(GPUObject gpuObj, LineageItem instLI, 
long computetime) {
                synchronized( _cache ) {
                        LineageCacheEntry centry = _cache.get(instLI);
+                       // TODO: Cache sparse pointers
+                       if (gpuObj.isSparse()) {
+                               removePlaceholder(instLI);
+                               return;
+                       }
                        // Update the total size of lineage cached gpu objects
                        // The eviction is handled by the unified gpu memory 
manager
-                       
LineageGPUCacheEviction.updateSize(LineageGPUCacheEviction.getPointerSize(gpuPtr),
 true);
+                       
LineageGPUCacheEviction.updateSize(gpuObj.getAllocatedSize(), true);
                        // Set the GPUOject in the cache
-                       centry.setGPUValue(gpuPtr, computetime);
+                       centry.setGPUValue(gpuObj.getDensePointer(), 
gpuObj.getAllocatedSize(),
+                               gpuObj.getMatrixObject().getMetaData(), 
computetime);
                        // Maintain order for eviction
                        LineageGPUCacheEviction.addEntry(centry);
                }
@@ -908,10 +915,10 @@ public class LineageCache
                        LineageCacheEntry oe = getIntern(probeItem);
                        LineageCacheEntry e = _cache.get(item);
                        boolean exists = !e.isNullVal();
-                       if (oe.isMatrixValue())
-                               e.setValue(oe.getMBValue(), computetime);
-                       else
-                               e.setValue(oe.getSOValue(), computetime);
+                       e.copyValueFrom(oe, computetime);
+                       if (e.isNullVal())
+                               throw new DMLRuntimeException("Lineage Cache: 
Original item is empty: "+oe._key);
+
                        e._origItem = probeItem; 
                        // Add itself as original item to navigate the list.
                        oe._origItem = probeItem;
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index e0b595e29f..2526fb60b5 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -25,10 +25,11 @@ import jcuda.Pointer;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.instructions.spark.data.RDDObject;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCacheStatus;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.meta.DataCharacteristics;
+import org.apache.sysds.runtime.meta.MetaData;
 
 public class LineageCacheEntry {
        protected final LineageItem _key;
@@ -43,7 +44,7 @@ public class LineageCacheEntry {
        protected LineageItem _origItem;
        private String _outfile = null;
        protected double score;
-       protected Pointer _gpuPointer;
+       protected GPUPointer _gpuPointer;
 
        protected RDDObject _rddObject;
        
@@ -124,6 +125,21 @@ public class LineageCacheEntry {
                }
        }
 
+       public synchronized Pointer getGPUPointer() {
+               try {
+                       //wait until other thread completes operation
+                       //in order to avoid redundant computation
+                       while(_status == LineageCacheStatus.EMPTY) {
+                               wait();
+                       }
+                       //comes here if data is placed or the entry is removed 
by the running thread
+                       return _gpuPointer.getPointer();
+               }
+               catch( InterruptedException ex ) {
+                       throw new DMLRuntimeException(ex);
+               }
+       }
+
        public synchronized LineageCacheStatus getCacheStatus() {
                return _status;
        }
@@ -142,8 +158,8 @@ public class LineageCacheEntry {
                        size += _MBval.getInMemorySize();
                if (_SOval != null)
                        size += _SOval.getSize();
-               if (_gpuPointer!= null)
-                       size += 
LineageGPUCacheEviction.getPointerSize(_gpuPointer);
+               if (_gpuPointer != null)
+                       size += _gpuPointer.getPointerSize();
                return size;
        }
        
@@ -164,7 +180,7 @@ public class LineageCacheEntry {
        }
 
        public boolean isGPUObject() {
-               return _gpuPointer != null;
+               return _gpuPointer!= null;
        }
 
        public boolean isSerializedBytes() {
@@ -173,7 +189,7 @@ public class LineageCacheEntry {
 
        public synchronized void setValue(MatrixBlock val, long computetime) {
                _MBval = val;
-               _gpuPointer = null;  //Matrix block and gpu pointer cannot 
coexist
+               _gpuPointer = null;  //Matrix block and gpu object cannot 
coexist
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
                //resume all threads waiting for val
@@ -186,15 +202,15 @@ public class LineageCacheEntry {
 
        public synchronized void setValue(ScalarObject val, long computetime) {
                _SOval = val;
-               _gpuPointer = null;  //scalar and gpu pointer cannot coexist
+               _gpuPointer = null;  //scalar and gpu object cannot coexist
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
                //resume all threads waiting for val
                notifyAll();
        }
        
-       public synchronized void setGPUValue(Pointer ptr, long computetime) {
-               _gpuPointer = ptr;
+       public synchronized void setGPUValue(Pointer ptr, long size, MetaData 
md, long computetime) {
+               _gpuPointer = new GPUPointer(ptr, size, md);
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.GPUCACHED;
                //resume all threads waiting for val
@@ -216,11 +232,22 @@ public class LineageCacheEntry {
                // resume all threads waiting for val
                notifyAll();
        }
-       
-       public synchronized Pointer getGPUPointer() {
-               return _gpuPointer;
+
+       public synchronized void copyValueFrom(LineageCacheEntry src, long 
computetime) {
+               _MBval = src._MBval;
+               _SOval = src._SOval;
+               _gpuPointer = src._gpuPointer;
+               _rddObject = src._rddObject;
+               _computeTime = src._computeTime;
+               _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
+               // resume all threads waiting for val
+               notifyAll();
        }
-       
+
+       public synchronized DataCharacteristics getDataCharacteristics() {
+               return _gpuPointer.getDataCharacteristics();
+       }
+
        protected synchronized void setNullValues() {
                _MBval = null;
                _SOval = null;
@@ -293,4 +320,28 @@ public class LineageCacheEntry {
                // Generate scores
                score = w1*(((double)_computeTime)/getSize()) + 
w2*getTimestamp() + w3*(((double)1)/getDagHeight());
        }
+
+       static class GPUPointer {
+               private Pointer _pointer;
+               private long _allocatedSize; //bytes
+               private MetaData _metadata;
+
+               public GPUPointer(Pointer pointer, long size, MetaData 
metadata) {
+                       _pointer = pointer;
+                       _allocatedSize = size;
+                       _metadata = metadata;
+               }
+
+               protected long getPointerSize() {
+                       return _allocatedSize;
+               }
+
+               protected Pointer getPointer() {
+                       return _pointer;
+               }
+
+               protected DataCharacteristics getDataCharacteristics() {
+                       return _metadata.getDataCharacteristics();
+               }
+       }
 }

[systemds] branch main updated: [SYSTEMDS-3510] Cache data characteristics with GPU pointer

Reply via email to