This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new ca72e4b03f [SYSTEMDS-2947] Re-enable lineage cache eviction from GPU 
to host
ca72e4b03f is described below

commit ca72e4b03fc36dc7faf1f206e9261f02198ef8b0
Author: Arnab Phani <[email protected]>
AuthorDate: Thu Apr 6 15:25:24 2023 +0200

    [SYSTEMDS-2947] Re-enable lineage cache eviction from GPU to host
    
    This patch adds code to be able to copy a cached pointer to a cached
    matrix block. The plan is to conditionally evict cached entries
    to host based of the score (compute time and #hits, #misses) while
    recycling. Currently, the eviction is disabled as we do not have
    a way to measure the elapsed time of the GPU kernels due to their
    asynchronous nature.
    
    Closes #1802
---
 .../instructions/gpu/context/CSRPointer.java       | 11 +--
 .../instructions/gpu/context/GPUMemoryManager.java | 24 ++++--
 .../instructions/gpu/context/GPUObject.java        | 42 +----------
 .../sysds/runtime/lineage/LineageCacheConfig.java  |  2 +-
 .../sysds/runtime/lineage/LineageCacheEntry.java   | 11 +++
 .../runtime/lineage/LineageGPUCacheEviction.java   | 88 +++++++++++++++-------
 .../test/functions/lineage/GPUFullReuseTest.java   |  4 +-
 7 files changed, 101 insertions(+), 81 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index 23ae4d1648..dc3f41ed28 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -178,9 +178,6 @@ public class CSRPointer {
         */
        public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int 
rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
                CSRPointer r = dest;
-               long t0 = 0;
-               if (DMLScript.STATISTICS)
-                       t0 = System.nanoTime();
                r.nnz = nnz;
                if(rows < 0) throw new DMLRuntimeException("Incorrect input 
parameter: rows=" + rows);
                if(nnz < 0) throw new DMLRuntimeException("Incorrect input 
parameter: nnz=" + nnz);
@@ -190,10 +187,10 @@ public class CSRPointer {
                LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, 
r.val, null);
                cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 
1), cudaMemcpyHostToDevice);
                cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), 
cudaMemcpyHostToDevice);
-               if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
-               if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaToDevCount.add(3);
+               //if (DMLScript.STATISTICS)
+               //      GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
+               //if (DMLScript.STATISTICS)
+               //      GPUStatistics.cudaToDevCount.add(3);
        }
        
        /**
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index a52c9eb6a9..fc249f03c3 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -297,16 +297,19 @@ public class GPUMemoryManager {
                        if (le != null) {
                                if(!LineageCacheConfig.GPU2HOSTEVICTION) {
                                        A = le.getGPUPointer(); //recycle
-                                       
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
+                                       
LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
                                        if (DMLScript.STATISTICS)
                                                
LineageCacheStatistics.incrementGpuRecycle();
                                }
-                               /*else {
-                                       // Copy from device cache to CPU 
lineage cache if not already copied
-                                       
LineageGPUCacheEviction.copyToHostCache(le, opcode, copied);
+                               else {
+                                       // Copy from device cache to CPU 
lineage cache
+                                       // TODO: Copy conditionally (if score > 
theta)
+                                       Pointer copiedPtr = 
LineageGPUCacheEviction.copyToHostCache(le);
+                                       
LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
+                                       A = copiedPtr;
                                        if(DMLScript.STATISTICS)
                                                
LineageCacheStatistics.incrementGpuSyncEvicts();
-                               }*/
+                               }
                        }
                        // TODO: Handle live (dirty) objects separately. Copy 
them back to the host
 
@@ -324,12 +327,19 @@ public class GPUMemoryManager {
                                if(le != null) {
                                        freedSize += 
getSizeAllocatedGPUPointer(le.getGPUPointer());
                                        
if(!LineageCacheConfig.GPU2HOSTEVICTION) {
-                                               
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
+                                               
LineageGPUCacheEviction.removeFromDeviceCache(le, le.getGPUPointer(), true);
                                                
guardedCudaFree(le.getGPUPointer()); //free
                                                if (DMLScript.STATISTICS)
                                                        
LineageCacheStatistics.incrementGpuDel();
                                        }
-                                       // TODO: else evict to the host cache
+                                       else {
+                                               // Copy from device cache to 
CPU lineage cache
+                                               Pointer copiedPtr = 
LineageGPUCacheEviction.copyToHostCache(le);
+                                               
LineageGPUCacheEviction.removeFromDeviceCache(le, copiedPtr, false);
+                                               guardedCudaFree(copiedPtr); 
//free
+                                               if(DMLScript.STATISTICS)
+                                                       
LineageCacheStatistics.incrementGpuSyncEvicts();
+                                       }
                                        if (freedSize > size)
                                                A = cudaMallocNoWarn(tmpA, 
size, "recycle non-exact match of lineage cache");
                                        // Else, deallocate another free 
pointer. We are calling pollFistFreeNotExact with
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 0cb793949a..5c4967cda9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -886,6 +886,8 @@ public class GPUObject {
                        if (copyToDevice) {
                                CSRPointer.copyToDevice(getGPUContext(), 
getJcudaSparseMatrixPtr(),
                                        tmp.getNumRows(), tmp.getNonZeros(), 
rowPtr, colInd, values);
+                               if (DMLScript.STATISTICS)
+                                       GPUStatistics.cudaToDevCount.add(3);
                        }
                } else {
                        double[] data = tmp.getDenseBlockValues();
@@ -906,6 +908,8 @@ public class GPUObject {
                                // Copy dense block
                                // H2D now only measures the time taken to do 
                                
LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, 
getDensePointer(), opcode);
+                               if (DMLScript.STATISTICS)
+                                       GPUStatistics.cudaToDevCount.add(1);
                        }
                }
 
@@ -913,8 +917,6 @@ public class GPUObject {
 
                if (DMLScript.STATISTICS)
                        GPUStatistics.cudaToDevTime.add(System.nanoTime() - 
start);
-               if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaToDevCount.add(1);
        }
 
        public static int toIntExact(long l) {
@@ -1012,42 +1014,6 @@ public class GPUObject {
                dirty = false;
        }
        
-       // Copy and convert to a MatrixBlock, and return
-       public MatrixBlock evictFromDeviceToHostMB(String instName, boolean 
eagerDelete) throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", 
GPUContext=" + getGPUContext());
-               }
-               MatrixBlock tmp = null;
-               if (!isDensePointerNull()) {
-                       tmp = new MatrixBlock(toIntExact(mat.getNumRows()), 
toIntExact(mat.getNumColumns()), false);
-                       tmp.allocateDenseBlock();
-                       
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
-                                               getDensePointer(), 
tmp.getDenseBlockValues(), instName, true);
-                       //if(eagerDelete)
-                       //      clearData(instName, true);
-                       tmp.recomputeNonZeros();
-               } else {
-                       int rows = toIntExact(mat.getNumRows());
-                       int cols = toIntExact(mat.getNumColumns());
-                       int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
-                       double[] values = new double[nnz];
-                       
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), 
getJcudaSparseMatrixPtr().val, values, instName, true);
-                       int[] rowPtr = new int[rows + 1];
-                       int[] colInd = new int[nnz];
-                       CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), 
rows, nnz, rowPtr, colInd);
-                       //if(eagerDelete)
-                       //      clearData(instName, true);
-                       SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, 
colInd, values, nnz);
-                       tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
-               }
-               //mat.acquireModify(tmp);
-               //mat.release();
-               //dirty = false;
-               //isLineageCached = false;
-               return tmp;
-       }
-
-
        /**
         * Clears the data associated with this {@link GPUObject} instance
         *
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 5bb4d97853..a483b6c21b 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -106,7 +106,7 @@ public class LineageCacheConfig
        public static double FSREAD_SPARSE = 400;
        public static double FSWRITE_DENSE = 450;
        public static double FSWRITE_SPARSE = 225;
-       public static double D2HCOPY = 1500;
+       public static double D2HCOPYBANDWIDTH = 1500; //MB/sec
        public static double D2HMAXBANDWIDTH = 8192;
        
        private enum CachedItemHead {
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index 0d89dc396d..f4d3e24982 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -183,6 +183,12 @@ public class LineageCacheEntry {
                return _gpuPointer!= null;
        }
 
+       public synchronized boolean isDensePointer() {
+               if (!isGPUObject())
+                       return false;
+               return _gpuPointer.isDensepointer();
+       }
+
        public boolean isSerializedBytes() {
                return _dt.isUnknown() && 
_key.getOpcode().equals(LineageItemUtils.SERIALIZATION_OPCODE);
        }
@@ -343,5 +349,10 @@ public class LineageCacheEntry {
                protected DataCharacteristics getDataCharacteristics() {
                        return _metadata.getDataCharacteristics();
                }
+
+               protected boolean isDensepointer() {
+                       return true;
+                       // TODO: Support sparse pointer caching
+               }
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index 9135565bd9..04ddfed198 100644
--- 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -33,6 +33,11 @@ import jcuda.Pointer;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
+import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.meta.DataCharacteristics;
+
+import static 
org.apache.sysds.runtime.instructions.gpu.context.GPUObject.toIntExact;
 
 public class LineageGPUCacheEviction 
 {
@@ -115,13 +120,12 @@ public class LineageGPUCacheEviction
                double sizeMB = sizeByte / (1024*1024);
                double newTSpeed = sizeMB / copyTime;  //bandwidth (MB/sec) + 
java overhead
 
-               // FIXME: A D2H copy lazily executes previous kernels
                if (newTSpeed > LineageCacheConfig.D2HMAXBANDWIDTH)
                        return;  //filter out errorneous measurements (~ 
>8GB/sec)
                // Perform exponential smoothing.
                double smFactor = 0.5;  //smoothing factor
-               LineageCacheConfig.D2HCOPY = (smFactor * newTSpeed) + 
((1-smFactor) * LineageCacheConfig.D2HCOPY);
-               //System.out.println("size_t: "+sizeMB+ " speed_t: "+newTSpeed 
+ " estimate_t+1: "+LineageCacheConfig.D2HCOPY);
+               LineageCacheConfig.D2HCOPYBANDWIDTH = (smFactor * newTSpeed) + 
((1-smFactor) * LineageCacheConfig.D2HCOPYBANDWIDTH);
+               //System.out.println("size_t: "+sizeMB+ " speed_t: "+newTSpeed 
+ " estimate_t+1: "+LineageCacheConfig.D2HCOPYBANDWIDTH);
        }
 
        //--------------- CACHE MAINTENANCE & LOOKUP FUNCTIONS --------------//
@@ -224,37 +228,67 @@ public class LineageGPUCacheEviction
                cachedPointers.addAll(livePointers.keySet());
                return cachedPointers;
        }
-       
-       /*public static void copyToHostCache(LineageCacheEntry entry, String 
instName, boolean alreadyCopied) {
-               // TODO: move to the shadow buffer. Convert to double precision 
only when reused.
+
+       // Copy an intermediate from GPU cache to host cache
+       // TODO: move to the shadow buffer. Convert to double precision only 
when reused.
+       public static Pointer copyToHostCache(LineageCacheEntry entry) {
+               // Memcopy from the GPU pointer to a matrix block
                long t0 = System.nanoTime();
-               MatrixBlock mb = alreadyCopied ? 
entry._gpuObject.getMatrixObject().acquireReadAndRelease()
-                               : 
entry._gpuObject.evictFromDeviceToHostMB(instName, false);
+               MatrixBlock mb = pointerToMatrixBlock(entry);
                long t1 = System.nanoTime();
-               
adjustD2HTransferSpeed(((double)entry._gpuObject.getSizeOnDevice()), 
((double)(t1-t0))/1000000000);
+               // Adjust the estimated D2H bandwidth
+               adjustD2HTransferSpeed(((double)entry.getSize()), 
((double)(t1-t0))/1000000000);
+               Pointer ptr = entry.getGPUPointer();
                long size = mb.getInMemorySize();
-               // make space in the host memory for the data TODO: synchronize
-               if (!LineageCacheEviction.isBelowThreshold(size)) {
-                       synchronized (LineageCache.getLineageCache()) {
-                               
LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+               synchronized(LineageCache.getLineageCache()) {
+                       // Make space in the host cache for the data
+                       if(!LineageCacheEviction.isBelowThreshold(size)) {
+                               synchronized(LineageCache.getLineageCache()) {
+                                       
LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+                               }
                        }
+                       LineageCacheEviction.updateSize(size, true);
+                       // Place the data and set gpu object to null in the 
cache entry
+                       entry.setValue(mb);
+                       // Maintain order for eviction of host cache.
+                       LineageCacheEviction.addEntry(entry);
                }
-               // FIXME: updateSize outside of synchronized is problematic, 
but eliminates waiting for background eviction
-               LineageCacheEviction.updateSize(size, true);
-               // place the data and set gpu object to null in the cache entry
-               entry.setValue(mb);
-               // maintain order for eviction of host cache. FIXME: synchronize
-               LineageCacheEviction.addEntry(entry);
-               // manage space in gpu cache
-               updateSize(size, false);
-       }*/
+               return ptr;
+       }
+
+       private static MatrixBlock pointerToMatrixBlock(LineageCacheEntry le) {
+               MatrixBlock ret = null;
+               DataCharacteristics dc = le.getDataCharacteristics();
+               if (le.isDensePointer()) {
+                       ret = new MatrixBlock(toIntExact(dc.getRows()), 
toIntExact(dc.getCols()), false);
+                       ret.allocateDenseBlock();
+                       // copy to the host
+                       
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
+                               le.getGPUPointer(), ret.getDenseBlockValues(), 
null, true);
+                       ret.recomputeNonZeros();
+               } /*else {
+                       int rows = toIntExact(dc.getRows());
+                       int cols = toIntExact(dc.getCols());
+                       int nnz = toIntExact(le.getGPUPointer().nnz);
+                       double[] values = new double[nnz];
+                       
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), 
le.getGPUPointer().val, values, null, true);
+                       int[] rowPtr = new int[rows + 1];
+                       int[] colInd = new int[nnz];
+                       CSRPointer.copyPtrToHost(le.getGPUPointer(), rows, nnz, 
rowPtr, colInd);
+                       SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, 
colInd, values, nnz);
+                       ret = new MatrixBlock(rows, cols, nnz, sparseBlock);
+               }*/
+               //mat.acquireModify(tmp);
+               //mat.release();
+               return ret;
+       }
 
-       public static void removeFromDeviceCache(LineageCacheEntry entry, 
String instName, boolean alreadyCopied) {
-               //long size = entry.getGPUObject().getSizeOnDevice();
-               long size = 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(entry.getGPUPointer());
-               LineageCache.removeEntry(entry._key);
+       public static void removeFromDeviceCache(LineageCacheEntry entry, 
Pointer ptr, boolean removeFromCache) {
+               long size = 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(ptr);
+               if (removeFromCache)
+                       LineageCache.removeEntry(entry._key);
                updateSize(size, false);
-               GPUCacheEntries.remove(entry.getGPUPointer());
+               GPUCacheEntries.remove(ptr);
        }
 
 }
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java 
b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 1fcd0766cf..1a0665c187 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -84,6 +84,7 @@ public class GPUFullReuseTest extends AutomatedTestBase{
 
                AutomatedTestBase.TEST_GPU = true;  //adds '-gpu'
                List<String> proArgs = new ArrayList<>();
+               proArgs.add("-explain");
                proArgs.add("-stats");
                proArgs.add("-args");
                proArgs.add(output("R"));
@@ -94,7 +95,8 @@ public class GPUFullReuseTest extends AutomatedTestBase{
                //run the test
                runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
                HashMap<MatrixValue.CellIndex, Double> R_orig = 
readDMLMatrixFromOutputDir("R");
-               
+
+               proArgs.clear();
                proArgs.add("-stats");
                proArgs.add("-lineage");
                
proArgs.add(LineageCacheConfig.ReuseCacheType.REUSE_MULTILEVEL.name().toLowerCase());

Reply via email to