systemml git commit: [SYSTEMML-445] Added memory stats for GPU allocation/eviction

niketanpansare Thu, 20 Sep 2018 15:00:44 -0700

Repository: systemml
Updated Branches:
  refs/heads/master 69624850e -> f46279a17



[SYSTEMML-445] Added memory stats for GPU allocation/eviction

- Also, reverted the shadow buffer to the original implementation as we are 
getting OOM for lstm scripts. This likely has to do with pessimistic GC.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f46279a1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f46279a1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f46279a1

Branch: refs/heads/master
Commit: f46279a17031d3f8827923f6eddd614c3eac77d3
Parents: 6962485
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Thu Sep 20 14:56:51 2018 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Thu Sep 20 14:56:51 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   8 +-
 .../gpu/context/GPUMemoryManager.java           |  61 ++++----
 .../instructions/gpu/context/GPUObject.java     |  18 +--
 .../instructions/gpu/context/ShadowBuffer.java  | 154 +++++--------------
 .../org/apache/sysml/utils/GPUStatistics.java   |  29 ++++
 .../apache/sysml/utils/PersistentLRUCache.java  |   8 +-
 6 files changed, 108 insertions(+), 170 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index 3925c4e..7b535c9 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -105,11 +105,9 @@
    <!-- Advanced optimization: fraction of driver memory to use for caching 
(default: 0.15) -->
    <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
    
-   <!-- Advanced optimization: maximum fraction of driver memory to use for 
GPU shadow buffer. 
-   Shadow buffer is cleared eagerly on garbage collection to avoid OOM and is 
backed by org.apache.sysml.utils.PersistentLRUCache.
-   Setting this to zero disables shadow buffering. If you intend to train 
network larger than GPU memory size, 
-   consider using large driver memory and setting this to a value greater than 
0. -->
-   
<sysml.gpu.eviction.shadow.bufferSize>0.5</sysml.gpu.eviction.shadow.bufferSize>
+   <!-- Advanced optimization: fraction of driver memory to use for GPU shadow 
buffer. This optimization is ignored for double precision. 
+   By default, it is disabled (hence set to 0.0). If you intend to train 
network larger than GPU memory size, consider using single precision and 
setting this to 0.1. -->
+   
<sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize>
    
    <!-- Fraction of available GPU memory to use. This is similar to 
TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 
0.9) -->
    <sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor>

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index 033051a..57b76f6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -191,7 +191,7 @@ public class GPUMemoryManager {
                                GPUStatistics.cudaAllocCount.increment();
                        }
                        if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO 
|| LOG.isTraceEnabled()) )  {
-                               LOG.info("Success: " + printDebugMessage + ":" 
+ byteCountToDisplaySize(size));
+                               LOG.info("Success: " + printDebugMessage + ":" 
+ GPUStatistics.byteCountToDisplaySize(size));
                        }
                        return A;
                } catch(jcuda.CudaException e) {
@@ -203,7 +203,7 @@ public class GPUMemoryManager {
                                GPUStatistics.cudaAllocCount.increment();
                        }
                        if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO 
|| LOG.isTraceEnabled()) )  {
-                               LOG.info("Failed: " + printDebugMessage + ":" + 
byteCountToDisplaySize(size));
+                               LOG.info("Failed: " + printDebugMessage + ":" + 
GPUStatistics.byteCountToDisplaySize(size));
                                LOG.info("GPU Memory info " + printDebugMessage 
+ ":" + toString());
                        }
                        return null;
@@ -224,28 +224,15 @@ public class GPUMemoryManager {
                        return "->" + stackTrace[index].getClassName() + "." + 
stackTrace[index].getMethodName() + "(" + stackTrace[index].getFileName() + ":" 
+ stackTrace[index].getLineNumber() + ")";
        }
        
-       /**
-        * Pretty printing utility to print bytes
-        * 
-        * @param numBytes number of bytes
-        * @return a human-readable display value
-        */
-       private String byteCountToDisplaySize(long numBytes) {
-               // return 
org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs 
rounding
-           if (numBytes < 1024) { 
-               return numBytes + " bytes";
-           }
-           else {
-                   int exp = (int) (Math.log(numBytes) / 6.931471805599453);
-                   return String.format("%.3f %sB", ((double)numBytes) / 
Math.pow(1024, exp), "KMGTP".charAt(exp-1));
-           }
-       }
        
        public boolean canAllocateWithoutEviction(String opcode, long size) {
                return lazyCudaFreeMemoryManager.contains(opcode, size) || 
allocator.canAllocate(size) ||
                        
lazyCudaFreeMemoryManager.containsRmvarPointerMinSize(opcode, size) ;
        }
        
+       long peakSize = 0;
+       long currentSize = 0;
+       
        /**
         * Allocate pointer of the given size in bytes.
         * 
@@ -255,12 +242,19 @@ public class GPUMemoryManager {
         */
        public Pointer malloc(String opcode, long size) {
                if(size <= 0) {
-                       throw new DMLRuntimeException("Cannot allocate memory 
of size " + byteCountToDisplaySize(size));
+                       throw new DMLRuntimeException("Cannot allocate memory 
of size " + GPUStatistics.byteCountToDisplaySize(size));
                }
                if(DEBUG_MEMORY_LEAK) {
                        LOG.info("GPU Memory info during malloc:" + toString());
                }
                
+               if(ConfigurationManager.isStatistics()) {
+                       GPUStatistics.cudaAllocAggSize.add(size);
+                       currentSize += size;
+                       peakSize = Math.max(currentSize, peakSize);
+                       GPUStatistics.cudaAllocPeakSize.set(peakSize);
+               }
+               
                // Step 1: First try reusing exact match in rmvarGPUPointers to 
avoid holes in the GPU memory
                Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, 
size);
                
@@ -358,7 +352,7 @@ public class GPUMemoryManager {
                }
                
                if(A == null) {
-                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix, requested = " + byteCountToDisplaySize(size) 
+ ". \n "
+                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix, requested = " + 
GPUStatistics.byteCountToDisplaySize(size) + ". \n "
                                        + toString());
                }
                
@@ -377,6 +371,10 @@ public class GPUMemoryManager {
                boolean eagerDelete = true;
                if(gpuObj.isDirty()) {
                        // Eviction
+                       if(ConfigurationManager.isStatistics()) {
+                               long size = gpuObj.getSizeOnDevice();
+                               GPUStatistics.cudaEvictAggSize.add(size);
+                       }
                        gpuObj.copyFromDeviceToHost(opcode, true, eagerDelete);
                }
                else {
@@ -416,7 +414,7 @@ public class GPUMemoryManager {
                if(allPointers.containsKey(toFree)) {
                        long size = allPointers.get(toFree).getSizeInBytes();
                        if(LOG.isTraceEnabled()) {
-                               LOG.trace("Free-ing up the pointer of size " +  
byteCountToDisplaySize(size));
+                               LOG.trace("Free-ing up the pointer of size " +  
GPUStatistics.byteCountToDisplaySize(size));
                        }
                        allPointers.remove(toFree);
                        lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
@@ -441,6 +439,10 @@ public class GPUMemoryManager {
        public void free(String opcode, Pointer toFree, boolean eager) throws 
DMLRuntimeException {
                if(LOG.isTraceEnabled())
                        LOG.trace("Free-ing the pointer with eager=" + eager);
+               long size = allPointers.get(toFree).getSizeInBytes();
+               if(ConfigurationManager.isStatistics()) {
+                       currentSize -= size;
+               }
                if (eager) {
                        long t0 = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
                        guardedCudaFree(toFree);
@@ -451,7 +453,6 @@ public class GPUMemoryManager {
                                LOG.info("GPU memory info before failure:" + 
toString());
                                throw new RuntimeException("ERROR : Internal 
state corrupted, cache block size map is not aware of a block it trying to free 
up");
                        }
-                       long size = allPointers.get(toFree).getSizeInBytes();
                        lazyCudaFreeMemoryManager.add(size, toFree);
                }
        }
@@ -604,24 +605,24 @@ public class GPUMemoryManager {
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", 
                                "Num Objects", "Num Pointers", "Size"));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked 
Dirty GPU objects", 
-                               numUnlockedDirtyGPUObjects, 
numUnlockedDirtyPointers, 
byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
+                               numUnlockedDirtyGPUObjects, 
numUnlockedDirtyPointers, 
GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked 
NonDirty GPU objects", 
-                               numUnlockedNonDirtyGPUObjects, 
numUnlockedNonDirtyPointers, 
byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
+                               numUnlockedNonDirtyGPUObjects, 
numUnlockedNonDirtyPointers, 
GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU 
objects", 
-                               numLockedGPUObjects, numLockedPointers, 
byteCountToDisplaySize(sizeOfLockedGPUObjects)));
+                               numLockedGPUObjects, numLockedPointers, 
GPUStatistics.byteCountToDisplaySize(sizeOfLockedGPUObjects)));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached 
rmvar-ed pointers", 
-                               "-", 
lazyCudaFreeMemoryManager.getNumPointers(), 
byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
+                               "-", 
lazyCudaFreeMemoryManager.getNumPointers(), 
GPUStatistics.byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", 
"Non-matrix/non-cached pointers", 
-                               "-", potentiallyLeakyPointers.size(), 
byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
+                               "-", potentiallyLeakyPointers.size(), 
GPUStatistics.byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All 
pointers", 
-                               "-", allPointers.size(), 
byteCountToDisplaySize(totalMemoryAllocated)));
+                               "-", allPointers.size(), 
GPUStatistics.byteCountToDisplaySize(totalMemoryAllocated)));
                long free[] = { 0 };
                long total[] = { 0 };
                cudaMemGetInfo(free, total);
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Free mem 
(from cudaMemGetInfo)", 
-                               "-", "-", byteCountToDisplaySize(free[0])));
+                               "-", "-", 
GPUStatistics.byteCountToDisplaySize(free[0])));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Total mem 
(from cudaMemGetInfo)", 
-                               "-", "-", byteCountToDisplaySize(total[0])));
+                               "-", "-", 
GPUStatistics.byteCountToDisplaySize(total[0])));
                
ret.append("====================================================\n");
                return ret.toString();
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 43e2727..72d3170 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -111,11 +111,7 @@ public class GPUObject {
         */
        public Pointer getDensePointer() {
                if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && 
getJcudaSparseMatrixPtr() == null) {
-                       try {
-                               shadowBuffer.moveToDevice();
-                       } catch (IOException e) {
-                               throw new DMLRuntimeException("Error moving the 
data from shadow buffer to the device", e);
-                       }
+                       shadowBuffer.moveToDevice();
                }
                return jcudaDenseMatrixPtr;
        }
@@ -939,21 +935,13 @@ public class GPUObject {
                        else {
                                // If already copied to shadow buffer as part 
of previous eviction and this is not an eviction (i.e. bufferpool call for 
subsequent CP/Spark instruction),
                                // then copy from shadow buffer to MatrixObject.
-                               try {
-                                       shadowBuffer.moveToHost();
-                               } catch (IOException e) {
-                                       throw new DMLRuntimeException("Error 
moving the data from shadow buffer to the host memory", e);
-                               }
+                               shadowBuffer.moveToHost();
                                return;
                        }
                }
                else if(shadowBuffer.isEligibleForBuffering(isEviction, 
eagerDelete)) {
                        // Perform shadow buffering if (1) single precision, 
(2) during eviction, (3) for dense matrices, and (4) if the given matrix can 
fit into the shadow buffer. 
-                       try {
-                               shadowBuffer.moveFromDevice(instName);
-                       } catch (IOException e) {
-                               throw new DMLRuntimeException("Error moving the 
data from the device to the shadow buffer", e);
-                       }
+                       shadowBuffer.moveFromDevice(instName);
                        return;
                }
                else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == 
null) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
index 4c534a0..88ea972 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
@@ -20,65 +20,41 @@ package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.runtime.JCuda.cudaMemcpy;
 
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.concurrent.atomic.AtomicLong;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
-import org.apache.sysml.runtime.DMLRuntimeException;
 import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.PersistentLRUCache;
 
 import jcuda.Pointer;
+import jcuda.Sizeof;
 
-/**
- * Shadow buffer is a temporary staging area used during eviction.
- * It is eagerly deleted and backed using the local filesystem in case of 
Garbage Collection
- * or if the staging memory size exceeds the user-specified size.
- * This is needed to respect SystemML's memory estimates, while still allowing
- * for caching in case of GPU plans.
- */
 public class ShadowBuffer {
        private static final Log LOG = 
LogFactory.getLog(ShadowBuffer.class.getName());
-       private static PersistentLRUCache CACHE;
-       private static AtomicLong UNIQUE_ID = new AtomicLong();
-       private static long EVICTION_SHADOW_BUFFER_MAX_BYTES; 
-       final GPUObject gpuObj;
-       boolean isBuffered = false;
-       String fileName;
        
-       public static boolean isEnabled() {
-               if(CACHE == null && EVICTION_SHADOW_BUFFER_MAX_BYTES >= 0) {
+       GPUObject gpuObj;
+       float[] shadowPointer = null;
+       private static boolean _warnedAboutShadowBuffer = false;
+       private static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0;
+       private static long EVICTION_SHADOW_BUFFER_MAX_BYTES;
+       static {
+               if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) {
+                       EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;
+               }
+               else {
                        double shadowBufferSize = 
ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-                       if(shadowBufferSize <= 0) {
-                               EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor 
optimization to avoid unnecessary invoking configuration manager.
-                       }
-                       else {
-                               if(shadowBufferSize > 1) 
-                                       throw new RuntimeException("Incorrect 
value (" + shadowBufferSize + ") for the configuration:" + 
DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-                               EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) 
(((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
-                               try {
-                                       CACHE = new 
PersistentLRUCache(EVICTION_SHADOW_BUFFER_MAX_BYTES);
-                               } catch(IOException e) {
-                                       LOG.warn("Unable to create a temporary 
directory for shadow buffering on the local filesystem; disabling shadow 
buffering:" + e.getMessage());
-                                       EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; 
// Minor optimization to avoid checking for file permission.
-                               }
-                       }
+                       if(shadowBufferSize < 0 || shadowBufferSize > 1) 
+                               throw new RuntimeException("Incorrect value (" 
+ shadowBufferSize + ") for the configuration:" + 
DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+                       EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) 
(((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
                }
-               return CACHE != null;
        }
        
        public ShadowBuffer(GPUObject gpuObj) {
-               if(isEnabled())
-                       fileName = "shadow_" + UNIQUE_ID.incrementAndGet();
                this.gpuObj = gpuObj;
-               
        }
        
        /**
@@ -87,39 +63,19 @@ public class ShadowBuffer {
         * @return true if the gpu object is shadow buffered
         */
        public boolean isBuffered() {
-               return isBuffered;
-       }
-       
-       private static long getSizeOfDataType(long numElems) {
-               return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
+               return shadowPointer != null;
        }
        
        /**
         * Move the data from GPU to shadow buffer 
         * @param instName name of the instruction
-        * @throws IOException if error 
-        * @throws FileNotFoundException  if error
         */
-       public void moveFromDevice(String instName) throws 
FileNotFoundException, IOException {
+       public void moveFromDevice(String instName) {
                long start = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
                int numElems = 
GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
-       
-               if(isDoublePrecision()) {
-                       double [] shadowPointer = new double[numElems];
-                       cudaMemcpy(Pointer.to(shadowPointer), 
gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
-                       CACHE.put(fileName, shadowPointer);
-                       isBuffered = true;
-               }
-               else if(isSinglePrecision()) {
-                       float [] shadowPointer = new float[numElems];
-                       cudaMemcpy(Pointer.to(shadowPointer), 
gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
-                       CACHE.put(fileName, shadowPointer);
-                       isBuffered = true;
-               }
-               else {
-                       throw new DMLRuntimeException("Unsupported datatype");
-               }
-               
+               shadowPointer = new float[numElems];
+               EVICTION_SHADOW_BUFFER_CURR_BYTES += 
getSizeOfFloat(shadowPointer.length);
+               cudaMemcpy(Pointer.to(shadowPointer), 
gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
                gpuObj.getGPUContext().cudaFreeHelper(instName, 
gpuObj.jcudaDenseMatrixPtr, true);
                gpuObj.jcudaDenseMatrixPtr = null;
                if (ConfigurationManager.isStatistics()) {
@@ -131,36 +87,24 @@ public class ShadowBuffer {
                }
        }
        
-
-       private static boolean isDoublePrecision() {
-               return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.DOUBLE;
+       private long getSizeOfFloat(long numElems) {
+               return numElems*Sizeof.FLOAT;
        }
        
-       private static boolean isSinglePrecision() {
-               return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT;
+       private long getSizeOfDataType(long numElems) {
+               return numElems*LibMatrixCUDA.sizeOfDataType;
        }
        
        /**
         * Move the data from shadow buffer to Matrix object
-        * @throws IOException if error 
-        * @throws FileNotFoundException if error
         */
-       public void moveToHost() throws FileNotFoundException, IOException {
+       public void moveToHost() {
                long start = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
                MatrixBlock tmp = new 
MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), 
GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false);
                tmp.allocateDenseBlock();
                double [] tmpArr = tmp.getDenseBlockValues();
-               if(isDoublePrecision()) {
-                       System.arraycopy(CACHE.getAsDoubleArray(fileName), 0, 
tmpArr, 0, tmpArr.length);
-               }
-               else if(isSinglePrecision()) {
-                       float [] shadowPointer = 
CACHE.getAsFloatArray(fileName);
-                       for(int i = 0; i < shadowPointer.length; i++) {
-                               tmpArr[i] = shadowPointer[i];
-                       }
-               }
-               else {
-                       throw new DMLRuntimeException("Unsupported datatype");
+               for(int i = 0; i < shadowPointer.length; i++) {
+                       tmpArr[i] = shadowPointer[i];
                }
                gpuObj.mat.acquireModify(tmp);
                gpuObj.mat.release();
@@ -178,28 +122,12 @@ public class ShadowBuffer {
        
        /**
         * Move the data from shadow buffer to GPU
-        * @throws IOException if error
-        * @throws FileNotFoundException if error
         */
-       public void moveToDevice() throws FileNotFoundException, IOException {
+       public void moveToDevice() {
                long start = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
-               int length; Pointer shadowDevicePointer;
-               if(isDoublePrecision()) {
-                       double [] shadowPointer = 
CACHE.getAsDoubleArray(fileName);
-                       length = shadowPointer.length;
-                       shadowDevicePointer = Pointer.to(shadowPointer);
-               }
-               else if(isSinglePrecision()) {
-                       float [] shadowPointer = 
CACHE.getAsFloatArray(fileName);
-                       length = shadowPointer.length;
-                       shadowDevicePointer = Pointer.to(shadowPointer);
-               }
-               else {
-                       throw new DMLRuntimeException("Unsupported datatype");
-               }
-               long numBytes = getSizeOfDataType(length);
+               long numBytes = getSizeOfDataType(shadowPointer.length);
                gpuObj.jcudaDenseMatrixPtr = 
gpuObj.getGPUContext().allocate(null, numBytes);
-               cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, shadowDevicePointer, 
numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
+               cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, 
Pointer.to(shadowPointer), numBytes, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
                clearShadowPointer();
                if (ConfigurationManager.isStatistics()) {
                        long totalTime = System.nanoTime() - start;
@@ -216,14 +144,14 @@ public class ShadowBuffer {
         * @return true if the given GPU object is eligible to be shadow 
buffered
         */
        public boolean isEligibleForBuffering(boolean isEviction, boolean 
eagerDelete) {
-               if(isEnabled() && isEviction && eagerDelete && 
!gpuObj.isDensePointerNull()) {
-                       long numBytes = 
getSizeOfDataType(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
-                       if(EVICTION_SHADOW_BUFFER_MAX_BYTES <= numBytes) {
-                               return false; // Don't attempt to cache very 
large GPU objects.
-                       }
-                       else {
-                               return true; // Dense GPU objects is eligible 
for shadow buffering when called during eviction and is being eagerly deleted.
+               if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && 
isEviction && eagerDelete && !gpuObj.isDensePointerNull()) {
+                       long numBytes = 
getSizeOfFloat(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
+                       boolean ret = EVICTION_SHADOW_BUFFER_CURR_BYTES + 
numBytes <= EVICTION_SHADOW_BUFFER_MAX_BYTES;
+                       if(!ret && !_warnedAboutShadowBuffer) {
+                               LOG.warn("Shadow buffer is full, so using CP 
bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize.");
+                               _warnedAboutShadowBuffer = true;
                        }
+                       return ret;
                }
                else {
                        return false;
@@ -234,9 +162,9 @@ public class ShadowBuffer {
         * Removes the content from shadow buffer
         */
        public void clearShadowPointer() {
-               if(CACHE.containsKey(fileName)) {
-                       CACHE.remove(fileName);
-                       isBuffered = false;
+               if(shadowPointer != null) {
+                       EVICTION_SHADOW_BUFFER_CURR_BYTES -= 
getSizeOfFloat(shadowPointer.length);
                }
+               shadowPointer = null;
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index e748057..541850d 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -26,6 +26,7 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
 import org.apache.sysml.conf.ConfigurationManager;
@@ -78,6 +79,10 @@ public class GPUStatistics {
        public static LongAdder cudaAllocSuccessCount = new LongAdder();
        public static LongAdder cudaAllocFailedCount = new LongAdder();
        public static LongAdder cudaAllocReuseCount = new LongAdder();
+       
+       public static LongAdder cudaAllocAggSize = new LongAdder();
+       public static AtomicLong cudaAllocPeakSize = new AtomicLong();
+       public static LongAdder cudaEvictAggSize = new LongAdder();
 
        // Per instruction miscellaneous timers.
        // Used to record events in a CP Heavy Hitter instruction and
@@ -116,6 +121,9 @@ public class GPUStatistics {
                cudaDouble2FloatCount.reset();
                cudaForcedClearLazyFreedEvictTime.reset();
                cudaForcedClearUnpinnedEvictTime.reset();
+               cudaAllocAggSize.reset();
+               cudaAllocPeakSize.set(0);
+               cudaEvictAggSize.reset();
                cudaAllocCount.reset();
                cudaDeAllocCount.reset();
                cudaToDevCount.reset();
@@ -218,6 +226,23 @@ public class GPUStatistics {
                }
                return sb.toString();
        }
+       
+       /**
+        * Pretty printing utility to print bytes
+        * 
+        * @param numBytes number of bytes
+        * @return a human-readable display value
+        */
+       public static String byteCountToDisplaySize(long numBytes) {
+               // return 
org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs 
rounding
+           if (numBytes < 1024) { 
+               return numBytes + " bytes";
+           }
+           else {
+                   int exp = (int) (Math.log(numBytes) / 6.931471805599453);
+                   return String.format("%.3f %sB", ((double)numBytes) / 
Math.pow(1024, exp), "KMGTP".charAt(exp-1));
+           }
+       }
 
        /**
         * Used to print out cuda timers & counters
@@ -242,6 +267,10 @@ public class GPUStatistics {
                                + cudaAllocReuseCount.longValue() +") / "
                                + cudaDeAllocCount.longValue() + " / "
                                + cudaMemSet0Count.longValue() + ".\n");
+               sb.append("GPU mem size (alloc (peak) / evict):\t"
+                               + 
byteCountToDisplaySize(cudaAllocAggSize.longValue()) + "("
+                               + 
byteCountToDisplaySize(cudaAllocPeakSize.longValue()) + ") / "
+                               + 
byteCountToDisplaySize(cudaEvictAggSize.longValue()) + ".\n");
                sb.append("GPU mem tx time  (toDev(d2f/s2d) / fromDev(f2d/s2h) 
/ evict(d2s/size)):\t"
                                + String.format("%.3f", 
cudaToDevTime.longValue()*1e-9) + "("
                                + String.format("%.3f", 
cudaDouble2FloatTime.longValue()*1e-9)+ "/"

http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java 
b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
index 71a1e28..d9d9337 100644
--- a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
+++ b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java
@@ -519,10 +519,6 @@ class ValueWrapper {
        long _clen;
        long _nnz;
        
-       // This is only used in write-mode until the writing to the disk is 
completed.
-       // It also prevents the _softRef from being garbage collected while it 
is written.
-       volatile DataWrapper _strongRef; 
-       
        ValueWrapper(DataWrapper data, boolean isInReadOnlyMode) {
                _lock = new Object();
                _isInReadOnlyMode = isInReadOnlyMode;
@@ -530,12 +526,10 @@ class ValueWrapper {
                if(!_isInReadOnlyMode && !isDummyValue) {
                        // Aggressive write to disk when the cache is used in 
the write-mode.
                        // This avoids the need to depend on finalize to 
perform writing.
-                       _strongRef = data;
                        Thread t = new Thread() {
                            public void run() {
                                try {
-                                               _strongRef.write(true);
-                                               _strongRef = null; // Reset the 
strong reference after aggresive writing
+                                       data.write(true);
                                        } catch (IOException e) {
                                                throw new 
DMLRuntimeException("Error occured while aggressively writing the value to 
disk.", e);
                                        }

systemml git commit: [SYSTEMML-445] Added memory stats for GPU allocation/eviction

Reply via email to