Repository: systemml Updated Branches: refs/heads/master 69624850e -> f46279a17
[SYSTEMML-445] Added memory stats for GPU allocation/eviction - Also, reverted the shadow buffer to the original implementation as we are getting OOM for lstm scripts. This likely has to do with pessimistic GC. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f46279a1 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f46279a1 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f46279a1 Branch: refs/heads/master Commit: f46279a17031d3f8827923f6eddd614c3eac77d3 Parents: 6962485 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Thu Sep 20 14:56:51 2018 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Thu Sep 20 14:56:51 2018 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 8 +- .../gpu/context/GPUMemoryManager.java | 61 ++++---- .../instructions/gpu/context/GPUObject.java | 18 +-- .../instructions/gpu/context/ShadowBuffer.java | 154 +++++-------------- .../org/apache/sysml/utils/GPUStatistics.java | 29 ++++ .../apache/sysml/utils/PersistentLRUCache.java | 8 +- 6 files changed, 108 insertions(+), 170 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index 3925c4e..7b535c9 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -105,11 +105,9 @@ <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) --> <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize> - <!-- Advanced optimization: maximum fraction of driver memory to use for GPU shadow buffer. - Shadow buffer is cleared eagerly on garbage collection to avoid OOM and is backed by org.apache.sysml.utils.PersistentLRUCache. - Setting this to zero disables shadow buffering. If you intend to train network larger than GPU memory size, - consider using large driver memory and setting this to a value greater than 0. --> - <sysml.gpu.eviction.shadow.bufferSize>0.5</sysml.gpu.eviction.shadow.bufferSize> + <!-- Advanced optimization: fraction of driver memory to use for GPU shadow buffer. This optimization is ignored for double precision. + By default, it is disabled (hence set to 0.0). If you intend to train network larger than GPU memory size, consider using single precision and setting this to 0.1. --> + <sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize> <!-- Fraction of available GPU memory to use. This is similar to TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 0.9) --> <sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor> http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index 033051a..57b76f6 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -191,7 +191,7 @@ public class GPUMemoryManager { GPUStatistics.cudaAllocCount.increment(); } if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) ) { - LOG.info("Success: " + printDebugMessage + ":" + byteCountToDisplaySize(size)); + LOG.info("Success: " + printDebugMessage + ":" + GPUStatistics.byteCountToDisplaySize(size)); } return A; } catch(jcuda.CudaException e) { @@ -203,7 +203,7 @@ public class GPUMemoryManager { GPUStatistics.cudaAllocCount.increment(); } if(printDebugMessage != null && (PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) ) { - LOG.info("Failed: " + printDebugMessage + ":" + byteCountToDisplaySize(size)); + LOG.info("Failed: " + printDebugMessage + ":" + GPUStatistics.byteCountToDisplaySize(size)); LOG.info("GPU Memory info " + printDebugMessage + ":" + toString()); } return null; @@ -224,28 +224,15 @@ public class GPUMemoryManager { return "->" + stackTrace[index].getClassName() + "." + stackTrace[index].getMethodName() + "(" + stackTrace[index].getFileName() + ":" + stackTrace[index].getLineNumber() + ")"; } - /** - * Pretty printing utility to print bytes - * - * @param numBytes number of bytes - * @return a human-readable display value - */ - private String byteCountToDisplaySize(long numBytes) { - // return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding - if (numBytes < 1024) { - return numBytes + " bytes"; - } - else { - int exp = (int) (Math.log(numBytes) / 6.931471805599453); - return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1)); - } - } public boolean canAllocateWithoutEviction(String opcode, long size) { return lazyCudaFreeMemoryManager.contains(opcode, size) || allocator.canAllocate(size) || lazyCudaFreeMemoryManager.containsRmvarPointerMinSize(opcode, size) ; } + long peakSize = 0; + long currentSize = 0; + /** * Allocate pointer of the given size in bytes. * @@ -255,12 +242,19 @@ public class GPUMemoryManager { */ public Pointer malloc(String opcode, long size) { if(size <= 0) { - throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size)); + throw new DMLRuntimeException("Cannot allocate memory of size " + GPUStatistics.byteCountToDisplaySize(size)); } if(DEBUG_MEMORY_LEAK) { LOG.info("GPU Memory info during malloc:" + toString()); } + if(ConfigurationManager.isStatistics()) { + GPUStatistics.cudaAllocAggSize.add(size); + currentSize += size; + peakSize = Math.max(currentSize, peakSize); + GPUStatistics.cudaAllocPeakSize.set(peakSize); + } + // Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size); @@ -358,7 +352,7 @@ public class GPUMemoryManager { } if(A == null) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + byteCountToDisplaySize(size) + ". \n " + throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + GPUStatistics.byteCountToDisplaySize(size) + ". \n " + toString()); } @@ -377,6 +371,10 @@ public class GPUMemoryManager { boolean eagerDelete = true; if(gpuObj.isDirty()) { // Eviction + if(ConfigurationManager.isStatistics()) { + long size = gpuObj.getSizeOnDevice(); + GPUStatistics.cudaEvictAggSize.add(size); + } gpuObj.copyFromDeviceToHost(opcode, true, eagerDelete); } else { @@ -416,7 +414,7 @@ public class GPUMemoryManager { if(allPointers.containsKey(toFree)) { long size = allPointers.get(toFree).getSizeInBytes(); if(LOG.isTraceEnabled()) { - LOG.trace("Free-ing up the pointer of size " + byteCountToDisplaySize(size)); + LOG.trace("Free-ing up the pointer of size " + GPUStatistics.byteCountToDisplaySize(size)); } allPointers.remove(toFree); lazyCudaFreeMemoryManager.removeIfPresent(size, toFree); @@ -441,6 +439,10 @@ public class GPUMemoryManager { public void free(String opcode, Pointer toFree, boolean eager) throws DMLRuntimeException { if(LOG.isTraceEnabled()) LOG.trace("Free-ing the pointer with eager=" + eager); + long size = allPointers.get(toFree).getSizeInBytes(); + if(ConfigurationManager.isStatistics()) { + currentSize -= size; + } if (eager) { long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; guardedCudaFree(toFree); @@ -451,7 +453,6 @@ public class GPUMemoryManager { LOG.info("GPU memory info before failure:" + toString()); throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"); } - long size = allPointers.get(toFree).getSizeInBytes(); lazyCudaFreeMemoryManager.add(size, toFree); } } @@ -604,24 +605,24 @@ public class GPUMemoryManager { ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", "Num Objects", "Num Pointers", "Size")); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked Dirty GPU objects", - numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects))); + numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects))); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked NonDirty GPU objects", - numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects))); + numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, GPUStatistics.byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects))); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU objects", - numLockedGPUObjects, numLockedPointers, byteCountToDisplaySize(sizeOfLockedGPUObjects))); + numLockedGPUObjects, numLockedPointers, GPUStatistics.byteCountToDisplaySize(sizeOfLockedGPUObjects))); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached rmvar-ed pointers", - "-", lazyCudaFreeMemoryManager.getNumPointers(), byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated()))); + "-", lazyCudaFreeMemoryManager.getNumPointers(), GPUStatistics.byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated()))); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Non-matrix/non-cached pointers", - "-", potentiallyLeakyPointers.size(), byteCountToDisplaySize(totalSizePotentiallyLeakyPointers))); + "-", potentiallyLeakyPointers.size(), GPUStatistics.byteCountToDisplaySize(totalSizePotentiallyLeakyPointers))); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All pointers", - "-", allPointers.size(), byteCountToDisplaySize(totalMemoryAllocated))); + "-", allPointers.size(), GPUStatistics.byteCountToDisplaySize(totalMemoryAllocated))); long free[] = { 0 }; long total[] = { 0 }; cudaMemGetInfo(free, total); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Free mem (from cudaMemGetInfo)", - "-", "-", byteCountToDisplaySize(free[0]))); + "-", "-", GPUStatistics.byteCountToDisplaySize(free[0]))); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Total mem (from cudaMemGetInfo)", - "-", "-", byteCountToDisplaySize(total[0]))); + "-", "-", GPUStatistics.byteCountToDisplaySize(total[0]))); ret.append("====================================================\n"); return ret.toString(); } http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 43e2727..72d3170 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -111,11 +111,7 @@ public class GPUObject { */ public Pointer getDensePointer() { if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && getJcudaSparseMatrixPtr() == null) { - try { - shadowBuffer.moveToDevice(); - } catch (IOException e) { - throw new DMLRuntimeException("Error moving the data from shadow buffer to the device", e); - } + shadowBuffer.moveToDevice(); } return jcudaDenseMatrixPtr; } @@ -939,21 +935,13 @@ public class GPUObject { else { // If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction), // then copy from shadow buffer to MatrixObject. - try { - shadowBuffer.moveToHost(); - } catch (IOException e) { - throw new DMLRuntimeException("Error moving the data from shadow buffer to the host memory", e); - } + shadowBuffer.moveToHost(); return; } } else if(shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) { // Perform shadow buffering if (1) single precision, (2) during eviction, (3) for dense matrices, and (4) if the given matrix can fit into the shadow buffer. - try { - shadowBuffer.moveFromDevice(instName); - } catch (IOException e) { - throw new DMLRuntimeException("Error moving the data from the device to the shadow buffer", e); - } + shadowBuffer.moveFromDevice(instName); return; } else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) { http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java index 4c534a0..88ea972 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java @@ -20,65 +20,41 @@ package org.apache.sysml.runtime.instructions.gpu.context; import static jcuda.runtime.JCuda.cudaMemcpy; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.concurrent.atomic.AtomicLong; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; -import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.utils.GPUStatistics; -import org.apache.sysml.utils.PersistentLRUCache; import jcuda.Pointer; +import jcuda.Sizeof; -/** - * Shadow buffer is a temporary staging area used during eviction. - * It is eagerly deleted and backed using the local filesystem in case of Garbage Collection - * or if the staging memory size exceeds the user-specified size. - * This is needed to respect SystemML's memory estimates, while still allowing - * for caching in case of GPU plans. - */ public class ShadowBuffer { private static final Log LOG = LogFactory.getLog(ShadowBuffer.class.getName()); - private static PersistentLRUCache CACHE; - private static AtomicLong UNIQUE_ID = new AtomicLong(); - private static long EVICTION_SHADOW_BUFFER_MAX_BYTES; - final GPUObject gpuObj; - boolean isBuffered = false; - String fileName; - public static boolean isEnabled() { - if(CACHE == null && EVICTION_SHADOW_BUFFER_MAX_BYTES >= 0) { + GPUObject gpuObj; + float[] shadowPointer = null; + private static boolean _warnedAboutShadowBuffer = false; + private static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0; + private static long EVICTION_SHADOW_BUFFER_MAX_BYTES; + static { + if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) { + EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; + } + else { double shadowBufferSize = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE); - if(shadowBufferSize <= 0) { - EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid unnecessary invoking configuration manager. - } - else { - if(shadowBufferSize > 1) - throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE); - EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize); - try { - CACHE = new PersistentLRUCache(EVICTION_SHADOW_BUFFER_MAX_BYTES); - } catch(IOException e) { - LOG.warn("Unable to create a temporary directory for shadow buffering on the local filesystem; disabling shadow buffering:" + e.getMessage()); - EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid checking for file permission. - } - } + if(shadowBufferSize < 0 || shadowBufferSize > 1) + throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE); + EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize); } - return CACHE != null; } public ShadowBuffer(GPUObject gpuObj) { - if(isEnabled()) - fileName = "shadow_" + UNIQUE_ID.incrementAndGet(); this.gpuObj = gpuObj; - } /** @@ -87,39 +63,19 @@ public class ShadowBuffer { * @return true if the gpu object is shadow buffered */ public boolean isBuffered() { - return isBuffered; - } - - private static long getSizeOfDataType(long numElems) { - return numElems * ((long) LibMatrixCUDA.sizeOfDataType); + return shadowPointer != null; } /** * Move the data from GPU to shadow buffer * @param instName name of the instruction - * @throws IOException if error - * @throws FileNotFoundException if error */ - public void moveFromDevice(String instName) throws FileNotFoundException, IOException { + public void moveFromDevice(String instName) { long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; int numElems = GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns()); - - if(isDoublePrecision()) { - double [] shadowPointer = new double[numElems]; - cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); - CACHE.put(fileName, shadowPointer); - isBuffered = true; - } - else if(isSinglePrecision()) { - float [] shadowPointer = new float[numElems]; - cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); - CACHE.put(fileName, shadowPointer); - isBuffered = true; - } - else { - throw new DMLRuntimeException("Unsupported datatype"); - } - + shadowPointer = new float[numElems]; + EVICTION_SHADOW_BUFFER_CURR_BYTES += getSizeOfFloat(shadowPointer.length); + cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); gpuObj.getGPUContext().cudaFreeHelper(instName, gpuObj.jcudaDenseMatrixPtr, true); gpuObj.jcudaDenseMatrixPtr = null; if (ConfigurationManager.isStatistics()) { @@ -131,36 +87,24 @@ public class ShadowBuffer { } } - - private static boolean isDoublePrecision() { - return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.DOUBLE; + private long getSizeOfFloat(long numElems) { + return numElems*Sizeof.FLOAT; } - private static boolean isSinglePrecision() { - return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT; + private long getSizeOfDataType(long numElems) { + return numElems*LibMatrixCUDA.sizeOfDataType; } /** * Move the data from shadow buffer to Matrix object - * @throws IOException if error - * @throws FileNotFoundException if error */ - public void moveToHost() throws FileNotFoundException, IOException { + public void moveToHost() { long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; MatrixBlock tmp = new MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false); tmp.allocateDenseBlock(); double [] tmpArr = tmp.getDenseBlockValues(); - if(isDoublePrecision()) { - System.arraycopy(CACHE.getAsDoubleArray(fileName), 0, tmpArr, 0, tmpArr.length); - } - else if(isSinglePrecision()) { - float [] shadowPointer = CACHE.getAsFloatArray(fileName); - for(int i = 0; i < shadowPointer.length; i++) { - tmpArr[i] = shadowPointer[i]; - } - } - else { - throw new DMLRuntimeException("Unsupported datatype"); + for(int i = 0; i < shadowPointer.length; i++) { + tmpArr[i] = shadowPointer[i]; } gpuObj.mat.acquireModify(tmp); gpuObj.mat.release(); @@ -178,28 +122,12 @@ public class ShadowBuffer { /** * Move the data from shadow buffer to GPU - * @throws IOException if error - * @throws FileNotFoundException if error */ - public void moveToDevice() throws FileNotFoundException, IOException { + public void moveToDevice() { long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; - int length; Pointer shadowDevicePointer; - if(isDoublePrecision()) { - double [] shadowPointer = CACHE.getAsDoubleArray(fileName); - length = shadowPointer.length; - shadowDevicePointer = Pointer.to(shadowPointer); - } - else if(isSinglePrecision()) { - float [] shadowPointer = CACHE.getAsFloatArray(fileName); - length = shadowPointer.length; - shadowDevicePointer = Pointer.to(shadowPointer); - } - else { - throw new DMLRuntimeException("Unsupported datatype"); - } - long numBytes = getSizeOfDataType(length); + long numBytes = getSizeOfDataType(shadowPointer.length); gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes); - cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, shadowDevicePointer, numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); + cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); clearShadowPointer(); if (ConfigurationManager.isStatistics()) { long totalTime = System.nanoTime() - start; @@ -216,14 +144,14 @@ public class ShadowBuffer { * @return true if the given GPU object is eligible to be shadow buffered */ public boolean isEligibleForBuffering(boolean isEviction, boolean eagerDelete) { - if(isEnabled() && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) { - long numBytes = getSizeOfDataType(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns()); - if(EVICTION_SHADOW_BUFFER_MAX_BYTES <= numBytes) { - return false; // Don't attempt to cache very large GPU objects. - } - else { - return true; // Dense GPU objects is eligible for shadow buffering when called during eviction and is being eagerly deleted. + if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) { + long numBytes = getSizeOfFloat(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns()); + boolean ret = EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= EVICTION_SHADOW_BUFFER_MAX_BYTES; + if(!ret && !_warnedAboutShadowBuffer) { + LOG.warn("Shadow buffer is full, so using CP bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize."); + _warnedAboutShadowBuffer = true; } + return ret; } else { return false; @@ -234,9 +162,9 @@ public class ShadowBuffer { * Removes the content from shadow buffer */ public void clearShadowPointer() { - if(CACHE.containsKey(fileName)) { - CACHE.remove(fileName); - isBuffered = false; + if(shadowPointer != null) { + EVICTION_SHADOW_BUFFER_CURR_BYTES -= getSizeOfFloat(shadowPointer.length); } + shadowPointer = null; } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index e748057..541850d 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -26,6 +26,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.LongAdder; import org.apache.sysml.conf.ConfigurationManager; @@ -78,6 +79,10 @@ public class GPUStatistics { public static LongAdder cudaAllocSuccessCount = new LongAdder(); public static LongAdder cudaAllocFailedCount = new LongAdder(); public static LongAdder cudaAllocReuseCount = new LongAdder(); + + public static LongAdder cudaAllocAggSize = new LongAdder(); + public static AtomicLong cudaAllocPeakSize = new AtomicLong(); + public static LongAdder cudaEvictAggSize = new LongAdder(); // Per instruction miscellaneous timers. // Used to record events in a CP Heavy Hitter instruction and @@ -116,6 +121,9 @@ public class GPUStatistics { cudaDouble2FloatCount.reset(); cudaForcedClearLazyFreedEvictTime.reset(); cudaForcedClearUnpinnedEvictTime.reset(); + cudaAllocAggSize.reset(); + cudaAllocPeakSize.set(0); + cudaEvictAggSize.reset(); cudaAllocCount.reset(); cudaDeAllocCount.reset(); cudaToDevCount.reset(); @@ -218,6 +226,23 @@ public class GPUStatistics { } return sb.toString(); } + + /** + * Pretty printing utility to print bytes + * + * @param numBytes number of bytes + * @return a human-readable display value + */ + public static String byteCountToDisplaySize(long numBytes) { + // return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding + if (numBytes < 1024) { + return numBytes + " bytes"; + } + else { + int exp = (int) (Math.log(numBytes) / 6.931471805599453); + return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1)); + } + } /** * Used to print out cuda timers & counters @@ -242,6 +267,10 @@ public class GPUStatistics { + cudaAllocReuseCount.longValue() +") / " + cudaDeAllocCount.longValue() + " / " + cudaMemSet0Count.longValue() + ".\n"); + sb.append("GPU mem size (alloc (peak) / evict):\t" + + byteCountToDisplaySize(cudaAllocAggSize.longValue()) + "(" + + byteCountToDisplaySize(cudaAllocPeakSize.longValue()) + ") / " + + byteCountToDisplaySize(cudaEvictAggSize.longValue()) + ".\n"); sb.append("GPU mem tx time (toDev(d2f/s2d) / fromDev(f2d/s2h) / evict(d2s/size)):\t" + String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "(" + String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ "/" http://git-wip-us.apache.org/repos/asf/systemml/blob/f46279a1/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java index 71a1e28..d9d9337 100644 --- a/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java +++ b/src/main/java/org/apache/sysml/utils/PersistentLRUCache.java @@ -519,10 +519,6 @@ class ValueWrapper { long _clen; long _nnz; - // This is only used in write-mode until the writing to the disk is completed. - // It also prevents the _softRef from being garbage collected while it is written. - volatile DataWrapper _strongRef; - ValueWrapper(DataWrapper data, boolean isInReadOnlyMode) { _lock = new Object(); _isInReadOnlyMode = isInReadOnlyMode; @@ -530,12 +526,10 @@ class ValueWrapper { if(!_isInReadOnlyMode && !isDummyValue) { // Aggressive write to disk when the cache is used in the write-mode. // This avoids the need to depend on finalize to perform writing. - _strongRef = data; Thread t = new Thread() { public void run() { try { - _strongRef.write(true); - _strongRef = null; // Reset the strong reference after aggresive writing + data.write(true); } catch (IOException e) { throw new DMLRuntimeException("Error occured while aggressively writing the value to disk.", e); }