Repository: systemml Updated Branches: refs/heads/master aed66df13 -> bdf27084b
[SYSTEMML-445] Bugfix and GPU shadow buffer for single precision - Added shadow buffer logic for using CPU memory as temporary memory for evicted matrices. This avoids unnecessary overhead of float to double, nnz computation, potential serialization, etc. By default, this is turned off and can be enabled via configuration property sysml.gpu.eviction.shadow.bufferSize - Bugfix when the data generated by MLContext with GPU disabled is consumed by MLContext with GPU enabled. Without this bugfix, we throw a null pointer exception as the GPU pointer data structure of the data by the first MLContext is not initialized. - Added additional GPU memory-related statistics. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/bdf27084 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/bdf27084 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/bdf27084 Branch: refs/heads/master Commit: bdf27084bd115c69a00a521a77c66a62d8b657b7 Parents: aed66df Author: Niketan Pansare <[email protected]> Authored: Thu Aug 2 11:45:55 2018 -0700 Committer: Niketan Pansare <[email protected]> Committed: Thu Aug 2 11:51:39 2018 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 4 + .../java/org/apache/sysml/api/DMLScript.java | 2 + .../apache/sysml/api/ScriptExecutorUtils.java | 21 ++ .../java/org/apache/sysml/conf/DMLConfig.java | 4 +- .../controlprogram/caching/CacheableData.java | 9 +- .../context/GPULazyCudaFreeMemoryManager.java | 32 ++- .../gpu/context/GPUMatrixMemoryManager.java | 44 +--- .../gpu/context/GPUMemoryManager.java | 231 +++++++------------ .../instructions/gpu/context/GPUObject.java | 110 +++++++-- .../org/apache/sysml/utils/GPUStatistics.java | 87 ++++--- 10 files changed, 307 insertions(+), 237 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index 05d6a1a..033aadb 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -104,4 +104,8 @@ <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) --> <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize> + + <!-- Advanced optimization: fraction of driver memory to use for GPU shadow buffer. This optimization is ignored for double precision. + By default, it is disabled (hence set to 0.0). If you intend to train network larger than GPU memory size, consider using single precision and setting this to 0.1 --> + <sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize> </root> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index 50a23aa..bfc9da5 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -122,6 +122,8 @@ public class DMLScript public static String FLOATING_POINT_PRECISION = "double"; // data type to use internally public static EvictionPolicy GPU_EVICTION_POLICY = EvictionPolicy.ALIGN_MEMORY; // currently employed GPU eviction policy public static boolean PRINT_GPU_MEMORY_INFO = false; // whether to print GPU memory-related information + public static long EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; // maximum number of bytes to use for shadow buffer + public static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0; // number of bytes to use for shadow buffer /** * Global variable indicating the script type (DML or PYDML). Can be used http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index 0b4c7ab..13d0c78 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -31,6 +31,7 @@ import org.apache.sysml.runtime.controlprogram.Program; import org.apache.sysml.runtime.controlprogram.caching.CacheableData; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; @@ -80,6 +81,8 @@ public class ScriptExecutorUtils { DMLScript.PRINT_GPU_MEMORY_INFO = dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO); DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU); CacheableData.CACHING_BUFFER_SIZE = dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE); + if(CacheableData.CACHING_BUFFER_SIZE < 0 || CacheableData.CACHING_BUFFER_SIZE > 1) + throw new RuntimeException("Incorrect value (" + CacheableData.CACHING_BUFFER_SIZE + ") for the configuration " + DMLConfig.CACHING_BUFFER_SIZE); DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE); DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN); NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim()); @@ -87,7 +90,25 @@ public class ScriptExecutorUtils { if(DMLScript.USE_ACCELERATOR) { DMLScript.FLOATING_POINT_PRECISION = dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION); org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision(); + if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) { + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; + } + else { + double shadowBufferSize = dmlconf.getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE); + if(shadowBufferSize < 0 || shadowBufferSize > 1) + throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration " + DMLConfig.EVICTION_SHADOW_BUFFERSIZE); + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize); + if(DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES > 0 && + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES > DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES) { + // This will be printed in a very rare situation when: + // 1. There is a memory leak which leads to non-cleared shadow buffer OR + // 2. MLContext is registering to bunch of outputs that are all part of shadow buffer + System.out.println("WARN: Cannot use the shadow buffer due to potentially cached GPU objects. Current shadow buffer size (in bytes):" + + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + " > Max shadow buffer size (in bytes):" + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES); + } + } } + boolean exceptionThrown = false; http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index 9f08c3c..4aad400 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -94,6 +94,7 @@ public class DMLConfig public static final String GPU_MEMORY_UTILIZATION_FACTOR = "sysml.gpu.memory.util.factor"; public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo"; + public static final String EVICTION_SHADOW_BUFFERSIZE = "sysml.gpu.eviction.shadow.bufferSize"; // supported prefixes for custom map/reduce configurations public static final String PREFIX_MAPRED = "mapred"; @@ -136,6 +137,7 @@ public class DMLConfig _defaultVals.put(NATIVE_BLAS_DIR, "none" ); _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" ); _defaultVals.put(PRINT_GPU_MEMORY_INFO, "false" ); + _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE, "0.0" ); _defaultVals.put(STATS_MAX_WRAP_LEN, "30" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); _defaultVals.put(AVAILABLE_GPUS, "-1"); @@ -426,7 +428,7 @@ public class DMLConfig COMPRESSED_LINALG, CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE, - AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY + AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, EVICTION_SHADOW_BUFFERSIZE }; StringBuilder sb = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java index 0265c33..b2a78d4 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java @@ -410,7 +410,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data getCache(); //call acquireHostRead if gpuHandle is set as well as is allocated - if( DMLScript.USE_ACCELERATOR ) { + if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) { boolean copiedFromGPU = false; for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) { GPUObject gObj = kv.getValue(); @@ -617,11 +617,12 @@ public abstract class CacheableData<T extends CacheBlock> extends Data _rddHandle.setBackReference(null); if( _bcHandle != null ) _bcHandle.setBackReference(null); - if( _gpuObjects != null ) + if( _gpuObjects != null ) { for (GPUObject gObj : _gpuObjects.values()) if (gObj != null) gObj.clearData(null, DMLScript.EAGER_CUDA_FREE); - + } + // change object state EMPTY setDirty(false); setEmpty(); @@ -684,7 +685,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat); - if( DMLScript.USE_ACCELERATOR ) { + if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) { boolean copiedFromGPU = false; for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) { GPUObject gObj = kv.getValue(); http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java index c90beef..b619fa9 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java @@ -29,6 +29,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.utils.GPUStatistics; import jcuda.Pointer; @@ -55,7 +56,16 @@ public class GPULazyCudaFreeMemoryManager { if (rmvarGPUPointers.containsKey(size)) { if(LOG.isTraceEnabled()) LOG.trace("Getting rmvar-ed pointers for size:" + size); + boolean measureTime = opcode != null && DMLScript.FINEGRAINED_STATISTICS; + long t0 = measureTime ? System.nanoTime() : 0; Pointer A = remove(rmvarGPUPointers, size); // remove from rmvarGPUPointers as you are not calling cudaFree + long totalTime = System.nanoTime() - t0; + if(DMLScript.STATISTICS) { + GPUStatistics.cudaAllocReuseCount.increment(); + } + if(measureTime) { + GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_REUSE, totalTime); + } return A; } else { @@ -63,6 +73,18 @@ public class GPULazyCudaFreeMemoryManager { } } + /** + * Convenient method to add misc timers + * + * @param opcode opcode + * @param instructionLevelTimer member of GPUInstruction + * @param startTime start time + */ + void addMiscTime(String opcode, String instructionLevelTimer, long startTime) { + if (opcode != null && DMLScript.FINEGRAINED_STATISTICS) + GPUStatistics.maintainCPMiscTimes(opcode, instructionLevelTimer, System.nanoTime() - startTime); + } + public Set<Pointer> getAllPointers() { return rmvarGPUPointers.values().stream().flatMap(ptrs -> ptrs.stream()).collect(Collectors.toSet()); } @@ -82,9 +104,15 @@ public class GPULazyCudaFreeMemoryManager { Optional<Long> toClear = rmvarGPUPointers.entrySet().stream().filter(e -> e.getValue().size() > 0).map(e -> e.getKey()) .filter(size -> size >= minSize).min((s1, s2) -> s1 < s2 ? -1 : 1); if(toClear.isPresent()) { - long t0 = opcode != null && DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + boolean measureTime = opcode != null && DMLScript.FINEGRAINED_STATISTICS; + long t0 = measureTime ? System.nanoTime() : 0; Pointer A = remove(rmvarGPUPointers, toClear.get()); // remove from rmvarGPUPointers as you are not calling cudaFree - gpuManager.addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0); + if(measureTime) { + gpuManager.addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0); + } + if(DMLScript.STATISTICS) { + GPUStatistics.cudaAllocReuseCount.increment(); + } return A; } return null; http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java index 610df23..cbb8d4e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java @@ -18,9 +18,7 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; -import java.util.Comparator; import java.util.HashSet; -import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -54,7 +52,10 @@ public class GPUMatrixMemoryManager { long getWorstCaseContiguousMemorySize(GPUObject gpuObj) { long ret = 0; if(!gpuObj.isDensePointerNull()) { - ret = gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes(); + if(gpuObj.shadowPointer == null) + ret = gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes(); + else + ret = 0; // evicted hence no contiguous memory on GPU } else if(gpuObj.getJcudaSparseMatrixPtr() != null) { CSRPointer sparsePtr = gpuObj.getJcudaSparseMatrixPtr(); @@ -81,6 +82,7 @@ public class GPUMatrixMemoryManager { LOG.warn("Matrix allocated in both dense and sparse format"); } if(!gObj.isDensePointerNull()) { + // && gObj.evictedDenseArr == null - Ignore evicted array ret.add(gObj.getDensePointer()); } if(gObj.getSparseMatrixCudaPointer() != null) { @@ -107,16 +109,6 @@ public class GPUMatrixMemoryManager { HashSet<GPUObject> gpuObjects = new HashSet<>(); /** - * Get GPUObjects from the first memory sections "Matrix Memory" - * @param locked return locked GPU objects if true - * @param dirty return dirty GPU objects if true - * @return set of GPU Objects - */ - Set<GPUObject> getGPUObjects(boolean locked, boolean dirty) { - return gpuObjects.stream().filter(gObj -> gObj.isLocked() == locked && gObj.isDirty() == dirty).collect(Collectors.toSet()); - } - - /** * Return all pointers in the first section * @return all pointers in this section */ @@ -135,32 +127,6 @@ public class GPUMatrixMemoryManager { } /** - * Clear the memory of the gpu object that matches the provided parameters - * - * @param locked is locked - * @param dirty is dirty - * @param minSize of atleast given size - * @param comparator sorting comparator in case there are more than one gpu object that matches above parameters - * @param opcode instruction code - * @return true if a gpu object satisfies the above condition else false - * @throws DMLRuntimeException if error occurs - */ - boolean clear(boolean locked, boolean dirty, long minSize, Comparator<GPUObject> comparator, String opcode) throws DMLRuntimeException { - Optional<GPUObject> toClear = getGPUObjects(locked, dirty).stream() - .filter(gObj -> getWorstCaseContiguousMemorySize(gObj) >= minSize) - .max(comparator); - if(toClear.isPresent()) { - GPUObject gObj = toClear.get(); - if(gObj.dirty) - gObj.copyFromDeviceToHost(opcode, true, true); // Perform eviction if dirty - else - gObj.clearData(opcode, true); - gpuObjects.remove(gObj); - } - return toClear.isPresent(); - } - - /** * Clear all unlocked gpu objects * * @param opcode instruction code http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index 45611a4..acfba66 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -29,6 +29,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map.Entry; +import java.util.Optional; import java.util.Set; import java.util.concurrent.atomic.LongAdder; import java.util.stream.Collectors; @@ -159,14 +160,37 @@ public class GPUMemoryManager { * * @param A pointer * @param size size in bytes + * @param printDebugMessage debug message * @return allocated pointer */ - private Pointer cudaMallocNoWarn(Pointer A, long size) { + private Pointer cudaMallocNoWarn(Pointer A, long size, String printDebugMessage) { + long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; try { cudaMalloc(A, size); allPointers.put(A, new PointerInfo(size)); + if(DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - t0; + GPUStatistics.cudaAllocSuccessTime.add(totalTime); + GPUStatistics.cudaAllocSuccessCount.increment(); + GPUStatistics.cudaAllocTime.add(totalTime); + GPUStatistics.cudaAllocCount.increment(); + } + if(printDebugMessage != null && (DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) ) { + LOG.info("Success: " + printDebugMessage + ":" + byteCountToDisplaySize(size)); + } return A; } catch(jcuda.CudaException e) { + if(DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - t0; + GPUStatistics.cudaAllocFailedTime.add(System.nanoTime() - t0); + GPUStatistics.cudaAllocFailedCount.increment(); + GPUStatistics.cudaAllocTime.add(totalTime); + GPUStatistics.cudaAllocCount.increment(); + } + if(printDebugMessage != null && (DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) ) { + LOG.info("Failed: " + printDebugMessage + ":" + byteCountToDisplaySize(size)); + LOG.info("GPU Memory info " + printDebugMessage + ":" + toString()); + } return null; } } @@ -218,180 +242,88 @@ public class GPUMemoryManager { LOG.info("GPU Memory info during malloc:" + toString()); } - long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - long mallocStart = 0; // Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size); - if(A != null) - addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0); Pointer tmpA = (A == null) ? new Pointer() : null; // Step 2: Allocate a new pointer in the GPU memory (since memory is available) + // Step 3 has potential to create holes as well as limit future reuse, hence perform this step before step 3. if(A == null && size <= getAvailableMemory()) { - mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; - A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues - addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); - if(LOG.isTraceEnabled()) { - if(A == null) - LOG.trace("Couldnot allocate a new pointer in the GPU memory:" + byteCountToDisplaySize(size)); - else - LOG.trace("Allocated a new pointer in the GPU memory:" + byteCountToDisplaySize(size)); - } + // This can fail in case of fragmented memory, so don't issue any warning + A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer"); } - // Reusing one rmvar-ed pointer (Step 3) is preferred to reusing multiple pointers as the latter may not be contiguously allocated. - // (Step 4 or using any other policy that doesnot take memory into account). - // Step 3: Try reusing non-exact match entry of rmvarGPUPointers if(A == null) { A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size); if(A != null) { guardedCudaFree(A); - mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; - A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues - addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); - if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - if(A == null) - LOG.info("Couldnot reuse non-exact match of rmvarGPUPointers:" + byteCountToDisplaySize(size)); - else { - LOG.info("Reuses a non-exact match from rmvarGPUPointers:" + byteCountToDisplaySize(size)); - LOG.info("GPU Memory info after reusing a non-exact match from rmvarGPUPointers:" + toString()); - } - } + A = cudaMallocNoWarn(tmpA, size, "reuse non-exact match of rmvarGPUPointers"); + if(A == null) + LOG.warn("cudaMalloc failed after clearing one of rmvarGPUPointers."); } } - // Step 3.b: An optimization missing so as not to over-engineer malloc: - // Try to find minimal number of contiguously allocated pointer. - + // Step 4: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU // Evictions of matrix blocks are expensive (as they might lead them to be written to disk in case of smaller CPU budget) // than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred to eviction (step 5). - - // Step 4: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU if(A == null) { lazyCudaFreeMemoryManager.clearAll(); if(size <= getAvailableMemory()) { - A = cudaMallocNoWarn(tmpA, size); - if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - if(A == null) - LOG.info("Couldnot allocate a new pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size)); - else { - LOG.info("Allocated a new pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size)); - LOG.info("GPU Memory info after allocating new pointer post lazyCudaFreeMemoryManager.clearAll():" + toString()); - } - } + // This can fail in case of fragmented memory, so don't issue any warning + A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free"); } } - addMiscTime(opcode, GPUStatistics.cudaAllocTime, GPUStatistics.cudaAllocCount, GPUInstruction.MISC_TIMER_ALLOCATE, t0); - - // Step 5: Try eviction based on the given policy + // Step 5: Try eviction/clearing exactly one with size restriction if(A == null) { - t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - - // First, clear unlocked non-dirty matrices greater than or equal to size - // Comparator clears the largest matrix to avoid future evictions - boolean success = matrixMemoryManager.clear(false, false, size, SIMPLE_COMPARATOR_SORT_BY_SIZE, opcode); - if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - if(success) { - LOG.info("Cleared an unlocked non-dirty matrix greater than or equal to " + byteCountToDisplaySize(size)); - LOG.info("GPU Memory info after clearing an unlocked non-dirty matrix:" + toString()); - } - else - LOG.info("No unlocked non-dirty matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for clearing."); - } - if(!success) { - // First, clear unlocked dirty matrices greater than or equal to size using the eviction policy - // Comparator clears the largest matrix to avoid future evictions - if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - LOG.info("GPU Memory info before eviction:" + toString()); - } - success = matrixMemoryManager.clear(false, true, size, new EvictionPolicyBasedComparator(size), opcode); - // JCuda.cudaDeviceSynchronize(); - if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - if(success) { - LOG.info("Evicted an unlocked dirty matrix greater than or equal to " + byteCountToDisplaySize(size)); - LOG.info("GPU Memory info after evicting an unlocked dirty matrix:" + toString()); - } - else - LOG.info("No unlocked dirty matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for evicted."); - } - - if(!success) { - // Minor optimization: clear all unlocked non-dirty matrices before attempting eviction - // Delete all non-dirty - List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream() - .filter(gpuObj -> !gpuObj.isLocked() && !gpuObj.isDirty()).collect(Collectors.toList()); - matrixMemoryManager.gpuObjects.removeAll(unlockedGPUObjects); - for(GPUObject toBeRemoved : unlockedGPUObjects) { - toBeRemoved.clearData(opcode, true); - } - if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - LOG.info("GPU Memory info after clearing all unlocked non-dirty matrices:" + toString()); - } - mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; - A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues - addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); - - // --------------------------------------------------------------- - // Evict unlocked GPU objects one-by-one and try malloc - unlockedGPUObjects = null; - if(A == null) { - unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream() - .filter(gpuObj -> !gpuObj.isLocked() && gpuObj.isDirty()).collect(Collectors.toList()); - Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size)); - while(A == null && unlockedGPUObjects.size() > 0) { - if(DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.ALIGN_MEMORY) { - // TODO: Optimize later using sliding window - // Evict as many sequential dense objects from back of the queue as possible - long neededSize = size; - while(neededSize >= 0 && unlockedGPUObjects.size() > 0) { - GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1); - neededSize -= matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj); - gpuObj.copyFromDeviceToHost(opcode, true, true); - } - } - else { - GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1); - gpuObj.copyFromDeviceToHost(opcode, true, true); - } - mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; - A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues - addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); - } - if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - // greater than or equal to " + byteCountToDisplaySize(size) - LOG.info("GPU Memory info after eviction:" + toString()); - } - if(unlockedGPUObjects != null && unlockedGPUObjects.size() == 0) { - LOG.warn("Evicted all unlocked matrices"); - } - } - + long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; + Optional<GPUObject> sizeBasedUnlockedGPUObjects = matrixMemoryManager.gpuObjects.stream() + .filter(gpuObj -> !gpuObj.isLocked() && matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size) + .min((o1, o2) -> worstCaseContiguousMemorySizeCompare(o1, o2)); + if(sizeBasedUnlockedGPUObjects.isPresent()) { + evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode); + A = cudaMallocNoWarn(tmpA, size, null); + if(A == null) + LOG.warn("cudaMalloc failed after clearing/evicting based on size."); + if(DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - t0; + GPUStatistics.cudaEvictTime.add(totalTime); + GPUStatistics.cudaEvictSizeTime.add(totalTime); + GPUStatistics.cudaEvictCount.increment(); + GPUStatistics.cudaEvictSizeCount.increment(); } - // --------------------------------------------------------------- - } - addMiscTime(opcode, GPUStatistics.cudaEvictTime, GPUStatistics.cudaEvictionCount, GPUInstruction.MISC_TIMER_EVICT, t0); - if(A == null) { - A = cudaMallocNoWarn(tmpA, size); // if the matrix is not allocated via eviction } - if(A == null) { - LOG.warn("cudaMalloc failed immediately after cudaMemGetInfo reported that memory of size " - + byteCountToDisplaySize(size) + " is available. " - + "This usually happens if there are external programs trying to grab on to memory in parallel or there is potential fragmentation."); + } + + // Step 6: Try eviction/clearing one-by-one based on the given policy without size restriction + if(A == null) { + long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; + // --------------------------------------------------------------- + // Evict unlocked GPU objects one-by-one and try malloc + List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream() + .filter(gpuObj -> !gpuObj.isLocked()).collect(Collectors.toList()); + Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size)); + while(A == null && unlockedGPUObjects.size() > 0) { + evictOrClear(unlockedGPUObjects.remove(unlockedGPUObjects.size()-1), opcode); + A = cudaMallocNoWarn(tmpA, size, null); + if(DMLScript.STATISTICS) + GPUStatistics.cudaEvictCount.increment(); } - else if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { - LOG.info("Malloc after eviction/clearing is successful."); + if(DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - t0; + GPUStatistics.cudaEvictTime.add(totalTime); } } - // Step 6: Handle defragmentation + + // Step 7: Handle defragmentation if(A == null) { LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ..."); LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString()); matrixMemoryManager.clearAllUnlocked(opcode); LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString()); - A = cudaMallocNoWarn(tmpA, size); + A = cudaMallocNoWarn(tmpA, size, null); } if(A == null) { @@ -399,12 +331,29 @@ public class GPUMemoryManager { + toString()); } - t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; + long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; cudaMemset(A, 0, size); addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0); return A; } + private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) { + long ret = matrixMemoryManager.getWorstCaseContiguousMemorySize(o1) - matrixMemoryManager.getWorstCaseContiguousMemorySize(o2); + return ret < 0 ? -1 : (ret == 0 ? 0 : 1); + } + + private void evictOrClear(GPUObject gpuObj, String opcode) { + boolean eagerDelete = true; + if(gpuObj.isDirty()) { + // Eviction + gpuObj.copyFromDeviceToHost(opcode, true, eagerDelete); + } + else { + // Clear without copying + gpuObj.clearData(opcode, eagerDelete); + } + } + // --------------- Developer Utilities to debug potential memory leaks ------------------------ private void printPointers(Set<Pointer> pointers, StringBuilder sb) { HashMap<String, Integer> frequency = new HashMap<>(); @@ -657,8 +606,6 @@ public class GPUMemoryManager { return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR); } - private static Comparator<GPUObject> SIMPLE_COMPARATOR_SORT_BY_SIZE = (o1, o2) -> o1.getSizeOnDevice() < o2.getSizeOnDevice() ? -1 : 1; - private static class CustomPointer extends Pointer { public CustomPointer(Pointer p) { super(p); http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 6125d15..26cbd97 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -43,6 +43,7 @@ import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR; import org.apache.sysml.utils.GPUStatistics; import jcuda.Pointer; +import jcuda.Sizeof; import jcuda.jcusparse.cusparseDirection; import jcuda.jcusparse.cusparseHandle; import jcuda.jcusparse.cusparseMatDescr; @@ -99,6 +100,18 @@ public class GPUObject { */ protected MatrixObject mat = null; + float[] shadowPointer = null; + private static boolean _warnedAboutShadowBuffer = false; + public boolean canFitIntoShadowBuffer() { + int numBytes = toIntExact(mat.getNumRows()*mat.getNumColumns())*Sizeof.FLOAT; + boolean ret = DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES; + if(!ret && !_warnedAboutShadowBuffer) { + LOG.warn("Shadow buffer is full, so using CP bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize."); + _warnedAboutShadowBuffer = true; + } + return ret; + } + // ---------------------------------------------------------------------- // Methods used to access, set and check jcudaDenseMatrixPtr @@ -108,6 +121,12 @@ public class GPUObject { * @return a pointer to the dense matrix */ public Pointer getDensePointer() { + if(jcudaDenseMatrixPtr == null && shadowPointer != null && getJcudaSparseMatrixPtr() == null) { + long numBytes = shadowPointer.length*LibMatrixCUDA.sizeOfDataType; + jcudaDenseMatrixPtr = gpuContext.allocate(null, numBytes); + cudaMemcpy(jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); + clearShadowPointer(); + } return jcudaDenseMatrixPtr; } @@ -125,9 +144,21 @@ public class GPUObject { */ public void clearDensePointer() { jcudaDenseMatrixPtr = null; + clearShadowPointer(); } /** + * Removes shadow pointer + */ + public void clearShadowPointer() { + if(shadowPointer != null) { + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES -= shadowPointer.length*Sizeof.FLOAT; + } + shadowPointer = null; + } + + + /** * Convenience method to directly set the dense matrix pointer on GPU * * @param densePtr dense pointer @@ -249,15 +280,7 @@ public class GPUObject { //cudaDeviceSynchronize(); int[] nnzC = { -1 }; - long t2 = 0; - if (DMLScript.STATISTICS) - t2 = System.nanoTime(); cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t2); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.add(1); - if (nnzC[0] == -1) { throw new DMLRuntimeException( "cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU"); @@ -299,7 +322,7 @@ public class GPUObject { } this.jcudaSparseMatrixPtr = sparseMatrixPtr; this.isSparse = true; - if (!isDensePointerNull()) { + if (!isDensePointerNull() && shadowPointer == null) { cudaFreeHelper(getDensePointer()); clearDensePointer(); } @@ -321,7 +344,7 @@ public class GPUObject { int rows = toIntExact(mat.getNumRows()); int cols = toIntExact(mat.getNumColumns()); - if (isDensePointerNull() || !isAllocated()) + if ((isDensePointerNull() && shadowPointer == null) || !isAllocated()) throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call"); denseRowMajorToColumnMajor(); @@ -454,7 +477,7 @@ public class GPUObject { } public boolean isAllocated() { - boolean eitherAllocated = (!isDensePointerNull() || getJcudaSparseMatrixPtr() != null); + boolean eitherAllocated = shadowPointer != null || !isDensePointerNull() || getJcudaSparseMatrixPtr() != null; return eitherAllocated; } @@ -916,7 +939,55 @@ public class GPUObject { if(LOG.isTraceEnabled()) { LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext()); } - if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) { + if(shadowPointer != null) { + if(isEviction) { + // If already copied to shadow buffer as part of previous eviction, do nothing. + return; + } + else { + // If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction), + // then copy from shadow buffer to MatrixObject. + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); + tmp.allocateDenseBlock(); + double [] tmpArr = tmp.getDenseBlockValues(); + for(int i = 0; i < shadowPointer.length; i++) { + tmpArr[i] = shadowPointer[i]; + } + mat.acquireModify(tmp); + mat.release(); + clearShadowPointer(); + dirty = false; + if (DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - start; + GPUStatistics.cudaFromShadowToHostTime.add(totalTime); + GPUStatistics.cudaFromShadowToHostCount.increment(); + // Part of dev -> host, not eviction + GPUStatistics.cudaFromDevTime.add(totalTime); + GPUStatistics.cudaFromDevCount.increment(); + } + return; + } + } + else if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && isEviction && eagerDelete && !isDensePointerNull() && canFitIntoShadowBuffer()) { + // Perform shadow buffering if (1) single precision, (2) during eviction, (3) for dense matrices, and (4) if the given matrix can fit into the shadow buffer. + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + int numElems = toIntExact(mat.getNumRows()*mat.getNumColumns()); + shadowPointer = new float[numElems]; + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES += shadowPointer.length*Sizeof.FLOAT; + cudaMemcpy(Pointer.to(shadowPointer), jcudaDenseMatrixPtr, numElems*LibMatrixCUDA.sizeOfDataType, jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); + getGPUContext().cudaFreeHelper(instName, jcudaDenseMatrixPtr, eagerDelete); + jcudaDenseMatrixPtr = null; + if (DMLScript.STATISTICS) { + // Eviction time measure in malloc + long totalTime = System.nanoTime() - start; + GPUStatistics.cudaFromDevToShadowTime.add(totalTime); + GPUStatistics.cudaFromDevToShadowCount.increment(); + + } + return; + } + else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) { throw new DMLRuntimeException( "Cannot copy from device to host as JCuda dense/sparse pointer is not allocated"); } @@ -939,13 +1010,10 @@ public class GPUObject { tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); tmp.allocateDenseBlock(); LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), - getDensePointer(), tmp.getDenseBlockValues(), instName, isEviction); - // int nnz = LibMatrixCUDA.computeNNZ(getGPUContext(), getJcudaDenseMatrixPtr(), toIntExact(mat.getNumRows()*mat.getNumColumns())); - // tmp.setNonZeros(nnz); + getDensePointer(), tmp.getDenseBlockValues(), instName, isEviction); if(eagerDelete) clearData(instName, true); - // tmp.recomputeNonZeros(); - tmp.setNonZeros(-1); + tmp.recomputeNonZeros(); } else { int rows = toIntExact(mat.getNumRows()); int cols = toIntExact(mat.getNumColumns()); @@ -962,10 +1030,11 @@ public class GPUObject { } mat.acquireModify(tmp); mat.release(); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start); - if (DMLScript.STATISTICS) { + if (DMLScript.STATISTICS && !isEviction) { + // Eviction time measure in malloc + long totalTime = System.nanoTime() - start; int count = !isDensePointerNull() ? 1 : 3; + GPUStatistics.cudaFromDevTime.add(totalTime); GPUStatistics.cudaFromDevCount.add(count); } dirty = false; @@ -990,6 +1059,7 @@ public class GPUObject { getJcudaSparseMatrixPtr().deallocate(eager); } clearDensePointer(); + clearShadowPointer(); jcudaSparseMatrixPtr = null; resetReadWriteLock(); getGPUContext().getMemoryManager().removeGPUObject(this); http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index 18270cc..12abf21 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -36,7 +36,7 @@ import org.apache.sysml.api.DMLScript; */ public class GPUStatistics { private static int iNoOfExecutedGPUInst = 0; - + public static long cudaInitTime = 0; public static long cudaLibrariesInitTime = 0; public static LongAdder cudaSparseToDenseTime = new LongAdder(); // time spent in converting sparse matrix block to dense @@ -47,12 +47,16 @@ public class GPUStatistics { public static LongAdder cudaSparseConversionCount = new LongAdder(); public static LongAdder cudaAllocTime = new LongAdder(); // time spent in allocating memory on the GPU + public static LongAdder cudaAllocSuccessTime = new LongAdder(); // time spent in successful allocation + public static LongAdder cudaAllocFailedTime = new LongAdder(); // time spent in unsuccessful allocation public static LongAdder cudaDeAllocTime = new LongAdder(); // time spent in deallocating memory on the GPU public static LongAdder cudaMemSet0Time = new LongAdder(); // time spent in setting memory to 0 on the GPU (part of reusing and for new allocates) public static LongAdder cudaToDevTime = new LongAdder(); // time spent in copying data from host (CPU) to device (GPU) memory public static LongAdder cudaFromDevTime = new LongAdder(); // time spent in copying data from device to host + public static LongAdder cudaFromShadowToHostTime = new LongAdder(); // time spent in copying data from shadow to host + public static LongAdder cudaFromDevToShadowTime = new LongAdder(); // time spent in copying data from device to shadow public static LongAdder cudaEvictTime = new LongAdder(); // time spent in eviction - public static LongAdder cudaEvictMallocTime = new LongAdder(); // time spent in eviction + public static LongAdder cudaEvictSizeTime = new LongAdder(); // time spent in eviction public static LongAdder cudaFloat2DoubleTime = new LongAdder(); // time spent in converting float to double during eviction public static LongAdder cudaDouble2FloatTime = new LongAdder(); // time spent in converting double to float during eviction public static LongAdder cudaEvictMemcpyTime = new LongAdder(); // time spent in cudaMemcpy kernel during eviction @@ -63,10 +67,15 @@ public class GPUStatistics { public static LongAdder cudaMemSet0Count = new LongAdder(); public static LongAdder cudaToDevCount = new LongAdder(); public static LongAdder cudaFromDevCount = new LongAdder(); - public static LongAdder cudaEvictionCount = new LongAdder(); + public static LongAdder cudaFromShadowToHostCount = new LongAdder(); + public static LongAdder cudaFromDevToShadowCount = new LongAdder(); + public static LongAdder cudaEvictCount = new LongAdder(); + public static LongAdder cudaEvictSizeCount = new LongAdder(); public static LongAdder cudaFloat2DoubleCount = new LongAdder(); public static LongAdder cudaDouble2FloatCount = new LongAdder(); - public static LongAdder cudaEvictionMallocCount = new LongAdder(); + public static LongAdder cudaAllocSuccessCount = new LongAdder(); + public static LongAdder cudaAllocFailedCount = new LongAdder(); + public static LongAdder cudaAllocReuseCount = new LongAdder(); // Per instruction miscellaneous timers. // Used to record events in a CP Heavy Hitter instruction and @@ -94,8 +103,10 @@ public class GPUStatistics { cudaMemSet0Count.reset(); cudaToDevTime.reset(); cudaFromDevTime.reset(); + cudaFromShadowToHostTime.reset(); + cudaFromDevToShadowTime.reset(); cudaEvictTime.reset(); - cudaEvictMallocTime.reset(); + cudaEvictSizeTime.reset(); cudaFloat2DoubleTime.reset(); cudaDouble2FloatTime.reset(); cudaFloat2DoubleCount.reset(); @@ -106,8 +117,15 @@ public class GPUStatistics { cudaDeAllocCount.reset(); cudaToDevCount.reset(); cudaFromDevCount.reset(); - cudaEvictionCount.reset(); - cudaEvictionMallocCount.reset(); + cudaFromShadowToHostCount.reset(); + cudaFromDevToShadowCount.reset(); + cudaEvictCount.reset(); + cudaEvictSizeCount.reset(); + cudaAllocSuccessTime.reset(); + cudaAllocFailedTime.reset(); + cudaAllocSuccessCount.reset(); + cudaAllocFailedCount.reset(); + cudaAllocReuseCount.reset(); resetMiscTimers(); } @@ -206,34 +224,45 @@ public class GPUStatistics { sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/" + String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n"); sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n"); - sb.append("GPU mem tx time (alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t" - + String.format("%.3f", cudaAllocTime.longValue()*1e-9) + "/" - + String.format("%.3f", cudaDeAllocTime.longValue()*1e-9) + "/" - + String.format("%.3f", cudaMemSet0Time.longValue()*1e-9) + "/" + // cudaSparseConversionCount + sb.append("GPU mem alloc time (alloc(success/fail) / dealloc / set0):\t" + + String.format("%.3f", cudaAllocTime.longValue()*1e-9) + "(" + + String.format("%.3f", cudaAllocSuccessTime.longValue()*1e-9) + "/" + + String.format("%.3f", cudaAllocFailedTime.longValue()*1e-9) + ") / " + + String.format("%.3f", cudaDeAllocTime.longValue()*1e-9) + " / " + + String.format("%.3f", cudaMemSet0Time.longValue()*1e-9) + " sec.\n"); + sb.append("GPU mem alloc count (alloc(success/fail/reuse) / dealloc / set0):\t" + + cudaAllocCount.longValue() + "(" + + cudaAllocSuccessCount.longValue() + "/" + + cudaAllocFailedCount.longValue() + "/" + + + cudaAllocReuseCount.longValue() +") / " + + cudaDeAllocCount.longValue() + " / " + + cudaMemSet0Count.longValue() + ".\n"); + sb.append("GPU mem tx time (toDev(d2f) / fromDev(f2d/s2h) / evict(d2s/size)):\t" + String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "(" - + String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ ")/" + + String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ ") / " + String.format("%.3f", cudaFromDevTime.longValue()*1e-9) + "(" - + String.format("%.3f", cudaFloat2DoubleTime.longValue()*1e-9) + ")/" + + String.format("%.3f", cudaFloat2DoubleTime.longValue()*1e-9) + "/" + + String.format("%.3f", cudaFromShadowToHostTime.longValue()*1e-9) + ") / " + String.format("%.3f", cudaEvictTime.longValue()*1e-9) + "(" - + String.format("%.3f", cudaEvictMallocTime.longValue()*1e-9) + ") sec.\n"); - sb.append("GPU mem tx count (alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t" - + cudaAllocCount.longValue() + "/" - + cudaDeAllocCount.longValue() + "/" - + cudaMemSet0Count.longValue() + "/" - + cudaSparseConversionCount.longValue() + "/" + + String.format("%.3f", cudaFromDevToShadowTime.longValue()*1e-9) + "/" + + String.format("%.3f", cudaEvictSizeTime.longValue()*1e-9) + ") sec.\n"); + sb.append("GPU mem tx count (toDev(d2f) / fromDev(f2d/s2h) / evict(d2s/size)):\t" + cudaToDevCount.longValue() + "(" - + cudaDouble2FloatCount.longValue() + ")/" + + cudaDouble2FloatCount.longValue() + ") / " + cudaFromDevCount.longValue() + "(" - + cudaFloat2DoubleCount.longValue() + ")/" - + cudaEvictionCount.longValue() + "(" - + cudaEvictionMallocCount.longValue() + ").\n"); - sb.append("GPU conversion time (sparseConv/sp2dense/dense2sp):\t" - + String.format("%.3f", cudaSparseConversionTime.longValue()*1e-9) + "/" - + String.format("%.3f", cudaSparseToDenseTime.longValue()*1e-9) + "/" + + cudaFloat2DoubleCount.longValue() + "/" + + cudaFromShadowToHostCount.longValue() + ") / " + + cudaEvictCount.longValue() + "(" + + cudaFromDevToShadowCount.longValue() + "/" + + + cudaEvictSizeCount.longValue() + ").\n"); + sb.append("GPU conversion time (sparseConv / sp2dense / dense2sp):\t" + + String.format("%.3f", cudaSparseConversionTime.longValue()*1e-9) + " / " + + String.format("%.3f", cudaSparseToDenseTime.longValue()*1e-9) + " / " + String.format("%.3f", cudaDenseToSparseTime.longValue()*1e-9) + " sec.\n"); - sb.append("GPU conversion count (sparseConv/sp2dense/dense2sp):\t" - + cudaSparseConversionCount.longValue() + "/" - + cudaSparseToDenseCount.longValue() + "/" + sb.append("GPU conversion count (sparseConv / sp2dense / dense2sp):\t" + + cudaSparseConversionCount.longValue() + " / " + + cudaSparseToDenseCount.longValue() + " / " + cudaDenseToSparseCount.longValue() + ".\n"); return sb.toString();
