Repository: systemml Updated Branches: refs/heads/master 11f0291d7 -> a11933002
[SYSTEMML-445] Refactored the shadow buffer and added documentation for newly added features - Refactored the shadow buffer logic from GPUObject to ShadowBuffer class for maintenance. - Added an additional timer to measure shadow buffer time. - Updated the gpu documentation Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a1193300 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a1193300 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a1193300 Branch: refs/heads/master Commit: a11933002bfa8ba4d3e50b16f69c60bb36a270f6 Parents: 11f0291 Author: Niketan Pansare <[email protected]> Authored: Mon Aug 6 09:40:08 2018 -0700 Committer: Niketan Pansare <[email protected]> Committed: Mon Aug 6 09:42:45 2018 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 2 +- docs/gpu.md | 28 +++- .../java/org/apache/sysml/api/DMLScript.java | 2 +- .../java/org/apache/sysml/conf/DMLConfig.java | 2 +- .../gpu/context/GPUMatrixMemoryManager.java | 2 +- .../instructions/gpu/context/GPUObject.java | 89 +++-------- .../instructions/gpu/context/ShadowBuffer.java | 146 +++++++++++++++++++ .../org/apache/sysml/utils/GPUStatistics.java | 14 +- 8 files changed, 206 insertions(+), 79 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index 3ce88c1..d773f79 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -97,7 +97,7 @@ <sysml.floating.point.precision>double</sysml.floating.point.precision> <!-- the eviction policy for the GPU bufferpool. Supported values are lru, mru, lfu, min_evict, align_memory --> - <sysml.gpu.eviction.policy>align_memory</sysml.gpu.eviction.policy> + <sysml.gpu.eviction.policy>min_evict</sysml.gpu.eviction.policy> <!-- maximum wrap length for instruction and miscellaneous timer column of statistics --> <sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength> http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/docs/gpu.md ---------------------------------------------------------------------- diff --git a/docs/gpu.md b/docs/gpu.md index e9d7bca..5e13e60 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -91,4 +91,30 @@ cd gcc-5.3.0 num_cores=`grep -c ^processor /proc/cpuinfo` make -j $num_cores sudo make install -``` \ No newline at end of file +``` + +# Advanced Configuration + +## Using single precision + +By default, SystemML uses double precision to store its matrices in the GPU memory. +To use single precision, the user needs to set the configuration property 'sysml.floating.point.precision' +to 'single'. However, with exception of BLAS operations, SystemML always performs all CPU operations +in double precision. + +## Training very deep network + +### Shadow buffer +To train very deep network with double precision, no additional configurations are necessary. +But to train very deep network with single precision, the user can speed up the eviction by +using shadow buffer. The fraction of the driver memory to be allocated to the shadow buffer can +be set by using the configuration property 'sysml.gpu.eviction.shadow.bufferSize'. +In the current version, the shadow buffer is currently not guarded by SystemML +and can potentially lead to OOM if the network is deep as well as wide. + +### Unified memory allocator + +By default, SystemML uses CUDA's memory allocator and performs on-demand eviction +using the eviction policy set by the configuration property 'sysml.gpu.eviction.policy'. +To use CUDA's unified memory allocator that performs page-level eviction instead, +please set the configuration property 'sysml.gpu.memory.allocator' to 'unified_memory'. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index d9413a8..9a6bb9b 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -121,7 +121,7 @@ public class DMLScript public static ExplainType EXPLAIN = DMLOptions.defaultOptions.explainType; // explain type public static String DML_FILE_PATH_ANTLR_PARSER = DMLOptions.defaultOptions.filePath; // filename of dml/pydml script public static String FLOATING_POINT_PRECISION = "double"; // data type to use internally - public static EvictionPolicy GPU_EVICTION_POLICY = EvictionPolicy.ALIGN_MEMORY; // currently employed GPU eviction policy + public static EvictionPolicy GPU_EVICTION_POLICY = EvictionPolicy.MIN_EVICT; // currently employed GPU eviction policy public static boolean PRINT_GPU_MEMORY_INFO = false; // whether to print GPU memory-related information public static long EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; // maximum number of bytes to use for shadow buffer public static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0; // number of bytes to use for shadow buffer http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index 1333075..5b30609 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -143,7 +143,7 @@ public class DMLConfig _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); _defaultVals.put(GPU_MEMORY_ALLOCATOR, "cuda"); _defaultVals.put(AVAILABLE_GPUS, "-1"); - _defaultVals.put(GPU_EVICTION_POLICY, "align_memory"); + _defaultVals.put(GPU_EVICTION_POLICY, "min_evict"); _defaultVals.put(SYNCHRONIZE_GPU, "false" ); _defaultVals.put(CACHING_BUFFER_SIZE, "0.15" ); _defaultVals.put(EAGER_CUDA_FREE, "false" ); http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java index cbb8d4e..457968b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java @@ -52,7 +52,7 @@ public class GPUMatrixMemoryManager { long getWorstCaseContiguousMemorySize(GPUObject gpuObj) { long ret = 0; if(!gpuObj.isDensePointerNull()) { - if(gpuObj.shadowPointer == null) + if(!gpuObj.shadowBuffer.isBuffered()) ret = gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes(); else ret = 0; // evicted hence no contiguous memory on GPU http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 26cbd97..a783138 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -43,7 +43,6 @@ import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR; import org.apache.sysml.utils.GPUStatistics; import jcuda.Pointer; -import jcuda.Sizeof; import jcuda.jcusparse.cusparseDirection; import jcuda.jcusparse.cusparseHandle; import jcuda.jcusparse.cusparseMatDescr; @@ -63,7 +62,7 @@ public class GPUObject { /** * Pointer to the underlying dense matrix block on GPU */ - private Pointer jcudaDenseMatrixPtr = null; + Pointer jcudaDenseMatrixPtr = null; /** * Pointer to the underlying sparse matrix block on GPU @@ -98,19 +97,12 @@ public class GPUObject { /** * Enclosing {@link MatrixObject} instance */ - protected MatrixObject mat = null; + MatrixObject mat = null; - float[] shadowPointer = null; - private static boolean _warnedAboutShadowBuffer = false; - public boolean canFitIntoShadowBuffer() { - int numBytes = toIntExact(mat.getNumRows()*mat.getNumColumns())*Sizeof.FLOAT; - boolean ret = DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES; - if(!ret && !_warnedAboutShadowBuffer) { - LOG.warn("Shadow buffer is full, so using CP bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize."); - _warnedAboutShadowBuffer = true; - } - return ret; - } + /** + * Shadow buffer instance + */ + final ShadowBuffer shadowBuffer; // ---------------------------------------------------------------------- // Methods used to access, set and check jcudaDenseMatrixPtr @@ -121,11 +113,8 @@ public class GPUObject { * @return a pointer to the dense matrix */ public Pointer getDensePointer() { - if(jcudaDenseMatrixPtr == null && shadowPointer != null && getJcudaSparseMatrixPtr() == null) { - long numBytes = shadowPointer.length*LibMatrixCUDA.sizeOfDataType; - jcudaDenseMatrixPtr = gpuContext.allocate(null, numBytes); - cudaMemcpy(jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); - clearShadowPointer(); + if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && getJcudaSparseMatrixPtr() == null) { + shadowBuffer.moveToDevice(); } return jcudaDenseMatrixPtr; } @@ -144,17 +133,7 @@ public class GPUObject { */ public void clearDensePointer() { jcudaDenseMatrixPtr = null; - clearShadowPointer(); - } - - /** - * Removes shadow pointer - */ - public void clearShadowPointer() { - if(shadowPointer != null) { - DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES -= shadowPointer.length*Sizeof.FLOAT; - } - shadowPointer = null; + shadowBuffer.clearShadowPointer(); } @@ -221,7 +200,7 @@ public class GPUObject { getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE); } - private GPUContext getGPUContext() { + GPUContext getGPUContext() { return gpuContext; } @@ -322,7 +301,7 @@ public class GPUObject { } this.jcudaSparseMatrixPtr = sparseMatrixPtr; this.isSparse = true; - if (!isDensePointerNull() && shadowPointer == null) { + if (!isDensePointerNull() && !shadowBuffer.isBuffered()) { cudaFreeHelper(getDensePointer()); clearDensePointer(); } @@ -344,7 +323,7 @@ public class GPUObject { int rows = toIntExact(mat.getNumRows()); int cols = toIntExact(mat.getNumColumns()); - if ((isDensePointerNull() && shadowPointer == null) || !isAllocated()) + if ((isDensePointerNull() && !shadowBuffer.isBuffered()) || !isAllocated()) throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call"); denseRowMajorToColumnMajor(); @@ -462,6 +441,7 @@ public class GPUObject { GPUObject(GPUContext gCtx, MatrixObject mat2) { gpuContext = gCtx; this.mat = mat2; + this.shadowBuffer = new ShadowBuffer(this); } public boolean isSparse() { @@ -477,7 +457,7 @@ public class GPUObject { } public boolean isAllocated() { - boolean eitherAllocated = shadowPointer != null || !isDensePointerNull() || getJcudaSparseMatrixPtr() != null; + boolean eitherAllocated = shadowBuffer.isBuffered() || !isDensePointerNull() || getJcudaSparseMatrixPtr() != null; return eitherAllocated; } @@ -939,7 +919,7 @@ public class GPUObject { if(LOG.isTraceEnabled()) { LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext()); } - if(shadowPointer != null) { + if(shadowBuffer.isBuffered()) { if(isEviction) { // If already copied to shadow buffer as part of previous eviction, do nothing. return; @@ -947,44 +927,13 @@ public class GPUObject { else { // If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction), // then copy from shadow buffer to MatrixObject. - long start = DMLScript.STATISTICS ? System.nanoTime() : 0; - MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); - tmp.allocateDenseBlock(); - double [] tmpArr = tmp.getDenseBlockValues(); - for(int i = 0; i < shadowPointer.length; i++) { - tmpArr[i] = shadowPointer[i]; - } - mat.acquireModify(tmp); - mat.release(); - clearShadowPointer(); - dirty = false; - if (DMLScript.STATISTICS) { - long totalTime = System.nanoTime() - start; - GPUStatistics.cudaFromShadowToHostTime.add(totalTime); - GPUStatistics.cudaFromShadowToHostCount.increment(); - // Part of dev -> host, not eviction - GPUStatistics.cudaFromDevTime.add(totalTime); - GPUStatistics.cudaFromDevCount.increment(); - } + shadowBuffer.moveToHost(); return; } } - else if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && isEviction && eagerDelete && !isDensePointerNull() && canFitIntoShadowBuffer()) { + else if(shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) { // Perform shadow buffering if (1) single precision, (2) during eviction, (3) for dense matrices, and (4) if the given matrix can fit into the shadow buffer. - long start = DMLScript.STATISTICS ? System.nanoTime() : 0; - int numElems = toIntExact(mat.getNumRows()*mat.getNumColumns()); - shadowPointer = new float[numElems]; - DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES += shadowPointer.length*Sizeof.FLOAT; - cudaMemcpy(Pointer.to(shadowPointer), jcudaDenseMatrixPtr, numElems*LibMatrixCUDA.sizeOfDataType, jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); - getGPUContext().cudaFreeHelper(instName, jcudaDenseMatrixPtr, eagerDelete); - jcudaDenseMatrixPtr = null; - if (DMLScript.STATISTICS) { - // Eviction time measure in malloc - long totalTime = System.nanoTime() - start; - GPUStatistics.cudaFromDevToShadowTime.add(totalTime); - GPUStatistics.cudaFromDevToShadowCount.increment(); - - } + shadowBuffer.moveFromDevice(instName); return; } else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) { @@ -1059,7 +1008,7 @@ public class GPUObject { getJcudaSparseMatrixPtr().deallocate(eager); } clearDensePointer(); - clearShadowPointer(); + shadowBuffer.clearShadowPointer(); jcudaSparseMatrixPtr = null; resetReadWriteLock(); getGPUContext().getMemoryManager().removeGPUObject(this); http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java new file mode 100644 index 0000000..27a6256 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import static jcuda.runtime.JCuda.cudaMemcpy; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.utils.GPUStatistics; + +import jcuda.Pointer; +import jcuda.Sizeof; + +public class ShadowBuffer { + private static final Log LOG = LogFactory.getLog(ShadowBuffer.class.getName()); + + GPUObject gpuObj; + float[] shadowPointer = null; + private static boolean _warnedAboutShadowBuffer = false; + + public ShadowBuffer(GPUObject gpuObj) { + this.gpuObj = gpuObj; + } + + /** + * Check if the gpu object is shadow buffered + * + * @return true if the gpu object is shadow buffered + */ + public boolean isBuffered() { + return shadowPointer != null; + } + + /** + * Move the data from GPU to shadow buffer + * @param instName name of the instruction + */ + public void moveFromDevice(String instName) { + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + int numElems = GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns()); + shadowPointer = new float[numElems]; + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES += shadowPointer.length*Sizeof.FLOAT; + cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, numElems*LibMatrixCUDA.sizeOfDataType, jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); + gpuObj.getGPUContext().cudaFreeHelper(instName, gpuObj.jcudaDenseMatrixPtr, true); + gpuObj.jcudaDenseMatrixPtr = null; + if (DMLScript.STATISTICS) { + // Eviction time measure in malloc + long totalTime = System.nanoTime() - start; + GPUStatistics.cudaFromDevToShadowTime.add(totalTime); + GPUStatistics.cudaFromDevToShadowCount.increment(); + + } + } + + /** + * Move the data from shadow buffer to Matrix object + */ + public void moveToHost() { + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + MatrixBlock tmp = new MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false); + tmp.allocateDenseBlock(); + double [] tmpArr = tmp.getDenseBlockValues(); + for(int i = 0; i < shadowPointer.length; i++) { + tmpArr[i] = shadowPointer[i]; + } + gpuObj.mat.acquireModify(tmp); + gpuObj.mat.release(); + clearShadowPointer(); + gpuObj.dirty = false; + if (DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - start; + GPUStatistics.cudaFromShadowToHostTime.add(totalTime); + GPUStatistics.cudaFromShadowToHostCount.increment(); + // Part of dev -> host, not eviction + GPUStatistics.cudaFromDevTime.add(totalTime); + GPUStatistics.cudaFromDevCount.increment(); + } + } + + /** + * Move the data from shadow buffer to GPU + */ + public void moveToDevice() { + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + long numBytes = shadowPointer.length*LibMatrixCUDA.sizeOfDataType; + gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes); + cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); + clearShadowPointer(); + if (DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - start; + GPUStatistics.cudaFromShadowToDevTime.add(totalTime); + GPUStatistics.cudaFromShadowToDevCount.increment(); + } + } + + /** + * Checks if the GPU object is eligible for shadow buffering + * + * @param isEviction true if this method is called during eviction + * @param eagerDelete true if the data on device has to be eagerly deleted + * @return true if the given GPU object is eligible to be shadow buffered + */ + public boolean isEligibleForBuffering(boolean isEviction, boolean eagerDelete) { + if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) { + int numBytes = GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns())*Sizeof.FLOAT; + boolean ret = DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES; + if(!ret && !_warnedAboutShadowBuffer) { + LOG.warn("Shadow buffer is full, so using CP bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize."); + _warnedAboutShadowBuffer = true; + } + return ret; + } + else { + return false; + } + } + + /** + * Removes the content from shadow buffer + */ + public void clearShadowPointer() { + if(shadowPointer != null) { + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES -= shadowPointer.length*Sizeof.FLOAT; + } + shadowPointer = null; + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index 12abf21..fcbc4c4 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -54,6 +54,7 @@ public class GPUStatistics { public static LongAdder cudaToDevTime = new LongAdder(); // time spent in copying data from host (CPU) to device (GPU) memory public static LongAdder cudaFromDevTime = new LongAdder(); // time spent in copying data from device to host public static LongAdder cudaFromShadowToHostTime = new LongAdder(); // time spent in copying data from shadow to host + public static LongAdder cudaFromShadowToDevTime = new LongAdder(); // time spent in copying data from shadow to host public static LongAdder cudaFromDevToShadowTime = new LongAdder(); // time spent in copying data from device to shadow public static LongAdder cudaEvictTime = new LongAdder(); // time spent in eviction public static LongAdder cudaEvictSizeTime = new LongAdder(); // time spent in eviction @@ -68,6 +69,7 @@ public class GPUStatistics { public static LongAdder cudaToDevCount = new LongAdder(); public static LongAdder cudaFromDevCount = new LongAdder(); public static LongAdder cudaFromShadowToHostCount = new LongAdder(); + public static LongAdder cudaFromShadowToDevCount = new LongAdder(); public static LongAdder cudaFromDevToShadowCount = new LongAdder(); public static LongAdder cudaEvictCount = new LongAdder(); public static LongAdder cudaEvictSizeCount = new LongAdder(); @@ -104,6 +106,7 @@ public class GPUStatistics { cudaToDevTime.reset(); cudaFromDevTime.reset(); cudaFromShadowToHostTime.reset(); + cudaFromShadowToDevTime.reset(); cudaFromDevToShadowTime.reset(); cudaEvictTime.reset(); cudaEvictSizeTime.reset(); @@ -118,6 +121,7 @@ public class GPUStatistics { cudaToDevCount.reset(); cudaFromDevCount.reset(); cudaFromShadowToHostCount.reset(); + cudaFromShadowToDevCount.reset(); cudaFromDevToShadowCount.reset(); cudaEvictCount.reset(); cudaEvictSizeCount.reset(); @@ -238,18 +242,20 @@ public class GPUStatistics { + cudaAllocReuseCount.longValue() +") / " + cudaDeAllocCount.longValue() + " / " + cudaMemSet0Count.longValue() + ".\n"); - sb.append("GPU mem tx time (toDev(d2f) / fromDev(f2d/s2h) / evict(d2s/size)):\t" + sb.append("GPU mem tx time (toDev(d2f/s2d) / fromDev(f2d/s2h) / evict(d2s/size)):\t" + String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "(" - + String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ ") / " + + String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ "/" + + String.format("%.3f", cudaFromShadowToDevTime.longValue()*1e-9) + ") / " + String.format("%.3f", cudaFromDevTime.longValue()*1e-9) + "(" + String.format("%.3f", cudaFloat2DoubleTime.longValue()*1e-9) + "/" + String.format("%.3f", cudaFromShadowToHostTime.longValue()*1e-9) + ") / " + String.format("%.3f", cudaEvictTime.longValue()*1e-9) + "(" + String.format("%.3f", cudaFromDevToShadowTime.longValue()*1e-9) + "/" + String.format("%.3f", cudaEvictSizeTime.longValue()*1e-9) + ") sec.\n"); - sb.append("GPU mem tx count (toDev(d2f) / fromDev(f2d/s2h) / evict(d2s/size)):\t" + sb.append("GPU mem tx count (toDev(d2f/s2d) / fromDev(f2d/s2h) / evict(d2s/size)):\t" + cudaToDevCount.longValue() + "(" - + cudaDouble2FloatCount.longValue() + ") / " + + cudaDouble2FloatCount.longValue() + "/" + + cudaFromShadowToDevCount.longValue() + ") / " + cudaFromDevCount.longValue() + "(" + cudaFloat2DoubleCount.longValue() + "/" + cudaFromShadowToHostCount.longValue() + ") / "
