systemml git commit: [SYSTEMML-445] Refactored the shadow buffer and added documentation for newly added features

niketanpansare Mon, 06 Aug 2018 09:45:31 -0700

Repository: systemml
Updated Branches:
  refs/heads/master 11f0291d7 -> a11933002



[SYSTEMML-445] Refactored the shadow buffer and added documentation for newly 
added features

- Refactored the shadow buffer logic from GPUObject to ShadowBuffer class for 
maintenance.
- Added an additional timer to measure shadow buffer time.
- Updated the gpu documentation


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a1193300
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a1193300
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a1193300

Branch: refs/heads/master
Commit: a11933002bfa8ba4d3e50b16f69c60bb36a270f6
Parents: 11f0291
Author: Niketan Pansare <[email protected]>
Authored: Mon Aug 6 09:40:08 2018 -0700
Committer: Niketan Pansare <[email protected]>
Committed: Mon Aug 6 09:42:45 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   2 +-
 docs/gpu.md                                     |  28 +++-
 .../java/org/apache/sysml/api/DMLScript.java    |   2 +-
 .../java/org/apache/sysml/conf/DMLConfig.java   |   2 +-
 .../gpu/context/GPUMatrixMemoryManager.java     |   2 +-
 .../instructions/gpu/context/GPUObject.java     |  89 +++--------
 .../instructions/gpu/context/ShadowBuffer.java  | 146 +++++++++++++++++++
 .../org/apache/sysml/utils/GPUStatistics.java   |  14 +-
 8 files changed, 206 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index 3ce88c1..d773f79 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -97,7 +97,7 @@
     <sysml.floating.point.precision>double</sysml.floating.point.precision>
     
     <!-- the eviction policy for the GPU bufferpool. Supported values are lru, 
mru, lfu, min_evict, align_memory -->
-    <sysml.gpu.eviction.policy>align_memory</sysml.gpu.eviction.policy>
+    <sysml.gpu.eviction.policy>min_evict</sysml.gpu.eviction.policy>
     
    <!-- maximum wrap length for instruction and miscellaneous timer column of 
statistics -->
    <sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength>

http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/docs/gpu.md
----------------------------------------------------------------------
diff --git a/docs/gpu.md b/docs/gpu.md
index e9d7bca..5e13e60 100644
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -91,4 +91,30 @@ cd gcc-5.3.0
 num_cores=`grep -c ^processor /proc/cpuinfo`
 make -j $num_cores
 sudo make install
-```
\ No newline at end of file
+```
+
+# Advanced Configuration
+
+## Using single precision
+
+By default, SystemML uses double precision to store its matrices in the GPU 
memory.
+To use single precision, the user needs to set the configuration property 
'sysml.floating.point.precision'
+to 'single'. However, with exception of BLAS operations, SystemML always 
performs all CPU operations
+in double precision.
+
+## Training very deep network
+
+### Shadow buffer
+To train very deep network with double precision, no additional configurations 
are necessary.
+But to train very deep network with single precision, the user can speed up 
the eviction by 
+using shadow buffer. The fraction of the driver memory to be allocated to the 
shadow buffer can  
+be set by using the configuration property 
'sysml.gpu.eviction.shadow.bufferSize'.
+In the current version, the shadow buffer is currently not guarded by SystemML
+and can potentially lead to OOM if the network is deep as well as wide.
+
+### Unified memory allocator
+
+By default, SystemML uses CUDA's memory allocator and performs on-demand 
eviction
+using the eviction policy set by the configuration property 
'sysml.gpu.eviction.policy'.
+To use CUDA's unified memory allocator that performs page-level eviction 
instead,
+please set the configuration property 'sysml.gpu.memory.allocator' to 
'unified_memory'.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index d9413a8..9a6bb9b 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -121,7 +121,7 @@ public class DMLScript
        public static ExplainType       EXPLAIN             = 
DMLOptions.defaultOptions.explainType; // explain type
        public static String            DML_FILE_PATH_ANTLR_PARSER = 
DMLOptions.defaultOptions.filePath; // filename of dml/pydml script
        public static String            FLOATING_POINT_PRECISION = "double";    
                     // data type to use internally
-       public static EvictionPolicy    GPU_EVICTION_POLICY = 
EvictionPolicy.ALIGN_MEMORY;           // currently employed GPU eviction policy
+       public static EvictionPolicy    GPU_EVICTION_POLICY = 
EvictionPolicy.MIN_EVICT;                 // currently employed GPU eviction 
policy
        public static boolean           PRINT_GPU_MEMORY_INFO = false;          
                     // whether to print GPU memory-related information
        public static long              EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;   
                      // maximum number of bytes to use for shadow buffer
        public static long              EVICTION_SHADOW_BUFFER_CURR_BYTES = 0;  
                      // number of bytes to use for shadow buffer

http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 1333075..5b30609 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -143,7 +143,7 @@ public class DMLConfig
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
                _defaultVals.put(GPU_MEMORY_ALLOCATOR,   "cuda");
                _defaultVals.put(AVAILABLE_GPUS,         "-1");
-               _defaultVals.put(GPU_EVICTION_POLICY,    "align_memory");
+               _defaultVals.put(GPU_EVICTION_POLICY,    "min_evict");
                _defaultVals.put(SYNCHRONIZE_GPU,        "false" );
                _defaultVals.put(CACHING_BUFFER_SIZE,    "0.15" );
                _defaultVals.put(EAGER_CUDA_FREE,        "false" );

http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
index cbb8d4e..457968b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
@@ -52,7 +52,7 @@ public class GPUMatrixMemoryManager {
        long getWorstCaseContiguousMemorySize(GPUObject gpuObj) {
                long ret = 0;
                if(!gpuObj.isDensePointerNull()) {
-                       if(gpuObj.shadowPointer == null)
+                       if(!gpuObj.shadowBuffer.isBuffered())
                                ret = 
gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes();
                        else
                                ret = 0; // evicted hence no contiguous memory 
on GPU

http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 26cbd97..a783138 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -43,7 +43,6 @@ import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
 import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
-import jcuda.Sizeof;
 import jcuda.jcusparse.cusparseDirection;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
@@ -63,7 +62,7 @@ public class GPUObject {
        /**
         * Pointer to the underlying dense matrix block on GPU
         */
-       private Pointer jcudaDenseMatrixPtr = null;
+       Pointer jcudaDenseMatrixPtr = null;
 
        /**
         * Pointer to the underlying sparse matrix block on GPU
@@ -98,19 +97,12 @@ public class GPUObject {
        /**
         * Enclosing {@link MatrixObject} instance
         */
-       protected MatrixObject mat = null;
+       MatrixObject mat = null;
        
-       float[] shadowPointer = null;
-       private static boolean _warnedAboutShadowBuffer = false;
-       public boolean canFitIntoShadowBuffer() {
-               int numBytes = 
toIntExact(mat.getNumRows()*mat.getNumColumns())*Sizeof.FLOAT;
-               boolean ret = DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + 
numBytes <= DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES;
-               if(!ret && !_warnedAboutShadowBuffer) {
-                       LOG.warn("Shadow buffer is full, so using CP bufferpool 
instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize.");
-                       _warnedAboutShadowBuffer = true;
-               }
-               return ret;
-       }
+       /**
+        * Shadow buffer instance
+        */
+       final ShadowBuffer shadowBuffer;
        
        // 
----------------------------------------------------------------------
        // Methods used to access, set and check jcudaDenseMatrixPtr
@@ -121,11 +113,8 @@ public class GPUObject {
         * @return a pointer to the dense matrix
         */
        public Pointer getDensePointer() {
-               if(jcudaDenseMatrixPtr == null && shadowPointer != null && 
getJcudaSparseMatrixPtr() == null) {
-                       long numBytes = 
shadowPointer.length*LibMatrixCUDA.sizeOfDataType;
-                       jcudaDenseMatrixPtr = gpuContext.allocate(null, 
numBytes);
-                       cudaMemcpy(jcudaDenseMatrixPtr, 
Pointer.to(shadowPointer), numBytes, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
-                       clearShadowPointer();
+               if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && 
getJcudaSparseMatrixPtr() == null) {
+                       shadowBuffer.moveToDevice();
                }
                return jcudaDenseMatrixPtr;
        }
@@ -144,17 +133,7 @@ public class GPUObject {
         */
        public void clearDensePointer() {
                jcudaDenseMatrixPtr = null;
-               clearShadowPointer();
-       }
-       
-       /**
-        * Removes shadow pointer
-        */
-       public void clearShadowPointer() {
-               if(shadowPointer != null) {
-                       DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES -= 
shadowPointer.length*Sizeof.FLOAT;
-               }
-               shadowPointer = null;
+               shadowBuffer.clearShadowPointer();
        }
        
        
@@ -221,7 +200,7 @@ public class GPUObject {
                getGPUContext().cudaFreeHelper(null, toFree, 
DMLScript.EAGER_CUDA_FREE);
        }
 
-       private GPUContext getGPUContext() {
+       GPUContext getGPUContext() {
                return gpuContext;
        }
 
@@ -322,7 +301,7 @@ public class GPUObject {
                }
                this.jcudaSparseMatrixPtr = sparseMatrixPtr;
                this.isSparse = true;
-               if (!isDensePointerNull() && shadowPointer == null) {
+               if (!isDensePointerNull() && !shadowBuffer.isBuffered()) {
                        cudaFreeHelper(getDensePointer());
                        clearDensePointer();
                }
@@ -344,7 +323,7 @@ public class GPUObject {
                int rows = toIntExact(mat.getNumRows());
                int cols = toIntExact(mat.getNumColumns());
 
-               if ((isDensePointerNull() && shadowPointer == null) || 
!isAllocated())
+               if ((isDensePointerNull() && !shadowBuffer.isBuffered()) || 
!isAllocated())
                        throw new DMLRuntimeException("Expected allocated dense 
matrix before denseToSparse() call");
 
                denseRowMajorToColumnMajor();
@@ -462,6 +441,7 @@ public class GPUObject {
        GPUObject(GPUContext gCtx, MatrixObject mat2) {
                gpuContext = gCtx;
                this.mat = mat2;
+               this.shadowBuffer = new ShadowBuffer(this);
        }
 
        public boolean isSparse() {
@@ -477,7 +457,7 @@ public class GPUObject {
        }
 
        public boolean isAllocated() {
-               boolean eitherAllocated = shadowPointer != null || 
!isDensePointerNull() || getJcudaSparseMatrixPtr() != null;
+               boolean eitherAllocated = shadowBuffer.isBuffered() || 
!isDensePointerNull() || getJcudaSparseMatrixPtr() != null;
                return eitherAllocated;
        }
 
@@ -939,7 +919,7 @@ public class GPUObject {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", 
GPUContext=" + getGPUContext());
                }
-               if(shadowPointer != null) {
+               if(shadowBuffer.isBuffered()) {
                        if(isEviction) {
                                // If already copied to shadow buffer as part 
of previous eviction, do nothing.
                                return;
@@ -947,44 +927,13 @@ public class GPUObject {
                        else {
                                // If already copied to shadow buffer as part 
of previous eviction and this is not an eviction (i.e. bufferpool call for 
subsequent CP/Spark instruction),
                                // then copy from shadow buffer to MatrixObject.
-                               long start = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
-                               MatrixBlock tmp = new 
MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), 
false);
-                               tmp.allocateDenseBlock();
-                               double [] tmpArr = tmp.getDenseBlockValues();
-                               for(int i = 0; i < shadowPointer.length; i++) {
-                                       tmpArr[i] = shadowPointer[i];
-                               }
-                               mat.acquireModify(tmp);
-                               mat.release();
-                               clearShadowPointer();
-                               dirty = false;
-                               if (DMLScript.STATISTICS) {
-                                       long totalTime = System.nanoTime() - 
start;
-                                       
GPUStatistics.cudaFromShadowToHostTime.add(totalTime);
-                                       
GPUStatistics.cudaFromShadowToHostCount.increment();
-                                       // Part of dev -> host, not eviction
-                                       
GPUStatistics.cudaFromDevTime.add(totalTime);
-                                       
GPUStatistics.cudaFromDevCount.increment();
-                               }
+                               shadowBuffer.moveToHost();
                                return;
                        }
                }
-               else if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && 
isEviction && eagerDelete && !isDensePointerNull() && canFitIntoShadowBuffer()) 
{
+               else if(shadowBuffer.isEligibleForBuffering(isEviction, 
eagerDelete)) {
                        // Perform shadow buffering if (1) single precision, 
(2) during eviction, (3) for dense matrices, and (4) if the given matrix can 
fit into the shadow buffer. 
-                       long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
-                       int numElems = 
toIntExact(mat.getNumRows()*mat.getNumColumns());
-                       shadowPointer = new float[numElems];
-                       DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES += 
shadowPointer.length*Sizeof.FLOAT;
-                       cudaMemcpy(Pointer.to(shadowPointer), 
jcudaDenseMatrixPtr, numElems*LibMatrixCUDA.sizeOfDataType, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
-                       getGPUContext().cudaFreeHelper(instName, 
jcudaDenseMatrixPtr, eagerDelete);
-                       jcudaDenseMatrixPtr = null;
-                       if (DMLScript.STATISTICS) {
-                               // Eviction time measure in malloc
-                               long totalTime = System.nanoTime() - start;
-                               
GPUStatistics.cudaFromDevToShadowTime.add(totalTime);
-                               
GPUStatistics.cudaFromDevToShadowCount.increment();
-                               
-                       }
+                       shadowBuffer.moveFromDevice(instName);
                        return;
                }
                else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == 
null) {
@@ -1059,7 +1008,7 @@ public class GPUObject {
                        getJcudaSparseMatrixPtr().deallocate(eager);
                }
                clearDensePointer();
-               clearShadowPointer();
+               shadowBuffer.clearShadowPointer();
                jcudaSparseMatrixPtr = null;
                resetReadWriteLock();
                getGPUContext().getMemoryManager().removeGPUObject(this);

http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
new file mode 100644
index 0000000..27a6256
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.runtime.JCuda.cudaMemcpy;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.Sizeof;
+
+public class ShadowBuffer {
+       private static final Log LOG = 
LogFactory.getLog(ShadowBuffer.class.getName());
+       
+       GPUObject gpuObj;
+       float[] shadowPointer = null;
+       private static boolean _warnedAboutShadowBuffer = false;
+       
+       public ShadowBuffer(GPUObject gpuObj) {
+               this.gpuObj = gpuObj;
+       }
+       
+       /**
+        * Check if the gpu object is shadow buffered
+        * 
+        * @return true if the gpu object is shadow buffered
+        */
+       public boolean isBuffered() {
+               return shadowPointer != null;
+       }
+       
+       /**
+        * Move the data from GPU to shadow buffer 
+        * @param instName name of the instruction
+        */
+       public void moveFromDevice(String instName) {
+               long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               int numElems = 
GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
+               shadowPointer = new float[numElems];
+               DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES += 
shadowPointer.length*Sizeof.FLOAT;
+               cudaMemcpy(Pointer.to(shadowPointer), 
gpuObj.jcudaDenseMatrixPtr, numElems*LibMatrixCUDA.sizeOfDataType, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
+               gpuObj.getGPUContext().cudaFreeHelper(instName, 
gpuObj.jcudaDenseMatrixPtr, true);
+               gpuObj.jcudaDenseMatrixPtr = null;
+               if (DMLScript.STATISTICS) {
+                       // Eviction time measure in malloc
+                       long totalTime = System.nanoTime() - start;
+                       GPUStatistics.cudaFromDevToShadowTime.add(totalTime);
+                       GPUStatistics.cudaFromDevToShadowCount.increment();
+                       
+               }
+       }
+       
+       /**
+        * Move the data from shadow buffer to Matrix object
+        */
+       public void moveToHost() {
+               long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               MatrixBlock tmp = new 
MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), 
GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false);
+               tmp.allocateDenseBlock();
+               double [] tmpArr = tmp.getDenseBlockValues();
+               for(int i = 0; i < shadowPointer.length; i++) {
+                       tmpArr[i] = shadowPointer[i];
+               }
+               gpuObj.mat.acquireModify(tmp);
+               gpuObj.mat.release();
+               clearShadowPointer();
+               gpuObj.dirty = false;
+               if (DMLScript.STATISTICS) {
+                       long totalTime = System.nanoTime() - start;
+                       GPUStatistics.cudaFromShadowToHostTime.add(totalTime);
+                       GPUStatistics.cudaFromShadowToHostCount.increment();
+                       // Part of dev -> host, not eviction
+                       GPUStatistics.cudaFromDevTime.add(totalTime);
+                       GPUStatistics.cudaFromDevCount.increment();
+               }
+       }
+       
+       /**
+        * Move the data from shadow buffer to GPU
+        */
+       public void moveToDevice() {
+               long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               long numBytes = 
shadowPointer.length*LibMatrixCUDA.sizeOfDataType;
+               gpuObj.jcudaDenseMatrixPtr = 
gpuObj.getGPUContext().allocate(null, numBytes);
+               cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, 
Pointer.to(shadowPointer), numBytes, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
+               clearShadowPointer();
+               if (DMLScript.STATISTICS) {
+                       long totalTime = System.nanoTime() - start;
+                       GPUStatistics.cudaFromShadowToDevTime.add(totalTime);
+                       GPUStatistics.cudaFromShadowToDevCount.increment();
+               }
+       }
+       
+       /**
+        * Checks if the GPU object is eligible for shadow buffering
+        * 
+        * @param isEviction true if this method is called during eviction
+        * @param eagerDelete true if the data on device has to be eagerly 
deleted
+        * @return true if the given GPU object is eligible to be shadow 
buffered
+        */
+       public boolean isEligibleForBuffering(boolean isEviction, boolean 
eagerDelete) {
+               if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && 
isEviction && eagerDelete && !gpuObj.isDensePointerNull()) {
+                       int numBytes = 
GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns())*Sizeof.FLOAT;
+                       boolean ret = 
DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= 
DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES;
+                       if(!ret && !_warnedAboutShadowBuffer) {
+                               LOG.warn("Shadow buffer is full, so using CP 
bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize.");
+                               _warnedAboutShadowBuffer = true;
+                       }
+                       return ret;
+               }
+               else {
+                       return false;
+               }
+       }
+       
+       /**
+        * Removes the content from shadow buffer
+        */
+       public void clearShadowPointer() {
+               if(shadowPointer != null) {
+                       DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES -= 
shadowPointer.length*Sizeof.FLOAT;
+               }
+               shadowPointer = null;
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/a1193300/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index 12abf21..fcbc4c4 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -54,6 +54,7 @@ public class GPUStatistics {
        public static LongAdder cudaToDevTime = new LongAdder();             // 
time spent in copying data from host (CPU) to device (GPU) memory
        public static LongAdder cudaFromDevTime = new LongAdder();           // 
time spent in copying data from device to host
        public static LongAdder cudaFromShadowToHostTime = new LongAdder();  // 
time spent in copying data from shadow to host
+       public static LongAdder cudaFromShadowToDevTime = new LongAdder();  // 
time spent in copying data from shadow to host
        public static LongAdder cudaFromDevToShadowTime = new LongAdder();  // 
time spent in copying data from device to shadow
        public static LongAdder cudaEvictTime = new LongAdder();                
 // time spent in eviction
        public static LongAdder cudaEvictSizeTime = new LongAdder();         // 
time spent in eviction
@@ -68,6 +69,7 @@ public class GPUStatistics {
        public static LongAdder cudaToDevCount = new LongAdder();
        public static LongAdder cudaFromDevCount = new LongAdder();
        public static LongAdder cudaFromShadowToHostCount = new LongAdder();
+       public static LongAdder cudaFromShadowToDevCount = new LongAdder();
        public static LongAdder cudaFromDevToShadowCount = new LongAdder();
        public static LongAdder cudaEvictCount = new LongAdder();
        public static LongAdder cudaEvictSizeCount = new LongAdder();
@@ -104,6 +106,7 @@ public class GPUStatistics {
                cudaToDevTime.reset();
                cudaFromDevTime.reset();
                cudaFromShadowToHostTime.reset();
+               cudaFromShadowToDevTime.reset();
                cudaFromDevToShadowTime.reset();
                cudaEvictTime.reset();
                cudaEvictSizeTime.reset();
@@ -118,6 +121,7 @@ public class GPUStatistics {
                cudaToDevCount.reset();
                cudaFromDevCount.reset();
                cudaFromShadowToHostCount.reset();
+               cudaFromShadowToDevCount.reset();
                cudaFromDevToShadowCount.reset();
                cudaEvictCount.reset();
                cudaEvictSizeCount.reset();
@@ -238,18 +242,20 @@ public class GPUStatistics {
                                + cudaAllocReuseCount.longValue() +") / "
                                + cudaDeAllocCount.longValue() + " / "
                                + cudaMemSet0Count.longValue() + ".\n");
-               sb.append("GPU mem tx time  (toDev(d2f) / fromDev(f2d/s2h) / 
evict(d2s/size)):\t"
+               sb.append("GPU mem tx time  (toDev(d2f/s2d) / fromDev(f2d/s2h) 
/ evict(d2s/size)):\t"
                                + String.format("%.3f", 
cudaToDevTime.longValue()*1e-9) + "("
-                               + String.format("%.3f", 
cudaDouble2FloatTime.longValue()*1e-9)+ ") / "
+                               + String.format("%.3f", 
cudaDouble2FloatTime.longValue()*1e-9)+ "/"
+                               + String.format("%.3f", 
cudaFromShadowToDevTime.longValue()*1e-9) + ") / "
                                + String.format("%.3f", 
cudaFromDevTime.longValue()*1e-9) + "("
                                + String.format("%.3f", 
cudaFloat2DoubleTime.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaFromShadowToHostTime.longValue()*1e-9) + ") / "
                                + String.format("%.3f", 
cudaEvictTime.longValue()*1e-9) + "("
                                + String.format("%.3f", 
cudaFromDevToShadowTime.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaEvictSizeTime.longValue()*1e-9) + ") sec.\n");
-               sb.append("GPU mem tx count (toDev(d2f) / fromDev(f2d/s2h) / 
evict(d2s/size)):\t"
+               sb.append("GPU mem tx count (toDev(d2f/s2d) / fromDev(f2d/s2h) 
/ evict(d2s/size)):\t"
                                + cudaToDevCount.longValue() + "("
-                               + cudaDouble2FloatCount.longValue() + ") / "
+                               + cudaDouble2FloatCount.longValue() + "/" 
+                               + cudaFromShadowToDevCount.longValue() + ") / "
                                + cudaFromDevCount.longValue() + "("
                                + cudaFloat2DoubleCount.longValue() + "/"
                                + cudaFromShadowToHostCount.longValue() + ") / "

systemml git commit: [SYSTEMML-445] Refactored the shadow buffer and added documentation for newly added features

Reply via email to