systemml git commit: [SYSTEMML-445] Bugfix and GPU shadow buffer for single precision

niketanpansare Thu, 02 Aug 2018 11:53:09 -0700

Repository: systemml
Updated Branches:
  refs/heads/master aed66df13 -> bdf27084b



[SYSTEMML-445] Bugfix and GPU shadow buffer for single precision

- Added shadow buffer logic for using CPU memory as temporary memory for 
evicted matrices. This avoids unnecessary overhead of float to double, nnz 
computation, potential serialization, etc. By default, this is turned off and 
can be enabled via configuration property sysml.gpu.eviction.shadow.bufferSize
- Bugfix when the data generated by MLContext with GPU disabled is consumed by
MLContext with GPU enabled. Without this bugfix, we throw a null pointer
exception as the GPU pointer data structure of the data by the first MLContext 
is not initialized.
- Added additional GPU memory-related statistics.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/bdf27084
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/bdf27084
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/bdf27084

Branch: refs/heads/master
Commit: bdf27084bd115c69a00a521a77c66a62d8b657b7
Parents: aed66df
Author: Niketan Pansare <[email protected]>
Authored: Thu Aug 2 11:45:55 2018 -0700
Committer: Niketan Pansare <[email protected]>
Committed: Thu Aug 2 11:51:39 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   4 +
 .../java/org/apache/sysml/api/DMLScript.java    |   2 +
 .../apache/sysml/api/ScriptExecutorUtils.java   |  21 ++
 .../java/org/apache/sysml/conf/DMLConfig.java   |   4 +-
 .../controlprogram/caching/CacheableData.java   |   9 +-
 .../context/GPULazyCudaFreeMemoryManager.java   |  32 ++-
 .../gpu/context/GPUMatrixMemoryManager.java     |  44 +---
 .../gpu/context/GPUMemoryManager.java           | 231 +++++++------------
 .../instructions/gpu/context/GPUObject.java     | 110 +++++++--
 .../org/apache/sysml/utils/GPUStatistics.java   |  87 ++++---
 10 files changed, 307 insertions(+), 237 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index 05d6a1a..033aadb 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -104,4 +104,8 @@
    
    <!-- Advanced optimization: fraction of driver memory to use for caching 
(default: 0.15) -->
    <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
+   
+   <!-- Advanced optimization: fraction of driver memory to use for GPU shadow 
buffer. This optimization is ignored for double precision. 
+   By default, it is disabled (hence set to 0.0). If you intend to train 
network larger than GPU memory size, consider using single precision and 
setting this to 0.1 -->
+   
<sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize>
 </root>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index 50a23aa..bfc9da5 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -122,6 +122,8 @@ public class DMLScript
        public static String            FLOATING_POINT_PRECISION = "double";    
                     // data type to use internally
        public static EvictionPolicy    GPU_EVICTION_POLICY = 
EvictionPolicy.ALIGN_MEMORY;           // currently employed GPU eviction policy
        public static boolean           PRINT_GPU_MEMORY_INFO = false;          
                     // whether to print GPU memory-related information
+       public static long              EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;   
                      // maximum number of bytes to use for shadow buffer
+       public static long              EVICTION_SHADOW_BUFFER_CURR_BYTES = 0;  
                      // number of bytes to use for shadow buffer
 
        /**
         * Global variable indicating the script type (DML or PYDML). Can be 
used

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 0b4c7ab..13d0c78 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -31,6 +31,7 @@ import org.apache.sysml.runtime.controlprogram.Program;
 import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
@@ -80,6 +81,8 @@ public class ScriptExecutorUtils {
                DMLScript.PRINT_GPU_MEMORY_INFO = 
dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
                DMLScript.SYNCHRONIZE_GPU = 
dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
                CacheableData.CACHING_BUFFER_SIZE = 
dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE);
+               if(CacheableData.CACHING_BUFFER_SIZE < 0 || 
CacheableData.CACHING_BUFFER_SIZE > 1) 
+                       throw new RuntimeException("Incorrect value (" + 
CacheableData.CACHING_BUFFER_SIZE + ") for the configuration " + 
DMLConfig.CACHING_BUFFER_SIZE);
                DMLScript.EAGER_CUDA_FREE = 
dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
                DMLScript.STATISTICS_MAX_WRAP_LEN = 
dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);          
                
NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), 
dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim());
@@ -87,7 +90,25 @@ public class ScriptExecutorUtils {
                if(DMLScript.USE_ACCELERATOR) {
                        DMLScript.FLOATING_POINT_PRECISION = 
dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION);
                        
org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision();
+                       if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) 
{
+                               DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;
+                       }
+                       else {
+                               double shadowBufferSize = 
dmlconf.getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+                               if(shadowBufferSize < 0 || shadowBufferSize > 
1) 
+                                       throw new RuntimeException("Incorrect 
value (" + shadowBufferSize + ") for the configuration " + 
DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+                               DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 
(long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
+                               if(DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES > 
0 && 
+                                               
DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES > 
DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES) {
+                                       // This will be printed in a very rare 
situation when:
+                                       // 1. There is a memory leak which 
leads to non-cleared shadow buffer OR
+                                       // 2. MLContext is registering to bunch 
of outputs that are all part of shadow buffer
+                                       System.out.println("WARN: Cannot use 
the shadow buffer due to potentially cached GPU objects. Current shadow buffer 
size (in bytes):" 
+                                               + 
DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + " > Max shadow buffer size (in 
bytes):" + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES);
+                               }
+                       }
                }
+               
 
                boolean exceptionThrown = false;
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 9f08c3c..4aad400 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -94,6 +94,7 @@ public class DMLConfig
        public static final String GPU_MEMORY_UTILIZATION_FACTOR = 
"sysml.gpu.memory.util.factor";
        public static final String FLOATING_POINT_PRECISION = 
"sysml.floating.point.precision"; // String to specify the datatype to use 
internally: supported values are double, single
        public static final String PRINT_GPU_MEMORY_INFO = 
"sysml.gpu.print.memoryInfo";
+       public static final String EVICTION_SHADOW_BUFFERSIZE = 
"sysml.gpu.eviction.shadow.bufferSize";
 
        // supported prefixes for custom map/reduce configurations
        public static final String PREFIX_MAPRED = "mapred";
@@ -136,6 +137,7 @@ public class DMLConfig
                _defaultVals.put(NATIVE_BLAS_DIR,        "none" );
                _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" );
                _defaultVals.put(PRINT_GPU_MEMORY_INFO,  "false" );
+               _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE,  "0.0" );
                _defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
                _defaultVals.put(AVAILABLE_GPUS,         "-1");
@@ -426,7 +428,7 @@ public class DMLConfig
                                COMPRESSED_LINALG, 
                                CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, 
CODEGEN_PLANCACHE, CODEGEN_LITERALS,
                                EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, 
PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
-                               AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY
+                               AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, 
EVICTION_SHADOW_BUFFERSIZE
                }; 
                
                StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 0265c33..b2a78d4 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -410,7 +410,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                        getCache();
                
                //call acquireHostRead if gpuHandle is set as well as is 
allocated
-               if( DMLScript.USE_ACCELERATOR ) {
+               if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) {
                        boolean copiedFromGPU = false;
                        for (Map.Entry<GPUContext, GPUObject> kv : 
_gpuObjects.entrySet()) {
                                GPUObject gObj = kv.getValue();
@@ -617,11 +617,12 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                        _rddHandle.setBackReference(null);
                if( _bcHandle != null )
                        _bcHandle.setBackReference(null);
-               if( _gpuObjects != null )
+               if( _gpuObjects != null ) {
                        for (GPUObject gObj : _gpuObjects.values())
                                if (gObj != null)
                                        gObj.clearData(null, 
DMLScript.EAGER_CUDA_FREE);
-
+               }
+               
                // change object state EMPTY
                setDirty(false);
                setEmpty();
@@ -684,7 +685,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
 
                LOG.trace("Exporting " + this.getDebugName() + " to " + fName + 
" in format " + outputFormat);
                
-               if( DMLScript.USE_ACCELERATOR ) {
+               if( DMLScript.USE_ACCELERATOR && _gpuObjects != null ) {
                        boolean copiedFromGPU = false;
                        for (Map.Entry<GPUContext, GPUObject> kv : 
_gpuObjects.entrySet()) {
                                GPUObject gObj = kv.getValue();

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
index c90beef..b619fa9 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
@@ -29,6 +29,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
 
@@ -55,7 +56,16 @@ public class GPULazyCudaFreeMemoryManager {
                if (rmvarGPUPointers.containsKey(size)) {
                        if(LOG.isTraceEnabled())
                                LOG.trace("Getting rmvar-ed pointers for size:" 
+ size);
+                       boolean measureTime = opcode != null && 
DMLScript.FINEGRAINED_STATISTICS; 
+                       long t0 = measureTime ? System.nanoTime() : 0;
                        Pointer A = remove(rmvarGPUPointers, size); // remove 
from rmvarGPUPointers as you are not calling cudaFree
+                       long totalTime = System.nanoTime() - t0;
+                       if(DMLScript.STATISTICS) {
+                               GPUStatistics.cudaAllocReuseCount.increment();
+                       }
+                       if(measureTime) {
+                               GPUStatistics.maintainCPMiscTimes(opcode, 
GPUInstruction.MISC_TIMER_REUSE, totalTime);
+                       }
                        return A;
                }
                else {
@@ -63,6 +73,18 @@ public class GPULazyCudaFreeMemoryManager {
                }
        }
        
+       /**
+        * Convenient method to add misc timers
+        * 
+        * @param opcode opcode
+        * @param instructionLevelTimer member of GPUInstruction
+        * @param startTime start time
+        */
+       void addMiscTime(String opcode, String instructionLevelTimer, long 
startTime) {
+               if (opcode != null && DMLScript.FINEGRAINED_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(opcode, 
instructionLevelTimer, System.nanoTime() - startTime);
+       }
+       
        public Set<Pointer> getAllPointers() {
                return rmvarGPUPointers.values().stream().flatMap(ptrs -> 
ptrs.stream()).collect(Collectors.toSet());
        }
@@ -82,9 +104,15 @@ public class GPULazyCudaFreeMemoryManager {
                Optional<Long> toClear = 
rmvarGPUPointers.entrySet().stream().filter(e -> e.getValue().size() > 0).map(e 
-> e.getKey())
                                .filter(size -> size >= minSize).min((s1, s2) 
-> s1 < s2 ? -1 : 1);
                if(toClear.isPresent()) {
-                       long t0 = opcode != null && 
DMLScript.FINEGRAINED_STATISTICS ?  System.nanoTime() : 0;
+                       boolean measureTime = opcode != null && 
DMLScript.FINEGRAINED_STATISTICS;
+                       long t0 = measureTime ?  System.nanoTime() : 0;
                        Pointer A = remove(rmvarGPUPointers, toClear.get()); // 
remove from rmvarGPUPointers as you are not calling cudaFree
-                       gpuManager.addMiscTime(opcode, 
GPUInstruction.MISC_TIMER_REUSE, t0);
+                       if(measureTime) {
+                               gpuManager.addMiscTime(opcode, 
GPUInstruction.MISC_TIMER_REUSE, t0);
+                       }
+                       if(DMLScript.STATISTICS) {
+                               GPUStatistics.cudaAllocReuseCount.increment();
+                       }
                        return A;
                }
                return null;

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
index 610df23..cbb8d4e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
@@ -18,9 +18,7 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import java.util.Comparator;
 import java.util.HashSet;
-import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;
 
@@ -54,7 +52,10 @@ public class GPUMatrixMemoryManager {
        long getWorstCaseContiguousMemorySize(GPUObject gpuObj) {
                long ret = 0;
                if(!gpuObj.isDensePointerNull()) {
-                       ret = 
gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes();
+                       if(gpuObj.shadowPointer == null)
+                               ret = 
gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes();
+                       else
+                               ret = 0; // evicted hence no contiguous memory 
on GPU
                }
                else if(gpuObj.getJcudaSparseMatrixPtr() != null) {
                        CSRPointer sparsePtr = gpuObj.getJcudaSparseMatrixPtr();
@@ -81,6 +82,7 @@ public class GPUMatrixMemoryManager {
                        LOG.warn("Matrix allocated in both dense and sparse 
format");
                }
                if(!gObj.isDensePointerNull()) {
+                       // && gObj.evictedDenseArr == null - Ignore evicted 
array
                        ret.add(gObj.getDensePointer());
                }
                if(gObj.getSparseMatrixCudaPointer() != null) {
@@ -107,16 +109,6 @@ public class GPUMatrixMemoryManager {
        HashSet<GPUObject> gpuObjects = new HashSet<>();
        
        /**
-        * Get GPUObjects from the first memory sections "Matrix Memory"
-        * @param locked return locked GPU objects if true
-        * @param dirty return dirty GPU objects if true
-        * @return set of GPU Objects
-        */
-       Set<GPUObject> getGPUObjects(boolean locked, boolean dirty) {
-               return gpuObjects.stream().filter(gObj -> gObj.isLocked() == 
locked && gObj.isDirty() == dirty).collect(Collectors.toSet());
-       }
-       
-       /**
         * Return all pointers in the first section
         * @return all pointers in this section
         */
@@ -135,32 +127,6 @@ public class GPUMatrixMemoryManager {
        }
        
        /**
-        * Clear the memory of the gpu object that matches the provided 
parameters
-        * 
-        * @param locked is locked
-        * @param dirty is dirty
-        * @param minSize of atleast given size
-        * @param comparator sorting comparator in case there are more than one 
gpu object that matches above parameters
-        * @param opcode instruction code
-        * @return true if a gpu object satisfies the above condition else false
-        * @throws DMLRuntimeException if error occurs
-        */
-       boolean clear(boolean locked, boolean dirty, long minSize, 
Comparator<GPUObject> comparator, String opcode) throws DMLRuntimeException {
-               Optional<GPUObject> toClear = getGPUObjects(locked, 
dirty).stream()
-                               .filter(gObj -> 
getWorstCaseContiguousMemorySize(gObj) >= minSize)
-                                       .max(comparator);
-               if(toClear.isPresent()) {
-                       GPUObject gObj = toClear.get();
-                       if(gObj.dirty) 
-                               gObj.copyFromDeviceToHost(opcode, true, true); 
// Perform eviction if dirty
-                       else
-                               gObj.clearData(opcode, true);
-                       gpuObjects.remove(gObj);
-               }
-               return toClear.isPresent();
-       }
-       
-       /**
         * Clear all unlocked gpu objects
         * 
         * @param opcode instruction code

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index 45611a4..acfba66 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -29,6 +29,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map.Entry;
+import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.atomic.LongAdder;
 import java.util.stream.Collectors;
@@ -159,14 +160,37 @@ public class GPUMemoryManager {
         * 
         * @param A pointer
         * @param size size in bytes
+        * @param printDebugMessage debug message
         * @return allocated pointer
         */
-       private Pointer cudaMallocNoWarn(Pointer A, long size) {
+       private Pointer cudaMallocNoWarn(Pointer A, long size, String 
printDebugMessage) {
+               long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                try {
                        cudaMalloc(A, size);
                        allPointers.put(A, new PointerInfo(size));
+                       if(DMLScript.STATISTICS) {
+                               long totalTime = System.nanoTime() - t0;
+                               
GPUStatistics.cudaAllocSuccessTime.add(totalTime);
+                               GPUStatistics.cudaAllocSuccessCount.increment();
+                               GPUStatistics.cudaAllocTime.add(totalTime);
+                               GPUStatistics.cudaAllocCount.increment();
+                       }
+                       if(printDebugMessage != null && 
(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) )  {
+                               LOG.info("Success: " + printDebugMessage + ":" 
+ byteCountToDisplaySize(size));
+                       }
                        return A;
                } catch(jcuda.CudaException e) {
+                       if(DMLScript.STATISTICS) {
+                               long totalTime = System.nanoTime() - t0;
+                               
GPUStatistics.cudaAllocFailedTime.add(System.nanoTime() - t0);
+                               GPUStatistics.cudaAllocFailedCount.increment();
+                               GPUStatistics.cudaAllocTime.add(totalTime);
+                               GPUStatistics.cudaAllocCount.increment();
+                       }
+                       if(printDebugMessage != null && 
(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) )  {
+                               LOG.info("Failed: " + printDebugMessage + ":" + 
byteCountToDisplaySize(size));
+                               LOG.info("GPU Memory info " + printDebugMessage 
+ ":" + toString());
+                       }
                        return null;
                }
        }
@@ -218,180 +242,88 @@ public class GPUMemoryManager {
                        LOG.info("GPU Memory info during malloc:" + toString());
                }
                
-               long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-               long mallocStart = 0;
                // Step 1: First try reusing exact match in rmvarGPUPointers to 
avoid holes in the GPU memory
                Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, 
size);
-               if(A != null)
-                       addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, 
t0);
                
                Pointer tmpA = (A == null) ? new Pointer() : null;
                // Step 2: Allocate a new pointer in the GPU memory (since 
memory is available)
+               // Step 3 has potential to create holes as well as limit future 
reuse, hence perform this step before step 3.
                if(A == null && size <= getAvailableMemory()) {
-                       mallocStart = DMLScript.STATISTICS ? System.nanoTime() 
: 0;
-                       A = cudaMallocNoWarn(tmpA, size); // Try malloc rather 
than check available memory to avoid fragmentation related issues
-                       addMiscTime(null, GPUStatistics.cudaEvictMallocTime, 
GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, 
mallocStart);
-                       if(LOG.isTraceEnabled()) {
-                               if(A == null)
-                                       LOG.trace("Couldnot allocate a new 
pointer in the GPU memory:" + byteCountToDisplaySize(size));
-                               else
-                                       LOG.trace("Allocated a new pointer in 
the GPU memory:" + byteCountToDisplaySize(size));
-                       }
+                       // This can fail in case of fragmented memory, so don't 
issue any warning
+                       A = cudaMallocNoWarn(tmpA, size, "allocate a new 
pointer");
                }
                
-               // Reusing one rmvar-ed pointer (Step 3) is preferred to 
reusing multiple pointers as the latter may not be contiguously allocated.
-               // (Step 4 or using any other policy that doesnot take memory 
into account).
-               
                // Step 3: Try reusing non-exact match entry of rmvarGPUPointers
                if(A == null) { 
                        A = 
lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
                        if(A != null) {
                                guardedCudaFree(A);
-                               mallocStart = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
-                               A = cudaMallocNoWarn(tmpA, size); // Try malloc 
rather than check available memory to avoid fragmentation related issues
-                               addMiscTime(null, 
GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, 
GPUInstruction.MISC_TIMER_EVICT, mallocStart);
-                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
-                                       if(A == null)
-                                               LOG.info("Couldnot reuse 
non-exact match of rmvarGPUPointers:" + byteCountToDisplaySize(size));
-                                       else {
-                                               LOG.info("Reuses a non-exact 
match from rmvarGPUPointers:" + byteCountToDisplaySize(size));
-                                               LOG.info("GPU Memory info after 
reusing a non-exact match from rmvarGPUPointers:" + toString());
-                                       }
-                               }
+                               A = cudaMallocNoWarn(tmpA, size, "reuse 
non-exact match of rmvarGPUPointers"); 
+                               if(A == null)
+                                       LOG.warn("cudaMalloc failed after 
clearing one of rmvarGPUPointers.");
                        }
                }
                
-               // Step 3.b: An optimization missing so as not to over-engineer 
malloc:
-               // Try to find minimal number of contiguously allocated pointer.
-               
+               // Step 4: Eagerly free-up rmvarGPUPointers and check if memory 
is available on GPU
                // Evictions of matrix blocks are expensive (as they might lead 
them to be written to disk in case of smaller CPU budget) 
                // than doing cuda free/malloc/memset. So, rmvar-ing every 
blocks (step 4) is preferred to eviction (step 5).
-               
-               // Step 4: Eagerly free-up rmvarGPUPointers and check if memory 
is available on GPU
                if(A == null) {
                        lazyCudaFreeMemoryManager.clearAll();
                        if(size <= getAvailableMemory()) {
-                               A = cudaMallocNoWarn(tmpA, size);
-                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
-                                       if(A == null)
-                                               LOG.info("Couldnot allocate a 
new pointer in the GPU memory after eager free:" + 
byteCountToDisplaySize(size));
-                                       else {
-                                               LOG.info("Allocated a new 
pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size));
-                                               LOG.info("GPU Memory info after 
allocating new pointer post lazyCudaFreeMemoryManager.clearAll():" + 
toString());
-                                       }
-                               }
+                               // This can fail in case of fragmented memory, 
so don't issue any warning
+                               A = cudaMallocNoWarn(tmpA, size, "allocate a 
new pointer after eager free");
                        }
                }
                
-               addMiscTime(opcode, GPUStatistics.cudaAllocTime, 
GPUStatistics.cudaAllocCount, GPUInstruction.MISC_TIMER_ALLOCATE, t0);
-               
-               // Step 5: Try eviction based on the given policy
+               // Step 5: Try eviction/clearing exactly one with size 
restriction
                if(A == null) {
-                       t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-                       
-                       // First, clear unlocked non-dirty matrices greater 
than or equal to size
-                       // Comparator clears the largest matrix to avoid future 
evictions
-                       boolean success = matrixMemoryManager.clear(false, 
false, size, SIMPLE_COMPARATOR_SORT_BY_SIZE, opcode);
-                       if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
-                               if(success) {
-                                       LOG.info("Cleared an unlocked non-dirty 
matrix greater than or equal to " + byteCountToDisplaySize(size));
-                                       LOG.info("GPU Memory info after 
clearing an unlocked non-dirty matrix:" + toString());
-                               }
-                               else
-                                       LOG.info("No unlocked non-dirty matrix 
greater than or equal to " + byteCountToDisplaySize(size) + " found for 
clearing.");
-                       }
-                       if(!success) {
-                               // First, clear unlocked dirty matrices greater 
than or equal to size using the eviction policy
-                               // Comparator clears the largest matrix to 
avoid future evictions
-                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
-                                       LOG.info("GPU Memory info before 
eviction:" + toString());
-                               }
-                               success = matrixMemoryManager.clear(false, 
true, size, new EvictionPolicyBasedComparator(size), opcode);
-                               // JCuda.cudaDeviceSynchronize();
-                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
-                                       if(success) {
-                                               LOG.info("Evicted an unlocked 
dirty matrix greater than or equal to " + byteCountToDisplaySize(size));
-                                               LOG.info("GPU Memory info after 
evicting an unlocked dirty matrix:" + toString());
-                                       }
-                                       else
-                                               LOG.info("No unlocked dirty 
matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for 
evicted.");
-                               }
-                               
-                               if(!success) {
-                                       // Minor optimization: clear all 
unlocked non-dirty matrices before attempting eviction 
-                                       // Delete all non-dirty
-                                       List<GPUObject> unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
-                                                                               
                .filter(gpuObj -> !gpuObj.isLocked() && 
!gpuObj.isDirty()).collect(Collectors.toList());
-                                       
matrixMemoryManager.gpuObjects.removeAll(unlockedGPUObjects);
-                                       for(GPUObject toBeRemoved : 
unlockedGPUObjects) {
-                                               toBeRemoved.clearData(opcode, 
true);
-                                       }
-                                       if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
-                                               LOG.info("GPU Memory info after 
clearing all unlocked non-dirty matrices:" + toString());
-                                       }
-                                       mallocStart = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
-                                       A = cudaMallocNoWarn(tmpA, size); // 
Try malloc rather than check available memory to avoid fragmentation related 
issues
-                                       addMiscTime(null, 
GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, 
GPUInstruction.MISC_TIMER_EVICT, mallocStart);
-                               
-                                       // 
---------------------------------------------------------------
-                                       // Evict unlocked GPU objects 
one-by-one and try malloc
-                                       unlockedGPUObjects = null;
-                                       if(A == null) {
-                                               unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
-                                                               .filter(gpuObj 
-> !gpuObj.isLocked() && gpuObj.isDirty()).collect(Collectors.toList());
-                                               
Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size));
-                                               while(A == null && 
unlockedGPUObjects.size() > 0) {
-                                                       
if(DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.ALIGN_MEMORY) {
-                                                               // TODO: 
Optimize later using sliding window
-                                                               // Evict as 
many sequential dense objects from back of the queue as possible
-                                                               long neededSize 
= size;
-                                                               
while(neededSize >= 0 && unlockedGPUObjects.size() > 0) {
-                                                                       
GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
-                                                                       
neededSize -= matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj);
-                                                                       
gpuObj.copyFromDeviceToHost(opcode, true, true);
-                                                               }
-                                                       }
-                                                       else {
-                                                               GPUObject 
gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
-                                                               
gpuObj.copyFromDeviceToHost(opcode, true, true);
-                                                       }
-                                                       mallocStart = 
DMLScript.STATISTICS ? System.nanoTime() : 0;
-                                                       A = 
cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory 
to avoid fragmentation related issues
-                                                       addMiscTime(null, 
GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, 
GPUInstruction.MISC_TIMER_EVICT, mallocStart);
-                                               }
-                                               
if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
-                                                       // greater than or 
equal to " + byteCountToDisplaySize(size)
-                                                       LOG.info("GPU Memory 
info after eviction:" + toString());
-                                               }
-                                               if(unlockedGPUObjects != null 
&& unlockedGPUObjects.size() == 0) {
-                                                       LOG.warn("Evicted all 
unlocked matrices");
-                                               }
-                                       }
-                                       
+                       long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
+                       Optional<GPUObject> sizeBasedUnlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
+                                               .filter(gpuObj -> 
!gpuObj.isLocked() && 
matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
+                                               .min((o1, o2) -> 
worstCaseContiguousMemorySizeCompare(o1, o2));
+                       if(sizeBasedUnlockedGPUObjects.isPresent()) {
+                               evictOrClear(sizeBasedUnlockedGPUObjects.get(), 
opcode);
+                               A = cudaMallocNoWarn(tmpA, size, null);
+                               if(A == null)
+                                       LOG.warn("cudaMalloc failed after 
clearing/evicting based on size.");
+                               if(DMLScript.STATISTICS) {
+                                       long totalTime = System.nanoTime() - t0;
+                                       
GPUStatistics.cudaEvictTime.add(totalTime);
+                                       
GPUStatistics.cudaEvictSizeTime.add(totalTime);
+                                       
GPUStatistics.cudaEvictCount.increment();
+                                       
GPUStatistics.cudaEvictSizeCount.increment();
                                }
-                               // 
---------------------------------------------------------------
-                       }
-                       addMiscTime(opcode, GPUStatistics.cudaEvictTime, 
GPUStatistics.cudaEvictionCount, GPUInstruction.MISC_TIMER_EVICT, t0);
-                       if(A == null) {
-                               A = cudaMallocNoWarn(tmpA, size); // if the 
matrix is not allocated via eviction
                        }
-                       if(A == null) {
-                               LOG.warn("cudaMalloc failed immediately after 
cudaMemGetInfo reported that memory of size " 
-                                               + byteCountToDisplaySize(size) 
+ " is available. "
-                                               + "This usually happens if 
there are external programs trying to grab on to memory in parallel or there is 
potential fragmentation.");
+               }
+               
+               // Step 6: Try eviction/clearing one-by-one based on the given 
policy without size restriction
+               if(A == null) {
+                       long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
+                       // 
---------------------------------------------------------------
+                       // Evict unlocked GPU objects one-by-one and try malloc
+                       List<GPUObject> unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
+                                               .filter(gpuObj -> 
!gpuObj.isLocked()).collect(Collectors.toList());
+                       Collections.sort(unlockedGPUObjects, new 
EvictionPolicyBasedComparator(size));
+                       while(A == null && unlockedGPUObjects.size() > 0) {
+                               
evictOrClear(unlockedGPUObjects.remove(unlockedGPUObjects.size()-1), opcode);
+                               A = cudaMallocNoWarn(tmpA, size, null);
+                               if(DMLScript.STATISTICS) 
+                                       
GPUStatistics.cudaEvictCount.increment();
                        }
-                       else if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
-                               LOG.info("Malloc after eviction/clearing is 
successful.");
+                       if(DMLScript.STATISTICS) {
+                               long totalTime = System.nanoTime() - t0;
+                               GPUStatistics.cudaEvictTime.add(totalTime);
                        }
                }
                
-               // Step 6: Handle defragmentation
+               
+               // Step 7: Handle defragmentation
                if(A == null) {
                        LOG.warn("Potential fragmentation of the GPU memory. 
Forcibly evicting all ...");
                        LOG.info("Before clearAllUnlocked, GPU Memory info:" + 
toString());
                        matrixMemoryManager.clearAllUnlocked(opcode);
                        LOG.info("GPU Memory info after evicting all unlocked 
matrices:" + toString());
-                       A = cudaMallocNoWarn(tmpA, size);
+                       A = cudaMallocNoWarn(tmpA, size, null);
                }
                
                if(A == null) {
@@ -399,12 +331,29 @@ public class GPUMemoryManager {
                                        + toString());
                }
                
-               t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                cudaMemset(A, 0, size);
                addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, 
GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
                return A;
        }
        
+       private int worstCaseContiguousMemorySizeCompare(GPUObject o1, 
GPUObject o2) {
+               long ret = 
matrixMemoryManager.getWorstCaseContiguousMemorySize(o1) - 
matrixMemoryManager.getWorstCaseContiguousMemorySize(o2);
+               return ret < 0 ? -1 : (ret == 0 ? 0 : 1);
+       }
+       
+       private void evictOrClear(GPUObject gpuObj, String opcode) {
+               boolean eagerDelete = true;
+               if(gpuObj.isDirty()) {
+                       // Eviction
+                       gpuObj.copyFromDeviceToHost(opcode, true, eagerDelete);
+               }
+               else {
+                       // Clear without copying
+                       gpuObj.clearData(opcode, eagerDelete);
+               }
+       }
+       
        // --------------- Developer Utilities to debug potential memory leaks 
------------------------
        private void printPointers(Set<Pointer> pointers, StringBuilder sb) {
                HashMap<String, Integer> frequency = new HashMap<>();
@@ -657,8 +606,6 @@ public class GPUMemoryManager {
                return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
        }
        
-       private static Comparator<GPUObject> SIMPLE_COMPARATOR_SORT_BY_SIZE = 
(o1, o2) -> o1.getSizeOnDevice() < o2.getSizeOnDevice() ? -1 : 1;
-       
        private static class CustomPointer extends Pointer {
                public CustomPointer(Pointer p) {
                        super(p);

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 6125d15..26cbd97 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -43,6 +43,7 @@ import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
 import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
+import jcuda.Sizeof;
 import jcuda.jcusparse.cusparseDirection;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
@@ -99,6 +100,18 @@ public class GPUObject {
         */
        protected MatrixObject mat = null;
        
+       float[] shadowPointer = null;
+       private static boolean _warnedAboutShadowBuffer = false;
+       public boolean canFitIntoShadowBuffer() {
+               int numBytes = 
toIntExact(mat.getNumRows()*mat.getNumColumns())*Sizeof.FLOAT;
+               boolean ret = DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + 
numBytes <= DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES;
+               if(!ret && !_warnedAboutShadowBuffer) {
+                       LOG.warn("Shadow buffer is full, so using CP bufferpool 
instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize.");
+                       _warnedAboutShadowBuffer = true;
+               }
+               return ret;
+       }
+       
        // 
----------------------------------------------------------------------
        // Methods used to access, set and check jcudaDenseMatrixPtr
        
@@ -108,6 +121,12 @@ public class GPUObject {
         * @return a pointer to the dense matrix
         */
        public Pointer getDensePointer() {
+               if(jcudaDenseMatrixPtr == null && shadowPointer != null && 
getJcudaSparseMatrixPtr() == null) {
+                       long numBytes = 
shadowPointer.length*LibMatrixCUDA.sizeOfDataType;
+                       jcudaDenseMatrixPtr = gpuContext.allocate(null, 
numBytes);
+                       cudaMemcpy(jcudaDenseMatrixPtr, 
Pointer.to(shadowPointer), numBytes, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
+                       clearShadowPointer();
+               }
                return jcudaDenseMatrixPtr;
        }
        
@@ -125,9 +144,21 @@ public class GPUObject {
         */
        public void clearDensePointer() {
                jcudaDenseMatrixPtr = null;
+               clearShadowPointer();
        }
        
        /**
+        * Removes shadow pointer
+        */
+       public void clearShadowPointer() {
+               if(shadowPointer != null) {
+                       DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES -= 
shadowPointer.length*Sizeof.FLOAT;
+               }
+               shadowPointer = null;
+       }
+       
+       
+       /**
         * Convenience method to directly set the dense matrix pointer on GPU
         *
         * @param densePtr dense pointer
@@ -249,15 +280,7 @@ public class GPUObject {
                //cudaDeviceSynchronize();
                int[] nnzC = { -1 };
 
-               long t2 = 0;
-               if (DMLScript.STATISTICS)
-                       t2 = System.nanoTime();
                cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, 
getIntSizeOf(1), cudaMemcpyDeviceToHost);
-               if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaFromDevTime.add(System.nanoTime() - 
t2);
-               if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaFromDevCount.add(1);
-
                if (nnzC[0] == -1) {
                        throw new DMLRuntimeException(
                                        "cusparseDnnz did not calculate the 
correct number of nnz from the sparse-matrix vector mulitply on the GPU");
@@ -299,7 +322,7 @@ public class GPUObject {
                }
                this.jcudaSparseMatrixPtr = sparseMatrixPtr;
                this.isSparse = true;
-               if (!isDensePointerNull()) {
+               if (!isDensePointerNull() && shadowPointer == null) {
                        cudaFreeHelper(getDensePointer());
                        clearDensePointer();
                }
@@ -321,7 +344,7 @@ public class GPUObject {
                int rows = toIntExact(mat.getNumRows());
                int cols = toIntExact(mat.getNumColumns());
 
-               if (isDensePointerNull() || !isAllocated())
+               if ((isDensePointerNull() && shadowPointer == null) || 
!isAllocated())
                        throw new DMLRuntimeException("Expected allocated dense 
matrix before denseToSparse() call");
 
                denseRowMajorToColumnMajor();
@@ -454,7 +477,7 @@ public class GPUObject {
        }
 
        public boolean isAllocated() {
-               boolean eitherAllocated = (!isDensePointerNull() || 
getJcudaSparseMatrixPtr() != null);
+               boolean eitherAllocated = shadowPointer != null || 
!isDensePointerNull() || getJcudaSparseMatrixPtr() != null;
                return eitherAllocated;
        }
 
@@ -916,7 +939,55 @@ public class GPUObject {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", 
GPUContext=" + getGPUContext());
                }
-               if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) {
+               if(shadowPointer != null) {
+                       if(isEviction) {
+                               // If already copied to shadow buffer as part 
of previous eviction, do nothing.
+                               return;
+                       }
+                       else {
+                               // If already copied to shadow buffer as part 
of previous eviction and this is not an eviction (i.e. bufferpool call for 
subsequent CP/Spark instruction),
+                               // then copy from shadow buffer to MatrixObject.
+                               long start = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
+                               MatrixBlock tmp = new 
MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), 
false);
+                               tmp.allocateDenseBlock();
+                               double [] tmpArr = tmp.getDenseBlockValues();
+                               for(int i = 0; i < shadowPointer.length; i++) {
+                                       tmpArr[i] = shadowPointer[i];
+                               }
+                               mat.acquireModify(tmp);
+                               mat.release();
+                               clearShadowPointer();
+                               dirty = false;
+                               if (DMLScript.STATISTICS) {
+                                       long totalTime = System.nanoTime() - 
start;
+                                       
GPUStatistics.cudaFromShadowToHostTime.add(totalTime);
+                                       
GPUStatistics.cudaFromShadowToHostCount.increment();
+                                       // Part of dev -> host, not eviction
+                                       
GPUStatistics.cudaFromDevTime.add(totalTime);
+                                       
GPUStatistics.cudaFromDevCount.increment();
+                               }
+                               return;
+                       }
+               }
+               else if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && 
isEviction && eagerDelete && !isDensePointerNull() && canFitIntoShadowBuffer()) 
{
+                       // Perform shadow buffering if (1) single precision, 
(2) during eviction, (3) for dense matrices, and (4) if the given matrix can 
fit into the shadow buffer. 
+                       long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
+                       int numElems = 
toIntExact(mat.getNumRows()*mat.getNumColumns());
+                       shadowPointer = new float[numElems];
+                       DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES += 
shadowPointer.length*Sizeof.FLOAT;
+                       cudaMemcpy(Pointer.to(shadowPointer), 
jcudaDenseMatrixPtr, numElems*LibMatrixCUDA.sizeOfDataType, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
+                       getGPUContext().cudaFreeHelper(instName, 
jcudaDenseMatrixPtr, eagerDelete);
+                       jcudaDenseMatrixPtr = null;
+                       if (DMLScript.STATISTICS) {
+                               // Eviction time measure in malloc
+                               long totalTime = System.nanoTime() - start;
+                               
GPUStatistics.cudaFromDevToShadowTime.add(totalTime);
+                               
GPUStatistics.cudaFromDevToShadowCount.increment();
+                               
+                       }
+                       return;
+               }
+               else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == 
null) {
                        throw new DMLRuntimeException(
                                        "Cannot copy from device to host as 
JCuda dense/sparse pointer is not allocated");
                }
@@ -939,13 +1010,10 @@ public class GPUObject {
                        tmp = new MatrixBlock(toIntExact(mat.getNumRows()), 
toIntExact(mat.getNumColumns()), false);
                        tmp.allocateDenseBlock();
                        
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
-                                               getDensePointer(), 
tmp.getDenseBlockValues(), instName, isEviction); 
-                       // int nnz = LibMatrixCUDA.computeNNZ(getGPUContext(), 
getJcudaDenseMatrixPtr(), toIntExact(mat.getNumRows()*mat.getNumColumns()));
-                       // tmp.setNonZeros(nnz);
+                                               getDensePointer(), 
tmp.getDenseBlockValues(), instName, isEviction);
                        if(eagerDelete)
                                clearData(instName, true);
-                       // tmp.recomputeNonZeros();
-                       tmp.setNonZeros(-1);
+                       tmp.recomputeNonZeros();
                } else {
                        int rows = toIntExact(mat.getNumRows());
                        int cols = toIntExact(mat.getNumColumns());
@@ -962,10 +1030,11 @@ public class GPUObject {
                }
                mat.acquireModify(tmp);
                mat.release();
-               if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaFromDevTime.add(System.nanoTime() - 
start);
-               if (DMLScript.STATISTICS) {
+               if (DMLScript.STATISTICS && !isEviction) {
+                       // Eviction time measure in malloc
+                       long totalTime = System.nanoTime() - start;
                        int count = !isDensePointerNull() ? 1 : 3;
+                       GPUStatistics.cudaFromDevTime.add(totalTime);
                        GPUStatistics.cudaFromDevCount.add(count);
                }
                dirty = false;
@@ -990,6 +1059,7 @@ public class GPUObject {
                        getJcudaSparseMatrixPtr().deallocate(eager);
                }
                clearDensePointer();
+               clearShadowPointer();
                jcudaSparseMatrixPtr = null;
                resetReadWriteLock();
                getGPUContext().getMemoryManager().removeGPUObject(this);

http://git-wip-us.apache.org/repos/asf/systemml/blob/bdf27084/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index 18270cc..12abf21 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -36,7 +36,7 @@ import org.apache.sysml.api.DMLScript;
  */
 public class GPUStatistics {
        private static int iNoOfExecutedGPUInst = 0;
-
+       
        public static long cudaInitTime = 0;
        public static long cudaLibrariesInitTime = 0;
        public static LongAdder cudaSparseToDenseTime = new LongAdder();        
        // time spent in converting sparse matrix block to dense
@@ -47,12 +47,16 @@ public class GPUStatistics {
        public static LongAdder cudaSparseConversionCount = new LongAdder();
 
        public static LongAdder cudaAllocTime = new LongAdder();             // 
time spent in allocating memory on the GPU
+       public static LongAdder cudaAllocSuccessTime = new LongAdder();      // 
time spent in successful allocation
+       public static LongAdder cudaAllocFailedTime = new LongAdder();      // 
time spent in unsuccessful allocation
        public static LongAdder cudaDeAllocTime = new LongAdder();           // 
time spent in deallocating memory on the GPU
        public static LongAdder cudaMemSet0Time = new LongAdder();           // 
time spent in setting memory to 0 on the GPU (part of reusing and for new 
allocates)
        public static LongAdder cudaToDevTime = new LongAdder();             // 
time spent in copying data from host (CPU) to device (GPU) memory
        public static LongAdder cudaFromDevTime = new LongAdder();           // 
time spent in copying data from device to host
+       public static LongAdder cudaFromShadowToHostTime = new LongAdder();  // 
time spent in copying data from shadow to host
+       public static LongAdder cudaFromDevToShadowTime = new LongAdder();  // 
time spent in copying data from device to shadow
        public static LongAdder cudaEvictTime = new LongAdder();                
 // time spent in eviction
-       public static LongAdder cudaEvictMallocTime = new LongAdder();      // 
time spent in eviction
+       public static LongAdder cudaEvictSizeTime = new LongAdder();         // 
time spent in eviction
        public static LongAdder cudaFloat2DoubleTime = new LongAdder();         
// time spent in converting float to double during eviction
        public static LongAdder cudaDouble2FloatTime = new LongAdder();         
// time spent in converting double to float during eviction
        public static LongAdder cudaEvictMemcpyTime = new LongAdder();          
// time spent in cudaMemcpy kernel during eviction
@@ -63,10 +67,15 @@ public class GPUStatistics {
        public static LongAdder cudaMemSet0Count = new LongAdder();
        public static LongAdder cudaToDevCount = new LongAdder();
        public static LongAdder cudaFromDevCount = new LongAdder();
-       public static LongAdder cudaEvictionCount = new LongAdder();
+       public static LongAdder cudaFromShadowToHostCount = new LongAdder();
+       public static LongAdder cudaFromDevToShadowCount = new LongAdder();
+       public static LongAdder cudaEvictCount = new LongAdder();
+       public static LongAdder cudaEvictSizeCount = new LongAdder();
        public static LongAdder cudaFloat2DoubleCount = new LongAdder();
        public static LongAdder cudaDouble2FloatCount = new LongAdder();
-       public static LongAdder cudaEvictionMallocCount = new LongAdder();
+       public static LongAdder cudaAllocSuccessCount = new LongAdder();
+       public static LongAdder cudaAllocFailedCount = new LongAdder();
+       public static LongAdder cudaAllocReuseCount = new LongAdder();
 
        // Per instruction miscellaneous timers.
        // Used to record events in a CP Heavy Hitter instruction and
@@ -94,8 +103,10 @@ public class GPUStatistics {
                cudaMemSet0Count.reset();
                cudaToDevTime.reset();
                cudaFromDevTime.reset();
+               cudaFromShadowToHostTime.reset();
+               cudaFromDevToShadowTime.reset();
                cudaEvictTime.reset();
-               cudaEvictMallocTime.reset();
+               cudaEvictSizeTime.reset();
                cudaFloat2DoubleTime.reset();
                cudaDouble2FloatTime.reset();
                cudaFloat2DoubleCount.reset();
@@ -106,8 +117,15 @@ public class GPUStatistics {
                cudaDeAllocCount.reset();
                cudaToDevCount.reset();
                cudaFromDevCount.reset();
-               cudaEvictionCount.reset();
-               cudaEvictionMallocCount.reset();
+               cudaFromShadowToHostCount.reset();
+               cudaFromDevToShadowCount.reset();
+               cudaEvictCount.reset();
+               cudaEvictSizeCount.reset();
+               cudaAllocSuccessTime.reset();
+               cudaAllocFailedTime.reset();
+               cudaAllocSuccessCount.reset();
+               cudaAllocFailedCount.reset();
+               cudaAllocReuseCount.reset();
                resetMiscTimers();
        }
 
@@ -206,34 +224,45 @@ public class GPUStatistics {
                sb.append("CUDA/CuLibraries init time:\t" + 
String.format("%.3f", cudaInitTime*1e-9) + "/"
                                + String.format("%.3f", 
cudaLibrariesInitTime*1e-9) + " sec.\n");
                sb.append("Number of executed GPU inst:\t" + 
getNoOfExecutedGPUInst() + ".\n");
-               sb.append("GPU mem tx time  
(alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t"
-                               + String.format("%.3f", 
cudaAllocTime.longValue()*1e-9) + "/"
-                               + String.format("%.3f", 
cudaDeAllocTime.longValue()*1e-9) + "/"
-                               + String.format("%.3f", 
cudaMemSet0Time.longValue()*1e-9) + "/"
+               // cudaSparseConversionCount
+               sb.append("GPU mem alloc time  (alloc(success/fail) / dealloc / 
set0):\t"
+                               + String.format("%.3f", 
cudaAllocTime.longValue()*1e-9) + "("
+                               + String.format("%.3f", 
cudaAllocSuccessTime.longValue()*1e-9) + "/"
+                               + String.format("%.3f", 
cudaAllocFailedTime.longValue()*1e-9) + ") / "
+                               + String.format("%.3f", 
cudaDeAllocTime.longValue()*1e-9) + " / "
+                               + String.format("%.3f", 
cudaMemSet0Time.longValue()*1e-9) + " sec.\n");
+               sb.append("GPU mem alloc count (alloc(success/fail/reuse) / 
dealloc / set0):\t"
+                               + cudaAllocCount.longValue() + "("
+                               + cudaAllocSuccessCount.longValue() + "/"
+                               + cudaAllocFailedCount.longValue() + "/" +
+                               + cudaAllocReuseCount.longValue() +") / "
+                               + cudaDeAllocCount.longValue() + " / "
+                               + cudaMemSet0Count.longValue() + ".\n");
+               sb.append("GPU mem tx time  (toDev(d2f) / fromDev(f2d/s2h) / 
evict(d2s/size)):\t"
                                + String.format("%.3f", 
cudaToDevTime.longValue()*1e-9) + "("
-                               + String.format("%.3f", 
cudaDouble2FloatTime.longValue()*1e-9)+ ")/"
+                               + String.format("%.3f", 
cudaDouble2FloatTime.longValue()*1e-9)+ ") / "
                                + String.format("%.3f", 
cudaFromDevTime.longValue()*1e-9) + "("
-                               + String.format("%.3f", 
cudaFloat2DoubleTime.longValue()*1e-9) + ")/"
+                               + String.format("%.3f", 
cudaFloat2DoubleTime.longValue()*1e-9) + "/"
+                               + String.format("%.3f", 
cudaFromShadowToHostTime.longValue()*1e-9) + ") / "
                                + String.format("%.3f", 
cudaEvictTime.longValue()*1e-9) + "("
-                               + String.format("%.3f", 
cudaEvictMallocTime.longValue()*1e-9) + ") sec.\n");
-               sb.append("GPU mem tx count 
(alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t"
-                               + cudaAllocCount.longValue() + "/"
-                               + cudaDeAllocCount.longValue() + "/"
-                               + cudaMemSet0Count.longValue() + "/"
-                               + cudaSparseConversionCount.longValue() + "/"
+                               + String.format("%.3f", 
cudaFromDevToShadowTime.longValue()*1e-9) + "/"
+                               + String.format("%.3f", 
cudaEvictSizeTime.longValue()*1e-9) + ") sec.\n");
+               sb.append("GPU mem tx count (toDev(d2f) / fromDev(f2d/s2h) / 
evict(d2s/size)):\t"
                                + cudaToDevCount.longValue() + "("
-                               + cudaDouble2FloatCount.longValue() + ")/"
+                               + cudaDouble2FloatCount.longValue() + ") / "
                                + cudaFromDevCount.longValue() + "("
-                               + cudaFloat2DoubleCount.longValue() + ")/"
-                               + cudaEvictionCount.longValue() + "("
-                               + cudaEvictionMallocCount.longValue() + ").\n");
-               sb.append("GPU conversion time  
(sparseConv/sp2dense/dense2sp):\t"
-                               + String.format("%.3f", 
cudaSparseConversionTime.longValue()*1e-9) + "/"
-                               + String.format("%.3f", 
cudaSparseToDenseTime.longValue()*1e-9) + "/"
+                               + cudaFloat2DoubleCount.longValue() + "/"
+                               + cudaFromShadowToHostCount.longValue() + ") / "
+                               + cudaEvictCount.longValue() + "("
+                               + cudaFromDevToShadowCount.longValue() + "/" + 
+                               + cudaEvictSizeCount.longValue() + ").\n");
+               sb.append("GPU conversion time  (sparseConv / sp2dense / 
dense2sp):\t"
+                               + String.format("%.3f", 
cudaSparseConversionTime.longValue()*1e-9) + " / "
+                               + String.format("%.3f", 
cudaSparseToDenseTime.longValue()*1e-9) + " / "
                                + String.format("%.3f", 
cudaDenseToSparseTime.longValue()*1e-9) + " sec.\n");
-               sb.append("GPU conversion count 
(sparseConv/sp2dense/dense2sp):\t"
-                               + cudaSparseConversionCount.longValue() + "/"
-                               + cudaSparseToDenseCount.longValue() + "/"
+               sb.append("GPU conversion count (sparseConv / sp2dense / 
dense2sp):\t"
+                               + cudaSparseConversionCount.longValue() + " / "
+                               + cudaSparseToDenseCount.longValue() + " / "
                                + cudaDenseToSparseCount.longValue() + ".\n");
 
                return sb.toString();

systemml git commit: [SYSTEMML-445] Bugfix and GPU shadow buffer for single precision

Reply via email to