http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index 07e766a..c737e92 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -112,6 +112,7 @@ public class DMLScript
                LFU,                            // Evict the least frequently 
used GPUObject. 
                MIN_EVICT,
                MRU,                            // 
http://www.vldb.org/conf/1985/P127.PDF
+               ALIGN_MEMORY
                // TODO:
                // ARC, // https://dbs.uni-leipzig.de/file/ARC.pdf
                // LOOP_AWARE           // different policies for operations in 
for/while/parfor loop vs out-side the loop
@@ -172,7 +173,8 @@ public class DMLScript
        public static ExplainType       EXPLAIN             = 
DMLOptions.defaultOptions.explainType; // explain type
        public static String            DML_FILE_PATH_ANTLR_PARSER = 
DMLOptions.defaultOptions.filePath; // filename of dml/pydml script
        public static String            FLOATING_POINT_PRECISION = "double";    
                                                // data type to use internally
-       public static EvictionPolicy    GPU_EVICTION_POLICY = 
EvictionPolicy.LRU;                                               // currently 
employed GPU eviction policy
+       public static EvictionPolicy    GPU_EVICTION_POLICY = 
EvictionPolicy.ALIGN_MEMORY;                              // currently employed 
GPU eviction policy
+       public static boolean                   PRINT_GPU_MEMORY_INFO = false;  
                                                                // whether to 
print GPU memory-related information
 
        /**
         * Global variable indicating the script type (DML or PYDML). Can be 
used

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index d587f48..2d913b6 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -27,6 +27,7 @@ import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.Program;
+import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
@@ -70,7 +71,9 @@ public class ScriptExecutorUtils {
                // Whether extra statistics useful for developers and others 
interested
                // in digging into performance problems are recorded and 
displayed
                DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && 
dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS);
+               DMLScript.PRINT_GPU_MEMORY_INFO = 
dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
                DMLScript.SYNCHRONIZE_GPU = 
dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
+               CacheableData.CACHING_BUFFER_SIZE = 
dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE);
                DMLScript.EAGER_CUDA_FREE = 
dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
                DMLScript.STATISTICS_MAX_WRAP_LEN = 
dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);          
                
NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), 
dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim());

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index fb0237f..7279c57 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -81,6 +81,7 @@ public class DMLConfig
        public static final String CODEGEN_OPTIMIZER    = 
"sysml.codegen.optimizer"; //see SpoofCompiler.PlanSelector
        public static final String CODEGEN_PLANCACHE    = 
"sysml.codegen.plancache"; //boolean
        public static final String CODEGEN_LITERALS     = 
"sysml.codegen.literals"; //1..heuristic, 2..always
+       public static final String CACHING_BUFFER_SIZE  = 
"sysml.caching.bufferSize"; //double: default:0.15
        
        public static final String EXTRA_FINEGRAINED_STATS = 
"sysml.stats.finegrained"; //boolean
        public static final String STATS_MAX_WRAP_LEN   = 
"sysml.stats.maxWrapLength"; //int
@@ -92,7 +93,8 @@ public class DMLConfig
        // to handle the tradeoff on calling cudaMemGetInfo too often.
        public static final String GPU_MEMORY_UTILIZATION_FACTOR = 
"sysml.gpu.memory.util.factor";
        public static final String FLOATING_POINT_PRECISION = 
"sysml.floating.point.precision"; // String to specify the datatype to use 
internally: supported values are double, single
-
+       public static final String PRINT_GPU_MEMORY_INFO = 
"sysml.gpu.print.memoryInfo";
+       
        // supported prefixes for custom map/reduce configurations
        public static final String PREFIX_MAPRED = "mapred";
        public static final String PREFIX_MAPREDUCE = "mapreduce";
@@ -136,7 +138,9 @@ public class DMLConfig
                _defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
                _defaultVals.put(AVAILABLE_GPUS,         "-1");
-               _defaultVals.put(GPU_EVICTION_POLICY,    "lru");
+               _defaultVals.put(GPU_EVICTION_POLICY,    "align_memory");
+               _defaultVals.put(SYNCHRONIZE_GPU,        "false" );
+               _defaultVals.put(CACHING_BUFFER_SIZE,    "0.15" );
                _defaultVals.put(SYNCHRONIZE_GPU,        "true" );
                _defaultVals.put(EAGER_CUDA_FREE,        "false" );
                _defaultVals.put(FLOATING_POINT_PRECISION,               
"double" );
@@ -420,7 +424,7 @@ public class DMLConfig
                                CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, 
NATIVE_BLAS_DIR,
                                COMPRESSED_LINALG, 
                                CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, 
CODEGEN_PLANCACHE, CODEGEN_LITERALS,
-                               EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN,
+                               EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, 
PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
                                AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY
                }; 
                

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 73f6c08..0bb2a67 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -77,7 +77,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
        // global constant configuration parameters
        public static final long    CACHING_THRESHOLD = (long)Math.max(4*1024, 
//obj not s.t. caching
                1e-5 * InfrastructureAnalyzer.getLocalMaxMemory());       //if 
below threshold [in bytes]
-       public static final double  CACHING_BUFFER_SIZE = 0.15; 
+       public static double  CACHING_BUFFER_SIZE = 0.15; 
        public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO; 
        public static final boolean CACHING_BUFFER_PAGECACHE = false; 
        public static final boolean CACHING_WRITE_CACHE_ON_READ = false;        
@@ -607,7 +607,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                if( _gpuObjects != null )
                        for (GPUObject gObj : _gpuObjects.values())
                                if (gObj != null)
-                                       gObj.clearData();
+                                       gObj.clearData(null, 
DMLScript.EAGER_CUDA_FREE);
 
                // change object state EMPTY
                setDirty(false);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index bdc5b21..7eb4033 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -416,10 +416,6 @@ public class CSRPointer {
                return getGPUContext().allocate(size);
        }
 
-       private void cudaFreeHelper(Pointer toFree, boolean eager) {
-               getGPUContext().cudaFreeHelper(toFree, eager);
-       }
-
        private GPUContext getGPUContext() {
                return gpuContext;
        }
@@ -494,17 +490,20 @@ public class CSRPointer {
         */
        public void deallocate(boolean eager) {
                if (nnz > 0) {
-                       cudaFreeHelper(val, eager);
-                       cudaFreeHelper(rowPtr, eager);
-                       cudaFreeHelper(colInd, eager);
-                       val = null;
-                       rowPtr = null;
-                       colInd = null;
+                       if (val != null)
+                               getGPUContext().cudaFreeHelper(null, val, 
eager);
+                       if (rowPtr != null)
+                               getGPUContext().cudaFreeHelper(null, rowPtr, 
eager);
+                       if (colInd != null)
+                               getGPUContext().cudaFreeHelper(null, colInd, 
eager);
                }
+               val = null;
+               rowPtr = null;
+               colInd = null;
        }
 
        @Override
        public String toString() {
                return "CSRPointer{" + "nnz=" + nnz + '}';
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 7f8f106..2d3918c 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -37,6 +37,7 @@ import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.utils.GPUStatistics;
+
 import jcuda.Pointer;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcudnn.cudnnHandle;
@@ -63,6 +64,7 @@ public class GPUContext {
         * active device assigned to this GPUContext instance
         */
        private final int deviceNum;
+       
        /**
         * cudnnHandle for Deep Neural Network operations on the GPU
         */
@@ -130,9 +132,11 @@ public class GPUContext {
                }
        }
 
-       private void initializeCudaLibraryHandles() {
-               deleteCudaLibraryHandles();
-
+       private void initializeCudaLibraryHandles() throws DMLRuntimeException {
+               // We don't need to explicitly delete the handles if we are 
planning to create them again. 
+               // This has a huge performance impact on scripts that has large 
number of layers (i.e. FunctionCallCP) for example ResNet.
+               // If this is absolutely required for parfor, please add 
appropriate safeguard for non-parfor scripts. 
+               // deleteCudaLibraryHandles();
                if (cudnnHandle == null) {
                        cudnnHandle = new cudnnHandle();
                        cudnnCreate(cudnnHandle);
@@ -150,11 +154,6 @@ public class GPUContext {
                        cusparseHandle = new cusparseHandle();
                        cusparseCreate(cusparseHandle);
                }
-
-               if (cusolverDnHandle == null) {
-                       cusolverDnHandle = new cusolverDnHandle();
-                       cusolverDnCreate(cusolverDnHandle);
-               }
                
                if (kernels == null) {
                        kernels = new JCudaKernels();
@@ -204,36 +203,6 @@ public class GPUContext {
                return memoryManager.malloc(instructionName, size);
        }
 
-
-       /**
-        * Does lazy cudaFree calls.
-        *
-        * @param toFree {@link Pointer} instance to be freed
-        */
-       public void cudaFreeHelper(final Pointer toFree) {
-               cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE);
-       }
-
-       /**
-        * Does lazy/eager cudaFree calls.
-        *
-        * @param toFree {@link Pointer} instance to be freed
-        * @param eager  true if to be done eagerly
-        */
-       public void cudaFreeHelper(final Pointer toFree, boolean eager) {
-               cudaFreeHelper(null, toFree, eager);
-       }
-
-       /**
-        * Does lazy cudaFree calls.
-        *
-        * @param instructionName name of the instruction for which to record 
per instruction free time, null if do not want to record
-        * @param toFree          {@link Pointer} instance to be freed
-        */
-       public void cudaFreeHelper(String instructionName, final Pointer 
toFree) {
-               cudaFreeHelper(instructionName, toFree, 
DMLScript.EAGER_CUDA_FREE);
-       }
-
        /**
         * Does cudaFree calls, lazily.
         *
@@ -290,7 +259,7 @@ public class GPUContext {
         */
        public GPUObject createGPUObject(MatrixObject mo) {
                GPUObject ret = new GPUObject(this, mo);
-               getMemoryManager().addGPUObject(ret);
+               
getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret);
                return ret;
        }
 
@@ -376,6 +345,15 @@ public class GPUContext {
         * @return cusolverDnHandle for current thread
         */
        public cusolverDnHandle getCusolverDnHandle() {
+               if (cusolverDnHandle == null) {
+                       synchronized(this) {
+                               if (cusolverDnHandle == null) {
+                                       // Since cusolverDnHandle handle is 
rarely used and occupies unnecessary memory, it is only initialized when needed.
+                                       cusolverDnHandle = new 
cusolverDnHandle();
+                                       cusolverDnCreate(cusolverDnHandle);
+                               }
+                       }
+               }
                return cusolverDnHandle;
        }
 
@@ -442,4 +420,4 @@ public class GPUContext {
        public String toString() {
                return "GPUContext{" + "deviceNum=" + deviceNum + '}';
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
new file mode 100644
index 0000000..830f7df
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.stream.Collectors;
+import java.util.Optional;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+
+import jcuda.Pointer;
+
+public class GPULazyCudaFreeMemoryManager {
+       protected static final Log LOG = 
LogFactory.getLog(GPULazyCudaFreeMemoryManager.class.getName());
+       GPUMemoryManager gpuManager;
+       public GPULazyCudaFreeMemoryManager(GPUMemoryManager gpuManager) {
+               this.gpuManager = gpuManager;
+       }
+
+       /**
+        * Map of free blocks allocate on GPU. maps size_of_block -> pointer on 
GPU
+        */
+       private HashMap<Long, Set<Pointer>> rmvarGPUPointers = new 
HashMap<Long, Set<Pointer>>();
+       
+       /**
+        * Get any pointer of the given size from rmvar-ed pointers (applicable 
if eager cudaFree is set to false)
+        * 
+        * @param opcode opcode
+        * @param size size in bytes
+        * @return pointer
+        */
+       public Pointer getRmvarPointer(String opcode, long size) {
+               if (rmvarGPUPointers.containsKey(size)) {
+                       if(LOG.isTraceEnabled())
+                               LOG.trace("Getting rmvar-ed pointers for size:" 
+ size);
+                       Pointer A = remove(rmvarGPUPointers, size); // remove 
from rmvarGPUPointers as you are not calling cudaFree
+                       return A;
+               }
+               else {
+                       return null;
+               }
+       }
+       
+       public Set<Pointer> getAllPointers() {
+               return rmvarGPUPointers.values().stream().flatMap(ptrs -> 
ptrs.stream()).collect(Collectors.toSet());
+       }
+       
+       public void clearAll() {
+               Set<Pointer> toFree = new HashSet<Pointer>();
+               for(Set<Pointer> ptrs : rmvarGPUPointers.values()) {
+                       toFree.addAll(ptrs);
+               }
+               rmvarGPUPointers.clear();
+               for(Pointer ptr : toFree) {
+                       gpuManager.guardedCudaFree(ptr);
+               }
+       }
+       
+       public Pointer getRmvarPointerMinSize(String opcode, long minSize) 
throws DMLRuntimeException {
+               Optional<Long> toClear = 
rmvarGPUPointers.entrySet().stream().filter(e -> e.getValue().size() > 0).map(e 
-> e.getKey())
+                               .filter(size -> size >= minSize).min((s1, s2) 
-> s1 < s2 ? -1 : 1);
+               if(toClear.isPresent()) {
+                       long t0 = opcode != null && 
DMLScript.FINEGRAINED_STATISTICS ?  System.nanoTime() : 0;
+                       Pointer A = remove(rmvarGPUPointers, toClear.get()); // 
remove from rmvarGPUPointers as you are not calling cudaFree
+                       gpuManager.addMiscTime(opcode, 
GPUInstruction.MISC_TIMER_REUSE, t0);
+                       return A;
+               }
+               return null;
+       }
+       
+       
+       /**
+        * Remove any pointer in the given hashmap
+        * 
+        * @param hm hashmap of size, pointers
+        * @param size size in bytes
+        * @return the pointer that was removed
+        */
+       private Pointer remove(HashMap<Long, Set<Pointer>> hm, long size) {
+               Pointer A = hm.get(size).iterator().next();
+               remove(hm, size, A);
+               return A;
+       }
+       
+       /**
+        * Remove a specific pointer in the given hashmap
+        * 
+        * @param hm hashmap of size, pointers
+        * @param size size in bytes
+        * @param ptr pointer to be removed
+        */
+       private void remove(HashMap<Long, Set<Pointer>> hm, long size, Pointer 
ptr) {
+               hm.get(size).remove(ptr);
+               if (hm.get(size).isEmpty())
+                       hm.remove(size);
+       }
+       
+       /**
+        * Return the total memory in bytes used by this memory manager
+        * @return number of bytes
+        */
+       public long getTotalMemoryAllocated() {
+               long rmvarMemoryAllocated = 0;
+               for(long numBytes : rmvarGPUPointers.keySet()) {
+                       rmvarMemoryAllocated += numBytes;
+               }
+               return rmvarMemoryAllocated;
+       }
+       
+       /**
+        * Get total number of rmvared pointers
+        * 
+        * @return number of pointers
+        */
+       public int getNumPointers() {
+               return rmvarGPUPointers.size();
+       }
+       
+       /**
+        * Add a pointer to the rmvar-ed list
+        * @param size size of the pointer
+        * @param toFree pointer
+        */
+       public void add(long size, Pointer toFree) {
+               Set<Pointer> freeList = rmvarGPUPointers.get(size);
+               if (freeList == null) {
+                       freeList = new HashSet<Pointer>();
+                       rmvarGPUPointers.put(size, freeList);
+               }
+               if (freeList.contains(toFree))
+                       throw new RuntimeException("GPU : Internal state 
corrupted, double free");
+               freeList.add(toFree);
+       }
+       
+       /**
+        * Remove a specific pointer if present in the internal hashmap
+        * 
+        * @param size size in bytes
+        * @param ptr pointer to be removed
+        */
+       public void removeIfPresent(long size, Pointer ptr) {
+               if(rmvarGPUPointers.containsKey(size) && 
rmvarGPUPointers.get(size).contains(ptr)) {
+                       rmvarGPUPointers.get(size).remove(ptr);
+                       if (rmvarGPUPointers.get(size).isEmpty())
+                               rmvarGPUPointers.remove(size);
+               }
+       }
+       
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
new file mode 100644
index 0000000..066905b
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import jcuda.Pointer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+
+public class GPUMatrixMemoryManager {
+       protected static final Log LOG = 
LogFactory.getLog(GPUMatrixMemoryManager.class.getName());
+       GPUMemoryManager gpuManager;
+       public GPUMatrixMemoryManager(GPUMemoryManager gpuManager) {
+               this.gpuManager = gpuManager;
+       }
+       
+       /**
+        * Adds the GPU object to the memory manager
+        * 
+        * @param gpuObj the handle to the GPU object
+        */
+       void addGPUObject(GPUObject gpuObj) {
+               gpuObjects.add(gpuObj);
+       }
+       
+       /**
+        * Returns worst-case contiguous memory size
+        * @param gpuObj gpu object
+        * @return memory size in bytes
+        */
+       long getWorstCaseContiguousMemorySize(GPUObject gpuObj) {
+               long ret = 0;
+               if(!gpuObj.isDensePointerNull()) {
+                       ret = 
gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes();
+               }
+               else if(gpuObj.getJcudaSparseMatrixPtr() != null) {
+                       CSRPointer sparsePtr = gpuObj.getJcudaSparseMatrixPtr();
+                       if(sparsePtr.nnz > 0) {
+                               if(sparsePtr.rowPtr != null)
+                                       ret = Math.max(ret, 
gpuManager.allPointers.get(sparsePtr.rowPtr).getSizeInBytes());
+                               if(sparsePtr.colInd != null)
+                                       ret = Math.max(ret, 
gpuManager.allPointers.get(sparsePtr.colInd).getSizeInBytes());
+                               if(sparsePtr.val != null)
+                                       ret = Math.max(ret, 
gpuManager.allPointers.get(sparsePtr.val).getSizeInBytes());
+                       }
+               }
+               return ret;
+       }
+       
+       /**
+        * Get list of all Pointers in a GPUObject 
+        * @param gObj gpu object 
+        * @return set of pointers
+        */
+       Set<Pointer> getPointers(GPUObject gObj) {
+               Set<Pointer> ret = new HashSet<>();
+               if(!gObj.isDensePointerNull() && 
gObj.getSparseMatrixCudaPointer() != null) {
+                       LOG.warn("Matrix allocated in both dense and sparse 
format");
+               }
+               if(!gObj.isDensePointerNull()) {
+                       ret.add(gObj.getDensePointer());
+               }
+               if(gObj.getSparseMatrixCudaPointer() != null) {
+                       CSRPointer sparsePtr = 
gObj.getSparseMatrixCudaPointer();
+                       if(sparsePtr != null) {
+                               if(sparsePtr.rowPtr != null)
+                                       ret.add(sparsePtr.rowPtr);
+                               else if(sparsePtr.colInd != null)
+                                       ret.add(sparsePtr.colInd);
+                               else if(sparsePtr.val != null)
+                                       ret.add(sparsePtr.val);
+                       }
+               }
+               return ret;
+       }
+       
+       /**
+        * list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
+        * These are matrices allocated on the GPU on which rmvar hasn't been 
called yet.
+        * If a {@link GPUObject} has more than one lock on it, it cannot be 
freed
+        * If it has zero locks on it, it can be freed, but it is preferrable 
to keep it around
+        * so that an extraneous host to dev transfer can be avoided
+        */
+       HashSet<GPUObject> gpuObjects = new HashSet<>();
+       
+       /**
+        * Get GPUObjects from the first memory sections "Matrix Memory"
+        * @param locked return locked GPU objects if true
+        * @param dirty return dirty GPU objects if true
+        * @return set of GPU Objects
+        */
+       Set<GPUObject> getGPUObjects(boolean locked, boolean dirty) {
+               return gpuObjects.stream().filter(gObj -> gObj.isLocked() == 
locked && gObj.isDirty() == dirty).collect(Collectors.toSet());
+       }
+       
+       /**
+        * Return all pointers in the first section
+        * @return all pointers in this section
+        */
+       Set<Pointer> getPointers() {
+               return gpuObjects.stream().flatMap(gObj -> 
getPointers(gObj).stream()).collect(Collectors.toSet());
+       }
+       
+       /**
+        * Get pointers from the first memory sections "Matrix Memory"
+        * @param locked return locked pointers if true
+        * @param dirty return dirty pointers if true
+        * @return set of pointers
+        */
+       Set<Pointer> getPointers(boolean locked, boolean dirty) {
+               return gpuObjects.stream().filter(gObj -> gObj.isLocked() == 
locked && gObj.isDirty() == dirty).flatMap(gObj -> 
getPointers(gObj).stream()).collect(Collectors.toSet());
+       }
+       
+       /**
+        * Clear the memory of the gpu object that matches the provided 
parameters
+        * 
+        * @param locked is locked
+        * @param dirty is dirty
+        * @param minSize of atleast given size
+        * @param comparator sorting comparator in case there are more than one 
gpu object that matches above parameters
+        * @param opcode instruction code
+        * @return true if a gpu object satisfies the above condition else false
+        * @throws DMLRuntimeException if error occurs
+        */
+       boolean clear(boolean locked, boolean dirty, long minSize, 
Comparator<GPUObject> comparator, String opcode) throws DMLRuntimeException {
+               Optional<GPUObject> toClear = getGPUObjects(locked, 
dirty).stream()
+                               .filter(gObj -> 
getWorstCaseContiguousMemorySize(gObj) >= minSize)
+                                       .max(comparator);
+               if(toClear.isPresent()) {
+                       GPUObject gObj = toClear.get();
+                       if(gObj.dirty) 
+                               gObj.copyFromDeviceToHost(opcode, true, true); 
// Perform eviction if dirty
+                       else
+                               gObj.clearData(opcode, true);
+                       gpuObjects.remove(gObj);
+               }
+               return toClear.isPresent();
+       }
+       
+       /**
+        * Clear all unlocked gpu objects
+        * 
+        * @param opcode instruction code
+        * @throws DMLRuntimeException if error
+        */
+       void clearAllUnlocked(String opcode) throws DMLRuntimeException {
+               Set<GPUObject> unlockedGPUObjects = gpuObjects.stream()
+                               .filter(gpuObj -> 
!gpuObj.isLocked()).collect(Collectors.toSet());
+               if(unlockedGPUObjects.size() > 0) {
+                       if(LOG.isWarnEnabled())
+                               LOG.warn("Clearing all unlocked matrices 
(count=" + unlockedGPUObjects.size() + ").");
+                       for(GPUObject toBeRemoved : unlockedGPUObjects) {
+                               if(toBeRemoved.dirty)
+                                       
toBeRemoved.copyFromDeviceToHost(opcode, true, true);
+                               else
+                                       toBeRemoved.clearData(opcode, true);
+                       }
+                       gpuObjects.removeAll(unlockedGPUObjects);
+               }
+       }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index e2012a7..411606d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -23,13 +23,14 @@ import static jcuda.runtime.JCuda.cudaMalloc;
 import static jcuda.runtime.JCuda.cudaMemGetInfo;
 import static jcuda.runtime.JCuda.cudaMemset;
 
-import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.LongAdder;
+import java.util.stream.Collectors;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -42,7 +43,6 @@ import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
-
 /**
  * - All cudaFree and cudaMalloc in SystemML should go through this class to 
avoid OOM or incorrect results.
  * - This class can be refactored in future to accept a chunk of memory ahead 
of time rather than while execution. This will only thow memory-related errors 
during startup.  
@@ -50,67 +50,87 @@ import jcuda.Pointer;
 public class GPUMemoryManager {
        protected static final Log LOG = 
LogFactory.getLog(GPUMemoryManager.class.getName());
        
-       // If the available free size is less than this factor, 
GPUMemoryManager will warn users of multiple programs grabbing onto GPU memory.
-       // This often happens if user tries to use both TF and SystemML, and TF 
grabs onto 90% of the memory ahead of time.
-       private static final double WARN_UTILIZATION_FACTOR = 0.7;
+       
/*****************************************************************************************/
+       // GPU Memory is divided into three major sections:
+       // 1. Matrix Memory: Memory allocated to matrices in SystemML and 
addressable by GPUObjects.
+       // This memory section is divided into three minor sections:
+       // 1.1 Locked Matrix Memory
+       // 1.2 UnLocked + Non-Dirty Matrix Memory
+       // 1.3 UnLocked + Dirty Matrix Memory
+       // To get the GPUObjects/Pointers in this section, please use 
getGPUObjects and getPointers methods of GPUMatrixMemoryManager.
+       // To clear GPUObjects/Pointers in this section, please use clear and 
clearAll methods of GPUMatrixMemoryManager.
+       // Both these methods allow to get/clear unlocked/locked and 
dirty/non-dirty objects of a certain size.
+       GPUMatrixMemoryManager matrixMemoryManager;
+       public GPUMatrixMemoryManager getGPUMatrixMemoryManager() {
+               return matrixMemoryManager;
+       }
        
-       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
-       public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig()
-                       
.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+       // 2. Rmvar-ed pointers: If sysml.gpu.eager.cudaFree is set to false,
+       // then this manager caches pointers of the GPUObject on which rmvar 
instruction has been executed for future reuse.
+       // We observe 2-3x improvement with this approach and hence recommend 
to set this flag to false.
+       GPULazyCudaFreeMemoryManager lazyCudaFreeMemoryManager;
+       public GPULazyCudaFreeMemoryManager getGPULazyCudaFreeMemoryManager() {
+               return lazyCudaFreeMemoryManager;
+       }
        
-       /**
-        * Map of free blocks allocate on GPU. maps size_of_block -> pointer on 
GPU
-        */
-       private HashMap<Long, Set<Pointer>> rmvarGPUPointers = new 
HashMap<Long, Set<Pointer>>();
+       // 3. Non-matrix locked pointers: Other pointers (required for 
execution of an instruction that are not memory). For example: workspace
+       // These pointers are not explicitly tracked by a memory manager but 
one can get them by using getNonMatrixLockedPointers
+       private Set<Pointer> getNonMatrixLockedPointers() {
+               Set<Pointer> managedPointers = 
matrixMemoryManager.getPointers();
+               
managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers());
+               return nonIn(allPointers.keySet(), managedPointers);
+       }
        
-       /**
-        * list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
-        * These are matrices allocated on the GPU on which rmvar hasn't been 
called yet.
-        * If a {@link GPUObject} has more than one lock on it, it cannot be 
freed
-        * If it has zero locks on it, it can be freed, but it is preferrable 
to keep it around
-        * so that an extraneous host to dev transfer can be avoided
-        */
-       private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
        
        /**
-        * To record size of allocated blocks
+        * To record size of all allocated pointers allocated by above memory 
managers
         */
-       private HashMap<Pointer, Long> allocatedGPUPointers = new HashMap<>();
-       
-       /**
-        * Adds the GPU object to the memory manager
-        * 
-        * @param gpuObj the handle to the GPU object
-        */
-       public void addGPUObject(GPUObject gpuObj) {
-               allocatedGPUObjects.add(gpuObj);
-       }
-       
-       /**
-        * Removes the GPU object from the memory manager
-        * 
-        * @param gpuObj the handle to the GPU object
-        */
-       public void removeGPUObject(GPUObject gpuObj) {
-               if(LOG.isDebugEnabled())
-                       LOG.debug("Removing the GPU object: " + gpuObj);
-               allocatedGPUObjects.removeIf(a -> a.equals(gpuObj));
-       }
+       HashMap<Pointer, PointerInfo> allPointers = new HashMap<>();
        
+       
/*****************************************************************************************/
        
+
        /**
         * Get size of allocated GPU Pointer
         * @param ptr pointer to get size of
         * @return either the size or -1 if no such pointer exists
         */
        public long getSizeAllocatedGPUPointer(Pointer ptr) {
-               if(allocatedGPUPointers.containsKey(ptr)) {
-                       return allocatedGPUPointers.get(ptr);
+               if(allPointers.containsKey(ptr)) {
+                       return allPointers.get(ptr).getSizeInBytes();
                }
                return -1;
        }
        
+       /**
+        * Utility to debug memory leaks
+        */
+       static class PointerInfo {
+               private long sizeInBytes;
+               private StackTraceElement[] stackTraceElements;
+               public PointerInfo(long sizeInBytes) {
+                       if(DMLScript.PRINT_GPU_MEMORY_INFO) {
+                               this.stackTraceElements = 
Thread.currentThread().getStackTrace();
+                       }
+                       this.sizeInBytes = sizeInBytes;
+               }
+               public long getSizeInBytes() {
+                       return sizeInBytes;
+               }
+       }
+       
+       // If the available free size is less than this factor, 
GPUMemoryManager will warn users of multiple programs grabbing onto GPU memory.
+       // This often happens if user tries to use both TF and SystemML, and TF 
grabs onto 90% of the memory ahead of time.
+       private static final double WARN_UTILIZATION_FACTOR = 0.7;
+       
+       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
+       public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig()
+                       
.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+       
+       
        public GPUMemoryManager(GPUContext gpuCtx) {
+               matrixMemoryManager = new GPUMatrixMemoryManager(this);
+               lazyCudaFreeMemoryManager = new 
GPULazyCudaFreeMemoryManager(this);
                long free[] = { 0 };
                long total[] = { 0 };
                cudaMemGetInfo(free, total);
@@ -135,19 +155,48 @@ public class GPUMemoryManager {
         * @param size size in bytes
         * @return allocated pointer
         */
-       private Pointer cudaMallocWarnIfFails(Pointer A, long size) {
+       private Pointer cudaMallocNoWarn(Pointer A, long size) {
                try {
                        cudaMalloc(A, size);
-                       allocatedGPUPointers.put(A, size);
+                       allPointers.put(A, new PointerInfo(size));
                        return A;
                } catch(jcuda.CudaException e) {
-                       LOG.warn("cudaMalloc failed immediately after 
cudaMemGetInfo reported that memory of size " + size + " is available. "
-                                       + "This usually happens if there are 
external programs trying to grab on to memory in parallel.");
                        return null;
                }
        }
        
        /**
+        * Pretty printing utility to debug OOM error
+        * 
+        * @param stackTrace stack trace
+        * @param index call depth
+        * @return pretty printed string
+        */
+       private String getCallerInfo(StackTraceElement [] stackTrace, int 
index) {
+               if(stackTrace.length <= index)
+                       return "->";
+               else
+                       return "->" + stackTrace[index].getClassName() + "." + 
stackTrace[index].getMethodName() + "(" + stackTrace[index].getFileName() + ":" 
+ stackTrace[index].getLineNumber() + ")";
+       }
+       
+       /**
+        * Pretty printing utility to print bytes
+        * 
+        * @param numBytes number of bytes
+        * @return a human-readable display value
+        */
+       private String byteCountToDisplaySize(long numBytes) {
+               // return 
org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs 
rounding
+           if (numBytes < 1024) { 
+               return numBytes + " bytes";
+           }
+           else {
+                   int exp = (int) (Math.log(numBytes) / 6.931471805599453);
+                   return String.format("%.3f %sB", ((double)numBytes) / 
Math.pow(1024, exp), "KMGTP".charAt(exp-1));
+           }
+       }
+       
+       /**
         * Allocate pointer of the given size in bytes.
         * 
         * @param opcode instruction name
@@ -156,20 +205,26 @@ public class GPUMemoryManager {
         */
        public Pointer malloc(String opcode, long size) {
                if(size < 0) {
-                       throw new DMLRuntimeException("Cannot allocate memory 
of size " + size);
+                       throw new DMLRuntimeException("Cannot allocate memory 
of size " + byteCountToDisplaySize(size));
                }
                long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               long mallocStart = 0;
                // Step 1: First try reusing exact match in rmvarGPUPointers to 
avoid holes in the GPU memory
-               Pointer A = getRmvarPointer(opcode, size);
+               Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, 
size);
+               if(A != null)
+                       addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, 
t0);
                
+               Pointer tmpA = (A == null) ? new Pointer() : null;
                // Step 2: Allocate a new pointer in the GPU memory (since 
memory is available)
                if(A == null && size <= getAvailableMemory()) {
-                       A = cudaMallocWarnIfFails(new Pointer(), size);
+                       mallocStart = DMLScript.STATISTICS ? System.nanoTime() 
: 0;
+                       A = cudaMallocNoWarn(tmpA, size); // Try malloc rather 
than check available memory to avoid fragmentation related issues
+                       addMiscTime(null, GPUStatistics.cudaEvictMallocTime, 
GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, 
mallocStart);
                        if(LOG.isTraceEnabled()) {
                                if(A == null)
-                                       LOG.trace("Couldnot allocate a new 
pointer in the GPU memory:" + size);
+                                       LOG.trace("Couldnot allocate a new 
pointer in the GPU memory:" + byteCountToDisplaySize(size));
                                else
-                                       LOG.trace("Allocated a new pointer in 
the GPU memory:" + size);
+                                       LOG.trace("Allocated a new pointer in 
the GPU memory:" + byteCountToDisplaySize(size));
                        }
                }
                
@@ -178,21 +233,19 @@ public class GPUMemoryManager {
                
                // Step 3: Try reusing non-exact match entry of rmvarGPUPointers
                if(A == null) { 
-                       // Find minimum key that is greater than size
-                       long key = Long.MAX_VALUE;
-                       for(Long k : rmvarGPUPointers.keySet()) {
-                               key = k > size ? Math.min(key, k) : key;
-                       }
-                       if(key != Long.MAX_VALUE) {
-                               A = getRmvarPointer(opcode, key);
-                               // To avoid potential for holes in the GPU 
memory
+                       A = 
lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
+                       if(A != null) {
                                guardedCudaFree(A);
-                               A = cudaMallocWarnIfFails(new Pointer(), size);
-                               if(LOG.isTraceEnabled()) {
+                               mallocStart = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
+                               A = cudaMallocNoWarn(tmpA, size); // Try malloc 
rather than check available memory to avoid fragmentation related issues
+                               addMiscTime(null, 
GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, 
GPUInstruction.MISC_TIMER_EVICT, mallocStart);
+                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
                                        if(A == null)
-                                               LOG.trace("Couldnot reuse 
non-exact match of rmvarGPUPointers:" + size);
-                                       else
-                                               LOG.trace("Reuses a non-exact 
match from rmvarGPUPointers:" + size);
+                                               LOG.info("Couldnot reuse 
non-exact match of rmvarGPUPointers:" + byteCountToDisplaySize(size));
+                                       else {
+                                               LOG.info("Reuses a non-exact 
match from rmvarGPUPointers:" + byteCountToDisplaySize(size));
+                                               LOG.info("GPU Memory info after 
reusing a non-exact match from rmvarGPUPointers:" + toString());
+                                       }
                                }
                        }
                }
@@ -205,20 +258,16 @@ public class GPUMemoryManager {
                
                // Step 4: Eagerly free-up rmvarGPUPointers and check if memory 
is available on GPU
                if(A == null) {
-                       Set<Pointer> toFree = new HashSet<Pointer>();
-                       for(Set<Pointer> ptrs : rmvarGPUPointers.values()) {
-                               toFree.addAll(ptrs);
-                       }
-                       for(Pointer ptr : toFree) {
-                               guardedCudaFree(ptr);
-                       }
+                       lazyCudaFreeMemoryManager.clearAll();
                        if(size <= getAvailableMemory()) {
-                               A = cudaMallocWarnIfFails(new Pointer(), size);
-                               if(LOG.isTraceEnabled()) {
+                               A = cudaMallocNoWarn(tmpA, size);
+                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
                                        if(A == null)
-                                               LOG.trace("Couldnot allocate a 
new pointer in the GPU memory after eager free:" + size);
-                                       else
-                                               LOG.trace("Allocated a new 
pointer in the GPU memory after eager free:" + size);
+                                               LOG.info("Couldnot allocate a 
new pointer in the GPU memory after eager free:" + 
byteCountToDisplaySize(size));
+                                       else {
+                                               LOG.info("Allocated a new 
pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size));
+                                               LOG.info("GPU Memory info after 
allocating new pointer post lazyCudaFreeMemoryManager.clearAll():" + 
toString());
+                                       }
                                }
                        }
                }
@@ -228,36 +277,114 @@ public class GPUMemoryManager {
                // Step 5: Try eviction based on the given policy
                if(A == null) {
                        t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-                       // Sort based on the eviction policy
-                       Collections.sort(allocatedGPUObjects, new 
GPUComparator(size));
-                       while (size > getAvailableMemory() && 
allocatedGPUObjects.size() > 0) {
-                               GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
-                               if (toBeRemoved.isLocked()) {
-                                       // All remaining blocks will also be 
locked
-                                       break;
-                               }
-                               else {
-                                       // Perform eviction
-                                       if (toBeRemoved.dirty) {
-                                               
toBeRemoved.copyFromDeviceToHost(opcode, true);
-                                       }
-                                       toBeRemoved.clearData(true);
+                       
+                       // First, clear unlocked non-dirty matrices greater 
than or equal to size
+                       // Comparator clears the largest matrix to avoid future 
evictions
+                       boolean success = matrixMemoryManager.clear(false, 
false, size, SIMPLE_COMPARATOR_SORT_BY_SIZE, opcode);
+                       if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
+                               if(success) {
+                                       LOG.info("Cleared an unlocked non-dirty 
matrix greater than or equal to " + byteCountToDisplaySize(size));
+                                       LOG.info("GPU Memory info after 
clearing an unlocked non-dirty matrix:" + toString());
                                }
+                               else
+                                       LOG.info("No unlocked non-dirty matrix 
greater than or equal to " + byteCountToDisplaySize(size) + " found for 
clearing.");
                        }
-                       addMiscTime(opcode, GPUStatistics.cudaEvictionCount, 
GPUStatistics.cudaEvictTime, GPUInstruction.MISC_TIMER_EVICT, t0);
-                       if(size <= getAvailableMemory()) {
-                               A = cudaMallocWarnIfFails(new Pointer(), size);
-                               if(LOG.isTraceEnabled()) {
-                                       if(A == null)
-                                               LOG.trace("Couldnot allocate a 
new pointer in the GPU memory after eviction:" + size);
+                       if(!success) {
+                               // First, clear unlocked dirty matrices greater 
than or equal to size using the eviction policy
+                               // Comparator clears the largest matrix to 
avoid future evictions
+                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
+                                       LOG.info("GPU Memory info before 
eviction:" + toString());
+                               }
+                               success = matrixMemoryManager.clear(false, 
true, size, new EvictionPolicyBasedComparator(size), opcode);
+                               // JCuda.cudaDeviceSynchronize();
+                               if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
+                                       if(success) {
+                                               LOG.info("Evicted an unlocked 
dirty matrix greater than or equal to " + byteCountToDisplaySize(size));
+                                               LOG.info("GPU Memory info after 
evicting an unlocked dirty matrix:" + toString());
+                                       }
                                        else
-                                               LOG.trace("Allocated a new 
pointer in the GPU memory after eviction:" + size);
+                                               LOG.info("No unlocked dirty 
matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for 
evicted.");
                                }
+                               
+                               if(!success) {
+                                       // Minor optimization: clear all 
unlocked non-dirty matrices before attempting eviction 
+                                       // Delete all non-dirty
+                                       List<GPUObject> unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
+                                                                               
                .filter(gpuObj -> !gpuObj.isLocked() && 
!gpuObj.isDirty()).collect(Collectors.toList());
+                                       
matrixMemoryManager.gpuObjects.removeAll(unlockedGPUObjects);
+                                       for(GPUObject toBeRemoved : 
unlockedGPUObjects) {
+                                               toBeRemoved.clearData(opcode, 
true);
+                                       }
+                                       if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
+                                               LOG.info("GPU Memory info after 
clearing all unlocked non-dirty matrices:" + toString());
+                                       }
+                                       mallocStart = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
+                                       A = cudaMallocNoWarn(tmpA, size); // 
Try malloc rather than check available memory to avoid fragmentation related 
issues
+                                       addMiscTime(null, 
GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, 
GPUInstruction.MISC_TIMER_EVICT, mallocStart);
+                               
+                                       // 
---------------------------------------------------------------
+                                       // Evict unlocked GPU objects 
one-by-one and try malloc
+                                       unlockedGPUObjects = null;
+                                       if(A == null) {
+                                               unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
+                                                               .filter(gpuObj 
-> !gpuObj.isLocked() && gpuObj.isDirty()).collect(Collectors.toList());
+                                               
Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size));
+                                               while(A == null && 
unlockedGPUObjects.size() > 0) {
+                                                       
if(DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.ALIGN_MEMORY) {
+                                                               // TODO: 
Optimize later using sliding window
+                                                               // Evict as 
many sequential dense objects from back of the queue as possible
+                                                               long neededSize 
= size;
+                                                               
while(neededSize >= 0 && unlockedGPUObjects.size() > 0) {
+                                                                       
GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
+                                                                       
neededSize -= matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj);
+                                                                       
gpuObj.copyFromDeviceToHost(opcode, true, true);
+                                                               }
+                                                       }
+                                                       else {
+                                                               GPUObject 
gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
+                                                               
gpuObj.copyFromDeviceToHost(opcode, true, true);
+                                                       }
+                                                       mallocStart = 
DMLScript.STATISTICS ? System.nanoTime() : 0;
+                                                       A = 
cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory 
to avoid fragmentation related issues
+                                                       addMiscTime(null, 
GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, 
GPUInstruction.MISC_TIMER_EVICT, mallocStart);
+                                               }
+                                               
if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) {
+                                                       // greater than or 
equal to " + byteCountToDisplaySize(size)
+                                                       LOG.info("GPU Memory 
info after eviction:" + toString());
+                                               }
+                                               if(unlockedGPUObjects != null 
&& unlockedGPUObjects.size() == 0) {
+                                                       LOG.warn("Evicted all 
unlocked matrices");
+                                               }
+                                       }
+                                       
+                               }
+                               // 
---------------------------------------------------------------
+                       }
+                       addMiscTime(opcode, GPUStatistics.cudaEvictTime, 
GPUStatistics.cudaEvictionCount, GPUInstruction.MISC_TIMER_EVICT, t0);
+                       if(A == null) {
+                               A = cudaMallocNoWarn(tmpA, size); // if the 
matrix is not allocated via eviction
                        }
+                       if(A == null) {
+                               LOG.warn("cudaMalloc failed immediately after 
cudaMemGetInfo reported that memory of size " 
+                                               + byteCountToDisplaySize(size) 
+ " is available. "
+                                               + "This usually happens if 
there are external programs trying to grab on to memory in parallel or there is 
potential fragmentation.");
+                       }
+                       else if(DMLScript.PRINT_GPU_MEMORY_INFO || 
LOG.isTraceEnabled()) {
+                               LOG.info("Malloc after eviction/clearing is 
successful.");
+                       }
+               }
+               
+               // Step 6: Handle defragmentation
+               if(A == null) {
+                       LOG.warn("Potential fragmentation of the GPU memory. 
Forcibly evicting all ...");
+                       LOG.info("Before clearAllUnlocked, GPU Memory info:" + 
toString());
+                       matrixMemoryManager.clearAllUnlocked(opcode);
+                       LOG.info("GPU Memory info after evicting all unlocked 
matrices:" + toString());
+                       A = cudaMallocNoWarn(tmpA, size);
                }
                
                if(A == null) {
-                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix, request (" + size + "). "
+                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix, requested = " + byteCountToDisplaySize(size) 
+ ". \n "
                                        + toString());
                }
                
@@ -267,26 +394,50 @@ public class GPUMemoryManager {
                return A;
        }
        
+       // --------------- Developer Utilities to debug potential memory leaks 
------------------------
+       @SuppressWarnings("unused")
+       private void printPointers(List<PointerInfo> pointers) {
+               for(PointerInfo ptrInfo : pointers) {
+                       System.out.println(">>" + 
+                                       // 
getCallerInfo(ptrInfo.stackTraceElements, 5) + 
getCallerInfo(ptrInfo.stackTraceElements, 6) + 
getCallerInfo(ptrInfo.stackTraceElements, 7) +
+                                       
getCallerInfo(ptrInfo.stackTraceElements, 8) + 
getCallerInfo(ptrInfo.stackTraceElements, 9) + 
getCallerInfo(ptrInfo.stackTraceElements, 10));
+               }
+       }
+       
+       @SuppressWarnings("unused")
+       private void printPointers(Set<Pointer> pointers, StringBuilder sb) {
+               for(Pointer ptr : pointers) {
+                       PointerInfo ptrInfo = allPointers.get(ptr);
+                       sb.append(">>");
+                       // getCallerInfo(ptrInfo.stackTraceElements, 5) + 
getCallerInfo(ptrInfo.stackTraceElements, 6) + 
getCallerInfo(ptrInfo.stackTraceElements, 7) +
+                       sb.append(getCallerInfo(ptrInfo.stackTraceElements, 8));
+                       sb.append(getCallerInfo(ptrInfo.stackTraceElements, 9));
+                       sb.append(getCallerInfo(ptrInfo.stackTraceElements, 
10));
+                       sb.append("\n");
+               }
+       }
+       // 
--------------------------------------------------------------------------------------------
+
        /**
         * Note: This method should not be called from an iterator as it 
removes entries from allocatedGPUPointers and rmvarGPUPointers
         * 
         * @param toFree pointer to call cudaFree method on
         */
-       private void guardedCudaFree(Pointer toFree) {
-               if (toFree != new Pointer()) {
-                       if(allocatedGPUPointers.containsKey(toFree)) {
-                               Long size = allocatedGPUPointers.remove(toFree);
-                               if(rmvarGPUPointers.containsKey(size) && 
rmvarGPUPointers.get(size).contains(toFree)) {
-                                       remove(rmvarGPUPointers, size, toFree);
-                               }
-                               if(LOG.isDebugEnabled())
-                                       LOG.debug("Free-ing up the pointer: " + 
toFree);
-                               cudaFree(toFree);
-                       }
-                       else {
-                               throw new RuntimeException("Attempting to free 
an unaccounted pointer:" + toFree);
+       void guardedCudaFree(Pointer toFree) {
+               if(allPointers.containsKey(toFree)) {
+                       long size = allPointers.get(toFree).getSizeInBytes();
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("Free-ing up the pointer of size " +  
byteCountToDisplaySize(size));
                        }
+                       allPointers.remove(toFree);
+                       lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
+                       cudaFree(toFree);
+                       // JCuda.cudaDeviceSynchronize(); // Force a device 
synchronize after free-ing the pointer for debugging
                }
+               else {
+                       throw new RuntimeException("Attempting to free an 
unaccounted pointer:" + toFree);
+               }
+
        }
        
        /**
@@ -295,88 +446,62 @@ public class GPUMemoryManager {
         * @param opcode instruction name
         * @param toFree pointer to free
         * @param eager whether to deallocate eagerly
+        * @throws DMLRuntimeException if error occurs
         */
-       public void free(String opcode, Pointer toFree, boolean eager) {
-               Pointer dummy = new Pointer();
-               if (toFree == dummy) { // trying to free a null pointer
-                       return;
-               }
+       public void free(String opcode, Pointer toFree, boolean eager) throws 
DMLRuntimeException {
+               if(LOG.isTraceEnabled())
+                       LOG.trace("Free-ing the pointer with eager=" + eager);
                if (eager) {
                        long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                        guardedCudaFree(toFree);
                        addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, 
GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
                }
                else {
-                       if (!allocatedGPUPointers.containsKey(toFree))
+                       if (!allPointers.containsKey(toFree)) {
+                               LOG.info("GPU memory info before failure:" + 
toString());
                                throw new RuntimeException("ERROR : Internal 
state corrupted, cache block size map is not aware of a block it trying to free 
up");
-                       long size = allocatedGPUPointers.get(toFree);
-                       Set<Pointer> freeList = rmvarGPUPointers.get(size);
-                       if (freeList == null) {
-                               freeList = new HashSet<Pointer>();
-                               rmvarGPUPointers.put(size, freeList);
                        }
-                       if (freeList.contains(toFree))
-                               throw new RuntimeException("GPU : Internal 
state corrupted, double free");
-                       freeList.add(toFree);
+                       long size = allPointers.get(toFree).getSizeInBytes();
+                       lazyCudaFreeMemoryManager.add(size, toFree);
                }
        }
        
        /**
+        * Removes the GPU object from the memory manager
+        * 
+        * @param gpuObj the handle to the GPU object
+        */
+       public void removeGPUObject(GPUObject gpuObj) {
+               if(LOG.isDebugEnabled())
+                       LOG.debug("Removing the GPU object: " + gpuObj);
+               matrixMemoryManager.gpuObjects.removeIf(a -> a.equals(gpuObj));
+       }
+
+       
+       /**
         * Clear the allocated GPU objects
         */
        public void clearMemory() {
                // First deallocate all the GPU objects
-               for(GPUObject gpuObj : allocatedGPUObjects) {
+               for(GPUObject gpuObj : matrixMemoryManager.gpuObjects) {
                        if(gpuObj.isDirty()) {
-                               LOG.debug("Attempted to free GPU Memory when a 
block[" + gpuObj + "] is still on GPU memory, copying it back to host.");
-                               gpuObj.acquireHostRead(null);
+                               if(LOG.isDebugEnabled())
+                                       LOG.debug("Attempted to free GPU Memory 
when a block[" + gpuObj + "] is still on GPU memory, copying it back to host.");
+                               gpuObj.copyFromDeviceToHost(null, true, true);
                        }
-                       gpuObj.clearData(true);
+                       else
+                               gpuObj.clearData(null, true);
                }
-               allocatedGPUObjects.clear();
+               matrixMemoryManager.gpuObjects.clear();
                
                // Then clean up remaining allocated GPU pointers 
-               Set<Pointer> remainingPtr = new 
HashSet<>(allocatedGPUPointers.keySet());
+               Set<Pointer> remainingPtr = new HashSet<>(allPointers.keySet());
                for(Pointer toFree : remainingPtr) {
                        guardedCudaFree(toFree); // cleans up 
allocatedGPUPointers and rmvarGPUPointers as well
                }
+               allPointers.clear();
        }
-       
-       /**
-        * Get all pointers withing allocatedGPUObjects such that GPUObject is 
in dirty state
-        * 
-        * @return set of pointers
-        */
-       private HashSet<Pointer> getDirtyPointers() {
-               HashSet<Pointer> nonTemporaryPointers = new HashSet<Pointer>();
-               for (GPUObject o : allocatedGPUObjects) {
-                       if(o.isDirty()) {
-                               if (o.isSparse()) {
-                                       CSRPointer p = 
o.getSparseMatrixCudaPointer();
-                                       if (p == null)
-                                               throw new 
RuntimeException("CSRPointer is null in clearTemporaryMemory");
-                                       if (p.rowPtr != null) {
-                                               
nonTemporaryPointers.add(p.rowPtr);
-                                       }
-                                       if (p.colInd != null) {
-                                               
nonTemporaryPointers.add(p.colInd);
-                                       }
-                                       if (p.val != null) {
-                                               nonTemporaryPointers.add(p.val);
-                                       }
-
-                               } else {
-                                       Pointer p = o.getJcudaDenseMatrixPtr();
-                                       if (p == null)
-                                               throw new 
RuntimeException("Pointer is null in clearTemporaryMemory");
-                                       nonTemporaryPointers.add(p);
-                               }
-                       }
-               }
                
-               return nonTemporaryPointers;
-       }
-       
        /**
         * Performs a non-in operation
         * 
@@ -399,7 +524,8 @@ public class GPUMemoryManager {
         */
        public void clearTemporaryMemory() {
                // To record the cuda block sizes needed by 
allocatedGPUObjects, others are cleared up.
-               Set<Pointer> temporaryPointers = 
nonIn(allocatedGPUPointers.keySet(), getDirtyPointers());
+               Set<Pointer> unlockedDirtyPointers = 
matrixMemoryManager.getPointers(false, true);
+               Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), 
unlockedDirtyPointers);
                for(Pointer tmpPtr : temporaryPointers) {
                        guardedCudaFree(tmpPtr);
                }
@@ -431,86 +557,84 @@ public class GPUMemoryManager {
         * @param instructionLevelTimer member of GPUInstruction
         * @param startTime start time
         */
-       private void addMiscTime(String opcode, String instructionLevelTimer, 
long startTime) {
+       void addMiscTime(String opcode, String instructionLevelTimer, long 
startTime) {
                if (opcode != null && DMLScript.FINEGRAINED_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(opcode, 
instructionLevelTimer, System.nanoTime() - startTime);
        }
        
-       /**
-        * Get any pointer of the given size from rmvar-ed pointers (applicable 
if eager cudaFree is set to false)
-        * 
-        * @param opcode opcode
-        * @param size size in bytes
-        * @return pointer
-        */
-       private Pointer getRmvarPointer(String opcode, long size) {
-               if (rmvarGPUPointers.containsKey(size)) {
-                       if(LOG.isTraceEnabled())
-                               LOG.trace("Getting rmvar-ed pointers for size:" 
+ size);
-                       long t0 = opcode != null && 
DMLScript.FINEGRAINED_STATISTICS ?  System.nanoTime() : 0;
-                       Pointer A = remove(rmvarGPUPointers, size); // remove 
from rmvarGPUPointers as you are not calling cudaFree
-                       addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, 
t0);
-                       return A;
-               }
-               else {
-                       return null;
-               }
-       }
-       
-       /**
-        * Remove any pointer in the given hashmap
-        * 
-        * @param hm hashmap of size, pointers
-        * @param size size in bytes
-        * @return the pointer that was removed
-        */
-       private Pointer remove(HashMap<Long, Set<Pointer>> hm, long size) {
-               Pointer A = hm.get(size).iterator().next();
-               remove(hm, size, A);
-               return A;
-       }
-       
-       /**
-        * Remove a specific pointer in the given hashmap
-        * 
-        * @param hm hashmap of size, pointers
-        * @param size size in bytes
-        * @param ptr pointer to be removed
-        */
-       private void remove(HashMap<Long, Set<Pointer>> hm, long size, Pointer 
ptr) {
-               hm.get(size).remove(ptr);
-               if (hm.get(size).isEmpty())
-                       hm.remove(size);
-       }
-       
        
        /**
         * Print debugging information
         */
        public String toString() {
-               long sizeOfLockedGPUObjects = 0; long numLockedGPUObjects = 0;
-               long sizeOfUnlockedGPUObjects = 0; long numUnlockedGPUObjects = 
0;
-               for(GPUObject gpuObj : allocatedGPUObjects) {
-                       try {
-                               if(gpuObj.isLocked()) {
-                                       numLockedGPUObjects++;
-                                       sizeOfLockedGPUObjects += 
gpuObj.getSizeOnDevice();
+               long sizeOfLockedGPUObjects = 0; int numLockedGPUObjects = 0; 
int numLockedPointers = 0;
+               long sizeOfUnlockedDirtyGPUObjects = 0; int 
numUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyPointers = 0;
+               long sizeOfUnlockedNonDirtyGPUObjects = 0; int 
numUnlockedNonDirtyGPUObjects = 0; int numUnlockedNonDirtyPointers = 0;
+               for(GPUObject gpuObj : matrixMemoryManager.gpuObjects) {
+                       if(gpuObj.isLocked()) {
+                               numLockedGPUObjects++;
+                               sizeOfLockedGPUObjects += 
gpuObj.getSizeOnDevice();
+                               numLockedPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
+                       }
+                       else {
+                               if(gpuObj.isDirty()) {
+                                       numUnlockedDirtyGPUObjects++;
+                                       sizeOfUnlockedDirtyGPUObjects += 
gpuObj.getSizeOnDevice();
+                                       numUnlockedDirtyPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
                                }
                                else {
-                                       numUnlockedGPUObjects++;
-                                       sizeOfUnlockedGPUObjects += 
gpuObj.getSizeOnDevice();
+                                       numUnlockedNonDirtyGPUObjects++;
+                                       sizeOfUnlockedNonDirtyGPUObjects += 
gpuObj.getSizeOnDevice();
+                                       numUnlockedNonDirtyPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
                                }
-                       } catch (DMLRuntimeException e) {
-                               throw new RuntimeException(e);
                        }
                }
+               
+               
                long totalMemoryAllocated = 0;
-               for(Long numBytes : allocatedGPUPointers.values()) {
-                       totalMemoryAllocated += numBytes;
+               for(PointerInfo ptrInfo : allPointers.values()) {
+                       totalMemoryAllocated += ptrInfo.getSizeInBytes();
+               }
+               
+               
+               Set<Pointer> potentiallyLeakyPointers = 
getNonMatrixLockedPointers();
+               List<Long> sizePotentiallyLeakyPointers = 
potentiallyLeakyPointers.stream().
+                               map(ptr -> 
allPointers.get(ptr).sizeInBytes).collect(Collectors.toList());
+               long totalSizePotentiallyLeakyPointers = 0;
+               for(long size : sizePotentiallyLeakyPointers) {
+                       totalSizePotentiallyLeakyPointers += size;
                }
-               return "Num of GPU objects: [unlocked:" + numUnlockedGPUObjects 
+ ", locked:" + numLockedGPUObjects + "]. "
-                               + "Size of GPU objects in bytes: [unlocked:" + 
sizeOfUnlockedGPUObjects + ", locked:" + sizeOfLockedGPUObjects + "]. "
-                               + "Total memory allocated by the current GPU 
context in bytes:" + totalMemoryAllocated;
+               StringBuilder ret = new StringBuilder();
+               //if(DMLScript.PRINT_GPU_MEMORY_INFO) {
+               //      if(potentiallyLeakyPointers.size() > 0) {
+               //              ret.append("Non-matrix pointers were allocated 
by:\n");
+               //              printPointers(potentiallyLeakyPointers, ret);
+               //      }
+               //}
+               
ret.append("\n====================================================\n");
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", 
+                               "Num Objects", "Num Pointers", "Size"));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked 
Dirty GPU objects", 
+                               numUnlockedDirtyGPUObjects, 
numUnlockedDirtyPointers, 
byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects)));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked 
NonDirty GPU objects", 
+                               numUnlockedNonDirtyGPUObjects, 
numUnlockedNonDirtyPointers, 
byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU 
objects", 
+                               numLockedGPUObjects, numLockedPointers, 
byteCountToDisplaySize(sizeOfLockedGPUObjects)));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached 
rmvar-ed pointers", 
+                               "-", 
lazyCudaFreeMemoryManager.getNumPointers(), 
byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", 
"Non-matrix/non-cached pointers", 
+                               "-", potentiallyLeakyPointers.size(), 
byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All 
pointers", 
+                               "-", allPointers.size(), 
byteCountToDisplaySize(totalMemoryAllocated)));
+               long free[] = { 0 };
+               long total[] = { 0 };
+               cudaMemGetInfo(free, total);
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Free mem 
(from cudaMemGetInfo)", 
+                               "-", "-", byteCountToDisplaySize(free[0])));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Total mem 
(from cudaMemGetInfo)", 
+                               "-", "-", byteCountToDisplaySize(total[0])));
+               
ret.append("====================================================\n");
+               return ret.toString();
        }
        
        /**
@@ -525,14 +649,39 @@ public class GPUMemoryManager {
                return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR);
        }
        
+       private static Comparator<GPUObject> SIMPLE_COMPARATOR_SORT_BY_SIZE = 
(o1, o2) -> o1.getSizeOnDevice() < o2.getSizeOnDevice() ? -1 : 1;
+       
+       private static class CustomPointer extends Pointer {
+               public CustomPointer(Pointer p) {
+                       super(p);
+               }
+               
+               @Override
+               public long getNativePointer() {
+                       return super.getNativePointer();
+               }
+               
+       }
        /**
         * Class that governs the eviction policy
         */
-       public static class GPUComparator implements Comparator<GPUObject> {
+       public static class EvictionPolicyBasedComparator implements 
Comparator<GPUObject> {
                private long neededSize;
-               public GPUComparator(long neededSize) {
+               public EvictionPolicyBasedComparator(long neededSize) {
                        this.neededSize = neededSize;
                }
+               
+               private int minEvictCompare(GPUObject p1, GPUObject p2) {
+                       long p1Size = p1.getSizeOnDevice() - neededSize;
+                       long p2Size = p2.getSizeOnDevice() - neededSize;
+
+                       if (p1Size >= 0 && p2Size >= 0) {
+                               return Long.compare(p2Size, p1Size);
+                       } else {
+                               return Long.compare(p1Size, p2Size);
+                       }
+               }
+               
                @Override
                public int compare(GPUObject p1, GPUObject p2) {
                        if (p1.isLocked() && p2.isLocked()) {
@@ -548,25 +697,32 @@ public class GPUMemoryManager {
                                return 1;
                        } else {
                                // Both are unlocked
-                               if (DMLScript.GPU_EVICTION_POLICY == 
DMLScript.EvictionPolicy.MIN_EVICT) {
-                                       long p1Size = 0;
-                                       long p2Size = 0;
-                                       try {
-                                               p1Size = p1.getSizeOnDevice() - 
neededSize;
-                                               p2Size = p2.getSizeOnDevice() - 
neededSize;
-                                       } catch (DMLRuntimeException e) {
-                                               throw new RuntimeException(e);
+                               if (DMLScript.GPU_EVICTION_POLICY == 
DMLScript.EvictionPolicy.ALIGN_MEMORY) {
+                                       if(!p1.isDensePointerNull() && 
!p2.isDensePointerNull()) {
+                                               long p1Ptr = new 
CustomPointer(p1.getDensePointer()).getNativePointer();
+                                               long p2Ptr = new 
CustomPointer(p2.getDensePointer()).getNativePointer();
+                                               
+                                               if(p1Ptr <= p2Ptr)
+                                                       return -1;
+                                               else
+                                                       return 1;
                                        }
-
-                                       if (p1Size >= 0 && p2Size >= 0) {
-                                               return Long.compare(p2Size, 
p1Size);
-                                       } else {
-                                               return Long.compare(p1Size, 
p2Size);
+                                       else if(p1.isDensePointerNull() && 
!p2.isDensePointerNull()) {
+                                               return -1;
+                                       }
+                                       else if(!p1.isDensePointerNull() && 
p2.isDensePointerNull()) {
+                                               return 1;
                                        }
+                                       else {
+                                               return minEvictCompare(p1, p2);
+                                       }
+                               }
+                               else if (DMLScript.GPU_EVICTION_POLICY == 
DMLScript.EvictionPolicy.MIN_EVICT) {
+                                       return minEvictCompare(p1, p2);
                                } else {
                                        return Long.compare(p2.timestamp.get(), 
p1.timestamp.get());
                                }
                        }
                }
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 613e16f..bf44895 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -23,6 +23,7 @@ import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
@@ -97,6 +98,56 @@ public class GPUObject {
         * Enclosing {@link MatrixObject} instance
         */
        protected MatrixObject mat = null;
+       
+       // 
----------------------------------------------------------------------
+       // Methods used to access, set and check jcudaDenseMatrixPtr
+       
+       /**
+        * Pointer to dense matrix
+        *
+        * @return a pointer to the dense matrix
+        */
+       public Pointer getDensePointer() {
+               return jcudaDenseMatrixPtr;
+       }
+       
+       /**
+        * Checks if the dense pointer is null
+        * 
+        * @return if the state of dense pointer is null
+        */
+       public boolean isDensePointerNull() {
+               return jcudaDenseMatrixPtr == null;
+       }
+       
+       /**
+        * Removes the dense pointer and potential soft reference
+        */
+       public void clearDensePointer() {
+               jcudaDenseMatrixPtr = null;
+       }
+       
+       /**
+        * Convenience method to directly set the dense matrix pointer on GPU
+        *
+        * @param densePtr dense pointer
+        */
+       public void setDensePointer(Pointer densePtr) {
+               if (!this.isDensePointerNull()) {
+                       throw new DMLRuntimeException("jcudaDenseMatrixPtr was 
already allocated for " + this + ", this will cause a memory leak on the GPU");
+               }
+               this.jcudaDenseMatrixPtr = densePtr;
+               this.isSparse = false;
+               if(LOG.isDebugEnabled()) {
+                       LOG.debug("Setting dense pointer of size " + 
getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
+               }
+               if (getJcudaSparseMatrixPtr() != null) {
+                       getJcudaSparseMatrixPtr().deallocate();
+                       jcudaSparseMatrixPtr = null;
+               }
+       }
+       // 
----------------------------------------------------------------------
+
 
        @Override
        public Object clone() {
@@ -111,12 +162,12 @@ public class GPUObject {
                that.isSparse = me.isSparse;
 
                try {
-                       if (me.jcudaDenseMatrixPtr != null) {
+                       if (!me.isDensePointerNull()) {
                                long rows = me.mat.getNumRows();
                                long cols = me.mat.getNumColumns();
                                long size = rows * cols * 
LibMatrixCUDA.sizeOfDataType;
-                               that.jcudaDenseMatrixPtr = allocate(size);
-                               cudaMemcpy(that.jcudaDenseMatrixPtr, 
me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice);
+                               that.setDensePointer(allocate(size));
+                               cudaMemcpy(that.getDensePointer(), 
me.getDensePointer(), size, cudaMemcpyDeviceToDevice);
                        }
 
                        if (me.getJcudaSparseMatrixPtr() != null) {
@@ -135,12 +186,8 @@ public class GPUObject {
                return getGPUContext().allocate(size);
        }
 
-       private void cudaFreeHelper(Pointer toFree) {
-               getGPUContext().cudaFreeHelper(toFree);
-       }
-
-       private void cudaFreeHelper(String instName, Pointer toFree, boolean 
eager) {
-               getGPUContext().cudaFreeHelper(instName, toFree, eager);
+       private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+               getGPUContext().cudaFreeHelper(null, toFree, 
DMLScript.EAGER_CUDA_FREE);
        }
 
        private GPUContext getGPUContext() {
@@ -172,6 +219,7 @@ public class GPUObject {
                                lda, C, ldc);
                return C;
        }
+       
 
        /**
         * Convenience method to convert a CSR matrix to a dense matrix on the 
GPU
@@ -224,8 +272,8 @@ public class GPUObject {
                                C.colInd);
                //cudaDeviceSynchronize();
 
-               gCtx.cudaFreeHelper(nnzPerRowPtr);
-               gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+               gCtx.cudaFreeHelper(null, nnzPerRowPtr, 
DMLScript.EAGER_CUDA_FREE);
+               gCtx.cudaFreeHelper(null, nnzTotalDevHostPtr, 
DMLScript.EAGER_CUDA_FREE);
 
                return C;
        }
@@ -251,32 +299,12 @@ public class GPUObject {
                }
                this.jcudaSparseMatrixPtr = sparseMatrixPtr;
                this.isSparse = true;
-               if (getJcudaDenseMatrixPtr() != null) {
-                       cudaFreeHelper(getJcudaDenseMatrixPtr());
-                       jcudaDenseMatrixPtr = null;
-               }
-       }
-
-       /**
-        * Convenience method to directly set the dense matrix pointer on GPU
-        *
-        * @param densePtr dense pointer
-        */
-       public void setDenseMatrixCudaPointer(Pointer densePtr) {
-               if (this.jcudaDenseMatrixPtr != null) {
-                       throw new DMLRuntimeException("jcudaDenseMatrixPtr was 
already allocated for " + this + ", this will cause a memory leak on the GPU");
-               }
-               this.jcudaDenseMatrixPtr = densePtr;
-               this.isSparse = false;
-               if(LOG.isDebugEnabled()) {
-                       LOG.debug("Setting dense pointer of size " + 
getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
-               }
-               if (getJcudaSparseMatrixPtr() != null) {
-                       getJcudaSparseMatrixPtr().deallocate();
-                       jcudaSparseMatrixPtr = null;
+               if (!isDensePointerNull()) {
+                       cudaFreeHelper(getDensePointer());
+                       clearDensePointer();
                }
        }
-
+       
        /**
         * Converts this GPUObject from dense to sparse format.
         */
@@ -293,12 +321,12 @@ public class GPUObject {
                int rows = toIntExact(mat.getNumRows());
                int cols = toIntExact(mat.getNumColumns());
 
-               if (getJcudaDenseMatrixPtr() == null || !isAllocated())
+               if (isDensePointerNull() || !isAllocated())
                        throw new DMLRuntimeException("Expected allocated dense 
matrix before denseToSparse() call");
 
                denseRowMajorToColumnMajor();
                setSparseMatrixCudaPointer(
-                               
columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, 
getJcudaDenseMatrixPtr(), rows,
+                               
columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, 
getDensePointer(), rows,
                                                cols));
                // TODO: What if mat.getNnz() is -1 ?
                if (DMLScript.STATISTICS)
@@ -322,10 +350,10 @@ public class GPUObject {
                        throw new DMLRuntimeException("Error in converting row 
major to column major : data is not allocated");
                }
 
-               Pointer tmp = transpose(getGPUContext(), 
getJcudaDenseMatrixPtr(), m, n, lda, ldc);
-               cudaFreeHelper(getJcudaDenseMatrixPtr());
-               jcudaDenseMatrixPtr = null;
-               setDenseMatrixCudaPointer(tmp);
+               Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, 
n, lda, ldc);
+               cudaFreeHelper(getDensePointer());
+               clearDensePointer();
+               setDensePointer(tmp);
        }
 
        /**
@@ -344,10 +372,10 @@ public class GPUObject {
                        throw new DMLRuntimeException("Error in converting 
column major to row major : data is not allocated");
                }
 
-               Pointer tmp = transpose(getGPUContext(), 
getJcudaDenseMatrixPtr(), m, n, lda, ldc);
-               cudaFreeHelper(getJcudaDenseMatrixPtr());
-               jcudaDenseMatrixPtr = null;
-               setDenseMatrixCudaPointer(tmp);
+               Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, 
n, lda, ldc);
+               cudaFreeHelper(getDensePointer());
+               clearDensePointer();
+               setDensePointer(tmp);
        }
 
        /**
@@ -400,7 +428,7 @@ public class GPUObject {
                        throw new DMLRuntimeException("Expected cusparse to be 
initialized");
                int rows = toIntExact(mat.getNumRows());
                int cols = toIntExact(mat.getNumColumns());
-               
setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle,
 null, rows, cols, null));
+               
setDensePointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle,
 null, rows, cols, null));
        }
 
        /**
@@ -426,7 +454,7 @@ public class GPUObject {
        }
 
        public boolean isAllocated() {
-               boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || 
getJcudaSparseMatrixPtr() != null);
+               boolean eitherAllocated = (!isDensePointerNull() || 
getJcudaSparseMatrixPtr() != null);
                return eitherAllocated;
        }
 
@@ -455,13 +483,13 @@ public class GPUObject {
                long cols = mat.getNumColumns();
                int numElems = toIntExact(rows * cols);
                long size = getDatatypeSizeOf(numElems);
-               setDenseMatrixCudaPointer(allocate(size));
+               setDensePointer(allocate(size));
                // The "fill" kernel is called which treats the matrix 
"jcudaDensePtr" like a vector and fills it with value "v"
                // If the fill value is 0, no need to call the special kernel, 
the allocate memsets the allocated region to 0
                if (v != 0)
                        getGPUContext().getKernels()
                        .launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
-                                       getJcudaDenseMatrixPtr(), v, numElems);
+                                       getDensePointer(), v, numElems);
        }
 
        /**
@@ -475,6 +503,7 @@ public class GPUObject {
                boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && 
getJcudaSparseMatrixPtr().nnz == 0;
                return isEmptyAndSparseAndAllocated;
        }
+       
                
        /**
         * Being allocated is a prerequisite for computing nnz.
@@ -505,7 +534,7 @@ public class GPUObject {
                                Pointer nnzTotalDevHostPtr = null;
                                nnzPerRowPtr = 
gCtx.allocate(getIntSizeOf(rows));
                                nnzTotalDevHostPtr = 
gCtx.allocate(getIntSizeOf(1));
-                               
LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, 
getJcudaDenseMatrixPtr(), rows,
+                               
LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, 
getDensePointer(), rows,
                                                nnzPerRowPtr, 
nnzTotalDevHostPtr);
                                int[] nnzC = { -1 };
                                cudaMemcpy(Pointer.to(nnzC), 
nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
@@ -513,8 +542,8 @@ public class GPUObject {
                                        throw new DMLRuntimeException(
                                                        "cusparseDnnz did not 
calculate the correct number of nnz on the GPU");
                                }
-                               gCtx.cudaFreeHelper(nnzPerRowPtr);
-                               gCtx.cudaFreeHelper(nnzTotalDevHostPtr);
+                               gCtx.cudaFreeHelper(instName, nnzPerRowPtr, 
DMLScript.EAGER_CUDA_FREE);
+                               gCtx.cudaFreeHelper(instName, 
nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE);
                                if(DMLScript.FINEGRAINED_STATISTICS) {
                                        
GPUStatistics.maintainCPMiscTimes(instName, 
CPInstruction.MISC_TIMER_RECOMPUTE_NNZ, System.nanoTime()-t1);
                        }
@@ -600,8 +629,18 @@ public class GPUObject {
                                LOG.trace("GPU : data is dirty on device, 
copying to host, on " + this + ", GPUContext="
                                        + getGPUContext());
                        }
-                       copyFromDeviceToHost(instName, false);
-                       copied = true;
+
+                       if (isAllocated() && dirty) {
+                               if(LOG.isTraceEnabled()) {
+                                       LOG.trace("GPU : data is dirty on 
device, copying to host, on " + this + ", GPUContext="
+                                               + getGPUContext());
+                               }
+                               // TODO: Future optimization:
+                               // For now, we are deleting the device data 
when copied from device to host. 
+                               // This can be optimized later by treating 
acquiredModify+release as a new state
+                               copyFromDeviceToHost(instName, false, true); 
+                               copied = true;
+                       }
                }
                return copied;
        }
@@ -657,6 +696,7 @@ public class GPUObject {
                                timestamp.addAndGet(1);
                                break;
                        case MIN_EVICT: /* Do Nothing */
+                       case ALIGN_MEMORY:
                                break;
                        case MRU:
                                timestamp.set(-System.nanoTime());
@@ -704,7 +744,7 @@ public class GPUObject {
                        throw new DMLRuntimeException("Internal error - invalid 
number of columns when allocating dense matrix;");
                long size = getDatatypeSizeOf(rows * cols);
                Pointer tmp = allocate(size);
-               setDenseMatrixCudaPointer(tmp);
+               setDensePointer(tmp);
        }
 
        void allocateSparseMatrixOnDevice() {
@@ -723,21 +763,6 @@ public class GPUObject {
                setSparseMatrixCudaPointer(tmp);
        }
 
-       void deallocateMemoryOnDevice(boolean eager) {
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : deallocateMemoryOnDevice, on " + this 
+ ", GPUContext=" + getGPUContext());
-               }
-               if (getJcudaDenseMatrixPtr() != null) {
-                       cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager);
-               }
-               if (getJcudaSparseMatrixPtr() != null) {
-                       getJcudaSparseMatrixPtr().deallocate(eager);
-               }
-               jcudaDenseMatrixPtr = null;
-               jcudaSparseMatrixPtr = null;
-               resetReadWriteLock();
-       }
-
        protected long getSizeOnDevice() {
                long GPUSize = 0;
                long rlen = mat.getNumRows();
@@ -850,14 +875,14 @@ public class GPUObject {
                                // Minor optimization: No need to allocate 
empty error for CPU 
                                // data = new double[tmp.getNumRows() * 
tmp.getNumColumns()];
                                long t1 = DMLScript.FINEGRAINED_STATISTICS ? 
System.nanoTime() : 0;
-                               cudaMemset(getJcudaDenseMatrixPtr(), 0, 
getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
+                               cudaMemset(getDensePointer(), 0, 
getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
                                if(DMLScript.FINEGRAINED_STATISTICS) 
                                        
GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t1);
                        }
                        else {
                                // Copy dense block
                                // H2D now only measures the time taken to do 
-                               
LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, 
getJcudaDenseMatrixPtr(), opcode);
+                               
LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, 
getDensePointer(), opcode);
                        }
                }
 
@@ -875,103 +900,102 @@ public class GPUObject {
                }
                return (int) l;
        }
+       
 
-       protected void copyFromDeviceToHost(String instName, boolean 
isEviction) {
+       /**
+        * Copies the data from device to host.
+        * Currently eagerDelete and isEviction are both provided for better 
control in different scenarios. 
+        * In future, we can force eagerDelete if isEviction is true, else 
false.
+        * 
+        * @param instName opcode of the instruction for fine-grained statistics
+        * @param isEviction is called for eviction
+        * @param eagerDelete whether to perform eager deletion of the device 
data. 
+        * @throws DMLRuntimeException if error occurs
+        */
+       protected void copyFromDeviceToHost(String instName, boolean 
isEviction, boolean eagerDelete) throws DMLRuntimeException {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", 
GPUContext=" + getGPUContext());
                }
-               if (getJcudaDenseMatrixPtr() != null && 
getJcudaSparseMatrixPtr() != null) {
+               if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) {
+                       throw new DMLRuntimeException(
+                                       "Cannot copy from device to host as 
JCuda dense/sparse pointer is not allocated");
+               }
+               else if (!isDensePointerNull() && getJcudaSparseMatrixPtr() != 
null) {
                        throw new DMLRuntimeException("Invalid state : JCuda 
dense/sparse pointer are both allocated");
                }
-
-               if (getJcudaDenseMatrixPtr() != null) {
-                       long start = 0;
-                       if (DMLScript.STATISTICS)
-                               start = System.nanoTime();
-                       MatrixBlock tmp = new 
MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), 
false);
+               else if(getJcudaSparseMatrixPtr() != null && 
!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
+                       throw new DMLRuntimeException(
+                                       "Block not in sparse format on host yet 
the device sparse matrix pointer is not null");
+               }
+               else if(getJcudaSparseMatrixPtr() != null && 
isSparseAndEmpty()) {
+                       mat.acquireModify(new 
MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l)); // empty 
block
+                       mat.release();
+                       return;
+               }
+               
+               MatrixBlock tmp = null;
+               long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               if (!isDensePointerNull()) {
+                       tmp = new MatrixBlock(toIntExact(mat.getNumRows()), 
toIntExact(mat.getNumColumns()), false);
                        tmp.allocateDenseBlock();
                        
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
-                                               getJcudaDenseMatrixPtr(), 
tmp.getDenseBlockValues(), instName, isEviction);
-                       
-                       tmp.recomputeNonZeros();
-                       mat.acquireModify(tmp);
-                       mat.release();
-
-                       if (DMLScript.STATISTICS)
-                               
GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
-                       if (DMLScript.STATISTICS)
-                               GPUStatistics.cudaFromDevCount.add(1);
-               } else if (getJcudaSparseMatrixPtr() != null) {
-                       if (!LibMatrixCUDA.isInSparseFormat(getGPUContext(), 
mat))
-                               throw new DMLRuntimeException(
-                                               "Block not in sparse format on 
host yet the device sparse matrix pointer is not null");
-
-                       if (this.isSparseAndEmpty()) {
-                               MatrixBlock tmp = new 
MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l);    // Empty 
Block
-                               mat.acquireModify(tmp);
-                               mat.release();
-                       } else {
-                               long start = 0;
-                               if (DMLScript.STATISTICS)
-                                       start = System.nanoTime();
-
-                               int rows = toIntExact(mat.getNumRows());
-                               int cols = toIntExact(mat.getNumColumns());
-                               int nnz = 
toIntExact(getJcudaSparseMatrixPtr().nnz);
-                               double[] values = new double[nnz];
-                               
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), 
getJcudaSparseMatrixPtr().val, values, instName, isEviction);
-                               int[] rowPtr = new int[rows + 1];
-                               int[] colInd = new int[nnz];
-                               long t0 = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
-                               
CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd);
-                               if (DMLScript.STATISTICS)
-                                       
GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0);
-                               if (DMLScript.STATISTICS)
-                                       GPUStatistics.cudaFromDevCount.add(3);
-
-                               SparseBlockCSR sparseBlock = new 
SparseBlockCSR(rowPtr, colInd, values, nnz);
-                               MatrixBlock tmp = new MatrixBlock(rows, cols, 
nnz, sparseBlock);
-                               mat.acquireModify(tmp);
-                               mat.release();
-                               if (DMLScript.STATISTICS)
-                                       
GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
-                               if (DMLScript.STATISTICS)
-                                       GPUStatistics.cudaFromDevCount.add(1);
-                       }
+                                               getDensePointer(), 
tmp.getDenseBlockValues(), instName, isEviction); 
+                       // int nnz = LibMatrixCUDA.computeNNZ(getGPUContext(), 
getJcudaDenseMatrixPtr(), toIntExact(mat.getNumRows()*mat.getNumColumns()));
+                       // tmp.setNonZeros(nnz);
+                       if(eagerDelete)
+                               clearData(instName, true);
+                       // tmp.recomputeNonZeros();
+                       tmp.setNonZeros(-1);
                } else {
-                       throw new DMLRuntimeException(
-                                       "Cannot copy from device to host as 
JCuda dense/sparse pointer is not allocated");
+                       int rows = toIntExact(mat.getNumRows());
+                       int cols = toIntExact(mat.getNumColumns());
+                       int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
+                       double[] values = new double[nnz];
+                       
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), 
getJcudaSparseMatrixPtr().val, values, instName, isEviction);
+                       int[] rowPtr = new int[rows + 1];
+                       int[] colInd = new int[nnz];
+                       CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), 
rows, nnz, rowPtr, colInd);
+                       if(eagerDelete)
+                               clearData(instName, true);
+                       SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, 
colInd, values, nnz);
+                       tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
+               }
+               mat.acquireModify(tmp);
+               mat.release();
+               if (DMLScript.STATISTICS)
+                       GPUStatistics.cudaFromDevTime.add(System.nanoTime() - 
start);
+               if (DMLScript.STATISTICS) {
+                       int count = !isDensePointerNull() ? 1 : 3;
+                       GPUStatistics.cudaFromDevCount.add(count);
                }
                dirty = false;
        }
 
-       /**
-        * lazily clears the data associated with this {@link GPUObject} 
instance
-        */
-       public void clearData() {
-               clearData(DMLScript.EAGER_CUDA_FREE);
-       }
 
        /**
         * Clears the data associated with this {@link GPUObject} instance
         *
+        * @param opcode opcode of the instruction
         * @param eager whether to be done synchronously or asynchronously
+        * @throws DMLRuntimeException if error occurs
         */
-       public void clearData(boolean eager) {
-               deallocateMemoryOnDevice(eager);
+       public void clearData(String opcode, boolean eager) throws 
DMLRuntimeException {
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : clearData on " + this + ", 
GPUContext=" + getGPUContext());
+               }
+               if (!isDensePointerNull()) {
+                       getGPUContext().cudaFreeHelper(opcode, 
getDensePointer(), eager);
+               }
+               if (getJcudaSparseMatrixPtr() != null) {
+                       getJcudaSparseMatrixPtr().deallocate(eager);
+               }
+               clearDensePointer();
+               jcudaSparseMatrixPtr = null;
+               resetReadWriteLock();
                getGPUContext().getMemoryManager().removeGPUObject(this);
        }
 
        /**
-        * Pointer to dense matrix
-        *
-        * @return ?
-        */
-       public Pointer getJcudaDenseMatrixPtr() {
-               return jcudaDenseMatrixPtr;
-       }
-
-       /**
         * Pointer to sparse matrix
         *
         * @return ?
@@ -997,12 +1021,12 @@ public class GPUObject {
                sb.append(", writeLock=").append(writeLock);
                sb.append(", sparse? ").append(isSparse);
                sb.append(", 
dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
-               if(jcudaDenseMatrixPtr != null)
-                       sb.append(", densePtr=").append(jcudaDenseMatrixPtr);
+               if(!isDensePointerNull())
+                       sb.append(", densePtr=").append(getDensePointer());
                if(jcudaSparseMatrixPtr != null)
                        sb.append(", sparsePtr=").append(jcudaSparseMatrixPtr);
                sb.append('}');
                return sb.toString();
        }
 
-}
+}
\ No newline at end of file

Reply via email to