Repository: incubator-systemml
Updated Branches:
  refs/heads/master 464600e3e -> be2632127


[SYSTEMML-445][SYSTEMML-540] Adding NNZ to Convolution CP instruction
and also refactoring GPUContext and GPUObject.



Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/be263212
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/be263212
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/be263212

Branch: refs/heads/master
Commit: be2632127269bb6a705923e0996fa27a9978b03e
Parents: 464600e
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Fri Jun 24 10:30:14 2016 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Fri Jun 24 10:31:21 2016 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/api/DMLScript.java    |   2 +-
 .../controlprogram/caching/CacheableData.java   |   2 +-
 .../controlprogram/caching/MatrixObject.java    |   2 +-
 .../context/ExecutionContext.java               |   1 +
 .../controlprogram/context/GPUContext.java      |  72 -----
 .../controlprogram/context/GPUObject.java       | 168 ------------
 .../controlprogram/context/JCudaContext.java    | 148 -----------
 .../controlprogram/context/JCudaObject.java     | 263 -------------------
 .../cp/ConvolutionCPInstruction.java            |  10 +-
 .../instructions/gpu/GPUInstruction.java        |   2 +-
 .../instructions/gpu/context/GPUContext.java    |  71 +++++
 .../instructions/gpu/context/GPUObject.java     | 167 ++++++++++++
 .../instructions/gpu/context/JCudaContext.java  | 148 +++++++++++
 .../instructions/gpu/context/JCudaObject.java   | 263 +++++++++++++++++++
 .../runtime/matrix/data/LibMatrixCUDA.java      |   2 +-
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |  16 +-
 16 files changed, 677 insertions(+), 660 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index a7c2db6..814bcb8 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -69,7 +69,7 @@ import 
org.apache.sysml.runtime.controlprogram.caching.CacheStatistics;
 import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory;
-import org.apache.sysml.runtime.controlprogram.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter;
 import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 4bab6b8..d60c607 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -33,7 +33,7 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy;
-import org.apache.sysml.runtime.controlprogram.context.GPUObject;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
index 2144ef8..5d5f41f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
@@ -32,8 +32,8 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import 
org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
-import org.apache.sysml.runtime.controlprogram.context.GPUContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 051347b..7e089c8 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -45,6 +45,7 @@ import 
org.apache.sysml.runtime.instructions.cp.FunctionCallCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.IntObject;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.instructions.cp.StringObject;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData;
 import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java
deleted file mode 100644
index 91a236b..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.controlprogram.context;
-
-import java.util.ArrayList;
-
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.hops.OptimizerUtils;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
-
-//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA 
specific
-//FIXME move to gpu instruction package
-public abstract class GPUContext {
-
-       public static ArrayList<GPUObject> allocatedPointers = new 
ArrayList<GPUObject>(); 
-       protected static GPUContext currContext;
-       protected GPUContext() { }
-       
-       public static volatile Boolean isGPUContextCreated = false;
-       
-       public abstract long getAvailableMemory();
-       
-       // Creation / Destruction of GPUContext and related handles
-       public static GPUContext createGPUContext() {
-               if(currContext == null && DMLScript.USE_ACCELERATOR) {
-                       // TODO: Handle this thread and resolve concurrency 
related bugs if any
-                       new Thread(new Runnable() {
-                               @Override
-                               public void run() {
-                                       // Lazy GPU context creation
-                                       synchronized(isGPUContextCreated) {
-                                               currContext = new 
JCudaContext();
-                                               
OptimizerUtils.GPU_MEMORY_BUDGET = 
((JCudaContext)currContext).getAvailableMemory();
-                                               isGPUContextCreated = true;
-                                       }
-                               }
-                       }).start();
-               }
-               return currContext;
-       }
-       public static GPUObject createGPUObject(MatrixObject mo) {
-               if(DMLScript.USE_ACCELERATOR) {
-                       synchronized(isGPUContextCreated) {
-                               if(currContext == null)
-                                       throw new RuntimeException("GPUContext 
is not created");
-                               if(currContext instanceof JCudaContext)
-                                       return new JCudaObject(mo);
-                       }
-               }
-               throw new RuntimeException("Cannot create createGPUObject when 
USE_ACCELERATOR is off");
-       }
-       public abstract void destroy() throws DMLRuntimeException;
-       
-       
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java
deleted file mode 100644
index 8037b8a..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.controlprogram.context;
-
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.caching.CacheException;
-import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysml.utils.Statistics;
-
-//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity
-//FIXME move to gpu instruction package
-public abstract class GPUObject 
-{
-       protected boolean isDeviceCopyModified = false;
-       protected AtomicInteger numLocks = new AtomicInteger(0);
-       protected boolean isInSparseFormat = false;
-       protected boolean isAllocated = false;
-       protected MatrixObject mat = null;
-       
-       protected GPUObject(MatrixObject mat2)  {
-               this.mat = mat2;
-       }
-       
-       public boolean isInSparseFormat() {
-               return isInSparseFormat;
-       }
-       
-       public boolean isAllocated() {
-               return isAllocated;
-       }
-       
-       public abstract void acquireDeviceRead() throws DMLRuntimeException;
-       public abstract void acquireDenseDeviceModify(int numElemsToAllocate) 
throws DMLRuntimeException;
-       public abstract void acquireHostRead() throws CacheException;
-       public abstract void acquireHostModify() throws CacheException;
-       public abstract void release(boolean isGPUCopyModified) throws 
CacheException;
-       
-       
-       // package-level visibility as these methods are guarded by underlying 
GPUContext
-       abstract void allocateMemoryOnDevice(int numElemToAllocate) throws 
DMLRuntimeException;
-       abstract void deallocateMemoryOnDevice() throws DMLRuntimeException;
-       abstract long getSizeOnDevice() throws DMLRuntimeException;
-       abstract void copyFromHostToDevice() throws DMLRuntimeException;
-       abstract void copyFromDeviceToHost() throws DMLRuntimeException; // 
Called by export()
-       
-       
-       /**
-        * It finds matrix toBeRemoved such that toBeRemoved.GPUSize >= size
-        * // TODO: it is the smallest matrix size that satisfy the above 
condition. For now just evicting the largest pointer.
-        * Then returns toBeRemoved. 
-        * 
-        */
-       protected void evict(long GPUSize) throws DMLRuntimeException {
-               if(GPUContext.allocatedPointers.size() == 0) {
-                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix!");
-               }
-               
-               Statistics.cudaEvictionCount.addAndGet(1);
-               
-               synchronized(evictionLock) {
-                       Collections.sort(GPUContext.allocatedPointers, new 
Comparator<GPUObject>() {
-       
-                               @Override
-                               public int compare(GPUObject p1, GPUObject p2) {
-                                       int p1Val = p1.numLocks.get();
-                                       int p2Val = p2.numLocks.get();
-                                       
-                                       if(p1Val < 0 || p2Val < 0) {
-                                               throw new 
RuntimeException("Number of locks cannot be negative");
-                                       }
-                                       else if(p1Val == 0 && p2Val == 0) {
-                                               // Both p1 and p2 are unlocked, 
return largest object
-                                               // TODO: Modify this !!
-                                               long p1Size = 0; long p2Size = 
0;
-                                               try {
-                                                       p1Size = 
p1.getSizeOnDevice();
-                                                       p2Size = 
p2.getSizeOnDevice();
-                                               } catch (DMLRuntimeException e) 
{
-                                                       throw new 
RuntimeException(e);
-                                               }
-                                               if(p1Size == p2Size) {
-                                                       return 0;
-                                               }
-                                               else if(p1Size < p2Size) {
-                                                       return 1;
-                                               }
-                                               else {
-                                                       return -1;
-                                               }
-                                       }
-                                       else if(p1Val > p2Val) {
-                                               // There are more locks on p1
-                                               return 1;
-                                       }
-                                       else {
-                                               // There are more locks on p2
-                                               return -1;
-                                       }
-                               }
-                       });
-                       
-                       
-                       while(GPUSize > getAvailableMemory() && 
GPUContext.allocatedPointers.size() > 0) {
-                               GPUObject toBeRemoved = 
GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1);
-                               if(toBeRemoved.numLocks.get() != 0) {
-                                       throw new DMLRuntimeException("There is 
not enough memory on device for this matrix!");
-                               }
-                               if(toBeRemoved.isDeviceCopyModified) {
-                                       toBeRemoved.copyFromDeviceToHost();
-                               }
-                               toBeRemoved.clearData();
-                       }
-               }
-       }
-       
-       public void clearData() throws CacheException {
-               synchronized(evictionLock) {
-                       GPUContext.allocatedPointers.remove(this);
-               }
-               try {
-                       deallocateMemoryOnDevice();
-               } catch (DMLRuntimeException e) {
-                       throw new CacheException(e);
-               }
-       }
-       
-       static Boolean evictionLock = new Boolean(true);
-       
-       protected long getAvailableMemory() {
-               return GPUContext.currContext.getAvailableMemory();
-       }
-       
-//     // Copying from device -> host occurs here
-//     // Called by MatrixObject's exportData
-//     public void exportData() throws CacheException {
-//             boolean isDeviceCopyModified = mat.getGPUObject() != null && 
mat.getGPUObject().isDeviceCopyModified;
-//             boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || 
-//                             (mat.getMatrixBlock().getDenseBlock() == null 
&& mat.getMatrixBlock().getSparseBlock() == null);
-//             
-//             if(mat.getGPUObject() != null && (isDeviceCopyModified || 
isHostCopyUnavailable)) {
-//                     try {
-//                             mat.getGPUObject().copyFromDeviceToHost();
-//                     } catch (DMLRuntimeException e) {
-//                             throw new CacheException(e);
-//                     }
-//             }
-//     }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
deleted file mode 100644
index bfb823e..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.controlprogram.context;
-
-import java.util.concurrent.atomic.AtomicLong;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.utils.Statistics;
-
-import jcuda.driver.JCudaDriver;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.JCudnn;
-import jcuda.runtime.JCuda;
-import jcuda.jcudnn.cudnnHandle;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.driver.JCudaDriver.cuInit;
-import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
-import static jcuda.runtime.JCuda.cudaMemGetInfo;
-import static jcuda.runtime.cudaError.cudaSuccess;
-
-/**
- * Setup:
- * 1. Install CUDA 7.5
- * 2. Install CuDNN v4 from 
http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip
- * 3. Download JCuda binaries version 0.7.5b and JCudnn version 0.7.5. Copy 
the DLLs into C:\lib (or /lib) directory. Link: 
http://www.jcuda.org/downloads/downloads.html
- *
- */
-public class JCudaContext extends GPUContext {
-       
-       private static final Log LOG = 
LogFactory.getLog(JCudaContext.class.getName());
-       
-       public static boolean DEBUG = false;
-       
-       public static long totalNumBytes = 0;
-       public static AtomicLong availableNumBytesWithoutUtilFactor = new 
AtomicLong(0);
-       // Fraction of available memory to use. The available memory is 
computer when the JCudaContext is created
-       // to handle the tradeoff on calling cudaMemGetInfo too often. 
-       public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; 
-       public static boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = true;
-       
-       static {
-               long start = System.nanoTime();
-               JCuda.setExceptionsEnabled(true);
-               JCudnn.setExceptionsEnabled(true);
-               JCublas2.setExceptionsEnabled(true);
-               JCudaDriver.setExceptionsEnabled(true);
-               cuInit(0); // Initialize the driver
-               // Obtain the number of devices
-        int deviceCountArray[] = { 0 };
-        cuDeviceGetCount(deviceCountArray);
-        int deviceCount = deviceCountArray[0];
-        LOG.info("Total number of GPUs on the machine: " + deviceCount);
-        Statistics.cudaInitTime = System.nanoTime() - start;
-       }
-       
-       public long getAvailableMemory() {
-               if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
-                       long free [] = { 0 };
-               long total [] = { 0 };
-               if(cudaMemGetInfo(free, total) == cudaSuccess) {
-                       totalNumBytes = total[0];
-                       availableNumBytesWithoutUtilFactor.set(free[0]);
-               }
-               else {
-                       throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
-               }
-               }
-               return (long) 
(availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR);
-       }
-       
-       
-       public JCudaContext() {
-               if(isGPUContextCreated) {
-                       // Wait until it is deleted. This case happens during 
multi-threaded testing.
-                       // This also allows for multi-threaded execute calls
-                       long startTime = System.currentTimeMillis();
-                       do {
-                               try {
-                                       Thread.sleep(100);
-                               } catch (InterruptedException e) {}
-                       } while(isGPUContextCreated && 
(System.currentTimeMillis() - startTime) < 60000);
-                       synchronized(isGPUContextCreated) {
-                               if(GPUContext.currContext != null) {
-                                       throw new RuntimeException("Cannot 
create multiple JCudaContext. Waited for 10 min to close previous GPUContext");
-                               }
-                       }
-               }
-               GPUContext.currContext = this;
-               
-               long start = System.nanoTime();
-               LibMatrixCUDA.cudnnHandle = new cudnnHandle();
-               cudnnCreate(LibMatrixCUDA.cudnnHandle);
-               LibMatrixCUDA.cublasHandle = new cublasHandle();
-               cublasCreate(LibMatrixCUDA.cublasHandle);
-               Statistics.cudaLibrariesInitTime = System.nanoTime() - start;
-               
-               long free [] = { 0 };
-        long total [] = { 0 };
-        if(cudaMemGetInfo(free, total) == cudaSuccess) {
-               totalNumBytes = total[0];
-               availableNumBytesWithoutUtilFactor.set(free[0]);
-        }
-        else {
-               throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
-        }
-        LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
-        LOG.info("Available GPU memory: " + 
(availableNumBytesWithoutUtilFactor.get()*(1e-6)) + " MB");
-       }
-
-       @Override
-       public void destroy() throws DMLRuntimeException {
-               if(currContext != null) {
-                       synchronized(isGPUContextCreated) {
-                               cudnnDestroy(LibMatrixCUDA.cudnnHandle);
-                               cublasDestroy(LibMatrixCUDA.cublasHandle);
-                               currContext = null;
-                               isGPUContextCreated = false;
-                       }
-               }
-               else if(LibMatrixCUDA.cudnnHandle != null || 
LibMatrixCUDA.cublasHandle != null) {
-                       throw new DMLRuntimeException("Error while destroying 
the GPUContext");
-               }
-       }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
deleted file mode 100644
index 5d37909..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.controlprogram.context;
-
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemcpy;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import jcuda.Pointer;
-import jcuda.Sizeof;
-
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.caching.CacheException;
-import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.runtime.matrix.data.MatrixBlock;
-import org.apache.sysml.utils.Statistics;
-
-public class JCudaObject extends GPUObject {
-       
-       public Pointer jcudaPointer = null;
-       public long numBytes;
-
-       JCudaObject(MatrixObject mat2) {
-               super(mat2);
-       }
-       
-       private void prepare(boolean isInput, int numElemsToAllocate) throws 
DMLRuntimeException {
-               if(jcudaPointer != null) {
-                       // Already allocated on GPU and expected to be in sync
-               }
-               else {
-                       if(isInput) {
-                               if(numElemsToAllocate != -1)
-                                       throw new DMLRuntimeException("Expected 
numElemsToAllocate to be -1 as it is inferred from the input");
-                               // Copy performs allocation
-                               copyFromHostToDevice();
-                       }
-                       else {
-                               // Don't copy just allocate
-                               
ensureFreeSpaceForDenseBlock(numElemsToAllocate);
-                               allocateMemoryOnDevice(numElemsToAllocate);
-                               synchronized(evictionLock) {
-                                       GPUContext.allocatedPointers.add(this);
-                               }
-                       }
-               }
-               numLocks.addAndGet(1);
-       }
-       
-       @Override
-       public void acquireDeviceRead() throws DMLRuntimeException {
-               prepare(true, -1);
-               if(!isAllocated) 
-                       throw new DMLRuntimeException("Expected device data to 
be allocated");
-       }
-       
-       @Override
-       public void acquireDenseDeviceModify(int numElemsToAllocate) throws 
DMLRuntimeException {
-               prepare(false, numElemsToAllocate); 
-               isDeviceCopyModified = true;
-               if(!isAllocated) 
-                       throw new DMLRuntimeException("Expected device data to 
be allocated");
-       }
-       
-       @Override
-       public void acquireHostRead() throws CacheException {
-               if(isAllocated) {
-                       try {
-                               if(isDeviceCopyModified) {
-                                       copyFromDeviceToHost();
-                               }
-                       } catch (DMLRuntimeException e) {
-                               throw new CacheException(e);
-                       }
-               }
-       }
-       
-       @Override
-       public void acquireHostModify() throws CacheException {
-               if(isAllocated) {
-                       try {
-                               if(isDeviceCopyModified) {
-                                       throw new 
DMLRuntimeException("Potential overwrite of GPU data");
-                                       // copyFromDeviceToHost();
-                               }
-                               clearData();
-                       } catch (DMLRuntimeException e) {
-                               throw new CacheException(e);
-                       }
-               }
-       }
-       
-       public void release(boolean isGPUCopyModified) throws CacheException {
-               if(numLocks.addAndGet(-1) < 0) {
-                       throw new CacheException("Redundant release of GPU 
object");
-               }
-               isDeviceCopyModified = isGPUCopyModified;
-       }
-
-       @Override
-       void allocateMemoryOnDevice(int numElemToAllocate) throws 
DMLRuntimeException {
-               if(jcudaPointer == null) {
-                       long start = System.nanoTime();
-                       jcudaPointer = new Pointer();
-                       if(numElemToAllocate == -1 && 
LibMatrixCUDA.isInSparseFormat(mat))
-                               throw new DMLRuntimeException("Sparse format 
not implemented");
-                       else if(numElemToAllocate == -1) {
-                               // Called for dense input
-                               numBytes = 
mat.getNumRows()*mat.getNumColumns()*Sizeof.DOUBLE;
-                               cudaMalloc(jcudaPointer, numBytes);
-                               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
-                       }
-                       else {
-                               // Called for dense output
-                               numBytes = numElemToAllocate*Sizeof.DOUBLE;
-                               cudaMalloc(jcudaPointer,  numBytes);
-                               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
-                       }
-                       
-                       
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
-                       Statistics.cudaAllocCount.addAndGet(1);
-
-               }
-               isAllocated = true;
-       }
-       
-       @Override
-       void deallocateMemoryOnDevice() {
-               if(jcudaPointer != null) {
-                       long start = System.nanoTime();
-                       cudaFree(jcudaPointer);
-                       
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes);
-                       
Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
-                       Statistics.cudaDeAllocCount.addAndGet(1);
-                       
-               }
-               jcudaPointer = null;
-               isAllocated = false;
-               numLocks.set(0);
-       }
-       
-       void ensureFreeSpaceForDenseBlock(int numElem) throws 
DMLRuntimeException {
-               long GPUSize = (Sizeof.DOUBLE) * numElem;
-               if(GPUSize >= getAvailableMemory()) {
-                       evict(GPUSize);
-               }
-       }
-       
-       @Override
-       void copyFromHostToDevice() 
-               throws DMLRuntimeException 
-       {
-               printCaller();
-               long start = System.nanoTime();
-               
-               MatrixBlock tmp = mat.acquireRead();
-               if(tmp.isInSparseFormat()) {
-                       throw new DMLRuntimeException("Sparse matrix is not 
implemented");
-                       // tmp.sparseToDense();
-               }
-               else {
-                       double[] data = tmp.getDenseBlock();
-                       
-                       if( data == null && tmp.getSparseBlock() != null )
-                               throw new DMLRuntimeException("Incorrect 
sparsity calculation");
-                       else if( data==null && tmp.getNonZeros() != 0 )
-                               throw new DMLRuntimeException("MatrixBlock is 
not allocated");
-                       else if( tmp.getNonZeros() == 0 )
-                               data = new 
double[tmp.getNumRows()*tmp.getNumColumns()];
-                       
-                       // Copy dense block
-                       ensureFreeSpaceForDenseBlock(data.length);
-                       allocateMemoryOnDevice(data.length);
-                       synchronized(evictionLock) {
-                               GPUContext.allocatedPointers.add(this);
-                       }
-                       cudaMemcpy(jcudaPointer, Pointer.to(data), 
mat.getNumRows()*mat.getNumColumns() * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
-               }
-               
-               mat.release();
-               
-               Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
-               Statistics.cudaToDevCount.addAndGet(1);
-       }
-
-       @Override
-       protected void copyFromDeviceToHost() throws DMLRuntimeException {
-               if(jcudaPointer != null) {
-                       printCaller();
-                       if(LibMatrixCUDA.isInSparseFormat(mat))
-                               throw new DMLRuntimeException("Sparse format 
not implemented");
-                       else {
-                               long start = System.nanoTime();
-                               MatrixBlock tmp = new 
MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), false);
-                               tmp.allocateDenseBlock();
-                               double [] data = tmp.getDenseBlock();
-                               
-                               cudaMemcpy(Pointer.to(data), jcudaPointer, 
data.length * Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-                               
-                               tmp.recomputeNonZeros();
-                               mat.acquireModify(tmp);
-                               mat.release();
-                               
-                               
Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
-                               Statistics.cudaFromDevCount.addAndGet(1);
-                       }
-               }
-               else {
-                       throw new DMLRuntimeException("Cannot copy from device 
to host as JCuda pointer is not allocated");
-               }
-               isDeviceCopyModified = false;
-       }
-
-       @Override
-       protected long getSizeOnDevice() throws DMLRuntimeException {
-               long GPUSize = 0;
-               int rlen = (int) mat.getNumRows();
-               int clen = (int) mat.getNumColumns();
-
-               if(LibMatrixCUDA.isInSparseFormat(mat)) {
-                       throw new DMLRuntimeException("Sparse format not 
implemented");
-               }
-               else {
-                       GPUSize = (Sizeof.DOUBLE) * (long) (rlen * clen);
-               }
-               return GPUSize;
-       }
-       
-       private String getClassAndMethod(StackTraceElement st) {
-               String [] str = st.getClassName().split("\\.");
-               return str[str.length - 1] + "." + st.getMethodName();
-       }
-       
-       private void printCaller() {
-               if(JCudaContext.DEBUG) {
-                       StackTraceElement[] st = 
Thread.currentThread().getStackTrace();
-                       String ret = getClassAndMethod(st[1]);
-                       for(int i = 2; i < st.length && i < 7; i++) {
-                               ret += "->" + getClassAndMethod(st[i]);
-                       }
-                       System.out.println("CALL_STACK:" + ret);
-               }
-                       
-       }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 4148a7d..1030071 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -191,6 +191,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, C * R * S, N * P 
* Q, true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.im2col(matBlock, outputBlock, params);
+                       outputBlock.setNonZeros(params.outputNNZ.get());
                }
                else if (instOpcode.equalsIgnoreCase("reshape_col")) {
                        checkHeightWidth(ec, params);
@@ -199,6 +200,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, N, K * P * Q, 
true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.reshape_col(matBlock, outputBlock, params);
+                       outputBlock.setNonZeros(matBlock.getNonZeros()); // As 
number of non-zeros doesnot change for reshape_col
                }
                else if (instOpcode.equalsIgnoreCase("rotate180")) {
                        checkHeightWidth(ec, params);
@@ -206,6 +208,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, N * P * Q, K, 
true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.rotate180(matBlock, outputBlock, params);
+                       outputBlock.setNonZeros(matBlock.getNonZeros()); // As 
number of non-zeros doesnot change for rotate180
                }
                else if (instOpcode.equalsIgnoreCase("col2im")) {
                        checkHeightWidth(ec, params);
@@ -213,7 +216,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        // needs to be zeroed-out
                        outputBlock = getDenseOutputBlock(ec, N, C * H * W, 
false);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-                       LibMatrixDNN.col2im(matBlock, outputBlock, params);
+                       LibMatrixDNN.col2im(matBlock, outputBlock, params); // 
No efficient nnz computation, so setting it to -1
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling")) {
                        // Is eligible for REUSE_NONZEROED_OUTPUT but cannot 
guarantee that previous output has been rmvar-ed
@@ -221,6 +224,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        outputBlock = getDenseOutputBlock(ec, N, C*P*Q, true);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
                        LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
+                       outputBlock.setNonZeros(params.outputNNZ.get());
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
                        MatrixBlock dout = ec.getMatrixInput(_in2.getName());
@@ -228,7 +232,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        // without somewhat expensive HashMap checks
                        outputBlock = getDenseOutputBlock(ec, N, C*H*W, false);
                        params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
-                       LibMatrixDNN.maxpooling_backward(matBlock, dout, 
outputBlock, params);
+                       LibMatrixDNN.maxpooling_backward(matBlock, dout, 
outputBlock, params); // No efficient nnz computation, so setting it to -1
                        ec.releaseMatrixInput(_in2.getName());
                }
                else {
@@ -246,7 +250,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                if(DMLScript.STATISTICS)
                        start = System.nanoTime();
                
-               MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, 
numRows * numCols);
+               MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, 
false, numRows * numCols);
                _reuseNonZeroedOutput = false;
                if(reuseNonZeroedOutput1 && DMLScript.REUSE_NONZEROED_OUTPUT) {
                        _reuseNonZeroedOutput = true;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 618f3b8..ce9646b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -36,7 +36,7 @@ public abstract class GPUInstruction extends Instruction
        protected boolean _requiresLabelUpdate = false;
        
        public GPUInstruction(String opcode, String istr) {
-               type = INSTRUCTION_TYPE.CONTROL_PROGRAM;
+               type = INSTRUCTION_TYPE.GPU;
                instString = istr;
                
                //prepare opcode and update requirement for repeated usage

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
new file mode 100644
index 0000000..0127154
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+
+//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA 
specific
+public abstract class GPUContext {
+
+       public static ArrayList<GPUObject> allocatedPointers = new 
ArrayList<GPUObject>(); 
+       protected static GPUContext currContext;
+       protected GPUContext() { }
+       
+       public static volatile Boolean isGPUContextCreated = false;
+       
+       public abstract long getAvailableMemory();
+       
+       // Creation / Destruction of GPUContext and related handles
+       public static GPUContext createGPUContext() {
+               if(currContext == null && DMLScript.USE_ACCELERATOR) {
+                       // TODO: Handle this thread and resolve concurrency 
related bugs if any
+                       new Thread(new Runnable() {
+                               @Override
+                               public void run() {
+                                       // Lazy GPU context creation
+                                       synchronized(isGPUContextCreated) {
+                                               currContext = new 
JCudaContext();
+                                               
OptimizerUtils.GPU_MEMORY_BUDGET = 
((JCudaContext)currContext).getAvailableMemory();
+                                               isGPUContextCreated = true;
+                                       }
+                               }
+                       }).start();
+               }
+               return currContext;
+       }
+       public static GPUObject createGPUObject(MatrixObject mo) {
+               if(DMLScript.USE_ACCELERATOR) {
+                       synchronized(isGPUContextCreated) {
+                               if(currContext == null)
+                                       throw new RuntimeException("GPUContext 
is not created");
+                               if(currContext instanceof JCudaContext)
+                                       return new JCudaObject(mo);
+                       }
+               }
+               throw new RuntimeException("Cannot create createGPUObject when 
USE_ACCELERATOR is off");
+       }
+       public abstract void destroy() throws DMLRuntimeException;
+       
+       
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
new file mode 100644
index 0000000..45b8c5b
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.CacheException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.utils.Statistics;
+
+//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity
+public abstract class GPUObject 
+{
+       protected boolean isDeviceCopyModified = false;
+       protected AtomicInteger numLocks = new AtomicInteger(0);
+       protected boolean isInSparseFormat = false;
+       public boolean isAllocated = false;
+       protected MatrixObject mat = null;
+       
+       protected GPUObject(MatrixObject mat2)  {
+               this.mat = mat2;
+       }
+       
+       public boolean isInSparseFormat() {
+               return isInSparseFormat;
+       }
+       
+       public boolean isAllocated() {
+               return isAllocated;
+       }
+       
+       public abstract void acquireDeviceRead() throws DMLRuntimeException;
+       public abstract void acquireDenseDeviceModify(int numElemsToAllocate) 
throws DMLRuntimeException;
+       public abstract void acquireHostRead() throws CacheException;
+       public abstract void acquireHostModify() throws CacheException;
+       public abstract void release(boolean isGPUCopyModified) throws 
CacheException;
+       
+       
+       // package-level visibility as these methods are guarded by underlying 
GPUContext
+       abstract void allocateMemoryOnDevice(int numElemToAllocate) throws 
DMLRuntimeException;
+       abstract void deallocateMemoryOnDevice() throws DMLRuntimeException;
+       abstract long getSizeOnDevice() throws DMLRuntimeException;
+       abstract void copyFromHostToDevice() throws DMLRuntimeException;
+       abstract void copyFromDeviceToHost() throws DMLRuntimeException; // 
Called by export()
+       
+       
+       /**
+        * It finds matrix toBeRemoved such that toBeRemoved.GPUSize >= size
+        * // TODO: it is the smallest matrix size that satisfy the above 
condition. For now just evicting the largest pointer.
+        * Then returns toBeRemoved. 
+        * 
+        */
+       protected void evict(long GPUSize) throws DMLRuntimeException {
+               if(GPUContext.allocatedPointers.size() == 0) {
+                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix!");
+               }
+               
+               Statistics.cudaEvictionCount.addAndGet(1);
+               
+               synchronized(evictionLock) {
+                       Collections.sort(GPUContext.allocatedPointers, new 
Comparator<GPUObject>() {
+       
+                               @Override
+                               public int compare(GPUObject p1, GPUObject p2) {
+                                       int p1Val = p1.numLocks.get();
+                                       int p2Val = p2.numLocks.get();
+                                       
+                                       if(p1Val < 0 || p2Val < 0) {
+                                               throw new 
RuntimeException("Number of locks cannot be negative");
+                                       }
+                                       else if(p1Val == 0 && p2Val == 0) {
+                                               // Both p1 and p2 are unlocked, 
return largest object
+                                               // TODO: Modify this !!
+                                               long p1Size = 0; long p2Size = 
0;
+                                               try {
+                                                       p1Size = 
p1.getSizeOnDevice();
+                                                       p2Size = 
p2.getSizeOnDevice();
+                                               } catch (DMLRuntimeException e) 
{
+                                                       throw new 
RuntimeException(e);
+                                               }
+                                               if(p1Size == p2Size) {
+                                                       return 0;
+                                               }
+                                               else if(p1Size < p2Size) {
+                                                       return 1;
+                                               }
+                                               else {
+                                                       return -1;
+                                               }
+                                       }
+                                       else if(p1Val > p2Val) {
+                                               // There are more locks on p1
+                                               return 1;
+                                       }
+                                       else {
+                                               // There are more locks on p2
+                                               return -1;
+                                       }
+                               }
+                       });
+                       
+                       
+                       while(GPUSize > getAvailableMemory() && 
GPUContext.allocatedPointers.size() > 0) {
+                               GPUObject toBeRemoved = 
GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1);
+                               if(toBeRemoved.numLocks.get() != 0) {
+                                       throw new DMLRuntimeException("There is 
not enough memory on device for this matrix!");
+                               }
+                               if(toBeRemoved.isDeviceCopyModified) {
+                                       toBeRemoved.copyFromDeviceToHost();
+                               }
+                               toBeRemoved.clearData();
+                       }
+               }
+       }
+       
+       public void clearData() throws CacheException {
+               synchronized(evictionLock) {
+                       GPUContext.allocatedPointers.remove(this);
+               }
+               try {
+                       deallocateMemoryOnDevice();
+               } catch (DMLRuntimeException e) {
+                       throw new CacheException(e);
+               }
+       }
+       
+       static Boolean evictionLock = new Boolean(true);
+       
+       protected long getAvailableMemory() {
+               return GPUContext.currContext.getAvailableMemory();
+       }
+       
+//     // Copying from device -> host occurs here
+//     // Called by MatrixObject's exportData
+//     public void exportData() throws CacheException {
+//             boolean isDeviceCopyModified = mat.getGPUObject() != null && 
mat.getGPUObject().isDeviceCopyModified;
+//             boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || 
+//                             (mat.getMatrixBlock().getDenseBlock() == null 
&& mat.getMatrixBlock().getSparseBlock() == null);
+//             
+//             if(mat.getGPUObject() != null && (isDeviceCopyModified || 
isHostCopyUnavailable)) {
+//                     try {
+//                             mat.getGPUObject().copyFromDeviceToHost();
+//                     } catch (DMLRuntimeException e) {
+//                             throw new CacheException(e);
+//                     }
+//             }
+//     }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
new file mode 100644
index 0000000..708badc
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.utils.Statistics;
+
+import jcuda.driver.JCudaDriver;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.JCudnn;
+import jcuda.runtime.JCuda;
+import jcuda.jcudnn.cudnnHandle;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.driver.JCudaDriver.cuInit;
+import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.cudaError.cudaSuccess;
+
+/**
+ * Setup:
+ * 1. Install CUDA 7.5
+ * 2. Install CuDNN v4 from 
http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip
+ * 3. Download JCuda binaries version 0.7.5b and JCudnn version 0.7.5. Copy 
the DLLs into C:\lib (or /lib) directory. Link: 
http://www.jcuda.org/downloads/downloads.html
+ *
+ */
+public class JCudaContext extends GPUContext {
+       
+       private static final Log LOG = 
LogFactory.getLog(JCudaContext.class.getName());
+       
+       public static boolean DEBUG = false;
+       
+       public static long totalNumBytes = 0;
+       public static AtomicLong availableNumBytesWithoutUtilFactor = new 
AtomicLong(0);
+       // Fraction of available memory to use. The available memory is 
computer when the JCudaContext is created
+       // to handle the tradeoff on calling cudaMemGetInfo too often. 
+       public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; 
+       public static boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = true;
+       
+       static {
+               long start = System.nanoTime();
+               JCuda.setExceptionsEnabled(true);
+               JCudnn.setExceptionsEnabled(true);
+               JCublas2.setExceptionsEnabled(true);
+               JCudaDriver.setExceptionsEnabled(true);
+               cuInit(0); // Initialize the driver
+               // Obtain the number of devices
+        int deviceCountArray[] = { 0 };
+        cuDeviceGetCount(deviceCountArray);
+        int deviceCount = deviceCountArray[0];
+        LOG.info("Total number of GPUs on the machine: " + deviceCount);
+        Statistics.cudaInitTime = System.nanoTime() - start;
+       }
+       
+       public long getAvailableMemory() {
+               if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
+                       long free [] = { 0 };
+               long total [] = { 0 };
+               if(cudaMemGetInfo(free, total) == cudaSuccess) {
+                       totalNumBytes = total[0];
+                       availableNumBytesWithoutUtilFactor.set(free[0]);
+               }
+               else {
+                       throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
+               }
+               }
+               return (long) 
(availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR);
+       }
+       
+       
+       public JCudaContext() {
+               if(isGPUContextCreated) {
+                       // Wait until it is deleted. This case happens during 
multi-threaded testing.
+                       // This also allows for multi-threaded execute calls
+                       long startTime = System.currentTimeMillis();
+                       do {
+                               try {
+                                       Thread.sleep(100);
+                               } catch (InterruptedException e) {}
+                       } while(isGPUContextCreated && 
(System.currentTimeMillis() - startTime) < 60000);
+                       synchronized(isGPUContextCreated) {
+                               if(GPUContext.currContext != null) {
+                                       throw new RuntimeException("Cannot 
create multiple JCudaContext. Waited for 10 min to close previous GPUContext");
+                               }
+                       }
+               }
+               GPUContext.currContext = this;
+               
+               long start = System.nanoTime();
+               LibMatrixCUDA.cudnnHandle = new cudnnHandle();
+               cudnnCreate(LibMatrixCUDA.cudnnHandle);
+               LibMatrixCUDA.cublasHandle = new cublasHandle();
+               cublasCreate(LibMatrixCUDA.cublasHandle);
+               Statistics.cudaLibrariesInitTime = System.nanoTime() - start;
+               
+               long free [] = { 0 };
+        long total [] = { 0 };
+        if(cudaMemGetInfo(free, total) == cudaSuccess) {
+               totalNumBytes = total[0];
+               availableNumBytesWithoutUtilFactor.set(free[0]);
+        }
+        else {
+               throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
+        }
+        LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
+        LOG.info("Available GPU memory: " + 
(availableNumBytesWithoutUtilFactor.get()*(1e-6)) + " MB");
+       }
+
+       @Override
+       public void destroy() throws DMLRuntimeException {
+               if(currContext != null) {
+                       synchronized(isGPUContextCreated) {
+                               cudnnDestroy(LibMatrixCUDA.cudnnHandle);
+                               cublasDestroy(LibMatrixCUDA.cublasHandle);
+                               currContext = null;
+                               isGPUContextCreated = false;
+                       }
+               }
+               else if(LibMatrixCUDA.cudnnHandle != null || 
LibMatrixCUDA.cublasHandle != null) {
+                       throw new DMLRuntimeException("Error while destroying 
the GPUContext");
+               }
+       }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
new file mode 100644
index 0000000..811f2dd
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.CacheException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.utils.Statistics;
+
+public class JCudaObject extends GPUObject {
+       
+       public Pointer jcudaPointer = null;
+       public long numBytes;
+
+       JCudaObject(MatrixObject mat2) {
+               super(mat2);
+       }
+       
+       private void prepare(boolean isInput, int numElemsToAllocate) throws 
DMLRuntimeException {
+               if(jcudaPointer != null) {
+                       // Already allocated on GPU and expected to be in sync
+               }
+               else {
+                       if(isInput) {
+                               if(numElemsToAllocate != -1)
+                                       throw new DMLRuntimeException("Expected 
numElemsToAllocate to be -1 as it is inferred from the input");
+                               // Copy performs allocation
+                               copyFromHostToDevice();
+                       }
+                       else {
+                               // Don't copy just allocate
+                               
ensureFreeSpaceForDenseBlock(numElemsToAllocate);
+                               allocateMemoryOnDevice(numElemsToAllocate);
+                               synchronized(evictionLock) {
+                                       GPUContext.allocatedPointers.add(this);
+                               }
+                       }
+               }
+               numLocks.addAndGet(1);
+       }
+       
+       @Override
+       public void acquireDeviceRead() throws DMLRuntimeException {
+               prepare(true, -1);
+               if(!isAllocated) 
+                       throw new DMLRuntimeException("Expected device data to 
be allocated");
+       }
+       
+       @Override
+       public void acquireDenseDeviceModify(int numElemsToAllocate) throws 
DMLRuntimeException {
+               prepare(false, numElemsToAllocate); 
+               isDeviceCopyModified = true;
+               if(!isAllocated) 
+                       throw new DMLRuntimeException("Expected device data to 
be allocated");
+       }
+       
+       @Override
+       public void acquireHostRead() throws CacheException {
+               if(isAllocated) {
+                       try {
+                               if(isDeviceCopyModified) {
+                                       copyFromDeviceToHost();
+                               }
+                       } catch (DMLRuntimeException e) {
+                               throw new CacheException(e);
+                       }
+               }
+       }
+       
+       @Override
+       public void acquireHostModify() throws CacheException {
+               if(isAllocated) {
+                       try {
+                               if(isDeviceCopyModified) {
+                                       throw new 
DMLRuntimeException("Potential overwrite of GPU data");
+                                       // copyFromDeviceToHost();
+                               }
+                               clearData();
+                       } catch (DMLRuntimeException e) {
+                               throw new CacheException(e);
+                       }
+               }
+       }
+       
+       public void release(boolean isGPUCopyModified) throws CacheException {
+               if(numLocks.addAndGet(-1) < 0) {
+                       throw new CacheException("Redundant release of GPU 
object");
+               }
+               isDeviceCopyModified = isGPUCopyModified;
+       }
+
+       @Override
+       void allocateMemoryOnDevice(int numElemToAllocate) throws 
DMLRuntimeException {
+               if(jcudaPointer == null) {
+                       long start = System.nanoTime();
+                       jcudaPointer = new Pointer();
+                       if(numElemToAllocate == -1 && 
LibMatrixCUDA.isInSparseFormat(mat))
+                               throw new DMLRuntimeException("Sparse format 
not implemented");
+                       else if(numElemToAllocate == -1) {
+                               // Called for dense input
+                               numBytes = 
mat.getNumRows()*mat.getNumColumns()*Sizeof.DOUBLE;
+                               cudaMalloc(jcudaPointer, numBytes);
+                               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
+                       }
+                       else {
+                               // Called for dense output
+                               numBytes = numElemToAllocate*Sizeof.DOUBLE;
+                               cudaMalloc(jcudaPointer,  numBytes);
+                               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
+                       }
+                       
+                       
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
+                       Statistics.cudaAllocCount.addAndGet(1);
+
+               }
+               isAllocated = true;
+       }
+       
+       @Override
+       void deallocateMemoryOnDevice() {
+               if(jcudaPointer != null) {
+                       long start = System.nanoTime();
+                       cudaFree(jcudaPointer);
+                       
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes);
+                       
Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
+                       Statistics.cudaDeAllocCount.addAndGet(1);
+                       
+               }
+               jcudaPointer = null;
+               isAllocated = false;
+               numLocks.set(0);
+       }
+       
+       void ensureFreeSpaceForDenseBlock(int numElem) throws 
DMLRuntimeException {
+               long GPUSize = (Sizeof.DOUBLE) * numElem;
+               if(GPUSize >= getAvailableMemory()) {
+                       evict(GPUSize);
+               }
+       }
+       
+       @Override
+       void copyFromHostToDevice() 
+               throws DMLRuntimeException 
+       {
+               printCaller();
+               long start = System.nanoTime();
+               
+               MatrixBlock tmp = mat.acquireRead();
+               if(tmp.isInSparseFormat()) {
+                       throw new DMLRuntimeException("Sparse matrix is not 
implemented");
+                       // tmp.sparseToDense();
+               }
+               else {
+                       double[] data = tmp.getDenseBlock();
+                       
+                       if( data == null && tmp.getSparseBlock() != null )
+                               throw new DMLRuntimeException("Incorrect 
sparsity calculation");
+                       else if( data==null && tmp.getNonZeros() != 0 )
+                               throw new DMLRuntimeException("MatrixBlock is 
not allocated");
+                       else if( tmp.getNonZeros() == 0 )
+                               data = new 
double[tmp.getNumRows()*tmp.getNumColumns()];
+                       
+                       // Copy dense block
+                       ensureFreeSpaceForDenseBlock(data.length);
+                       allocateMemoryOnDevice(data.length);
+                       synchronized(evictionLock) {
+                               GPUContext.allocatedPointers.add(this);
+                       }
+                       cudaMemcpy(jcudaPointer, Pointer.to(data), 
mat.getNumRows()*mat.getNumColumns() * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+               }
+               
+               mat.release();
+               
+               Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
+               Statistics.cudaToDevCount.addAndGet(1);
+       }
+
+       @Override
+       protected void copyFromDeviceToHost() throws DMLRuntimeException {
+               if(jcudaPointer != null) {
+                       printCaller();
+                       if(LibMatrixCUDA.isInSparseFormat(mat))
+                               throw new DMLRuntimeException("Sparse format 
not implemented");
+                       else {
+                               long start = System.nanoTime();
+                               MatrixBlock tmp = new 
MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), false);
+                               tmp.allocateDenseBlock();
+                               double [] data = tmp.getDenseBlock();
+                               
+                               cudaMemcpy(Pointer.to(data), jcudaPointer, 
data.length * Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+                               
+                               tmp.recomputeNonZeros();
+                               mat.acquireModify(tmp);
+                               mat.release();
+                               
+                               
Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
+                               Statistics.cudaFromDevCount.addAndGet(1);
+                       }
+               }
+               else {
+                       throw new DMLRuntimeException("Cannot copy from device 
to host as JCuda pointer is not allocated");
+               }
+               isDeviceCopyModified = false;
+       }
+
+       @Override
+       protected long getSizeOnDevice() throws DMLRuntimeException {
+               long GPUSize = 0;
+               int rlen = (int) mat.getNumRows();
+               int clen = (int) mat.getNumColumns();
+
+               if(LibMatrixCUDA.isInSparseFormat(mat)) {
+                       throw new DMLRuntimeException("Sparse format not 
implemented");
+               }
+               else {
+                       GPUSize = (Sizeof.DOUBLE) * (long) (rlen * clen);
+               }
+               return GPUSize;
+       }
+       
+       private String getClassAndMethod(StackTraceElement st) {
+               String [] str = st.getClassName().split("\\.");
+               return str[str.length - 1] + "." + st.getMethodName();
+       }
+       
+       private void printCaller() {
+               if(JCudaContext.DEBUG) {
+                       StackTraceElement[] st = 
Thread.currentThread().getStackTrace();
+                       String ret = getClassAndMethod(st[1]);
+                       for(int i = 2; i < st.length && i < 7; i++) {
+                               ret += "->" + getClassAndMethod(st[i]);
+                       }
+                       System.out.println("CALL_STACK:" + ret);
+               }
+                       
+       }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 45f68dd..fc1f657 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -49,7 +49,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysml.runtime.controlprogram.context.JCudaObject;
+import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
 
 //FIXME move could to respective instructions, this is not a block library
 public class LibMatrixCUDA {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index b68a51c..0565136 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -24,6 +24,7 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -31,7 +32,7 @@ import org.apache.sysml.runtime.util.ConvolutionUtils;
 
 public class LibMatrixDNN {
 
-       public static boolean ALLOW_MULTI_THREADED_OPS = true;
+       public static final boolean ALLOW_MULTI_THREADED_OPS = true;
        // Using hashmap to avoid any performance impacts of multimap
        private static final ConcurrentHashMap<Integer, 
SoftReference<double[]>> non_zeroed_double_arr = new ConcurrentHashMap<Integer, 
SoftReference<double[]>>();
        private static final int NON_ZEROED_DOUBLE_ARR_THRESHOLD = 100;
@@ -64,6 +65,8 @@ public class LibMatrixDNN {
                public int K; public int R; public int S; public int stride_h; 
public int stride_w; public int pad_h; public int pad_w;
                public int P; public int Q; public int numThreads;
                
+               public AtomicLong outputNNZ = new AtomicLong(-1);
+               
                MatrixBlock input1; MatrixBlock input2; MatrixBlock output;
                boolean reuseNonZeroedOutput = false;
                
@@ -219,6 +222,8 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Incorrect input 
dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + 
" " + params.N + " " + params.K*params.P*params.Q);
                }
                
+               params.outputNNZ.set(0);
+               
                int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
                if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
                        for (int n = 0; n < params.N; n++) {
@@ -240,6 +245,7 @@ public class LibMatrixDNN {
                if (!params.output.isInSparseFormat())
                        outputArray = params.output.getDenseBlock();
                
+               long tmpNNZ = 0;
                for (int p = 0; p < params.P; p++) {
                        for (int q = 0; q < params.Q; q++) {
                                int start_index_h = p * params.stride_h - 
params.pad_h;
@@ -258,10 +264,13 @@ public class LibMatrixDNN {
                                                else
                                                        inVal = 
params.input1.quickGetValue(n, c*params.H*params.W +  h*params.W + w);
                                                outputArray[out_index] = 
Math.max(outputArray[out_index], inVal);
+                                               if(outputArray[out_index] != 0)
+                                                       tmpNNZ++;
                                        }
                                }
                        }
                }
+               params.outputNNZ.addAndGet(tmpNNZ);
        }
                
        // Reshape a 4D tensor of dimension (N, K, P, Q) to matrix of dimension 
(K, NPQ)
@@ -436,6 +445,7 @@ public class LibMatrixDNN {
                params.input1 = input;
                params.output = outputBlock;
                
+               params.outputNNZ.set(0);
                int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
                if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
                        for (int n = 0; n < params.N; n++) { // Do following 
for all images
@@ -525,6 +535,7 @@ public class LibMatrixDNN {
                final int inputOffset = n*params.C*params.H*params.W + 
c*params.H*params.W;
                final int outputOffset = (c*params.R*params.S*params.N + 
n)*params.P*params.Q;
                
+               long tmpNNZ = 0;
                for (int r = 0; r < params.R; r++) { // Get an input patch of 
size R X S
                        for (int s = 0; s < params.S; s++) {
                                int localIndex = outputOffset + 
((r*params.S*params.N + s*params.N)*params.P*params.Q);
@@ -541,6 +552,8 @@ public class LibMatrixDNN {
                                                                        
outputArray[localIndex] = inputArray[inputOffset + input_row*params.W + 
input_col];
                                                                else
                                                                        
outputArray[localIndex] = params.input1.quickGetValue(n, c*params.H*params.W + 
input_row*params.W + input_col);
+                                                               
if(outputArray[localIndex] != 0)
+                                                                       
tmpNNZ++;
                                                        }
                                                        else 
if(params.reuseNonZeroedOutput) {
                                                                
outputArray[localIndex] = 0;
@@ -560,5 +573,6 @@ public class LibMatrixDNN {
                        }
                }
                
+               params.outputNNZ.addAndGet(tmpNNZ);
        }
 }

Reply via email to