[FIX] Fixed nested parfor for GPUs

Additionally
- Fixed intellij codestyle accordingly
- Fixed formatting of some GPU related source files

Closes #532


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f5871756
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f5871756
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f5871756

Branch: refs/heads/master
Commit: f587175647a84a3825b174b4d29c0398be17331f
Parents: 0bcae49
Author: Nakul Jindal <[email protected]>
Authored: Sat Jun 10 12:06:47 2017 -0700
Committer: Nakul Jindal <[email protected]>
Committed: Sat Jun 10 12:06:47 2017 -0700

----------------------------------------------------------------------
 dev/code-style/systemml-style-intellij.xml      |   37 +-
 .../apache/sysml/api/ScriptExecutorUtils.java   |   17 +-
 .../controlprogram/ParForProgramBlock.java      |   19 +-
 .../context/ExecutionContext.java               |   64 +-
 .../controlprogram/parfor/LocalParWorker.java   |    2 +-
 .../cp/FunctionCallCPInstruction.java           |   12 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |    4 +-
 .../gpu/AggregateUnaryGPUInstruction.java       |    2 +-
 .../gpu/ConvolutionGPUInstruction.java          |   18 +-
 .../instructions/gpu/MMTSJGPUInstruction.java   |    2 +-
 .../gpu/MatrixBuiltinGPUInstruction.java        |   30 +-
 .../MatrixMatrixArithmeticGPUInstruction.java   |    2 +-
 .../gpu/MatrixMatrixAxpyGPUInstruction.java     |    2 +-
 .../gpu/MatrixMatrixBuiltinGPUInstruction.java  |    2 +-
 .../instructions/gpu/ReorgGPUInstruction.java   |    2 +-
 .../ScalarMatrixArithmeticGPUInstruction.java   |    2 +-
 .../instructions/gpu/context/CSRPointer.java    |  922 ++++++-------
 .../gpu/context/ExecutionConfig.java            |   85 +-
 .../instructions/gpu/context/GPUContext.java    | 1257 +++++++++---------
 .../gpu/context/GPUContextPool.java             |  266 ++--
 .../instructions/gpu/context/GPUObject.java     |  454 ++++---
 .../instructions/gpu/context/JCudaKernels.java  |  141 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |   42 +-
 .../runtime/matrix/data/LibMatrixDNNHelper.java |    1 +
 .../org/apache/sysml/test/gpu/GPUTests.java     |   47 +-
 .../sysml/test/gpu/NeuralNetworkOpTests.java    |  106 +-
 26 files changed, 1917 insertions(+), 1621 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/dev/code-style/systemml-style-intellij.xml
----------------------------------------------------------------------
diff --git a/dev/code-style/systemml-style-intellij.xml 
b/dev/code-style/systemml-style-intellij.xml
index 1ad3209..b4a53b4 100644
--- a/dev/code-style/systemml-style-intellij.xml
+++ b/dev/code-style/systemml-style-intellij.xml
@@ -1,28 +1,27 @@
 <!--
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
 -->
-
 <code_scheme name="SystemML Format">
   <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
   <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
   <option name="IMPORT_LAYOUT_TABLE">
     <value>
-      <package name="" withSubpackages="true" static="false" />
+      <package name="" withSubpackages="true" static="true" />
       <emptyLine />
       <package name="java" withSubpackages="true" static="false" />
       <emptyLine />
@@ -32,7 +31,7 @@
       <emptyLine />
       <package name="com" withSubpackages="true" static="false" />
       <emptyLine />
-      <package name="" withSubpackages="true" static="true" />
+      <package name="" withSubpackages="true" static="false" />
     </value>
   </option>
   <codeStyleSettings language="JAVA">

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 674a011..2895aa4 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysml.api;
 
+import java.util.List;
+
 import org.apache.sysml.api.mlcontext.ScriptExecutor;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.codegen.SpoofCompiler;
@@ -79,23 +81,22 @@ public class ScriptExecutorUtils {
                // GPUs
                GPUContextPool.PER_PROCESS_MAX_GPUS = 
dmlconf.getIntValue(DMLConfig.MAX_GPUS_PER_PROCESS);
                Statistics.startRunTimer();
-               GPUContext gCtx = null;
                try {
                        // run execute (w/ exception handling to ensure proper 
shutdown)
                        if (DMLScript.USE_ACCELERATOR && ec != null) {
-                               gCtx = GPUContextPool.getFromPool();
-                               if (gCtx == null) {
+                               List<GPUContext> gCtxs = 
GPUContextPool.reserveAllGPUContexts();
+                               if (gCtxs == null) {
                                        throw new DMLRuntimeException(
                                                        "GPU : Could not create 
GPUContext, either no GPU or all GPUs currently in use");
                                }
-                               gCtx.initializeThread();
-                               ec.setGPUContext(gCtx);
+                               gCtxs.get(0).initializeThread();
+                               ec.setGPUContexts(gCtxs);
                        }
                        rtprog.execute(ec);
                } finally { // ensure cleanup/shutdown
-                       if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != 
null) {
-                               ec.getGPUContext().clearTemporaryMemory();
-                               GPUContextPool.returnToPool(ec.getGPUContext());
+                       if (DMLScript.USE_ACCELERATOR && 
!ec.getGPUContexts().isEmpty()) {
+                               ec.getGPUContexts().forEach(gCtx -> 
gCtx.clearTemporaryMemory());
+                               GPUContextPool.freeAllGPUContexts();
                        }
                        if (dmlconf.getBooleanValue(DMLConfig.CODEGEN))
                                SpoofCompiler.cleanupCodeGenerator();

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
index c9dcc22..95e28e7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
@@ -632,9 +632,7 @@ public class ParForProgramBlock extends ForProgramBlock
                        {
                                case LOCAL: //create parworkers as local threads
                                        if (DMLScript.USE_ACCELERATOR) {
-                                               
GPUContextPool.returnToPool(ec.getGPUContext());
-                                               ec.setGPUContext(null);
-                                               
setDegreeOfParallelism(GPUContextPool.getDeviceCount());
+                                               
setDegreeOfParallelism(ec.getNumGPUContexts());
                                        }
                                        executeLocalParFor(ec, iterVar, from, 
to, incr);
                                        break;
@@ -757,7 +755,7 @@ public class ParForProgramBlock extends ForProgramBlock
                        {
                                //create parallel workers as (lazy) deep copies
                                //including preparation of update-in-place 
variables
-                               workers[i] = createParallelWorker( _pwIDs[i], 
queue, ec ); 
+                               workers[i] = createParallelWorker( _pwIDs[i], 
queue, ec, i);
                                threads[i] = new Thread( workers[i] );
                                threads[i].setPriority(Thread.MAX_PRIORITY); 
                        }
@@ -833,11 +831,9 @@ public class ParForProgramBlock extends ForProgramBlock
                        // the main thread to use the GPUContext
                        if (DMLScript.USE_ACCELERATOR) {
                                for (int i = 0; i < _numThreads; i++) {
-                                       GPUContext gCtx = 
workers[i].getExecutionContext().getGPUContext();
-                                       GPUContextPool.returnToPool(gCtx);
+                                       
workers[i].getExecutionContext().setGPUContexts(null);
                                }
-                               ec.setGPUContext(GPUContextPool.getFromPool());
-                               ec.getGPUContext().initializeThread();
+                               ec.getGPUContext(0).initializeThread();
                        }
                }
                finally 
@@ -1386,10 +1382,11 @@ public class ParForProgramBlock extends ForProgramBlock
         * @param pwID parworker id
         * @param queue task queue
         * @param ec execution context
+        * @param index the index of the worker
         * @return local parworker
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       private LocalParWorker createParallelWorker(long pwID, 
LocalTaskQueue<Task> queue, ExecutionContext ec) 
+       private LocalParWorker createParallelWorker(long pwID, 
LocalTaskQueue<Task> queue, ExecutionContext ec, int index)
                throws DMLRuntimeException
        {
                LocalParWorker pw = null; 
@@ -1420,9 +1417,9 @@ public class ParForProgramBlock extends ForProgramBlock
                        ExecutionContext cpEc = 
ProgramConverter.createDeepCopyExecutionContext(ec);
 
                        // If GPU mode is enabled, gets a GPUContext from the 
pool of GPUContexts
-                       // and sets it in the ExecutionContext
+                       // and sets it in the ExecutionContext of the parfor
                        if (DMLScript.USE_ACCELERATOR){
-                               
cpEc.setGPUContext(GPUContextPool.getFromPool());
+                               
cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index)));
                        }
                        
                        //prepare basic update-in-place variables (vars dropped 
on result merge)

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 735f394..fb179f5 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -20,7 +20,9 @@
 package org.apache.sysml.runtime.controlprogram.context;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
+import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -66,7 +68,10 @@ public class ExecutionContext {
        //debugging (optional)
        protected DebugState _dbState = null;
 
-    protected GPUContext _gpuContext = null;
+       /**
+        * List of {@link GPUContext}s owned by this {@link ExecutionContext}
+        */
+    protected List<GPUContext> _gpuContexts = new ArrayList<>();
 
        protected ExecutionContext()
        {
@@ -99,13 +104,42 @@ public class ExecutionContext {
                _variables = vars;
        }
 
-    public GPUContext getGPUContext() {
-        return _gpuContext;
+       /**
+        * Get the i-th GPUContext
+        * @param index index of the GPUContext
+        * @return a valid GPUContext or null if the indexed GPUContext does 
not exist.
+        */
+    public GPUContext getGPUContext(int index) {
+       try {
+                       return _gpuContexts.get(index);
+               } catch (IndexOutOfBoundsException e){
+               return null;
+               }
     }
 
-    public void setGPUContext(GPUContext _gpuContext) {
-        this._gpuContext = _gpuContext;
-    }
+       /**
+        * Sets the list of GPUContexts
+        * @param gpuContexts a collection of GPUContexts
+        */
+       public void setGPUContexts(List<GPUContext> gpuContexts){
+               _gpuContexts = gpuContexts;
+       }
+
+       /**
+        * Gets the list of GPUContexts
+        * @return a list of GPUContexts
+        */
+       public List<GPUContext> getGPUContexts() {
+               return _gpuContexts;
+       }
+
+       /**
+        * Gets the number of GPUContexts
+        * @return number of GPUContexts
+        */
+       public int getNumGPUContexts() {
+       return _gpuContexts.size();
+       }
 
        /* -------------------------------------------------------
         * Methods to handle variables and associated data objects
@@ -238,7 +272,7 @@ public class ExecutionContext {
                throws DMLRuntimeException 
        {       
                MatrixObject mo = allocateGPUMatrixObject(varName);
-               boolean allocated = 
mo.getGPUObject(getGPUContext()).acquireDeviceModifyDense();
+               boolean allocated = 
mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense();
                mo.getMatrixCharacteristics().setNonZeros(-1);
                return new Pair<MatrixObject, Boolean>(mo, allocated);
        }
@@ -257,7 +291,7 @@ public class ExecutionContext {
     {
         MatrixObject mo = allocateGPUMatrixObject(varName);
         mo.getMatrixCharacteristics().setNonZeros(nnz);
-                               boolean allocated = 
mo.getGPUObject(getGPUContext()).acquireDeviceModifySparse();
+                               boolean allocated = 
mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse();
         return new Pair<MatrixObject, Boolean>(mo, allocated);
     } 
 
@@ -269,12 +303,12 @@ public class ExecutionContext {
         */
        public MatrixObject allocateGPUMatrixObject(String varName) throws 
DMLRuntimeException {
                MatrixObject mo = getMatrixObject(varName);
-               if( mo.getGPUObject(getGPUContext()) == null ) {
-                       GPUObject newGObj = getGPUContext().createGPUObject(mo);
+               if( mo.getGPUObject(getGPUContext(0)) == null ) {
+                       GPUObject newGObj = 
getGPUContext(0).createGPUObject(mo);
                        // The lock is added here for an output block
                        // so that any block currently in use is not 
deallocated by eviction on the GPU
                        newGObj.addLock();
-                       mo.setGPUObject(getGPUContext(), newGObj);
+                       mo.setGPUObject(getGPUContext(0), newGObj);
                }
                return mo;
        }
@@ -282,7 +316,7 @@ public class ExecutionContext {
        public Pair<MatrixObject, Boolean> 
getMatrixInputForGPUInstruction(String varName)
                        throws DMLRuntimeException 
        {
-               GPUContext gCtx = getGPUContext();
+               GPUContext gCtx = getGPUContext(0);
                boolean copied = false;
                MatrixObject mo = getMatrixObject(varName);
                if(mo == null) {
@@ -322,7 +356,7 @@ public class ExecutionContext {
                throws DMLRuntimeException 
        {
                MatrixObject mo = getMatrixObject(varName);
-               mo.getGPUObject(getGPUContext()).releaseInput();
+               mo.getGPUObject(getGPUContext(0)).releaseInput();
        }
        
        /**
@@ -374,10 +408,10 @@ public class ExecutionContext {
        
        public void releaseMatrixOutputForGPUInstruction(String varName) throws 
DMLRuntimeException {
                MatrixObject mo = getMatrixObject(varName);
-               if(mo.getGPUObject(getGPUContext()) == null || 
!mo.getGPUObject(getGPUContext()).isAllocated()) {
+               if(mo.getGPUObject(getGPUContext(0)) == null || 
!mo.getGPUObject(getGPUContext(0)).isAllocated()) {
                        throw new DMLRuntimeException("No output is allocated 
on GPU");
                }
-               mo.getGPUObject(getGPUContext()).releaseOutput();
+               mo.getGPUObject(getGPUContext(0)).releaseOutput();
        }
 
        public void setMatrixOutput(String varName, MatrixBlock outputData) 

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
index c4684ec..636b1f8 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
@@ -83,7 +83,7 @@ public class LocalParWorker extends ParWorker implements 
Runnable
 
                // Initialize this GPUContext to this thread
                if (DMLScript.USE_ACCELERATOR)
-                       _ec.getGPUContext().initializeThread();
+                       _ec.getGPUContext(0).initializeThread();
                
                //setup compiler config for worker thread
                ConfigurationManager.setLocalConfig(_cconf);

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index 9cc6bb2..3cd2633 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -168,9 +168,9 @@ public class FunctionCallCPInstruction extends CPInstruction
                // and copy the function arguments into the created table. 
                ExecutionContext fn_ec = 
ExecutionContextFactory.createContext(false, ec.getProgram());
                if (DMLScript.USE_ACCELERATOR) {
-                       fn_ec.setGPUContext(ec.getGPUContext());
-                       ec.setGPUContext(null);
-                       fn_ec.getGPUContext().initializeThread();
+                       fn_ec.setGPUContexts(ec.getGPUContexts());
+                       ec.setGPUContexts(null);
+                       fn_ec.getGPUContext(0).initializeThread();
                }
                fn_ec.setVariables(functionVariables);
                // execute the function block
@@ -206,9 +206,9 @@ public class FunctionCallCPInstruction extends CPInstruction
                ec.unpinVariables(_boundInputParamNames, pinStatus);
 
                if (DMLScript.USE_ACCELERATOR) {
-                       ec.setGPUContext(fn_ec.getGPUContext());
-                       fn_ec.setGPUContext(null);
-                       ec.getGPUContext().initializeThread();
+                       ec.setGPUContexts(fn_ec.getGPUContexts());
+                       fn_ec.setGPUContexts(null);
+                       ec.getGPUContext(0).initializeThread();
                }
                
                // add the updated binding for each return variable to the 
variables in original symbol table

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 2531c17..0c0a4b2 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -96,7 +96,7 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
                int clen = (int) (_isRightTransposed ? m2.getNumRows() : 
m2.getNumColumns());
 
                ec.setMetaData(_output.getName(), rlen, clen);
-               LibMatrixCUDA.matmult(ec, ec.getGPUContext(), 
getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, 
_isRightTransposed);
+               LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), 
getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, 
_isRightTransposed);
         
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
@@ -113,6 +113,6 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
        @SuppressWarnings("unused")
        private boolean isSparse(ExecutionContext ec, String var) throws 
DMLRuntimeException {
                MatrixObject mo = ec.getMatrixObject(var);
-               return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(), mo);
+               return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(0), mo);
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
index 8bdd397..5d01820 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
@@ -93,7 +93,7 @@ public class AggregateUnaryGPUInstruction extends 
GPUInstruction {
       ec.setMetaData(_output.getName(), rlen, 1);
     }
 
-    LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(), getExtendedOpcode(), 
in1, _output.getName(), (AggregateUnaryOperator)_optr);
+    LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(0), getExtendedOpcode(), 
in1, _output.getName(), (AggregateUnaryOperator)_optr);
 
     //release inputs/outputs
     ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 9d4cd1f..e5ea097 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -186,9 +186,9 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
                MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
                if(instOpcode.equalsIgnoreCase("bias_add"))
-                       LibMatrixCUDA.biasAdd(ec.getGPUContext(), 
getExtendedOpcode(), input, bias, out);
+                       LibMatrixCUDA.biasAdd(ec.getGPUContext(0), 
getExtendedOpcode(), input, bias, out);
                else if(instOpcode.equalsIgnoreCase("bias_multiply"))
-                       LibMatrixCUDA.biasMultiply(ec.getGPUContext(), 
getExtendedOpcode(), input, bias, out);
+                       LibMatrixCUDA.biasMultiply(ec.getGPUContext(0), 
getExtendedOpcode(), input, bias, out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -202,7 +202,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                
                MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
                ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
-               LibMatrixCUDA.reluBackward(ec.getGPUContext(), 
getExtendedOpcode(), input, dout, out);
+               LibMatrixCUDA.reluBackward(ec.getGPUContext(0), 
getExtendedOpcode(), input, dout, out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -253,7 +253,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, K * P * Q);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2d(ec.getGPUContext(), 
getExtendedOpcode(), image, filter, out, N, C, H, W,
+                       LibMatrixCUDA.conv2d(ec.getGPUContext(0), 
getExtendedOpcode(), image, filter, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -268,7 +268,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, K * P * Q);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(), 
getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
+                       LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), 
getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
                                                K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) 
{
@@ -283,7 +283,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), K, C * R * S);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
+                       LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                        // TODO: For now always copy the device data to host
                        // ec.gpuCtx.copyDeviceToHost(outputBlock);
@@ -300,7 +300,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, C * H * W);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
+                       LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling")) {
@@ -313,7 +313,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        ec.setMetaData(_output.getName(), N, C * P * Q);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
                        if(instOpcode.equalsIgnoreCase("maxpooling"))
-                               LibMatrixCUDA.maxpooling(ec.getGPUContext(), 
getExtendedOpcode(), image, out, N, C, H, W,
+                               LibMatrixCUDA.maxpooling(ec.getGPUContext(0), 
getExtendedOpcode(), image, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
@@ -328,7 +328,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, C * H * W);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
+                       LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
index c147a6f..55656f0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -107,7 +107,7 @@ public class MMTSJGPUInstruction extends GPUInstruction
 
                 //execute operations 
                 ec.setMetaData(_output.getName(), rlen, clen);
-                LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
+                LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
                 
                 ec.releaseMatrixInputForGPUInstruction(_input.getName());
                 ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index 7b50285..beeacee 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -44,35 +44,35 @@ public class MatrixBuiltinGPUInstruction extends 
BuiltinUnaryGPUInstruction {
 
                switch(opcode) {
                        case "sel+":
-                               LibMatrixCUDA.relu(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.relu(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "exp":
-                               LibMatrixCUDA.exp(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.exp(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "sqrt":
-                               LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.sqrt(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "log":
-                               LibMatrixCUDA.log(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.log(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "round":
-                               LibMatrixCUDA.round(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.round(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "floor":
-                               LibMatrixCUDA.floor(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.floor(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "ceil":
-                               LibMatrixCUDA.ceil(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.ceil(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "abs":
-                               LibMatrixCUDA.abs(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.abs(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "sin":
-                               LibMatrixCUDA.sin(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.sin(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "cos":
-                               LibMatrixCUDA.cos(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.cos(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "tan":
-                               LibMatrixCUDA.tan(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.tan(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "asin":
-                               LibMatrixCUDA.asin(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.asin(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "acos":
-                               LibMatrixCUDA.acos(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.acos(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "atan":
-                               LibMatrixCUDA.atan(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.atan(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "sign":
-                               LibMatrixCUDA.sign(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCUDA.sign(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        default:
                                throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
index 9573a1a..a03f9b1 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
                ec.setMetaData(_output.getName(), (int)rlen, (int)clen);
                
                BinaryOperator bop = (BinaryOperator) _optr;
-               LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), 
getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, 
isRightTransposed, bop);
+               LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, 
isRightTransposed, bop);
                
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
index 58905d6..e430e29 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
@@ -103,7 +103,7 @@ public class MatrixMatrixAxpyGPUInstruction extends 
ArithmeticBinaryGPUInstructi
                                        " and input2:" + rlen2 + " X " + clen2);
                }
                
-               LibMatrixCUDA.axpy(ec, ec.getGPUContext(), getExtendedOpcode(), 
in1, in2, _output.getName(), multiplier*scalar.getDoubleValue());
+               LibMatrixCUDA.axpy(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1, in2, _output.getName(), 
multiplier*scalar.getDoubleValue());
                
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
index 8936735..e60a3d7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
@@ -46,7 +46,7 @@ public class MatrixMatrixBuiltinGPUInstruction extends 
BuiltinBinaryGPUInstructi
 
     if(opcode.equals("solve")) {
       ec.setMetaData(output.getName(), mat1.getNumColumns(), 1);
-      LibMatrixCUDA.solve(ec, ec.getGPUContext(), getExtendedOpcode(), mat1, 
mat2, output.getName());
+      LibMatrixCUDA.solve(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, 
mat2, output.getName());
 
     } else {
       throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
index 53d56a3..bc63d12 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -81,7 +81,7 @@ public class ReorgGPUInstruction extends GPUInstruction
                
                //execute operation
                ec.setMetaData(_output.getName(), rlen, clen);
-               LibMatrixCUDA.transpose(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName());
+               LibMatrixCUDA.transpose(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName());
                
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
index 64cb6c4..ea4665a 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -60,7 +60,7 @@ public class ScalarMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
                ScalarOperator sc_op = (ScalarOperator) _optr;
                sc_op.setConstant(constant.getDoubleValue());
                
-               LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), 
getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
+               LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
                
                ec.releaseMatrixInputForGPUInstruction(mat.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 0ff9d14..b15dd69 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -52,453 +52,477 @@ import jcuda.jcusparse.cusparsePointerMode;
  */
 public class CSRPointer {
 
-  private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());
-
-  private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
-
-  /** {@link GPUContext} instance to track the GPU to do work on */
-  private final GPUContext gpuContext;
-
-  public static cusparseMatDescr matrixDescriptor;
-
-  /** Number of non zeroes */
-  public long nnz;
-
-  /** double array of non zero values */
-  public Pointer val;
-
-  /** integer array of start of all rows and end of last row + 1 */
-  public Pointer rowPtr;
-
-  /** integer array of nnz values' column indices */
-  public Pointer colInd;
-
-  /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */
-  public cusparseMatDescr descr;
-
-
-  public CSRPointer clone(int rows) throws DMLRuntimeException {
-    CSRPointer me = this;
-    CSRPointer that = new CSRPointer(me.getGPUContext());
-
-    that.allocateMatDescrPointer();
-    long totalSize = estimateSize(me.nnz, rows);
-    that.gpuContext.ensureFreeSpace(totalSize);
-
-    that.nnz = me.nnz;
-    that.val = allocate(that.nnz * Sizeof.DOUBLE);
-    that.rowPtr = allocate(rows * Sizeof.DOUBLE);
-    that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
-
-    cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
-    cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
-    cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
-
-    return that;
-  }
-
-  /**
-   * Default constructor to help with Factory method {@link 
#allocateEmpty(GPUContext, long, long)}
-   * @param gCtx   a valid {@link GPUContext}
-   */
-  private CSRPointer(GPUContext gCtx) {
-    gpuContext = gCtx;
-    val = new Pointer();
-    rowPtr = new Pointer();
-    colInd = new Pointer();
-    allocateMatDescrPointer();
-  }
-
-//  private Pointer allocate(String instName, long size) throws 
DMLRuntimeException {
-//    return getGPUContext().allocate(instName, size);
-//  }
-
-  private Pointer allocate(long size) throws DMLRuntimeException {
-    return getGPUContext().allocate(size);
-  }
-
-//  private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
-//    getGPUContext().cudaFreeHelper(toFree);
-//  }
-
-  private void cudaFreeHelper(Pointer toFree, boolean eager) throws 
DMLRuntimeException {
-    getGPUContext().cudaFreeHelper(toFree, eager);
-  }
-
-//  private void cudaFreeHelper(String instName, Pointer toFree, boolean 
eager) throws DMLRuntimeException {
-//    getGPUContext().cudaFreeHelper(instName, toFree, eager);
-//  }
-
-  private static long getDoubleSizeOf(long numElems) {
-    return numElems * ((long)jcuda.Sizeof.DOUBLE);
-  }
-
-  private static long getIntSizeOf(long numElems) {
-    return numElems * ((long)jcuda.Sizeof.INT);
-  }
-
-  private GPUContext getGPUContext() {
-    return gpuContext;
-  }
-
-  public static int toIntExact(long l) throws DMLRuntimeException {
-    if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
-      throw new DMLRuntimeException("Cannot be cast to int:" + l);
-    }
-    return (int) l;
-  }
-
-  /**
-   * @return Singleton default matrix descriptor object
-   * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
-   */
-  public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
-    if (matrixDescriptor == null) {
-      // Code from JCuda Samples - 
http://www.jcuda.org/samples/JCusparseSample.java
-      matrixDescriptor = new cusparseMatDescr();
-      cusparseCreateMatDescr(matrixDescriptor);
-      cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
-    }
-    return matrixDescriptor;
-  }
-
-  /**
-   * Estimate the size of a CSR matrix in GPU memory
-   * Size of pointers is not needed and is not added in
-   *
-   * @param nnz2 number of non zeroes
-   * @param rows number of rows
-   * @return size estimate
-   */
-  public static long estimateSize(long nnz2, long rows) {
-    long sizeofValArray = getDoubleSizeOf(nnz2);
-    long sizeofRowPtrArray = getIntSizeOf(rows + 1);
-    long sizeofColIndArray = getIntSizeOf(nnz2);
-    long sizeofDescr = getIntSizeOf(4);
-    // From the CUSPARSE documentation, the cusparseMatDescr in native code is 
represented as:
-    // typedef struct {
-    //         cusparseMatrixType_t MatrixType;
-    // cusparseFillMode_t FillMode;
-    // cusparseDiagType_t DiagType;
-    //         cusparseIndexBase_t IndexBase;
-    // } cusparseMatDescr_t;
-    long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + 
sizeofDescr;
-    return tot;
-  }
-
-  /**
-   * Static method to copy a CSR sparse matrix from Host to Device
-   *
-   * @param dest   [input] destination location (on GPU)
-   * @param rows   number of rows
-   * @param nnz    number of non-zeroes
-   * @param rowPtr integer array of row pointers
-   * @param colInd integer array of column indices
-   * @param values double array of non zero values
-   */
-  public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] 
rowPtr, int[] colInd, double[] values) {
-    CSRPointer r = dest;
-    long t0 = 0;
-    if (DMLScript.STATISTICS) t0 = System.nanoTime();
-    r.nnz = nnz;
-    cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), 
cudaMemcpyHostToDevice);
-    cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), 
cudaMemcpyHostToDevice);
-    cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), 
cudaMemcpyHostToDevice);
-    if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3);
-  }
-
-  /**
-   * Static method to copy a CSR sparse matrix from Device to host
-   *
-   * @param src    [input] source location (on GPU)
-   * @param rows   [input] number of rows
-   * @param nnz    [input] number of non-zeroes
-   * @param rowPtr [output] pre-allocated integer array of row pointers of 
size (rows+1)
-   * @param colInd [output] pre-allocated integer array of column indices of 
size nnz
-   * @param values [output] pre-allocated double array of values of size nnz
-   */
-  public static void copyToHost(CSRPointer src, int rows, long nnz, int[] 
rowPtr, int[] colInd, double[] values) {
-    CSRPointer r = src;
-    long t0 = 0;
-    if (DMLScript.STATISTICS) t0 = System.nanoTime();
-    cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), 
cudaMemcpyDeviceToHost);
-    cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), 
cudaMemcpyDeviceToHost);
-    cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), 
cudaMemcpyDeviceToHost);
-    if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3);
-  }
-
-  /**
-   * Estimates the number of non zero elements from the results of a sparse 
cusparseDgeam operation
-   * C = a op(A) + b op(B)
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param B      Sparse Matrix B on GPU
-   * @param m      Rows in A
-   * @param n      Columns in Bs
-   * @return CSR (compressed sparse row) pointer
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, CSRPointer B, int m, int n)
-          throws DMLRuntimeException {
-    if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
-      throw new DMLRuntimeException("Number of non zeroes is larger than 
supported by cuSparse");
-    }
-    CSRPointer C = new CSRPointer(gCtx);
-    step1AllocateRowPointers(gCtx, handle, C, m);
-    step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
-    step3AllocateValNInd(gCtx, handle, C);
-    return C;
-  }
-
-  /**
-   * Estimates the number of non-zero elements from the result of a sparse 
matrix multiplication C = A * B
-   * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
-   *
-   * @param gCtx ?
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param transA 'T' if A is to be transposed, 'N' otherwise
-   * @param B      Sparse Matrix B on GPU
-   * @param transB 'T' if B is to be transposed, 'N' otherwise
-   * @param m      Rows in A
-   * @param n      Columns in B
-   * @param k      Columns in A / Rows in B
-   * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on 
GPU
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, 
cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int 
m, int n, int k)
-          throws DMLRuntimeException {
-    // Following the code example at 
http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
-    // 
https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
-    CSRPointer C = new CSRPointer(gCtx);
-    step1AllocateRowPointers(gCtx, handle, C, m);
-    step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
-    step3AllocateValNInd(gCtx, handle, C);
-    return C;
-  }
-
-  /**
-   * Check for ultra sparsity
-   *
-   * @param rows number of rows
-   * @param cols number of columns
-   * @return true if ultra sparse
-   */
-  public boolean isUltraSparse(int rows, int cols) {
-    double sp = ((double) nnz / rows / cols);
-    return sp < ULTRA_SPARSITY_TURN_POINT;
-  }
-
-// 
==============================================================================================
-
-// The following methods estimate the memory needed for sparse matrices that 
are
-// results of operations on other sparse matrices using the cuSparse Library.
-// The operation is C = op(A) binaryOperation op(B), C is the output and A & B 
are the inputs
-// op = whether to transpose or not
-// binaryOperation = For cuSparse, +, - are *(matmul) are supported
-
-// From CuSparse Manual,
-// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step 
approach
-// to complete sparse matrix C. In the first step, the user allocates 
csrRowPtrC of m+1
-// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
-// and the total number of nonzero elements. In the second step, the user 
gathers nnzC
-//(number of nonzero elements of matrix C) from either 
(nnzC=*nnzTotalDevHostPtr)
-// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
-// nnzC elements respectively, then finally calls function 
cusparse[S|D|C|Z]csrgeam()
-// to complete matrix C.
-
-  /**
-   * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
-   * the default that works for DGEMM.
-   */
-  private void allocateMatDescrPointer() {
-    this.descr = getDefaultCuSparseMatrixDescriptor();
-  }
-
-  /**
-   * Factory method to allocate an empty CSR Sparse matrix on the GPU
-   *
-   * @param gCtx ?
-   * @param nnz2 number of non-zeroes
-   * @param rows number of rows
-   * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on 
GPU
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long 
rows) throws DMLRuntimeException {
-    LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and 
rows=" + rows + ", GPUContext=" + gCtx);
-    assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes 
is less than 0 when trying to allocate sparse data on GPU";
-    CSRPointer r = new CSRPointer(gCtx);
-    r.nnz = nnz2;
-    if (nnz2 == 0) {
-      // The convention for an empty sparse matrix is to just have an instance 
of the CSRPointer object
-      // with no memory allocated on the GPU.
-      return r;
-    }
-    gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + 
getIntSizeOf(nnz2));
-    // increment the cudaCount by 1 for the allocation of all 3 arrays
-    r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
-    r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
-    r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
-    return r;
-  }
-
-  /**
-   * Allocate row pointers of m+1 elements
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param C      Output matrix
-   * @param rowsC  number of rows in C
-   * @throws DMLRuntimeException ?
-   */
-  private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle 
handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
-    LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
-    cusparseSetPointerMode(handle, 
cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
-    //cudaDeviceSynchronize;
-    // Do not increment the cudaCount of allocations on GPU
-    C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
-  }
-
-  /**
-   * Determine total number of nonzero element for the cusparseDgeam  
operation.
-   * This is done from either (nnzC=*nnzTotalDevHostPtr) or 
(nnzC=csrRowPtrC(m)-csrRowPtrC(0))
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param B      Sparse Matrix B on GPU
-   * @param C      Output Sparse Matrix C on GPU
-   * @param m      Rows in C
-   * @param n      Columns in C
-   * @throws DMLRuntimeException ?
-   */
-  private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws 
DMLRuntimeException {
-    LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
-    int[] CnnzArray = {-1};
-    cusparseXcsrgeamNnz(handle, m, n,
-            A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
-            B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
-            C.descr, C.rowPtr, Pointer.to(CnnzArray));
-    //cudaDeviceSynchronize;
-    if (CnnzArray[0] != -1) {
-      C.nnz = CnnzArray[0];
-    } else {
-      int baseArray[] = {0};
-      cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-      cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-      C.nnz = CnnzArray[0] - baseArray[0];
-    }
-  }
-
-// 
==============================================================================================
-
-  /**
-   * Determine total number of nonzero element for the cusparseDgemm operation.
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param transA op - whether A is transposed
-   * @param B      Sparse Matrix B on GPU
-   * @param transB op - whether B is transposed
-   * @param C      Output Sparse Matrix C on GPU
-   * @param m      Number of rows of sparse matrix op ( A ) and C
-   * @param n      Number of columns of sparse matrix op ( B ) and C
-   * @param k      Number of columns/rows of sparse matrix op ( A ) / op ( B )
-   * @throws DMLRuntimeException ?
-   */
-  private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int 
m, int n, int k) throws DMLRuntimeException {
-    LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
-    int[] CnnzArray = {-1};
-    if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
-      throw new DMLRuntimeException("Number of non zeroes is larger than 
supported by cuSparse");
-    }
-    cusparseXcsrgemmNnz(handle, transA, transB, m, n, k,
-            A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
-            B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
-            C.descr, C.rowPtr, Pointer.to(CnnzArray));
-    //cudaDeviceSynchronize;
-    if (CnnzArray[0] != -1) {
-      C.nnz = CnnzArray[0];
-    } else {
-      int baseArray[] = {0};
-      cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-      cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-      C.nnz = CnnzArray[0] - baseArray[0];
-    }
-  }
-
-  /**
-   * Allocate val and index pointers.
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param C      Output sparse matrix on GPU
-   * @throws DMLRuntimeException ?
-   */
-  private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle 
handle, CSRPointer C) throws DMLRuntimeException {
-    LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
-    // Increment cudaCount by one when all three arrays of CSR sparse array 
are allocated
-    C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
-    C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
-  }
-
-  /**
-   * Copies this CSR matrix on the GPU to a dense column-major matrix
-   * on the GPU. This is a temporary matrix for operations such as
-   * cusparseDcsrmv.
-   * Since the allocated matrix is temporary, bookkeeping is not updated.
-   * The caller is responsible for calling "free" on the returned Pointer 
object
-   *
-   * @param cusparseHandle a valid {@link cusparseHandle}
-   * @param cublasHandle   a valid {@link cublasHandle}
-   * @param rows           number of rows in this CSR matrix
-   * @param cols           number of columns in this CSR matrix
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   * @return A {@link Pointer} to the allocated dense matrix (in column-major 
format)
-   */
-  public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, 
cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException {
-    LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + 
this + ", GPUContext=" + getGPUContext());
-    long size = ((long) rows) * getDoubleSizeOf((long) cols);
-    Pointer A = allocate(size);
-    // If this sparse block is empty, the allocated dense matrix, initialized 
to zeroes, will be returned.
-    if (val != null && rowPtr != null && colInd != null && nnz > 0) {
-      // Note: cusparseDcsr2dense method cannot handle empty blocks
-      cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, 
colInd, A, rows);
-      //cudaDeviceSynchronize;
-    } else {
-      LOG.debug("in CSRPointer, the values array, row pointers array or column 
indices array was null");
-    }
-    return A;
-  }
-
-  /**
-   * Calls cudaFree lazily on the allocated {@link Pointer} instances
-   * 
-   * @throws DMLRuntimeException ?
-   */
-  public void deallocate() throws DMLRuntimeException {
-    deallocate(false);
-  }
-
-  /**
-   * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} 
instances
-   *
-   * @param eager whether to do eager or lazy cudaFrees
-   * @throws DMLRuntimeException ?
-   */
-  public void deallocate(boolean eager) throws DMLRuntimeException {
-    if (nnz > 0) {
-      cudaFreeHelper(val, eager);
-      cudaFreeHelper(rowPtr, eager);
-      cudaFreeHelper(colInd, eager);
-    }
-  }
-
-  @Override
-  public String toString() {
-    return "CSRPointer{" +
-            "nnz=" + nnz +
-            '}';
-  }
+       private static final Log LOG = 
LogFactory.getLog(CSRPointer.class.getName());
+
+       private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
+       public static cusparseMatDescr matrixDescriptor;
+       /**
+        * {@link GPUContext} instance to track the GPU to do work on
+        */
+       private final GPUContext gpuContext;
+       /**
+        * Number of non zeroes
+        */
+       public long nnz;
+
+       /**
+        * double array of non zero values
+        */
+       public Pointer val;
+
+       /**
+        * integer array of start of all rows and end of last row + 1
+        */
+       public Pointer rowPtr;
+
+       /**
+        * integer array of nnz values' column indices
+        */
+       public Pointer colInd;
+
+       /**
+        * descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported
+        */
+       public cusparseMatDescr descr;
+
+       /**
+        * Default constructor to help with Factory method {@link 
#allocateEmpty(GPUContext, long, long)}
+        *
+        * @param gCtx a valid {@link GPUContext}
+        */
+       private CSRPointer(GPUContext gCtx) {
+               gpuContext = gCtx;
+               val = new Pointer();
+               rowPtr = new Pointer();
+               colInd = new Pointer();
+               allocateMatDescrPointer();
+       }
+
+       private static long getDoubleSizeOf(long numElems) {
+               return numElems * ((long) jcuda.Sizeof.DOUBLE);
+       }
+
+       //  private Pointer allocate(String instName, long size) throws 
DMLRuntimeException {
+       //    return getGPUContext().allocate(instName, size);
+       //  }
+
+       private static long getIntSizeOf(long numElems) {
+               return numElems * ((long) jcuda.Sizeof.INT);
+       }
+
+       //  private void cudaFreeHelper(Pointer toFree) throws 
DMLRuntimeException {
+       //    getGPUContext().cudaFreeHelper(toFree);
+       //  }
+
+       public static int toIntExact(long l) throws DMLRuntimeException {
+               if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+                       throw new DMLRuntimeException("Cannot be cast to int:" 
+ l);
+               }
+               return (int) l;
+       }
+
+       //  private void cudaFreeHelper(String instName, Pointer toFree, 
boolean eager) throws DMLRuntimeException {
+       //    getGPUContext().cudaFreeHelper(instName, toFree, eager);
+       //  }
+
+       /**
+        * @return Singleton default matrix descriptor object
+        * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
+        */
+       public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
+               if (matrixDescriptor == null) {
+                       // Code from JCuda Samples - 
http://www.jcuda.org/samples/JCusparseSample.java
+                       matrixDescriptor = new cusparseMatDescr();
+                       cusparseCreateMatDescr(matrixDescriptor);
+                       cusparseSetMatType(matrixDescriptor, 
CUSPARSE_MATRIX_TYPE_GENERAL);
+                       cusparseSetMatIndexBase(matrixDescriptor, 
CUSPARSE_INDEX_BASE_ZERO);
+               }
+               return matrixDescriptor;
+       }
+
+       /**
+        * Estimate the size of a CSR matrix in GPU memory
+        * Size of pointers is not needed and is not added in
+        *
+        * @param nnz2 number of non zeroes
+        * @param rows number of rows
+        * @return size estimate
+        */
+       public static long estimateSize(long nnz2, long rows) {
+               long sizeofValArray = getDoubleSizeOf(nnz2);
+               long sizeofRowPtrArray = getIntSizeOf(rows + 1);
+               long sizeofColIndArray = getIntSizeOf(nnz2);
+               long sizeofDescr = getIntSizeOf(4);
+               // From the CUSPARSE documentation, the cusparseMatDescr in 
native code is represented as:
+               // typedef struct {
+               //      cusparseMatrixType_t MatrixType;
+               //      cusparseFillMode_t FillMode;
+               //      cusparseDiagType_t DiagType;
+               //      cusparseIndexBase_t IndexBase;
+               // } cusparseMatDescr_t;
+               long tot = sizeofValArray + sizeofRowPtrArray + 
sizeofColIndArray + sizeofDescr;
+               return tot;
+       }
+
+       /**
+        * Static method to copy a CSR sparse matrix from Host to Device
+        *
+        * @param dest   [input] destination location (on GPU)
+        * @param rows   number of rows
+        * @param nnz    number of non-zeroes
+        * @param rowPtr integer array of row pointers
+        * @param colInd integer array of column indices
+        * @param values double array of non zero values
+        */
+       public static void copyToDevice(CSRPointer dest, int rows, long nnz, 
int[] rowPtr, int[] colInd, double[] values) {
+               CSRPointer r = dest;
+               long t0 = 0;
+               if (DMLScript.STATISTICS)
+                       t0 = System.nanoTime();
+               r.nnz = nnz;
+               cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 
1), cudaMemcpyHostToDevice);
+               cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), 
cudaMemcpyHostToDevice);
+               cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), 
cudaMemcpyHostToDevice);
+               if (DMLScript.STATISTICS)
+                       GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() 
- t0);
+               if (DMLScript.STATISTICS)
+                       GPUStatistics.cudaToDevCount.addAndGet(3);
+       }
+
+       /**
+        * Static method to copy a CSR sparse matrix from Device to host
+        *
+        * @param src    [input] source location (on GPU)
+        * @param rows   [input] number of rows
+        * @param nnz    [input] number of non-zeroes
+        * @param rowPtr [output] pre-allocated integer array of row pointers 
of size (rows+1)
+        * @param colInd [output] pre-allocated integer array of column indices 
of size nnz
+        * @param values [output] pre-allocated double array of values of size 
nnz
+        */
+       public static void copyToHost(CSRPointer src, int rows, long nnz, int[] 
rowPtr, int[] colInd, double[] values) {
+               CSRPointer r = src;
+               long t0 = 0;
+               if (DMLScript.STATISTICS)
+                       t0 = System.nanoTime();
+               cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 
1), cudaMemcpyDeviceToHost);
+               cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), 
cudaMemcpyDeviceToHost);
+               cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), 
cudaMemcpyDeviceToHost);
+               if (DMLScript.STATISTICS)
+                       
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
+               if (DMLScript.STATISTICS)
+                       GPUStatistics.cudaFromDevCount.addAndGet(3);
+       }
+
+       /**
+        * Estimates the number of non zero elements from the results of a 
sparse cusparseDgeam operation
+        * C = a op(A) + b op(B)
+        *
+        * @param gCtx   a valid {@link GPUContext}
+        * @param handle a valid {@link cusparseHandle}
+        * @param A      Sparse Matrix A on GPU
+        * @param B      Sparse Matrix B on GPU
+        * @param m      Rows in A
+        * @param n      Columns in Bs
+        * @return CSR (compressed sparse row) pointer
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static CSRPointer allocateForDgeam(GPUContext gCtx, 
cusparseHandle handle, CSRPointer A, CSRPointer B, int m,
+                       int n) throws DMLRuntimeException {
+               if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+                       throw new DMLRuntimeException("Number of non zeroes is 
larger than supported by cuSparse");
+               }
+               CSRPointer C = new CSRPointer(gCtx);
+               step1AllocateRowPointers(gCtx, handle, C, m);
+               step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
+               step3AllocateValNInd(gCtx, handle, C);
+               return C;
+       }
+
+       /**
+        * Estimates the number of non-zero elements from the result of a 
sparse matrix multiplication C = A * B
+        * and returns the {@link CSRPointer} to C with the appropriate GPU 
memory.
+        *
+        * @param gCtx   ?
+        * @param handle a valid {@link cusparseHandle}
+        * @param A      Sparse Matrix A on GPU
+        * @param transA 'T' if A is to be transposed, 'N' otherwise
+        * @param B      Sparse Matrix B on GPU
+        * @param transB 'T' if B is to be transposed, 'N' otherwise
+        * @param m      Rows in A
+        * @param n      Columns in B
+        * @param k      Columns in A / Rows in B
+        * @return a {@link CSRPointer} instance that encapsulates the CSR 
matrix on GPU
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, 
cusparseHandle handle, CSRPointer A, int transA,
+                       CSRPointer B, int transB, int m, int n, int k) throws 
DMLRuntimeException {
+               // Following the code example at 
http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
+               // 
https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
+               CSRPointer C = new CSRPointer(gCtx);
+               step1AllocateRowPointers(gCtx, handle, C, m);
+               step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, 
k);
+               step3AllocateValNInd(gCtx, handle, C);
+               return C;
+       }
+
+       /**
+        * Factory method to allocate an empty CSR Sparse matrix on the GPU
+        *
+        * @param gCtx ?
+        * @param nnz2 number of non-zeroes
+        * @param rows number of rows
+        * @return a {@link CSRPointer} instance that encapsulates the CSR 
matrix on GPU
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long 
rows) throws DMLRuntimeException {
+               LOG.trace(
+                               "GPU : allocateEmpty from CSRPointer with nnz=" 
+ nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
+               assert nnz2
+                               > -1 : "Incorrect usage of internal API, number 
of non zeroes is less than 0 when trying to allocate sparse data on GPU";
+               CSRPointer r = new CSRPointer(gCtx);
+               r.nnz = nnz2;
+               if (nnz2 == 0) {
+                       // The convention for an empty sparse matrix is to just 
have an instance of the CSRPointer object
+                       // with no memory allocated on the GPU.
+                       return r;
+               }
+               gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows 
+ 1) + getIntSizeOf(nnz2));
+               // increment the cudaCount by 1 for the allocation of all 3 
arrays
+               r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
+               r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
+               r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+               return r;
+       }
+
+       /**
+        * Allocate row pointers of m+1 elements
+        *
+        * @param gCtx   a valid {@link GPUContext}
+        * @param handle a valid {@link cusparseHandle}
+        * @param C      Output matrix
+        * @param rowsC  number of rows in C
+        * @throws DMLRuntimeException ?
+        */
+       private static void step1AllocateRowPointers(GPUContext gCtx, 
cusparseHandle handle, CSRPointer C, int rowsC)
+                       throws DMLRuntimeException {
+               LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + 
gCtx);
+               cusparseSetPointerMode(handle, 
cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
+               //cudaDeviceSynchronize;
+               // Do not increment the cudaCount of allocations on GPU
+               C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
+       }
+
+       /**
+        * Determine total number of nonzero element for the cusparseDgeam  
operation.
+        * This is done from either (nnzC=*nnzTotalDevHostPtr) or 
(nnzC=csrRowPtrC(m)-csrRowPtrC(0))
+        *
+        * @param gCtx   a valid {@link GPUContext}
+        * @param handle a valid {@link cusparseHandle}
+        * @param A      Sparse Matrix A on GPU
+        * @param B      Sparse Matrix B on GPU
+        * @param C      Output Sparse Matrix C on GPU
+        * @param m      Rows in C
+        * @param n      Columns in C
+        * @throws DMLRuntimeException ?
+        */
+       private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, CSRPointer B,
+                       CSRPointer C, int m, int n) throws DMLRuntimeException {
+               LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", 
GPUContext=" + gCtx);
+               int[] CnnzArray = { -1 };
+               cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), 
A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
+                               B.rowPtr, B.colInd, C.descr, C.rowPtr, 
Pointer.to(CnnzArray));
+               //cudaDeviceSynchronize;
+               if (CnnzArray[0] != -1) {
+                       C.nnz = CnnzArray[0];
+               } else {
+                       int baseArray[] = { 0 };
+                       cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
+                                       cudaMemcpyDeviceToHost);
+                       cudaMemcpy(Pointer.to(baseArray), C.rowPtr, 
getIntSizeOf(1), cudaMemcpyDeviceToHost);
+                       C.nnz = CnnzArray[0] - baseArray[0];
+               }
+       }
+
+       /**
+        * Determine total number of nonzero element for the cusparseDgemm 
operation.
+        *
+        * @param gCtx   a valid {@link GPUContext}
+        * @param handle a valid {@link cusparseHandle}
+        * @param A      Sparse Matrix A on GPU
+        * @param transA op - whether A is transposed
+        * @param B      Sparse Matrix B on GPU
+        * @param transB op - whether B is transposed
+        * @param C      Output Sparse Matrix C on GPU
+        * @param m      Number of rows of sparse matrix op ( A ) and C
+        * @param n      Number of columns of sparse matrix op ( B ) and C
+        * @param k      Number of columns/rows of sparse matrix op ( A ) / op 
( B )
+        * @throws DMLRuntimeException ?
+        */
+       private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, int transA,
+                       CSRPointer B, int transB, CSRPointer C, int m, int n, 
int k) throws DMLRuntimeException {
+               LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", 
GPUContext=" + gCtx);
+               int[] CnnzArray = { -1 };
+               if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+                       throw new DMLRuntimeException("Number of non zeroes is 
larger than supported by cuSparse");
+               }
+               cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, 
toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
+                               toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, 
C.rowPtr, Pointer.to(CnnzArray));
+               //cudaDeviceSynchronize;
+               if (CnnzArray[0] != -1) {
+                       C.nnz = CnnzArray[0];
+               } else {
+                       int baseArray[] = { 0 };
+                       cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
+                                       cudaMemcpyDeviceToHost);
+                       cudaMemcpy(Pointer.to(baseArray), C.rowPtr, 
getIntSizeOf(1), cudaMemcpyDeviceToHost);
+                       C.nnz = CnnzArray[0] - baseArray[0];
+               }
+       }
+
+       /**
+        * Allocate val and index pointers.
+        *
+        * @param gCtx   a valid {@link GPUContext}
+        * @param handle a valid {@link cusparseHandle}
+        * @param C      Output sparse matrix on GPU
+        * @throws DMLRuntimeException ?
+        */
+       private static void step3AllocateValNInd(GPUContext gCtx, 
cusparseHandle handle, CSRPointer C)
+                       throws DMLRuntimeException {
+               LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + 
gCtx);
+               // Increment cudaCount by one when all three arrays of CSR 
sparse array are allocated
+               C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
+               C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
+       }
+
+       // 
==============================================================================================
+
+       // The following methods estimate the memory needed for sparse matrices 
that are
+       // results of operations on other sparse matrices using the cuSparse 
Library.
+       // The operation is C = op(A) binaryOperation op(B), C is the output 
and A & B are the inputs
+       // op = whether to transpose or not
+       // binaryOperation = For cuSparse, +, - are *(matmul) are supported
+
+       // From CuSparse Manual,
+       // Since A and B have different sparsity patterns, cuSPARSE adopts a 
two-step approach
+       // to complete sparse matrix C. In the first step, the user allocates 
csrRowPtrC of m+1
+       // elements and uses function cusparseXcsrgeamNnz() to determine 
csrRowPtrC
+       // and the total number of nonzero elements. In the second step, the 
user gathers nnzC
+       //(number of nonzero elements of matrix C) from either 
(nnzC=*nnzTotalDevHostPtr)
+       // or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, 
csrColIndC of
+       // nnzC elements respectively, then finally calls function 
cusparse[S|D|C|Z]csrgeam()
+       // to complete matrix C.
+
+       public CSRPointer clone(int rows) throws DMLRuntimeException {
+               CSRPointer me = this;
+               CSRPointer that = new CSRPointer(me.getGPUContext());
+
+               that.allocateMatDescrPointer();
+               long totalSize = estimateSize(me.nnz, rows);
+               that.gpuContext.ensureFreeSpace(totalSize);
+
+               that.nnz = me.nnz;
+               that.val = allocate(that.nnz * Sizeof.DOUBLE);
+               that.rowPtr = allocate(rows * Sizeof.DOUBLE);
+               that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
+
+               cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
+               cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
+               cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
+
+               return that;
+       }
+
+       private Pointer allocate(long size) throws DMLRuntimeException {
+               return getGPUContext().allocate(size);
+       }
+
+       private void cudaFreeHelper(Pointer toFree, boolean eager) throws 
DMLRuntimeException {
+               getGPUContext().cudaFreeHelper(toFree, eager);
+       }
+
+       private GPUContext getGPUContext() {
+               return gpuContext;
+       }
+
+       // 
==============================================================================================
+
+       /**
+        * Check for ultra sparsity
+        *
+        * @param rows number of rows
+        * @param cols number of columns
+        * @return true if ultra sparse
+        */
+       public boolean isUltraSparse(int rows, int cols) {
+               double sp = ((double) nnz / rows / cols);
+               return sp < ULTRA_SPARSITY_TURN_POINT;
+       }
+
+       /**
+        * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
+        * the default that works for DGEMM.
+        */
+       private void allocateMatDescrPointer() {
+               this.descr = getDefaultCuSparseMatrixDescriptor();
+       }
+
+       /**
+        * Copies this CSR matrix on the GPU to a dense column-major matrix
+        * on the GPU. This is a temporary matrix for operations such as
+        * cusparseDcsrmv.
+        * Since the allocated matrix is temporary, bookkeeping is not updated.
+        * The caller is responsible for calling "free" on the returned Pointer 
object
+        *
+        * @param cusparseHandle a valid {@link cusparseHandle}
+        * @param cublasHandle   a valid {@link cublasHandle}
+        * @param rows           number of rows in this CSR matrix
+        * @param cols           number of columns in this CSR matrix
+        * @return A {@link Pointer} to the allocated dense matrix (in 
column-major format)
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, 
cublasHandle cublasHandle, int rows,
+                       int cols) throws DMLRuntimeException {
+               LOG.trace("GPU : sparse -> column major dense (inside 
CSRPointer) on " + this + ", GPUContext="
+                               + getGPUContext());
+               long size = ((long) rows) * getDoubleSizeOf((long) cols);
+               Pointer A = allocate(size);
+               // If this sparse block is empty, the allocated dense matrix, 
initialized to zeroes, will be returned.
+               if (val != null && rowPtr != null && colInd != null && nnz > 0) 
{
+                       // Note: cusparseDcsr2dense method cannot handle empty 
blocks
+                       cusparseDcsr2dense(cusparseHandle, rows, cols, descr, 
val, rowPtr, colInd, A, rows);
+                       //cudaDeviceSynchronize;
+               } else {
+                       LOG.debug("in CSRPointer, the values array, row 
pointers array or column indices array was null");
+               }
+               return A;
+       }
+
+       /**
+        * Calls cudaFree lazily on the allocated {@link Pointer} instances
+        *
+        * @throws DMLRuntimeException ?
+        */
+       public void deallocate() throws DMLRuntimeException {
+               deallocate(false);
+       }
+
+       /**
+        * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} 
instances
+        *
+        * @param eager whether to do eager or lazy cudaFrees
+        * @throws DMLRuntimeException ?
+        */
+       public void deallocate(boolean eager) throws DMLRuntimeException {
+               if (nnz > 0) {
+                       cudaFreeHelper(val, eager);
+                       cudaFreeHelper(rowPtr, eager);
+                       cudaFreeHelper(colInd, eager);
+               }
+       }
+
+       @Override
+       public String toString() {
+               return "CSRPointer{" + "nnz=" + nnz + '}';
+       }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index ce5c5ff..ef000c2 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -31,25 +31,24 @@ import jcuda.driver.CUstream;
  * Java Wrapper to specify CUDA execution configuration for launching custom 
kernels
  */
 public class ExecutionConfig {
-       public int gridDimX; 
+       public int gridDimX;
        public int gridDimY = 1;
        public int gridDimZ = 1;
-       public int blockDimX; 
+       public int blockDimX;
        public int blockDimY = 1;
        public int blockDimZ = 1;
        public int sharedMemBytes = 0;
        public CUstream stream = null;
 
-
-       
        private static HashMap<Integer, Integer> maxBlockDimForDevice = new 
HashMap<Integer, Integer>();
 
        /**
         * Convenience constructor for setting the number of blocks, number of 
threads and the
         * shared memory size
-        * @param gridDimX                                      Number of 
blocks (for CUDA Kernel)
-        * @param blockDimX                                     Number of 
threads per block (for CUDA Kernel)
-        * @param sharedMemBytes                Amount of Shared memory (for 
CUDA Kernel)
+        *
+        * @param gridDimX       Number of blocks (for CUDA Kernel)
+        * @param blockDimX      Number of threads per block (for CUDA Kernel)
+        * @param sharedMemBytes Amount of Shared memory (for CUDA Kernel)
         */
        public ExecutionConfig(int gridDimX, int blockDimX, int sharedMemBytes) 
{
                this.gridDimX = gridDimX;
@@ -58,13 +57,13 @@ public class ExecutionConfig {
        }
 
        /**
-        * Use this for simple vector operations and use following in the 
kernel 
-        * <code> 
-        * int index = blockIdx.x * blockDim.x + threadIdx.x 
+        * Use this for simple vector operations and use following in the kernel
+        * <code>
+        * int index = blockIdx.x * blockDim.x + threadIdx.x
         * </code>
-        * 
+        * <p>
         * This tries to schedule as minimum grids as possible.
-        * 
+        *
         * @param numCells number of cells
         * @return execution configuration
         * @throws DMLRuntimeException if DMLRuntimeException occurs
@@ -72,19 +71,19 @@ public class ExecutionConfig {
        public static ExecutionConfig getConfigForSimpleVectorOperations(int 
numCells) throws DMLRuntimeException {
                int deviceNumber = 0;
                int blockDimX = getMaxBlockDim(deviceNumber);
-               int gridDimX = (int)Math.ceil((double)numCells / blockDimX);
+               int gridDimX = (int) Math.ceil((double) numCells / blockDimX);
                return new ExecutionConfig(gridDimX, blockDimX);
        }
-       
+
        /**
-        * Use this for simple matrix operations and use following in the 
kernel 
-        * <code> 
+        * Use this for simple matrix operations and use following in the kernel
+        * <code>
         * int ix = blockIdx.x * blockDim.x + threadIdx.x;
         * int iy = blockIdx.y * blockDim.y + threadIdx.y;
         * </code>
-        * 
+        * <p>
         * This tries to schedule as minimum grids as possible.
-        * 
+        *
         * @param rlen number of rows
         * @param clen number of columns
         * @return execution configuration
@@ -94,45 +93,45 @@ public class ExecutionConfig {
                int deviceNumber = 0;
                int maxBlockDim = getMaxBlockDim(deviceNumber);
                int blockDimX = (int) Math.min(maxBlockDim, rlen);
-               int gridDimX = (int)Math.ceil((double)rlen / blockDimX);
-               int blockDimY = 
(int)Math.min(Math.floor(((double)maxBlockDim)/blockDimX), clen);
-               int gridDimY = (int)Math.ceil((double)clen / blockDimY);
+               int gridDimX = (int) Math.ceil((double) rlen / blockDimX);
+               int blockDimY = (int) Math.min(Math.floor(((double) 
maxBlockDim) / blockDimX), clen);
+               int gridDimY = (int) Math.ceil((double) clen / blockDimY);
                return new ExecutionConfig(gridDimX, gridDimY, blockDimX, 
blockDimY);
        }
-       
+
        public ExecutionConfig(int gridDimX, int blockDimX) {
                this.gridDimX = gridDimX;
                this.blockDimX = blockDimX;
        }
-       
+
        public ExecutionConfig(int gridDimX, int gridDimY, int blockDimX, int 
blockDimY) {
                this.gridDimX = gridDimX;
                this.gridDimY = gridDimY;
                this.blockDimX = blockDimX;
                this.blockDimY = blockDimY;
        }
-       
-       
+
        /**
-     * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device
-        * 
+        * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device
+        *
         * @param deviceNumber device number of the given device
         * @return The maximum block dimension, in x-direction
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-    private static int getMaxBlockDim(int deviceNumber) throws 
DMLRuntimeException {
-//     return 32;
-       // TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the 
block size that maximizes occupancy
-       Integer ret = maxBlockDimForDevice.get(deviceNumber);
-       if(ret == null) {
-               CUdevice device = new CUdevice();
-            
JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, 
deviceNumber));
-            int maxBlockDimX[] =  {0};
-            jcuda.driver.JCudaDriver.cuDeviceGetAttribute(maxBlockDimX, 
CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
-            maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]);
-            return maxBlockDimX[0];
-       }
-        return ret;
-    }
-    
-    }
+       private static int getMaxBlockDim(int deviceNumber) throws 
DMLRuntimeException {
+               //      return 32;
+               // TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to 
chose the block size that maximizes occupancy
+               Integer ret = maxBlockDimForDevice.get(deviceNumber);
+               if (ret == null) {
+                       CUdevice device = new CUdevice();
+                       
JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, 
deviceNumber));
+                       int maxBlockDimX[] = { 0 };
+                       jcuda.driver.JCudaDriver
+                                       .cuDeviceGetAttribute(maxBlockDimX, 
CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
+                       maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]);
+                       return maxBlockDimX[0];
+               }
+               return ret;
+       }
+
+}

Reply via email to