[2/2] incubator-systemml git commit: [SYSTEMML-445] [WIP] Initial version of GPU backend

niketanpansare Wed, 15 Jun 2016 10:05:50 -0700

[SYSTEMML-445] [WIP] Initial version of GPU backend


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/f306b0b1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/f306b0b1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/f306b0b1

Branch: refs/heads/master
Commit: f306b0b1ecd97fdb5755989f77728d1ca81358c5
Parents: 8c37e2e
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Wed Jun 15 10:02:44 2016 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Wed Jun 15 10:03:48 2016 -0700

----------------------------------------------------------------------
 docs/devdocs/gpu-backend.md                     |  70 ++++
 pom.xml                                         |  63 ++++
 .../java/org/apache/sysml/api/DMLScript.java    |  27 ++
 .../java/org/apache/sysml/hops/AggBinaryOp.java |  44 ++-
 .../org/apache/sysml/hops/ConvolutionOp.java    |  39 +-
 .../org/apache/sysml/hops/OptimizerUtils.java   |   2 +
 .../java/org/apache/sysml/hops/ReorgOp.java     |   6 +
 src/main/java/org/apache/sysml/lops/Binary.java |  19 +-
 src/main/java/org/apache/sysml/lops/Lop.java    |   2 +-
 .../org/apache/sysml/lops/LopProperties.java    |   2 +-
 .../runtime/controlprogram/ProgramBlock.java    |   7 +-
 .../controlprogram/caching/CacheableData.java   |  30 ++
 .../controlprogram/caching/MatrixObject.java    |  12 +
 .../context/ExecutionContext.java               |  91 ++++-
 .../controlprogram/context/GPUContext.java      |  70 ++++
 .../controlprogram/context/GPUObject.java       | 156 ++++++++
 .../controlprogram/context/JCudaContext.java    | 146 ++++++++
 .../controlprogram/context/JCudaObject.java     | 276 ++++++++++++++
 .../instructions/GPUInstructionParser.java      |  87 +++++
 .../sysml/runtime/instructions/Instruction.java |   3 +-
 .../runtime/instructions/InstructionParser.java |   5 +
 .../runtime/instructions/InstructionUtils.java  |  13 +
 .../runtime/instructions/cp/CPInstruction.java  |   5 +-
 .../gpu/AggregateBinaryGPUInstruction.java      | 152 ++++++++
 .../gpu/ConvolutionGPUInstruction.java          | 207 +++++++++++
 .../runtime/matrix/data/LibMatrixCUDA.java      | 359 +++++++++++++++++++
 .../sysml/runtime/util/ConvolutionUtils.java    | 127 +++++++
 .../java/org/apache/sysml/utils/Statistics.java |  55 +++
 .../test/integration/AutomatedTestBase.java     |  17 +-
 .../binary/matrix/MatrixMultiplicationTest.java |  27 ++
 30 files changed, 2091 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/docs/devdocs/gpu-backend.md
----------------------------------------------------------------------
diff --git a/docs/devdocs/gpu-backend.md b/docs/devdocs/gpu-backend.md
new file mode 100644
index 0000000..c6f66d6
--- /dev/null
+++ b/docs/devdocs/gpu-backend.md
@@ -0,0 +1,70 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# Initial prototype for GPU backend
+
+A GPU backend implements two important abstract classes:
+1. `org.apache.sysml.runtime.controlprogram.context.GPUContext`
+2. `org.apache.sysml.runtime.controlprogram.context.GPUObject`
+
+The GPUContext is responsible for GPU memory management and 
initialization/destruction of Cuda handles.
+
+A GPUObject (like RDDObject and BroadcastObject) is stored in CacheableData 
object. It gets call-backs from SystemML's bufferpool on following methods
+1. void acquireDeviceRead()
+2. void acquireDenseDeviceModify(int numElemsToAllocate)
+3. void acquireHostRead()
+4. void acquireHostModify()
+5. void release(boolean isGPUCopyModified)
+
+## JCudaContext:
+The current prototype supports Nvidia's CUDA libraries using JCuda wrapper. 
The implementation for the above classes can be found in:
+1. `org.apache.sysml.runtime.controlprogram.context.JCudaContext`
+2. `org.apache.sysml.runtime.controlprogram.context.JCudaObject`
+
+### Setup instructions for JCudaContext:
+
+1. Follow the instructions from `https://developer.nvidia.com/cuda-downloads` 
and install CUDA 7.5.
+2. Follow the instructions from `https://developer.nvidia.com/cudnn` and 
install CuDNN v4.
+3. Download install JCuda binaries version 0.7.5b and JCudnn version 0.7.5. 
Easiest option would be to use mavenized jcuda: 
+```python
+git clone https://github.com/MysterionRise/mavenized-jcuda.git
+mvn -Djcuda.version=0.7.5b -Djcudnn.version=0.7.5 clean package
+CURR_DIR=`pwd`
+JCUDA_PATH=$CURR_DIR"/target/lib/"
+JAR_PATH="."
+for j in `ls $JCUDA_PATH/*.jar`
+do
+        JAR_PATH=$JAR_PATH":"$j
+done
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JCUDA_PATH
+```
+
+Note for Windows users:
+* CuDNN v4 is available to download: 
`http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip`
+* If above steps doesn't work for JCuda, copy the DLLs into C:\lib (or /lib) 
directory.
+
+To use SystemML's GPU backend, 
+1. Add JCuda's jar into the classpath.
+2. Include CUDA, CuDNN and JCuda's libraries in LD_LIBRARY_PATH (or using 
-Djava.library.path).
+3. Use `-gpu` flag.
+
+For example: to use GPU backend in standalone mode:
+```python
+java -classpath $JAR_PATH:systemml-0.10.0-incubating-SNAPSHOT-standalone.jar 
org.apache.sysml.api.DMLScript -f MyDML.dml -gpu -exec singlenode ... 
+```

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 9982d41..1564c99 100644
--- a/pom.xml
+++ b/pom.xml
@@ -73,6 +73,24 @@
                <!-- OS-specific JVM arguments for running integration tests -->
                <integrationTestExtraJVMArgs />
        </properties>
+       
+       <repositories>
+        <repository>
+          <id>central</id>
+          <url>https://repo1.maven.org/maven2</url>
+          <releases>
+            <enabled>true</enabled>
+          </releases>
+        </repository>
+        <repository>
+            <id>mavenized-jcuda-mvn-repo</id>
+            
<url>https://raw.github.com/niketanpansare/mavenized-jcuda/mvn-repo/</url>
+            <snapshots>
+                <enabled>true</enabled>
+                <updatePolicy>always</updatePolicy>
+            </snapshots>
+        </repository>
+    </repositories>
 
        <build>
 
@@ -784,6 +802,51 @@
 
 
        <dependencies>
+       
+               <!-- For GPU backend
+                       Use org.mystic:mavenized-jcuda until Alan puts 
org.jcuda:*
+                -->
+               <dependency>
+                       <groupId>org.mystic</groupId>
+            <artifactId>mavenized-jcuda</artifactId>
+            <version>0.7.5b</version>
+            <type>jar</type>
+            <scope>provided</scope>
+            <exclusions>
+                       <exclusion>
+                           <groupId>*</groupId>
+                           <artifactId>*</artifactId>
+                       </exclusion>
+                   </exclusions>
+               </dependency>
+               <!-- Since there is no mvn repo for jcuda
+               <dependency>
+                       <groupId>org.jcuda</groupId>
+                       <artifactId>jcuda</artifactId>
+                       <version>0.7.5b</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.jcuda</groupId>
+                       <artifactId>jcublas</artifactId>
+                       <version>0.7.5b</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.jcuda</groupId>
+                       <artifactId>jcusparse</artifactId>
+                       <version>0.7.5b</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.jcuda</groupId>
+                       <artifactId>jcudnn</artifactId>
+                       <version>0.7.5</version>
+                       <scope>provided</scope>
+               </dependency>
+                -->
+               <!-- ************************* -->
+       
                <dependency>
                        <groupId>org.apache.spark</groupId>
                        <artifactId>spark-core_2.10</artifactId>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index ec71af3..04b4548 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -69,6 +69,7 @@ import 
org.apache.sysml.runtime.controlprogram.caching.CacheStatistics;
 import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory;
+import org.apache.sysml.runtime.controlprogram.context.GPUContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter;
 import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
@@ -103,6 +104,9 @@ public class DMLScript
        public static String DML_FILE_PATH_ANTLR_PARSER = null;
        public static ExplainType EXPLAIN = ExplainType.NONE; //default explain
        
+       public static boolean USE_ACCELERATOR = false;
+       public static boolean FORCE_ACCELERATOR = false;
+       
        // flag that indicates whether or not to suppress any prints to stdout
        public static boolean _suppressPrint2Stdout = false;
        
@@ -121,6 +125,8 @@ public class DMLScript
                        //+ "   -s: <filename> will be interpreted as a DML 
script string \n"
                        + "   -python: (optional) parses Python-like DML\n"
                        + "   -debug: (optional) run in debug mode\n"
+                       + "   -accelerator: <flags> (optional) use acceleration 
whenever possible. Current version only supports CUDA.\n"
+                       + "                     Optional <flags> that is 
supported for this mode is force=(true|false)\n"
                        // Later add optional flags to indicate optimizations 
turned on or off. Currently they are turned off.
                        //+ "   -debug: <flags> (optional) run in debug mode\n"
                        //+ "                   Optional <flags> that is 
supported for this mode is optimize=(on|off)\n"
@@ -302,6 +308,24 @@ public class DMLScript
                                else if( args[i].equalsIgnoreCase("-debug") ) { 
                                
                                        ENABLE_DEBUG_MODE = true;
                                }
+                               else if( args[i].equalsIgnoreCase("-gpu") ) {   
+                                       USE_ACCELERATOR = true;
+                                       if( args.length > (i+1) && 
!args[i+1].startsWith("-") ) {
+                                               String flag = args[++i];
+                                               if(flag.startsWith("force=")) {
+                                                       String [] flagOptions = 
flag.split("=");
+                                                       if(flagOptions.length 
== 2)
+                                                               
FORCE_ACCELERATOR = Boolean.parseBoolean(flagOptions[1]);
+                                                       else
+                                                               throw new 
DMLRuntimeException("Unsupported \"force\" option for -gpu:" + flag);
+                                               }
+                                               else {
+                                                       throw new 
DMLRuntimeException("Unsupported flag for -gpu:" + flag);
+                                               }
+                                       }
+                                               
+                                       GPUContext.createGPUContext(); // Set 
GPU memory budget
+                               }
                                else if( args[i].equalsIgnoreCase("-python") ) {
                                        parsePyDML = true;
                                }
@@ -673,6 +697,9 @@ public class DMLScript
                }
                finally //ensure cleanup/shutdown
                {       
+                       if(DMLScript.USE_ACCELERATOR && ec != null) {
+                               ec.destroyGPUContext();
+                       }
                        if(ec != null && ec instanceof SparkExecutionContext) {
                                ((SparkExecutionContext) ec).close();
                        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index 8a8cffc..0532d01 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -640,16 +640,46 @@ public class AggBinaryOp extends Hop implements 
MultiThreadedHop
                throws HopsException, LopsException
        {       
                Lop matmultCP = null;
-               if( isLeftTransposeRewriteApplicable(true, false) ) {
-                       matmultCP = constructCPLopsMMWithLeftTransposeRewrite();
+               
+               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
+                       Hop h1 = getInput().get(0);
+                       Hop h2 = getInput().get(1);
+                       Lop left; Lop right;
+                       boolean isLeftTransposed; boolean isRightTransposed;
+                       if( h1 instanceof ReorgOp && 
((ReorgOp)h1).getOp()==ReOrgOp.TRANSPOSE ) {
+                               isLeftTransposed = true;
+                               left = h1.getInput().get(0).constructLops();
+                       }
+                       else {
+                               isLeftTransposed = false;
+                               left = h1.constructLops();
+                       }
+                       if( h2 instanceof ReorgOp && 
((ReorgOp)h2).getOp()==ReOrgOp.TRANSPOSE ) {
+                               isRightTransposed = true;
+                               right = h2.getInput().get(0).constructLops();
+                       }
+                       else {
+                               isRightTransposed = false;
+                               right = h2.constructLops();
+                       }
+                       
+                       matmultCP = new Binary(left, right, 
+                                                                        
Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.GPU, 
isLeftTransposed, isRightTransposed);
+                       setOutputDimensions(matmultCP);
+                       setNnz(-1);
                }
-               else { 
-                       int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-                       matmultCP = new 
Binary(getInput().get(0).constructLops(),getInput().get(1).constructLops(), 
-                                                                        
Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.CP, k);
+               else {
+                       if( isLeftTransposeRewriteApplicable(true, false) ) {
+                               matmultCP = 
constructCPLopsMMWithLeftTransposeRewrite();
+                       }
+                       else { 
+                               int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+                               matmultCP = new 
Binary(getInput().get(0).constructLops(),getInput().get(1).constructLops(), 
+                                                                               
 Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.CP, k);
+                       }
+                       setOutputDimensions(matmultCP);
                }
                
-               setOutputDimensions(matmultCP);
                setLineNumbers( matmultCP );
                setLops(matmultCP);
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index e9023d4..7f53e2e 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -33,6 +33,7 @@ import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.ConvolutionParameters;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
 
 public class ConvolutionOp extends Hop  implements MultiThreadedHop
 {      
@@ -90,6 +91,16 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        return getLops();
                
                ExecType et = optFindExecType();
+               
+               Lop ret = ConvolutionUtils.constructConvolutionLops(this, et);
+               if(ret != null) {
+                       return ret;
+               }
+               ret = 
ConvolutionUtils.constructConvolutionBackwardDataLops(this, et);
+               if(ret != null) {
+                       return ret;
+               }
+               
                ArrayList<Hop> inputs = getInput();
                switch( op )
                {
@@ -99,11 +110,26 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        case COL2IM:
                        case MAX_POOLING:
                        case MAX_POOLING_BACKWARD:
+                       {       
+                               et = ExecType.CP; // TODO: Since max_backwards 
and other Convolution Ops only implemented for CP
+                               
+                               if( et == ExecType.CP  )
+                               {
+                                       setLops(constructConvolutionLops(et, 
inputs));
+                                       break;
+                               }
+                               else {
+                                       // TODO: Add support for SPARK/MR 
backends once we are happy with the performance of
+                                       // single node Lenet script. 
+                                       throw new HopsException("Unimplemented 
ConvolutionOp for execution type: " + et.name());
+                               }
+                               // break;
+                       }       
                        case DIRECT_CONV2D:
                        case DIRECT_CONV2D_BACKWARD_DATA:
                        case DIRECT_CONV2D_BACKWARD_FILTER:
                        {       
-                               if( et == ExecType.CP )
+                               if( et == ExecType.GPU )
                                {
                                        setLops(constructConvolutionLops(et, 
inputs));
                                        break;
@@ -148,15 +174,17 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                                HopsConv2Lops.get(op), getDataType(), 
getValueType(), et, k);
                setOutputDimensions(transform1);
                setLineNumbers(transform1);
+               in.addOutput(transform1);
                
                // stride1, stride2, padding1, padding2  
                // input_shape1, input_shape2, input_shape3, input_shape4, 
                // filter_shape1, filter_shape2, filter_shape3, filter_shape4
-               for( int i=1; i <= (expectedNumInputs-1); i++ )
+               for( int i=1; i < inputs.size(); i++ )
                {
                        Lop ltmp = inputs.get(i).constructLops();
                        transform1.addInput(ltmp);
-                       ltmp.addOutput(transform1);
+                       //if(i == 1 && expectedNumInputs == 14)
+                               ltmp.addOutput(transform1);
                }
                transform1.setLevel(); //force order of added lops
                return transform1;
@@ -265,11 +293,6 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
                                _etype = findExecTypeByMemEstimate();
                        }
-                       // Choose CP, if the input dimensions are below 
threshold or if the input is a vector
-                       else if ( getInput().get(0).areDimsBelowThreshold() || 
getInput().get(0).isVector() )
-                       {
-                               _etype = ExecType.CP;
-                       }
                        else 
                        {
                                _etype = REMOTE;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java 
b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
index aa9f7d0..e6c8b88 100644
--- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
@@ -204,6 +204,8 @@ public class OptimizerUtils
        public static boolean ALLOW_FRAME_CSV_REBLOCK = false;
        
        
+       public static long GPU_MEMORY_BUDGET = -1;
+       
        //////////////////////
        // Optimizer levels //
        //////////////////////

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/ReorgOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java 
b/src/main/java/org/apache/sysml/hops/ReorgOp.java
index d81e777..55c0f3f 100644
--- a/src/main/java/org/apache/sysml/hops/ReorgOp.java
+++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java
@@ -34,6 +34,7 @@ import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
 
 /**
  *  Reorg (cell) operation: aij
@@ -118,6 +119,11 @@ public class ReorgOp extends Hop implements 
MultiThreadedHop
                if( getLops() != null )
                        return getLops();
 
+               Lop ret = 
ConvolutionUtils.constructConvolutionBackwardFilterLops(this);
+               if(ret != null) {
+                       return ret;
+               }
+               
                ExecType et = optFindExecType();
                
                switch( op )

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/lops/Binary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/Binary.java 
b/src/main/java/org/apache/sysml/lops/Binary.java
index d3389bc..bae7dee 100644
--- a/src/main/java/org/apache/sysml/lops/Binary.java
+++ b/src/main/java/org/apache/sysml/lops/Binary.java
@@ -42,6 +42,7 @@ public class Binary extends Lop
 
        private OperationTypes operation;
        private int numThreads = -1;
+       boolean isLeftTransposed; boolean isRightTransposed; // Used for GPU 
matmult operation
        
        /**
         * Constructor to perform a binary operation.
@@ -58,6 +59,14 @@ public class Binary extends Lop
                numThreads = k;
        }
        
+       public Binary(Lop input1, Lop input2, OperationTypes op, DataType dt, 
ValueType vt, ExecType et, 
+                       boolean isLeftTransposed, boolean isRightTransposed) {
+               super(Lop.Type.Binary, dt, vt);
+               init(input1, input2, op, dt, vt, et);
+               this.isLeftTransposed = isLeftTransposed;
+               this.isRightTransposed = isRightTransposed;
+       }
+       
        private void init(Lop input1, Lop input2, OperationTypes op, DataType 
dt, ValueType vt, ExecType et) 
        {
                operation = op;
@@ -76,7 +85,7 @@ public class Binary extends Lop
                        lps.addCompatibility(JobType.REBLOCK);
                        this.lps.setProperties( inputs, et, 
ExecLocation.Reduce, breaksAlignment, aligner, definesMRJob );
                }
-               else if ( et == ExecType.CP || et == ExecType.SPARK ){
+               else if ( et == ExecType.CP || et == ExecType.SPARK || et == 
ExecType.GPU ){
                        lps.addCompatibility(JobType.INVALID);
                        this.lps.setProperties( inputs, et, 
ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob );
                }
@@ -183,7 +192,13 @@ public class Binary extends Lop
                if( operation == OperationTypes.MATMULT && 
getExecType()==ExecType.CP ) {
                        sb.append( OPERAND_DELIMITOR );
                        sb.append( numThreads );
-               }       
+               }
+               else if( operation == OperationTypes.MATMULT && 
getExecType()==ExecType.GPU ) {
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( isLeftTransposed );
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( isRightTransposed );
+               }
                
                return sb.toString();
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/lops/Lop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/Lop.java 
b/src/main/java/org/apache/sysml/lops/Lop.java
index 8424e28..1ffdd10 100644
--- a/src/main/java/org/apache/sysml/lops/Lop.java
+++ b/src/main/java/org/apache/sysml/lops/Lop.java
@@ -668,7 +668,7 @@ public abstract class Lop
                boolean isLiteral = (isData && ((Data)this).isLiteral());
                
                StringBuilder sb = new StringBuilder("");
-               if ( et == ExecType.CP || et == ExecType.SPARK || (isData && 
isLiteral)) {
+               if ( et == ExecType.CP || et == ExecType.SPARK || et == 
ExecType.GPU || (isData && isLiteral)) {
                        sb.append(label);
                }
                else {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/lops/LopProperties.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/LopProperties.java 
b/src/main/java/org/apache/sysml/lops/LopProperties.java
index 8d9e916..48e8f57 100644
--- a/src/main/java/org/apache/sysml/lops/LopProperties.java
+++ b/src/main/java/org/apache/sysml/lops/LopProperties.java
@@ -27,7 +27,7 @@ import 
org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
 public class LopProperties 
 {
        
-       public enum ExecType { CP, CP_FILE, MR, SPARK, INVALID };
+       public enum ExecType { CP, CP_FILE, MR, SPARK, GPU, INVALID };
        public enum ExecLocation {INVALID, RecordReader, Map, MapOrReduce, 
MapAndReduce, Reduce, Data, ControlProgram };
 
        // static variable to assign an unique ID to every lop that is created

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
index 40be5b1..7d3103c 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
@@ -37,6 +37,7 @@ import 
org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.Instruction;
+import org.apache.sysml.runtime.instructions.Instruction.INSTRUCTION_TYPE;
 import org.apache.sysml.runtime.instructions.cp.BooleanObject;
 import org.apache.sysml.runtime.instructions.cp.ComputationCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.Data;
@@ -308,7 +309,11 @@ public class ProgramBlock
                        
                        // maintain aggregate statistics
                        if( DMLScript.STATISTICS) {
-                               Statistics.maintainCPHeavyHitters(
+                               if(tmp.getType() == INSTRUCTION_TYPE.GPU)
+                                       Statistics.maintainCPHeavyHitters(
+                                                       
"gpu_"+tmp.getExtendedOpcode(), System.nanoTime()-t0);
+                               else
+                                       Statistics.maintainCPHeavyHitters(
                                                tmp.getExtendedOpcode(), 
System.nanoTime()-t0);
                        }
                                

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 51c4de5..ce412e1 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -33,6 +33,7 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy;
+import org.apache.sysml.runtime.controlprogram.context.GPUObject;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject;
@@ -43,6 +44,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
 import org.apache.sysml.runtime.matrix.MetaData;
 import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
 import org.apache.sysml.runtime.util.LocalFileUtils;
 import org.apache.sysml.runtime.util.MapReduceTool;
@@ -178,6 +180,19 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
        //for lazily evaluated RDDs, and (2) as abstraction for environments 
that do not necessarily have spark libraries available
        private RDDObject _rddHandle = null; //RDD handle
        private BroadcastObject _bcHandle = null; //Broadcast handle
+       public GPUObject _gpuHandle = null;
+       
+       public GPUObject getGPUObject() {
+               return _gpuHandle;
+       }
+       public MatrixBlock getMatrixBlock() {
+               if(_data == null)
+                       getCache();
+               if(_data != null && _data instanceof MatrixBlock)
+                       return (MatrixBlock) _data;
+               else
+                       return null;
+       }
        
        /**
         * Basic constructor for any cacheable data.
@@ -203,6 +218,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                _hdfsFileName = that._hdfsFileName;
                _hdfsFileExists = that._hdfsFileExists; 
                _varName = that._varName;
+               _gpuHandle = that._gpuHandle;
        }
 
        
@@ -412,6 +428,9 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                if( _data == null )
                        getCache();
                
+               if( _gpuHandle != null )
+                       _gpuHandle.acquireHostRead();
+               
                //read data from HDFS/RDD if required
                //(probe data for cache_nowrite / jvm_reuse)  
                if( isEmpty(true) && _data==null ) 
@@ -497,6 +516,10 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                if( _data == null )
                        getCache();
                
+//             // Donot need to sync GPU data as it can cause redundant copy 
when GPU instruction does release
+//             if( _gpuHandle != null )
+//                     _gpuHandle.acquireHostModify();
+               
                //read data from HDFS if required
                if( isEmpty(true) && _data == null )
                {
@@ -675,6 +698,9 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                clearReusableData();
                _data = null;   
                clearCache();
+               if(_gpuHandle != null) {
+                       _gpuHandle.clearData();
+               }
                
                // clear rdd/broadcast back refs
                if( _rddHandle != null )
@@ -735,6 +761,8 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                exportData(fName, outputFormat, -1, formatProperties);
        }
        
+       protected void exportGPUData()  throws CacheException { }
+       
        /**
         * Synchronized because there might be parallel threads (parfor local) 
that
         * access the same object (in case it was created before the loop).
@@ -762,6 +790,8 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                        throw new CacheException ("MatrixObject not available 
to read.");
 
                LOG.trace("Exporting " + this.getDebugName() + " to " + fName + 
" in format " + outputFormat);
+               
+               exportGPUData();
                                
                boolean pWrite = false; // !fName.equals(_hdfsFileName); 
//persistent write flag
                if ( fName.equals(_hdfsFileName) ) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
index 90d1f3f..d781730 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
@@ -32,6 +32,7 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import 
org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
+import org.apache.sysml.runtime.controlprogram.context.GPUContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
@@ -87,6 +88,8 @@ public class MatrixObject extends CacheableData<MatrixBlock>
         */
        public MatrixObject (ValueType vt, String file) {
                this (vt, file, null); //HDFS file path
+               if(DMLScript.USE_ACCELERATOR)
+                       _gpuHandle = GPUContext.createGPUObject(this);
        }
        
        /**
@@ -98,6 +101,8 @@ public class MatrixObject extends CacheableData<MatrixBlock>
                _hdfsFileName = file;           
                _cache = null;
                _data = null;
+               if(DMLScript.USE_ACCELERATOR)
+                       _gpuHandle = GPUContext.createGPUObject(this);
        }
        
        /**
@@ -230,6 +235,13 @@ public class MatrixObject extends 
CacheableData<MatrixBlock>
                }
        }
        
+       @Override
+       protected void exportGPUData() throws CacheException {
+               if(DMLScript.USE_ACCELERATOR && getGPUObject() != null) {
+                       getGPUObject().acquireHostRead();
+               }
+       }
+       
        public String toString()
        { 
                StringBuilder str = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 6356a76..e446fed 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -47,9 +47,12 @@ import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.instructions.cp.StringObject;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData;
+import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
 import org.apache.sysml.runtime.matrix.MetaData;
 import org.apache.sysml.runtime.matrix.data.FrameBlock;
+import org.apache.sysml.runtime.matrix.data.InputInfo;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.OutputInfo;
 import org.apache.sysml.runtime.util.MapReduceTool;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
@@ -66,6 +69,8 @@ public class ExecutionContext
        //debugging (optional)
        protected DebugState _dbState = null;
        
+       protected GPUContext _gpuCtx = null;
+       
        protected ExecutionContext()
        {
                //protected constructor to force use of ExecutionContextFactory
@@ -76,7 +81,6 @@ public class ExecutionContext
        {
                //protected constructor to force use of ExecutionContextFactory
                this( true, prog );
-               
        }
        
        protected ExecutionContext(LocalVariableMap vars)
@@ -111,6 +115,11 @@ public class ExecutionContext
                _variables = vars;
        }
        
+       public void destroyGPUContext() throws DMLRuntimeException {
+               if(_gpuCtx != null)
+                       _gpuCtx.destroy();
+       }
+       
        
        /* -------------------------------------------------------
         * Methods to handle variables and associated data objects
@@ -229,7 +238,59 @@ public class ExecutionContext
                throws DMLRuntimeException 
        {       
                MatrixObject mo = getMatrixObject(varName);
-               return mo.acquireRead();
+               MatrixBlock mb = mo.acquireRead();
+               return mb;
+       }
+       
+       public void setMetaData(String varName, long nrows, long ncols) throws 
DMLRuntimeException  {
+               MatrixObject mo = getMatrixObject(varName);
+               if(mo.getNumRows() != nrows || mo.getNumColumns() != ncols) {
+                       MatrixCharacteristics mc = new 
MatrixCharacteristics((long)nrows, (long)ncols, 
+                                       (int) mo.getNumRowsPerBlock(), 
(int)mo.getNumColumnsPerBlock());
+                       OutputInfo oiOld = null;
+                       InputInfo iiOld = null;
+                       MetaData oldMetaData = mo.getMetaData();
+                       if(oldMetaData != null && oldMetaData instanceof 
MatrixFormatMetaData) {
+                               oiOld = 
((MatrixFormatMetaData)oldMetaData).getOutputInfo();
+                               iiOld = 
((MatrixFormatMetaData)oldMetaData).getInputInfo();
+                       }
+                       else {
+                               throw new DMLRuntimeException("Metadata not 
available");
+                       }
+                       mo.setMetaData(new MatrixFormatMetaData(mc, oiOld, 
iiOld));
+               }
+       }
+       
+       public MatrixObject getMatrixOutputForGPUInstruction(String varName, 
boolean isSparse) 
+                       throws DMLRuntimeException {    
+               if(isSparse) {
+                       throw new DMLRuntimeException("Sparse matrix block is 
not supported for GPU instruction");
+               }
+               MatrixObject mo = getMatrixObject(varName);
+               if(mo.getMatrixBlock() == null) {
+                       MatrixBlock mb = new MatrixBlock((int)mo.getNumRows(), 
(int)mo.getNumColumns(), false);
+                       mo.acquireModify(mb);
+                       mo.release();
+               }
+               
mo.getGPUObject().acquireDenseDeviceModify((int)(mo.getNumRows()*mo.getNumColumns()));
+               mo.getMatrixCharacteristics().setNonZeros(-1);
+               mo.getMatrixBlock().setNonZeros(-1);
+               return mo;
+       }
+       
+       public MatrixObject getMatrixInputForGPUInstruction(String varName) 
+                       throws DMLRuntimeException 
+       {       
+               MatrixObject mo = getMatrixObject(varName);
+               if(mo == null) {
+                       throw new DMLRuntimeException("No matrix object 
available for variable:" + varName);
+               }
+               if(mo.getGPUObject() == null || !mo.getGPUObject().isAllocated) 
{
+                       mo.acquireRead();
+                       mo.release();
+               }
+               mo.getGPUObject().acquireDeviceRead();
+               return mo;
        }
        
        /**
@@ -245,6 +306,13 @@ public class ExecutionContext
                mo.release();
        }
        
+       public void releaseMatrixInputForGPUInstruction(String varName) 
+                       throws DMLRuntimeException 
+               {
+                       MatrixObject mo = getMatrixObject(varName);
+                       mo.getGPUObject().release(false);
+               }
+       
        /**
         * Pins a frame variable into memory and returns the internal frame 
block.
         * 
@@ -311,6 +379,16 @@ public class ExecutionContext
                setVariable(varName, so);
        }
        
+       public void releaseMatrixOutputForGPUInstruction(String varName) throws 
DMLRuntimeException {
+               MatrixObject mo = getMatrixObject(varName);
+               if(mo.getGPUObject() == null || !mo.getGPUObject().isAllocated) 
{
+                       throw new DMLRuntimeException("No output is allocated 
on GPU");
+               }
+               mo.getGPUObject().release(true);
+//             mo.acquireModify();
+//             mo.release();
+       }
+       
        /**
         * 
         * @param varName
@@ -318,12 +396,17 @@ public class ExecutionContext
         * @throws DMLRuntimeException
         */
        public void setMatrixOutput(String varName, MatrixBlock outputData) 
-               throws DMLRuntimeException 
+                       throws DMLRuntimeException 
        {
                MatrixObject mo = getMatrixObject(varName);
+               if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated) {
+                       throw new DMLRuntimeException("GPU instructions should 
not set matrix output. "
+                                       + "Instead should use 
releaseMatrixOutput. If called by non-GPU instruction, "
+                                       + "then inconsistent bufferpool logic. 
Possible skipped deleting GPU object when acquire modify.");
+               }
+               
                mo.acquireModify(outputData);
            mo.release();
-               
            setVariable(varName, mo);
        }
        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java
new file mode 100644
index 0000000..9c513aa
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.controlprogram.context;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+
+public abstract class GPUContext {
+
+       public static ArrayList<GPUObject> allocatedPointers = new 
ArrayList<GPUObject>(); 
+       protected static GPUContext currContext;
+       protected GPUContext() { }
+       
+       public static volatile Boolean isGPUContextCreated = false;
+       
+       public abstract long getAvailableMemory();
+       
+       // Creation / Destruction of GPUContext and related handles
+       public static GPUContext createGPUContext() {
+               if(currContext == null && DMLScript.USE_ACCELERATOR) {
+                       // TODO: Handle this thread and resolve concurrency 
related bugs if any
+                       new Thread(new Runnable() {
+                               @Override
+                               public void run() {
+                                       // Lazy GPU context creation
+                                       synchronized(isGPUContextCreated) {
+                                               currContext = new 
JCudaContext();
+                                               
OptimizerUtils.GPU_MEMORY_BUDGET = 
((JCudaContext)currContext).getAvailableMemory();
+                                               isGPUContextCreated = true;
+                                       }
+                               }
+                       }).start();
+               }
+               return currContext;
+       }
+       public static GPUObject createGPUObject(MatrixObject mo) {
+               if(DMLScript.USE_ACCELERATOR) {
+                       synchronized(isGPUContextCreated) {
+                               if(currContext == null)
+                                       throw new RuntimeException("GPUContext 
is not created");
+                               if(currContext instanceof JCudaContext)
+                                       return new JCudaObject(mo);
+                       }
+               }
+               throw new RuntimeException("Cannot create createGPUObject when 
USE_ACCELERATOR is off");
+       }
+       public abstract void destroy() throws DMLRuntimeException;
+       
+       
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java
new file mode 100644
index 0000000..4f0b77f
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.controlprogram.context;
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.CacheException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+
+public abstract class GPUObject {
+
+       public boolean isDeviceCopyModified = false;
+       AtomicInteger numLocks = new AtomicInteger(0);
+       
+       public boolean isInSparseFormat = false;
+       public boolean isAllocated = false;
+       
+       MatrixObject mat = null;
+       protected GPUObject(MatrixObject mat2)  {
+               this.mat = mat2;
+       }
+       
+       public abstract void acquireDeviceRead() throws DMLRuntimeException;
+       public abstract void acquireDenseDeviceModify(int numElemsToAllocate) 
throws DMLRuntimeException;
+       public abstract void acquireHostRead() throws CacheException;
+       public abstract void acquireHostModify() throws CacheException;
+       public abstract void release(boolean isGPUCopyModified) throws 
CacheException;
+       
+       
+       // package-level visibility as these methods are guarded by underlying 
GPUContext
+       abstract void allocateMemoryOnDevice(int numElemToAllocate) throws 
DMLRuntimeException;
+       abstract void deallocateMemoryOnDevice() throws DMLRuntimeException;
+       abstract long getSizeOnDevice() throws DMLRuntimeException;
+       abstract void copyFromHostToDevice() throws DMLRuntimeException;
+       abstract void copyFromDeviceToHost() throws DMLRuntimeException; // 
Called by export()
+       
+       
+       /**
+        * It finds matrix toBeRemoved such that toBeRemoved.GPUSize >= size
+        * // TODO: it is the smallest matrix size that satisfy the above 
condition. For now just evicting the largest pointer.
+        * Then returns toBeRemoved. 
+        * 
+        */
+       protected void evict(long GPUSize) throws DMLRuntimeException {
+               if(GPUContext.allocatedPointers.size() == 0) {
+                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix!");
+               }
+               
+               synchronized(evictionLock) {
+                       Collections.sort(GPUContext.allocatedPointers, new 
Comparator<GPUObject>() {
+       
+                               @Override
+                               public int compare(GPUObject p1, GPUObject p2) {
+                                       int p1Val = p1.numLocks.get();
+                                       int p2Val = p2.numLocks.get();
+                                       
+                                       if(p1Val < 0 || p2Val < 0) {
+                                               throw new 
RuntimeException("Number of locks cannot be negative");
+                                       }
+                                       else if(p1Val == 0 && p2Val == 0) {
+                                               // Both p1 and p2 are unlocked, 
return largest object
+                                               // TODO: Modify this !!
+                                               long p1Size = 0; long p2Size = 
0;
+                                               try {
+                                                       p1Size = 
p1.getSizeOnDevice();
+                                                       p2Size = 
p2.getSizeOnDevice();
+                                               } catch (DMLRuntimeException e) 
{
+                                                       throw new 
RuntimeException(e);
+                                               }
+                                               if(p1Size == p2Size) {
+                                                       return 0;
+                                               }
+                                               else if(p1Size < p2Size) {
+                                                       return 1;
+                                               }
+                                               else {
+                                                       return -1;
+                                               }
+                                       }
+                                       else if(p1Val > p2Val) {
+                                               // There are more locks on p1
+                                               return 1;
+                                       }
+                                       else {
+                                               // There are more locks on p2
+                                               return -1;
+                                       }
+                               }
+                       });
+                       
+                       
+                       while(GPUSize > getAvailableMemory() && 
GPUContext.allocatedPointers.size() > 0) {
+                               GPUObject toBeRemoved = 
GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1);
+                               if(toBeRemoved.numLocks.get() != 0) {
+                                       throw new DMLRuntimeException("There is 
not enough memory on device for this matrix!");
+                               }
+                               if(toBeRemoved.isDeviceCopyModified) {
+                                       toBeRemoved.copyFromDeviceToHost();
+                               }
+                               toBeRemoved.clearData();
+                       }
+               }
+       }
+       
+       public void clearData() throws CacheException {
+               synchronized(evictionLock) {
+                       GPUContext.allocatedPointers.remove(this);
+               }
+               try {
+                       deallocateMemoryOnDevice();
+               } catch (DMLRuntimeException e) {
+                       throw new CacheException(e);
+               }
+       }
+       
+       static Boolean evictionLock = new Boolean(true);
+       
+       protected long getAvailableMemory() {
+               return GPUContext.currContext.getAvailableMemory();
+       }
+       
+//     // Copying from device -> host occurs here
+//     // Called by MatrixObject's exportData
+//     public void exportData() throws CacheException {
+//             boolean isDeviceCopyModified = mat.getGPUObject() != null && 
mat.getGPUObject().isDeviceCopyModified;
+//             boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || 
+//                             (mat.getMatrixBlock().getDenseBlock() == null 
&& mat.getMatrixBlock().getSparseBlock() == null);
+//             
+//             if(mat.getGPUObject() != null && (isDeviceCopyModified || 
isHostCopyUnavailable)) {
+//                     try {
+//                             mat.getGPUObject().copyFromDeviceToHost();
+//                     } catch (DMLRuntimeException e) {
+//                             throw new CacheException(e);
+//                     }
+//             }
+//     }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
new file mode 100644
index 0000000..6c8f244
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.controlprogram.context;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.utils.Statistics;
+
+import jcuda.driver.JCudaDriver;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.JCudnn;
+import jcuda.runtime.JCuda;
+import jcuda.jcudnn.cudnnHandle;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.driver.JCudaDriver.cuInit;
+import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.cudaError.cudaSuccess;
+
+/**
+ * Setup:
+ * 1. Install CUDA 7.5
+ * 2. Install CuDNN v4 from 
http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip
+ * 3. Download JCuda binaries version 0.7.5b and JCudnn version 0.7.5. Copy 
the DLLs into C:\lib (or /lib) directory. Link: 
http://www.jcuda.org/downloads/downloads.html
+ *
+ */
+public class JCudaContext extends GPUContext {
+       
+       private static final Log LOG = 
LogFactory.getLog(JCudaContext.class.getName());
+       
+       public static boolean DEBUG = false;
+       
+       public static long totalNumBytes = 0;
+       public static long availableNumBytesWithoutUtilFactor = 0;
+       // Fraction of available memory to use. The available memory is 
computer when the JCudaContext is created
+       // to handle the tradeoff on calling cudaMemGetInfo too often. 
+       public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; 
+       public static boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = true;
+       
+       static {
+               long start = System.nanoTime();
+               JCuda.setExceptionsEnabled(true);
+               JCudnn.setExceptionsEnabled(true);
+               JCublas2.setExceptionsEnabled(true);
+               JCudaDriver.setExceptionsEnabled(true);
+               cuInit(0); // Initialize the driver
+               // Obtain the number of devices
+        int deviceCountArray[] = { 0 };
+        cuDeviceGetCount(deviceCountArray);
+        int deviceCount = deviceCountArray[0];
+        LOG.info("Total number of GPUs on the machine: " + deviceCount);
+        Statistics.cudaInitTime = System.nanoTime() - start;
+       }
+       
+       public long getAvailableMemory() {
+               if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
+                       long free [] = { 0 };
+               long total [] = { 0 };
+               if(cudaMemGetInfo(free, total) == cudaSuccess) {
+                       totalNumBytes = total[0];
+                       availableNumBytesWithoutUtilFactor = free[0];
+               }
+               else {
+                       throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
+               }
+               }
+               return (long) 
(availableNumBytesWithoutUtilFactor*GPU_MEMORY_UTILIZATION_FACTOR);
+       }
+       
+       
+       public JCudaContext() {
+               if(isGPUContextCreated) {
+                       // Wait until it is deleted. This case happens during 
multi-threaded testing.
+                       // This also allows for multi-threaded execute calls
+                       long startTime = System.currentTimeMillis();
+                       do {
+                               try {
+                                       Thread.sleep(100);
+                               } catch (InterruptedException e) {}
+                       } while(isGPUContextCreated && 
(System.currentTimeMillis() - startTime) < 60000);
+                       synchronized(isGPUContextCreated) {
+                               if(GPUContext.currContext != null) {
+                                       throw new RuntimeException("Cannot 
create multiple JCudaContext. Waited for 10 min to close previous GPUContext");
+                               }
+                       }
+               }
+               GPUContext.currContext = this;
+               
+               long start = System.nanoTime();
+               LibMatrixCUDA.cudnnHandle = new cudnnHandle();
+               cudnnCreate(LibMatrixCUDA.cudnnHandle);
+               LibMatrixCUDA.cublasHandle = new cublasHandle();
+               cublasCreate(LibMatrixCUDA.cublasHandle);
+               Statistics.cudaLibrariesInitTime = System.nanoTime() - start;
+               
+               long free [] = { 0 };
+        long total [] = { 0 };
+        if(cudaMemGetInfo(free, total) == cudaSuccess) {
+               totalNumBytes = total[0];
+               availableNumBytesWithoutUtilFactor = free[0];
+        }
+        else {
+               throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
+        }
+        LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
+        LOG.info("Available GPU memory: " + 
(availableNumBytesWithoutUtilFactor*(1e-6)) + " MB");
+       }
+
+       @Override
+       public void destroy() throws DMLRuntimeException {
+               if(currContext != null) {
+                       synchronized(isGPUContextCreated) {
+                               cudnnDestroy(LibMatrixCUDA.cudnnHandle);
+                               cublasDestroy(LibMatrixCUDA.cublasHandle);
+                               currContext = null;
+                               isGPUContextCreated = false;
+                       }
+               }
+               else if(LibMatrixCUDA.cudnnHandle != null || 
LibMatrixCUDA.cublasHandle != null) {
+                       throw new DMLRuntimeException("Error while destroying 
the GPUContext");
+               }
+       }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
new file mode 100644
index 0000000..10d6408
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java
@@ -0,0 +1,276 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.controlprogram.context;
+
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.CacheException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.utils.Statistics;
+
+public class JCudaObject extends GPUObject {
+       
+       public Pointer jcudaPointer = null;
+//     public long numElems = -1;
+
+       JCudaObject(MatrixObject mat2) {
+               super(mat2);
+       }
+       
+       private void prepare(boolean isInput, int numElemsToAllocate) throws 
DMLRuntimeException {
+               if(jcudaPointer != null) {
+                       // Already allocated on GPU and expected to be in sync
+                       // checkDimensions();
+               }
+               else {
+                       long GPUSize;
+                       if(numElemsToAllocate != -1)
+                               GPUSize = (Sizeof.DOUBLE) * (long) 
(numElemsToAllocate);
+                       else if(isInput && mat != null && mat.getMatrixBlock() 
!= null && mat.getMatrixBlock().getDenseBlock() != null) {
+                               GPUSize = (Sizeof.DOUBLE) * (long) 
mat.getMatrixBlock().getDenseBlock().length;
+                               numElemsToAllocate = 
mat.getMatrixBlock().getDenseBlock().length;
+                       }
+                       else
+                               GPUSize = getSizeOnDevice();
+                       // Ensure enough memory while allocating the matrix
+                       if(GPUSize > getAvailableMemory()) {
+                               evict(GPUSize);
+                       }
+                       allocateMemoryOnDevice(numElemsToAllocate);
+                       synchronized(evictionLock) {
+                               GPUContext.allocatedPointers.add(this);
+                       }
+                       if(isInput)
+                               copyFromHostToDevice();
+               }
+               numLocks.addAndGet(1);
+       }
+       
+       @Override
+       public void acquireDeviceRead() throws DMLRuntimeException {
+               prepare(true, -1);
+               if(!isAllocated) 
+                       throw new DMLRuntimeException("Expected device data to 
be allocated");
+       }
+       
+//     private void checkDimensions() throws DMLRuntimeException {
+//             if(LibMatrixCUDA.isInSparseFormat(mat))
+//                     throw new DMLRuntimeException("Sparse format not 
implemented");
+//             else {
+//                     if(mat.getNumRows()*mat.getNumColumns() != numElems) {
+//                             throw new DMLRuntimeException("The jcudaPointer 
and MatrixBlock is not in synched");
+//                     }
+//             }
+//     }
+       
+       @Override
+       public void acquireDenseDeviceModify(int numElemsToAllocate) throws 
DMLRuntimeException {
+               prepare(false, numElemsToAllocate); 
+               isDeviceCopyModified = true;
+               if(!isAllocated) 
+                       throw new DMLRuntimeException("Expected device data to 
be allocated");
+       }
+       
+       @Override
+       public void acquireHostRead() throws CacheException {
+               if(isAllocated) {
+                       try {
+                               if(isDeviceCopyModified) {
+                                       copyFromDeviceToHost();
+                               }
+                       } catch (DMLRuntimeException e) {
+                               throw new CacheException(e);
+                       }
+               }
+       }
+       
+       @Override
+       public void acquireHostModify() throws CacheException {
+               if(isAllocated) {
+                       try {
+                               if(isDeviceCopyModified) {
+                                       throw new 
DMLRuntimeException("Potential overwrite of GPU data");
+                                       // copyFromDeviceToHost();
+                               }
+                               clearData();
+                       } catch (DMLRuntimeException e) {
+                               throw new CacheException(e);
+                       }
+               }
+       }
+       
+       public void release(boolean isGPUCopyModified) throws CacheException {
+               if(numLocks.addAndGet(-1) < 0) {
+                       throw new CacheException("Redundant release of GPU 
object");
+               }
+               isDeviceCopyModified = isGPUCopyModified;
+       }
+
+       @Override
+       void allocateMemoryOnDevice(int numElemToAllocate) throws 
DMLRuntimeException {
+               if(jcudaPointer == null) {
+                       long start = System.nanoTime();
+                       jcudaPointer = new Pointer();
+                       if(numElemToAllocate == -1 && 
LibMatrixCUDA.isInSparseFormat(mat))
+                               throw new DMLRuntimeException("Sparse format 
not implemented");
+                       else if(numElemToAllocate == -1) {
+                               // Called for dense input
+                               cudaMalloc(jcudaPointer,  
mat.getNumRows()*mat.getNumColumns()*Sizeof.DOUBLE);
+                       }
+                       else {
+                               // Called for dense output
+                               cudaMalloc(jcudaPointer,  
numElemToAllocate*Sizeof.DOUBLE);
+                       }
+                       
+                       
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
+                       Statistics.cudaAllocCount.addAndGet(1);
+               }
+               isAllocated = true;
+       }
+       
+       @Override
+       void deallocateMemoryOnDevice() {
+               if(jcudaPointer != null) {
+                       long start = System.nanoTime();
+                       cudaFree(jcudaPointer);
+                       
Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
+                       Statistics.cudaDeAllocCount.addAndGet(1);
+               }
+               jcudaPointer = null;
+               isAllocated = false;
+               numLocks.set(0);
+       }
+       
+       @Override
+       void copyFromHostToDevice() throws DMLRuntimeException {
+               if(jcudaPointer != null) {
+                       printCaller();
+                       long start = System.nanoTime();
+                       if(LibMatrixCUDA.isInSparseFormat(mat))
+                               throw new DMLRuntimeException("Sparse format 
not implemented");
+                       else {
+                               double [] data = 
mat.getMatrixBlock().getDenseBlock();
+                               if(data == null && 
mat.getMatrixBlock().getSparseBlock() != null) {
+                                       throw new 
DMLRuntimeException("Incorrect sparsity calculation");
+                               }
+                               else if(data == null) {
+                                       if(mat.getMatrixBlock().getNonZeros() 
== 0) {
+                                               data = new 
double[mat.getMatrixBlock().getNumRows()*mat.getMatrixBlock().getNumColumns()];
+                                       }
+                                       else
+                                               throw new 
DMLRuntimeException("MatrixBlock is not allocated");
+                               }
+                               cudaMemcpy(jcudaPointer, Pointer.to(data), 
mat.getNumRows()*mat.getNumColumns() * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+                       }
+                       
Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
+                       Statistics.cudaToDevCount.addAndGet(1);
+               }
+               else {
+                       throw new DMLRuntimeException("Cannot copy from host to 
device without allocating");
+               }
+       }
+
+       @Override
+       protected void copyFromDeviceToHost() throws DMLRuntimeException {
+               if(jcudaPointer != null) {
+                       printCaller();
+                       if(LibMatrixCUDA.isInSparseFormat(mat))
+                               throw new DMLRuntimeException("Sparse format 
not implemented");
+                       else {
+                               long start = System.nanoTime();
+                               MatrixBlock mb = mat.getMatrixBlock();
+                               if(mb == null) {
+                                       throw new DMLRuntimeException("CP Data 
is not allocated");
+                               }
+                               if(mb.getDenseBlock() == null) {
+                                       mb.allocateDenseBlock();
+                               }
+                               double [] data = mb.getDenseBlock();
+                               
+                               cudaMemcpy(Pointer.to(data), jcudaPointer, 
data.length * Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+                               mat.getMatrixBlock().recomputeNonZeros();
+                               
Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
+                               Statistics.cudaFromDevCount.addAndGet(1);
+                       }
+               }
+               else {
+                       throw new DMLRuntimeException("Cannot copy from device 
to host as JCuda pointer is not allocated");
+               }
+               isDeviceCopyModified = false;
+       }
+
+       @Override
+       protected long getSizeOnDevice() throws DMLRuntimeException {
+               long GPUSize = 0;
+//             boolean emptyBlock = (mat.getDenseBlock() == null && 
mat.getSparseBlock() == null);
+               int rlen = (int) mat.getNumRows();
+               int clen = (int) mat.getNumColumns();
+//             long nonZeros = mat.getNonZeros();
+               if(LibMatrixCUDA.isInSparseFormat(mat)) {
+               // if(LibMatrixCUDA.isInSparseFormat(mat)) {
+//             
+//             if (mat.getMatrixBlock().isInSparseFormat() ) { // && 
!emptyBlock) {
+//                     GPUSize = (rlen + 1) * (long) (Integer.SIZE / 
Byte.SIZE) +
+//                                     nonZeros * (long) (Integer.SIZE / 
Byte.SIZE) +
+//                                     nonZeros * (long) (Double.SIZE / 
Byte.SIZE);
+                       throw new DMLRuntimeException("Sparse format not 
implemented");
+               }
+               else {
+                       int align = 0;
+//                     if (clen > 5120)
+//                             if (clen % 256 == 0)
+//                                     align = 0;
+//                             else
+//                                     align = 256; // in the dense case we 
use this for alignment
+//                     else
+//                             if (clen % 128 == 0)
+//                                     align = 0;
+//                             else
+//                                     align = 128; // in the dense case we 
use this for alignment
+                       GPUSize = (Sizeof.DOUBLE) * (long) (rlen * clen + 
align);
+                       
+               }
+               return GPUSize;
+       }
+       
+       private String getClassAndMethod(StackTraceElement st) {
+               String [] str = st.getClassName().split("\\.");
+               return str[str.length - 1] + "." + st.getMethodName();
+       }
+       private void printCaller() {
+               if(JCudaContext.DEBUG) {
+                       StackTraceElement[] st = 
Thread.currentThread().getStackTrace();
+                       String ret = getClassAndMethod(st[1]);
+                       for(int i = 2; i < st.length && i < 7; i++) {
+                               ret += "->" + getClassAndMethod(st[i]);
+                       }
+                       System.out.println("CALL_STACK:" + ret);
+               }
+                       
+       }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
new file mode 100644
index 0000000..a29baf4
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions;
+
+import java.util.HashMap;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.Instruction.INSTRUCTION_TYPE;
+import org.apache.sysml.runtime.instructions.cp.CPInstruction;
+import 
org.apache.sysml.runtime.instructions.cp.CPInstruction.CPINSTRUCTION_TYPE;
+import org.apache.sysml.runtime.instructions.gpu.AggregateBinaryGPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction;
+
+public class GPUInstructionParser  extends InstructionParser 
+{
+       public static final HashMap<String, CPINSTRUCTION_TYPE> 
String2GPUInstructionType;
+       static {
+               String2GPUInstructionType = new HashMap<String, 
CPINSTRUCTION_TYPE>();
+               String2GPUInstructionType.put( "conv2d"      , 
CPINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "conv2d_backward_filter"      , 
CPINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "conv2d_backward_data"      , 
CPINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "ba+*"           , 
CPINSTRUCTION_TYPE.AggregateBinary);
+               
+       }
+       
+       public static CPInstruction parseSingleInstruction (String str ) 
+                       throws DMLRuntimeException 
+               {
+                       if ( str == null || str.isEmpty() )
+                               return null;
+
+                       CPINSTRUCTION_TYPE cptype = 
InstructionUtils.getCPType(str); 
+                       if ( cptype == null ) 
+                               throw new DMLRuntimeException("Unable derive 
cptype for instruction: " + str);
+                       CPInstruction cpinst = parseSingleInstruction(cptype, 
str);
+                       if ( cpinst == null )
+                               throw new DMLRuntimeException("Unable to parse 
instruction: " + str);
+                       return cpinst;
+               }
+       
+       public static CPInstruction parseSingleInstruction ( CPINSTRUCTION_TYPE 
cptype, String str ) 
+                       throws DMLRuntimeException 
+               {
+                       
+                       if ( str == null || str.isEmpty() ) 
+                               return null;
+                       
+                       if(cptype == null) {
+                               throw new DMLRuntimeException("The instruction 
is not GPU-enabled:" + str);
+                       }
+                       
+                       CPInstruction ret;
+                       switch(cptype) 
+                       {
+                               case AggregateBinary:
+                                       ret = 
AggregateBinaryGPUInstruction.parseInstruction(str);
+                                       break;
+                                       
+                               case Convolution:
+                                       ret = 
ConvolutionGPUInstruction.parseInstruction(str);
+                                       break;
+                                       
+                               default: 
+                                       throw new DMLRuntimeException("Invalid 
GPU Instruction Type: " + cptype );
+                       }
+                       
+                       ret.setType(INSTRUCTION_TYPE.GPU);
+                       return ret;
+               }
+               
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java
index 97f792d..60ca3b7 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java
@@ -37,7 +37,8 @@ public abstract class Instruction
                EXTERNAL_LIBRARY, 
                MAPREDUCE_JOB, 
                BREAKPOINT, 
-               SPARK 
+               SPARK,
+               GPU
        };
        
        protected static final Log LOG = 
LogFactory.getLog(Instruction.class.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java
index 5ae3124..b468eaa 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java
@@ -46,6 +46,11 @@ public class InstructionParser
                        SPINSTRUCTION_TYPE sptype = 
InstructionUtils.getSPType(str); 
                        return SPInstructionParser.parseSingleInstruction 
(sptype, str);
                }
+               else if (   execType.equalsIgnoreCase(ExecType.GPU.toString()) 
) 
+               {
+                       CPINSTRUCTION_TYPE cptype = 
InstructionUtils.getGPUType(str); 
+                       return GPUInstructionParser.parseSingleInstruction 
(cptype, str);
+               }
                else if ( execType.equalsIgnoreCase("MR") ) {
                        MRINSTRUCTION_TYPE mrtype = 
InstructionUtils.getMRType(str); 
                        if ( mrtype == null )

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java 
b/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java
index c8d406e..b3493c2 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java
@@ -259,6 +259,19 @@ public class InstructionUtils
        
        /**
         * 
+        * @param str
+        * @return
+        * @throws DMLUnsupportedOperationException
+        */
+       public static CPINSTRUCTION_TYPE getGPUType( String str ) 
+       {
+               String opcode = getOpCode(str);
+               CPINSTRUCTION_TYPE cptype = 
GPUInstructionParser.String2GPUInstructionType.get( opcode ); 
+               return cptype;
+       }
+       
+       /**
+        * 
         * @param opcode
         * @return
         */

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
index 38e92bb..6aa6205 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
@@ -23,8 +23,8 @@ import org.apache.sysml.api.MLContextProxy;
 import org.apache.sysml.lops.runtime.RunMRJobs;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysml.runtime.instructions.CPInstructionParser;
 import org.apache.sysml.runtime.instructions.Instruction;
+import org.apache.sysml.runtime.instructions.InstructionParser;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 
 
@@ -78,7 +78,8 @@ public abstract class CPInstruction extends Instruction
                {
                        //note: no exchange of updated instruction as labels 
might change in the general case
                        String updInst = RunMRJobs.updateLabels(tmp.toString(), 
ec.getVariables());
-                       tmp = 
CPInstructionParser.parseSingleInstruction(updInst);
+                       // tmp = 
CPInstructionParser.parseSingleInstruction(updInst);
+                       tmp = InstructionParser.parseSingleInstruction(updInst);
                        if(MLContextProxy.isActive()) {
                                MLContextProxy.setInstructionForMonitoring(tmp);
                        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
new file mode 100644
index 0000000..4a09e28
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.parser.Expression.ValueType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.Plus;
+import org.apache.sysml.runtime.functionobjects.SwapIndex;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.BinaryCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator;
+import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysml.utils.Statistics;
+
+public class AggregateBinaryGPUInstruction extends BinaryCPInstruction
+{
+       
+       public AggregateBinaryGPUInstruction(Operator op, CPOperand in1, 
CPOperand in2, CPOperand out, 
+                       String opcode, String istr, boolean isLeftTransposed, 
boolean isRightTransposed){
+               super(op, in1, in2, out, opcode, istr);
+               _cptype = CPINSTRUCTION_TYPE.AggregateBinary;
+               this.isLeftTransposed = isLeftTransposed;
+               this.isRightTransposed = isRightTransposed;
+       }
+
+       boolean isLeftTransposed;
+       boolean isRightTransposed;
+       
+       /**
+        * 
+        * @param str
+        * @return
+        * @throws DMLRuntimeException
+        */
+       public static AggregateBinaryGPUInstruction parseInstruction( String 
str ) 
+               throws DMLRuntimeException 
+       {
+               CPOperand in1 = new CPOperand("", ValueType.UNKNOWN, 
DataType.UNKNOWN);
+               CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, 
DataType.UNKNOWN);
+               CPOperand out = new CPOperand("", ValueType.UNKNOWN, 
DataType.UNKNOWN);
+
+               String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
+               String opcode = parts[0];
+
+               if ( !opcode.equalsIgnoreCase("ba+*")) {
+                       throw new 
DMLRuntimeException("AggregateBinaryInstruction.parseInstruction():: Unknown 
opcode " + opcode);
+               }
+               
+               InstructionUtils.checkNumFields( parts, 5 );
+               in1.split(parts[1]);
+               in2.split(parts[2]);
+               out.split(parts[3]);
+               
+               boolean isLeftTransposed = Boolean.parseBoolean(parts[4]);
+               boolean isRightTransposed = Boolean.parseBoolean(parts[5]);
+               
+               AggregateOperator agg = new AggregateOperator(0, 
Plus.getPlusFnObject());
+               AggregateBinaryOperator aggbin = new 
AggregateBinaryOperator(Multiply.getMultiplyFnObject(), agg, 1);
+               return new AggregateBinaryGPUInstruction(aggbin, in1, in2, out, 
opcode, str, isLeftTransposed, isRightTransposed);      
+       }
+       
+       private MatrixBlock transpose(MatrixBlock m1) throws 
DMLRuntimeException {
+               ReorgOperator r_op = new 
ReorgOperator(SwapIndex.getSwapIndexFnObject(), 1);
+               return (MatrixBlock) (m1.reorgOperations(r_op, new 
MatrixBlock(), 0, 0, 0));
+       }
+       
+       private boolean isSparse(ExecutionContext ec, String var) throws 
DMLRuntimeException {
+               MatrixObject mo = (MatrixObject) ec.getVariable(var);
+               return LibMatrixCUDA.isInSparseFormat(mo);
+       }
+       
+       @Override
+       public void processInstruction(ExecutionContext ec) 
+               throws DMLRuntimeException
+       {       
+               // --------------------------------------
+               // This code will be removed when the JIRA SYSTEMML-702 is 
complete
+               if(     isSparse(ec, input1.getName()) || isSparse(ec, 
input2.getName())) {
+                       
+                       Statistics.gpuSparseMultCount.addAndGet(1);
+                       
+                       //get inputs
+                       MatrixBlock matBlock1 = 
ec.getMatrixInput(input1.getName());
+               MatrixBlock matBlock2 = ec.getMatrixInput(input2.getName());
+               
+               if(isLeftTransposed) 
+                       matBlock1 = transpose(matBlock1);
+               if(isRightTransposed) 
+                       matBlock2 = transpose(matBlock2);
+                       
+               //compute matrix multiplication
+               AggregateBinaryOperator ab_op = (AggregateBinaryOperator) _optr;
+                       MatrixBlock soresBlock = (MatrixBlock) 
(matBlock1.aggregateBinaryOperations(matBlock1, matBlock2, new MatrixBlock(), 
ab_op));
+                               
+                       //release inputs/outputs
+                       ec.releaseMatrixInput(input1.getName());
+                       ec.releaseMatrixInput(input2.getName());
+                       ec.setMatrixOutput(output.getName(), soresBlock);
+                       return;
+               }
+               // --------------------------------------
+               
+               Statistics.incrementNoOfExecutedGPUInst();
+               
+               AggregateBinaryOperator op = (AggregateBinaryOperator) _optr;
+               if( !(op.binaryFn instanceof Multiply && op.aggOp.increOp.fn 
instanceof Plus) ) {
+                       throw new DMLRuntimeException("Unsupported binary 
aggregate operation: ("+op.binaryFn+", "+op.aggOp+").");
+               }
+               
+               //get inputs
+               MatrixObject m1 = 
ec.getMatrixInputForGPUInstruction(input1.getName());
+        MatrixObject m2 = ec.getMatrixInputForGPUInstruction(input2.getName());
+        
+        //compute matrix multiplication
+        int rlen = (int) (isLeftTransposed ? m1.getNumColumns() : 
m1.getNumRows());
+        int clen = (int) (isRightTransposed ? m2.getNumRows() : 
m2.getNumColumns());
+        
+        ec.setMetaData(output.getName(), rlen, clen);
+        MatrixObject out = 
ec.getMatrixOutputForGPUInstruction(output.getName(), false);
+        LibMatrixCUDA.matmult(m1, m2, out, isLeftTransposed, 
isRightTransposed);
+        
+               //release inputs/outputs
+               ec.releaseMatrixInputForGPUInstruction(input1.getName());
+               ec.releaseMatrixInputForGPUInstruction(input2.getName());
+               ec.releaseMatrixOutputForGPUInstruction(output.getName());
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
new file mode 100644
index 0000000..8907c4d
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.parser.Expression.ValueType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.SwapIndex;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.UnaryCPInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.Statistics;
+
+public class ConvolutionGPUInstruction extends UnaryCPInstruction {
+       
+       private CPOperand _in2; 
+       private ArrayList<CPOperand> _input_shape;
+       private ArrayList<CPOperand> _filter_shape;
+       private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
+       private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
+       
+       int N; int C; int H; int W;
+       int K; int R; int S; int stride_h; int stride_w; int pad_h; int pad_w;
+       int P; int Q;
+       
+       public ConvolutionGPUInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode,
+                       String istr, ArrayList<CPOperand> stride,
+                       ArrayList<CPOperand> padding, ArrayList<CPOperand> 
input_shape,
+                       ArrayList<CPOperand> filter_shape) {
+               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out,
+                               opcode, istr);
+               _in2 = in2;
+               _cptype = CPINSTRUCTION_TYPE.Convolution;
+               _stride = stride;
+               _padding = padding;
+               _input_shape = input_shape;
+               _filter_shape = filter_shape;
+       }
+       
+       public static ConvolutionGPUInstruction parseInstruction(String str)
+                       throws DMLRuntimeException {
+               CPOperand in = new CPOperand("", ValueType.UNKNOWN, 
DataType.UNKNOWN);
+               CPOperand out = new CPOperand("", ValueType.UNKNOWN, 
DataType.UNKNOWN);
+
+               String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
+               String opcode = parts[0];
+               if (opcode.equalsIgnoreCase("conv2d") ||
+                               
opcode.equalsIgnoreCase("conv2d_backward_filter") ||
+                               
opcode.equalsIgnoreCase("conv2d_backward_data")) {
+                       InstructionUtils.checkNumFields(parts, 15);
+                       // dout, stride1, stride2, padding1, padding2
+                       // input_shape1, input_shape2, input_shape3, 
input_shape4,
+                       // filter_shape1, filter_shape2, filter_shape3, 
filter_shape4,
+                       in.split(parts[1]);
+                       CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, 
DataType.UNKNOWN);
+                       in2.split(parts[2]);
+                       out.split(parts[15]);
+       
+                       ArrayList<CPOperand> stride = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> padding = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> input_shape = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> filter_shape = new 
ArrayList<CPOperand>();
+                       stride.add(new CPOperand(parts[3]));
+                       stride.add(new CPOperand(parts[4]));
+                       padding.add(new CPOperand(parts[5]));
+                       padding.add(new CPOperand(parts[6]));
+                       input_shape.add(new CPOperand(parts[7]));
+                       input_shape.add(new CPOperand(parts[8]));
+                       input_shape.add(new CPOperand(parts[9]));
+                       input_shape.add(new CPOperand(parts[10]));
+                       filter_shape.add(new CPOperand(parts[11]));
+                       filter_shape.add(new CPOperand(parts[12]));
+                       filter_shape.add(new CPOperand(parts[13]));
+                       filter_shape.add(new CPOperand(parts[14]));
+       
+                       return new ConvolutionGPUInstruction(in, in2, out, 
opcode, str, stride,
+                                       padding, input_shape, filter_shape);
+               } 
+               else {
+                       throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionGPUInstruction: " + str);
+               }
+       }
+       
+       private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL,
+                       int index) throws DMLRuntimeException {
+               return (int) ec.getScalarInput(aL.get(index).getName(),
+                               aL.get(index).getValueType(), 
aL.get(index).isLiteral())
+                               .getLongValue();
+       }
+       
+       @Override
+       public void processInstruction(ExecutionContext ec) 
+                       throws DMLRuntimeException {
+               
+               Statistics.incrementNoOfExecutedGPUInst();
+               
+               MatrixObject out = null;
+               if (instOpcode.equalsIgnoreCase("conv2d") || 
+                               
instOpcode.equalsIgnoreCase("conv2d_backward_filter") ||
+                               
instOpcode.equalsIgnoreCase("conv2d_backward_data")) {
+                       
+                       pad_h = getScalarInput(ec, _padding, 0);
+                       pad_w = getScalarInput(ec, _padding, 1);
+                       stride_h = getScalarInput(ec, _stride, 0);
+                       stride_w = getScalarInput(ec, _stride, 1);
+
+                       N = getScalarInput(ec, _input_shape, 0);
+                       C = getScalarInput(ec, _input_shape, 1);
+                       H = getScalarInput(ec, _input_shape, 2);
+                       W = getScalarInput(ec, _input_shape, 3);
+
+                       K = getScalarInput(ec, _filter_shape, 0);
+                       
+                       R = getScalarInput(ec, _filter_shape, 2);
+                       S = getScalarInput(ec, _filter_shape, 3);
+                       
+                       P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h);
+                       Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
+                       
+                       if (instOpcode.equalsIgnoreCase("conv2d")) {
+                               MatrixObject image = 
ec.getMatrixInputForGPUInstruction(input1.getName());
+                               MatrixObject filter = 
ec.getMatrixInputForGPUInstruction(_in2.getName());
+                               if(image.getMatrixBlock().isInSparseFormat() || 
filter.getMatrixBlock().isInSparseFormat()) {
+                                       throw new DMLRuntimeException("Sparse 
convolution not implemented");
+                               }
+                               if(image.getNumRows() != N || 
image.getNumColumns() != C*H*W) 
+                                       throw new 
DMLRuntimeException("Incorrect dimensions for image in conv2d");
+                               if(filter.getNumRows() != K || 
filter.getNumColumns() != C*R*S) 
+                                       throw new 
DMLRuntimeException("Incorrect dimensions for filter in conv2d");
+                               
+                               ec.setMetaData(output.getName(), N, K * P * Q);
+                               out = 
ec.getMatrixOutputForGPUInstruction(output.getName(), false);
+                               LibMatrixCUDA.conv2d(image, filter, out, N, C, 
H, W,
+                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
+                               
+                       }
+                       else if 
(instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
+                               MatrixObject image = 
ec.getMatrixInputForGPUInstruction(input1.getName());
+                               MatrixObject dout = 
ec.getMatrixInputForGPUInstruction(_in2.getName());
+                               if(image.getMatrixBlock().isInSparseFormat() || 
dout.getMatrixBlock().isInSparseFormat())
+                                       throw new DMLRuntimeException("Sparse 
convolution_backward_filter not implemented");
+                               if(image.getNumRows() != N || 
image.getNumColumns() != C*H*W) 
+                                       throw new 
DMLRuntimeException("Incorrect dimensions for image in conv2d_backward_filter");
+                               if(dout.getNumRows() != N || 
dout.getNumColumns() != K*P*Q) 
+                                       throw new 
DMLRuntimeException("Incorrect dimensions for dout in conv2d_backward_filter: " 
+ 
+                                                       dout.getNumRows() + " 
!= " +  N + " || " + dout.getNumColumns() + " != " + K*P*Q);
+                               
+                               ec.setMetaData(output.getName(), K, C * R * S);
+                               out = 
ec.getMatrixOutputForGPUInstruction(output.getName(), false);
+                               LibMatrixCUDA.conv2d_backward_filter(image, 
dout, out, N, C, H, W,
+                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
+                               // TODO: For now always copy the device data to 
host
+                               // ec.gpuCtx.copyDeviceToHost(outputBlock);
+                       }
+                       else if 
(instOpcode.equalsIgnoreCase("conv2d_backward_data")) {
+                               MatrixObject filter = 
ec.getMatrixInputForGPUInstruction(input1.getName());
+                               MatrixObject dout = 
ec.getMatrixInputForGPUInstruction(_in2.getName());
+                               if(filter.getMatrixBlock().isInSparseFormat() 
|| dout.getMatrixBlock().isInSparseFormat())
+                                       throw new DMLRuntimeException("Sparse 
convolution_backward_data not implemented");
+                               if(filter.getNumRows() != K || 
filter.getNumColumns() != C*R*S) 
+                                       throw new 
DMLRuntimeException("Incorrect dimensions for filter in 
convolution_backward_data");
+                               if(dout.getNumRows() != N || 
dout.getNumColumns() != K*P*Q) 
+                                       throw new 
DMLRuntimeException("Incorrect dimensions for dout in conv2d_backward_data: " + 
+                                                       dout.getNumRows() + " 
!= " +  N + " || " + dout.getNumColumns() + " != " + K*P*Q);
+                               
+                               ec.setMetaData(output.getName(), N, C * H * W);
+                               out = 
ec.getMatrixOutputForGPUInstruction(output.getName(), false);
+                               LibMatrixCUDA.conv2d_backward_data(filter, 
dout, out, N, C, H, W,
+                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
+                       }
+                       else {
+                               throw new DMLRuntimeException("Unsupported GPU 
context for " + instOpcode);
+                       }
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported op code " + 
instOpcode);
+               }
+               // release inputs/outputs
+               ec.releaseMatrixInputForGPUInstruction(input1.getName());
+               ec.releaseMatrixInputForGPUInstruction(_in2.getName());
+               ec.releaseMatrixOutputForGPUInstruction(output.getName());
+       }
+       
+}
\ No newline at end of file

[2/2] incubator-systemml git commit: [SYSTEMML-445] [WIP] Initial version of GPU backend

Reply via email to