[systemds] branch master updated: [SYSTEMDS-2913] Refactor GPUInstruction to support reuse better

arnabp20 Fri, 02 Apr 2021 04:38:45 -0700

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new b1bdeae  [SYSTEMDS-2913] Refactor GPUInstruction to support reuse 
better
b1bdeae is described below

commit b1bdeaed3e7c4124e15e297e4cfd37176c0071f9
Author: arnabp <[email protected]>
AuthorDate: Fri Apr 2 13:38:10 2021 +0200

    [SYSTEMDS-2913] Refactor GPUInstruction to support reuse better
    
    This patch refactors the subclasses of GPUInstruction by moving
    the input and output operands to the parent class.
    This patch also extends the infrastructure of reuse of gpu
    intermediates to support all gpu instructions.
---
 .../gpu/AggregateBinaryGPUInstruction.java         | 19 +----
 .../gpu/AggregateUnaryGPUInstruction.java          | 99 +++++++++++-----------
 .../gpu/ArithmeticBinaryGPUInstruction.java        | 21 +----
 .../gpu/BuiltinBinaryGPUInstruction.java           | 21 +----
 .../gpu/BuiltinUnaryGPUInstruction.java            |  6 +-
 .../runtime/instructions/gpu/GPUInstruction.java   | 30 ++++++-
 .../instructions/gpu/MMTSJGPUInstruction.java      | 10 +--
 .../gpu/MatrixAppendGPUInstruction.java            | 21 ++---
 .../gpu/MatrixBuiltinGPUInstruction.java           |  4 +-
 .../gpu/MatrixIndexingGPUInstruction.java          | 29 ++++---
 .../gpu/MatrixMatrixBuiltinGPUInstruction.java     | 14 +--
 .../gpu/MatrixReshapeGPUInstruction.java           | 19 +++--
 .../gpu/RelationalBinaryGPUInstruction.java        |  9 +-
 .../instructions/gpu/ReorgGPUInstruction.java      | 22 +----
 .../gpu/ScalarMatrixBuiltinGPUInstruction.java     | 58 +++++++------
 .../instructions/gpu/context/GPUContext.java       |  9 ++
 .../instructions/gpu/context/GPUObject.java        |  8 +-
 .../apache/sysds/runtime/lineage/LineageCache.java | 49 +++++++----
 .../sysds/runtime/lineage/LineageCacheConfig.java  | 10 ++-
 .../test/functions/lineage/GPUFullReuseTest.java   | 11 ++-
 .../scripts/functions/lineage/FullReuseGPU1.dml    |  4 +-
 21 files changed, 234 insertions(+), 239 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 6445135..e737d52 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -18,7 +18,6 @@
  */
 package org.apache.sysds.runtime.instructions.gpu;
 
-import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
@@ -27,9 +26,6 @@ import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
-import org.apache.sysds.runtime.lineage.LineageItem;
-import org.apache.sysds.runtime.lineage.LineageItemUtils;
-import org.apache.sysds.runtime.lineage.LineageTraceable;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCuMatMult;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -38,20 +34,14 @@ import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysds.utils.GPUStatistics;
 
-public class AggregateBinaryGPUInstruction extends GPUInstruction implements 
LineageTraceable {
-       private CPOperand _input1 = null;
-       private CPOperand _input2 = null;
-       public CPOperand _output = null;
+public class AggregateBinaryGPUInstruction extends GPUInstruction {
        private boolean _isLeftTransposed;
        private boolean _isRightTransposed;
 
        private AggregateBinaryGPUInstruction(Operator op, CPOperand in1, 
CPOperand in2, CPOperand out, String opcode,
                        String istr, boolean leftTranspose, boolean 
rightTranspose) {
-               super(op, opcode, istr);
+               super(op, in1, in2, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.AggregateBinary;
-               _input1 = in1;
-               _input2 = in2;
-               _output = out;
                _isLeftTransposed = leftTranspose;
                _isRightTransposed = rightTranspose;
        }
@@ -102,9 +92,4 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction implements Lin
                return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(0), mo);
        }
 
-       @Override
-       public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
-               return Pair.of(_output.getName(), new LineageItem(getOpcode(),
-                       LineageItemUtils.getLineage(ec, _input1, _input2)));
-       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
index 905a94a..12f76b0 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
@@ -36,67 +36,68 @@ import org.apache.sysds.utils.GPUStatistics;
  * Implements aggregate unary instructions for CUDA
  */
 public class AggregateUnaryGPUInstruction extends GPUInstruction {
-       private CPOperand _input1 = null;
-       private CPOperand _output = null;
 
        private AggregateUnaryGPUInstruction(Operator op, CPOperand in1, 
CPOperand out, String opcode, String istr) {
-               super(op, opcode, istr);
+               super(op, in1, null, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.AggregateUnary;
-               _input1 = in1;
-               _output = out;
        }
 
-  public static AggregateUnaryGPUInstruction parseInstruction(String str ) {
-    String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
-    String opcode = parts[0];
-    CPOperand in1 = new CPOperand(parts[1]);
-    CPOperand out = new CPOperand(parts[2]);
+       public static AggregateUnaryGPUInstruction parseInstruction(String str) 
{
+               String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
+               String opcode = parts[0];
+               CPOperand in1 = new CPOperand(parts[1]);
+               CPOperand out = new CPOperand(parts[2]);
 
-    // This follows logic similar to AggregateUnaryCPInstruction.
-    // nrow, ncol & length should either read or refresh metadata
-    Operator aggop = null;
-    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
-      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
-    } else {
-      aggop = InstructionUtils.parseBasicAggregateUnaryOperator(opcode);
-    }
-    return new AggregateUnaryGPUInstruction(aggop, in1, out, opcode, str);
-  }
+               // This follows logic similar to AggregateUnaryCPInstruction.
+               // nrow, ncol & length should either read or refresh metadata
+               Operator aggop = null;
+               if (opcode.equalsIgnoreCase("nrow") || 
opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) {
+                       throw new DMLRuntimeException("nrow, ncol & length 
should not be compiled as GPU instructions!");
+               }
+               else {
+                       aggop = 
InstructionUtils.parseBasicAggregateUnaryOperator(opcode);
+               }
+               return new AggregateUnaryGPUInstruction(aggop, in1, out, 
opcode, str);
+       }
 
-  @Override
-  public void processInstruction(ExecutionContext ec) {
-    GPUStatistics.incrementNoOfExecutedGPUInst();
+       @Override
+       public void processInstruction(ExecutionContext ec) {
+               GPUStatistics.incrementNoOfExecutedGPUInst();
 
-    String opcode = getOpcode();
+               String opcode = getOpcode();
 
-    // nrow, ncol & length should either read or refresh metadata
-    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
-      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
-    }
+               // nrow, ncol & length should either read or refresh metadata
+               if (opcode.equalsIgnoreCase("nrow") || 
opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) {
+                       throw new DMLRuntimeException("nrow, ncol & length 
should not be compiled as GPU instructions!");
+               }
 
-    //get inputs
-    MatrixObject in1 = getMatrixInputForGPUInstruction(ec, _input1.getName());
+               // get inputs
+               MatrixObject in1 = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
 
-    int rlen = (int)in1.getNumRows();
-    int clen = (int)in1.getNumColumns();
+               int rlen = (int) in1.getNumRows();
+               int clen = (int) in1.getNumColumns();
 
-    IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn;
-    if (indexFunction instanceof ReduceRow){  // COL{SUM, MAX...}
-      ec.setMetaData(_output.getName(), 1, clen);
-    } else if (indexFunction instanceof ReduceCol) { // ROW{SUM, MAX,...}
-      ec.setMetaData(_output.getName(), rlen, 1);
-    }
+               IndexFunction indexFunction = ((AggregateUnaryOperator) 
_optr).indexFn;
+               if (indexFunction instanceof ReduceRow) { // COL{SUM, MAX...}
+                       ec.setMetaData(_output.getName(), 1, clen);
+               }
+               else if (indexFunction instanceof ReduceCol) { // ROW{SUM, 
MAX,...}
+                       ec.setMetaData(_output.getName(), rlen, 1);
+               }
 
-    LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(0), getExtendedOpcode(), 
in1, _output.getName(), (AggregateUnaryOperator)_optr);
+               LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(0), 
getExtendedOpcode(),
+                               in1, _output.getName(), 
(AggregateUnaryOperator) _optr);
 
-    //release inputs/outputs
-    ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+               // release inputs/outputs
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 
-    // If the unary aggregate is a row reduction or a column reduction, it 
results in a vector
-    // which needs to be released. Otherwise a scala is produced and it is 
copied back to the host
-    // and set in the execution context by invoking the setScalarOutput
-    if (indexFunction instanceof ReduceRow || indexFunction instanceof 
ReduceCol) {
-      ec.releaseMatrixOutputForGPUInstruction(_output.getName());
-    }
-  }
-}
+               // If the unary aggregate is a row reduction or a column 
reduction, it results
+               // in a vector
+               // which needs to be released. Otherwise a scala is produced 
and it is copied
+               // back to the host
+               // and set in the execution context by invoking the 
setScalarOutput
+               if (indexFunction instanceof ReduceRow || indexFunction 
instanceof ReduceCol) {
+                       
ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+               }
+       }
+}
\ No newline at end of file
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java
index f451910..5c7f6b9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java
@@ -19,29 +19,18 @@
 
 package org.apache.sysds.runtime.instructions.gpu;
 
-import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
-import org.apache.sysds.runtime.lineage.LineageItem;
-import org.apache.sysds.runtime.lineage.LineageItemUtils;
-import org.apache.sysds.runtime.lineage.LineageTraceable;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 
-public abstract class ArithmeticBinaryGPUInstruction extends GPUInstruction 
implements LineageTraceable {
-       protected CPOperand _input1;
-       protected CPOperand _input2;
-       protected CPOperand _output;
+public abstract class ArithmeticBinaryGPUInstruction extends GPUInstruction {
 
        protected ArithmeticBinaryGPUInstruction(Operator op, CPOperand in1, 
CPOperand in2, CPOperand out, String opcode,
                        String istr) {
-               super(op, opcode, istr);
+               super(op, in1, in2, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.ArithmeticBinary;
-               _input1 = in1;
-               _input2 = in2;
-               _output = out;
        }
 
        public static ArithmeticBinaryGPUInstruction parseInstruction ( String 
str ) {
@@ -70,10 +59,4 @@ public abstract class ArithmeticBinaryGPUInstruction extends 
GPUInstruction impl
                else
                        throw new DMLRuntimeException("Unsupported GPU 
ArithmeticInstruction.");
        }
-
-       @Override
-       public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
-               return Pair.of(_output.getName(), new LineageItem(getOpcode(),
-                       LineageItemUtils.getLineage(ec, _input1, _input2)));
-       }
 }
\ No newline at end of file
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
index 82d3222..de604ba 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
@@ -19,35 +19,24 @@
 
 package org.apache.sysds.runtime.instructions.gpu;
 
-import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
-import org.apache.sysds.runtime.lineage.LineageItem;
-import org.apache.sysds.runtime.lineage.LineageItemUtils;
-import org.apache.sysds.runtime.lineage.LineageTraceable;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 
-public abstract class BuiltinBinaryGPUInstruction extends GPUInstruction 
implements LineageTraceable {
+public abstract class BuiltinBinaryGPUInstruction extends GPUInstruction {
        @SuppressWarnings("unused")
        private int _arity;
 
-       CPOperand output;
-       CPOperand input1, input2;
-
        protected BuiltinBinaryGPUInstruction(Operator op, CPOperand input1, 
CPOperand input2, CPOperand output,
                        String opcode, String istr, int _arity) {
-               super(op, opcode, istr);
+               super(op, input1, input2, output, opcode, istr);
                this._arity = _arity;
-               this.output = output;
-               this.input1 = input1;
-               this.input2 = input2;
        }
 
        public static BuiltinBinaryGPUInstruction parseInstruction(String str) {
@@ -88,10 +77,4 @@ public abstract class BuiltinBinaryGPUInstruction extends 
GPUInstruction impleme
                                "GPU : Unsupported GPU builtin operations on a 
matrix and a scalar:" + opcode);
        }
 
-       @Override
-       public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
-               return Pair.of(output.getName(), new LineageItem(getOpcode(),
-                       LineageItemUtils.getLineage(ec, input1, input2)));
-       }
-
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
index 8ad6d98..0812c81 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
@@ -29,16 +29,12 @@ import org.apache.sysds.runtime.matrix.operators.Operator;
 
 public abstract class BuiltinUnaryGPUInstruction extends GPUInstruction {
        int _arity;
-       CPOperand _input;
-       CPOperand _output;
 
        protected BuiltinUnaryGPUInstruction(Operator op, CPOperand in, 
CPOperand out, int _arity, String opcode,
                        String istr) {
-               super(op, opcode, istr);
+               super(op, in, null, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.BuiltinUnary;
                this._arity = _arity;
-               _input = in;
-               _output = out;
        }
 
        public int getArity() {
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
index 615e194..4c51c70 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysds.runtime.instructions.gpu;
 
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
@@ -27,12 +28,18 @@ import 
org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.GPUInstructionParser;
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.cp.CPInstruction;
+import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysds.runtime.lineage.LineageItem;
+import org.apache.sysds.runtime.lineage.LineageItemUtils;
+import org.apache.sysds.runtime.lineage.LineageTraceable;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.utils.Statistics;
 
-public abstract class GPUInstruction extends Instruction {
+public abstract class GPUInstruction extends Instruction implements 
LineageTraceable {
        private static final Log LOG = 
LogFactory.getLog(GPUInstruction.class.getName());
+       public final CPOperand _output;
+       public final CPOperand _input1, _input2;
 
        public enum GPUINSTRUCTION_TYPE {
                AggregateUnary,
@@ -152,14 +159,27 @@ public abstract class GPUInstruction extends Instruction {
 
        protected boolean _requiresLabelUpdate = false;
 
-       protected GPUInstruction(Operator op, String opcode, String istr) {
+       protected GPUInstruction(Operator op, CPOperand in1, CPOperand in2, 
CPOperand out, String opcode, String istr) {
                super(op);
+               _input1 = in1;
+               _input2 = in2;
+               _output = out;
                instString = istr;
 
                // prepare opcode and update requirement for repeated usage
                instOpcode = opcode;
                _requiresLabelUpdate = super.requiresLabelUpdate();
        }
+       
+       protected GPUInstruction(Operator op, String opcode, String istr) {
+               super(op);
+               _input1 = null;
+               _input2 = null;
+               _output = null;
+               instString = istr;
+               instOpcode = opcode;
+               _requiresLabelUpdate = super.requiresLabelUpdate();
+       }
 
        @Override
        public IType getType() {
@@ -231,4 +251,10 @@ public abstract class GPUInstruction extends Instruction {
        protected MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long 
numRows, long numCols) {
                return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, 
numCols).getKey();
        }
+
+       @Override
+       public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
+               return Pair.of(_output.getName(), new LineageItem(getOpcode(),
+                       LineageItemUtils.getLineage(ec, _input1, _input2)));
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MMTSJGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MMTSJGPUInstruction.java
index 0a435d7..29e50e0 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -31,8 +31,6 @@ import org.apache.sysds.utils.GPUStatistics;
 
 public class MMTSJGPUInstruction extends GPUInstruction {
        private MMTSJType _type = null;
-       CPOperand _input;
-       CPOperand _output;
 
        /**
         * MMTSJGPUInstruction constructor.
@@ -51,11 +49,9 @@ public class MMTSJGPUInstruction extends GPUInstruction {
         *                      ?
         */
        private MMTSJGPUInstruction(Operator op, CPOperand in1, MMTSJType type, 
CPOperand out, String opcode, String istr) {
-               super(op, opcode, istr);
+               super(op, in1, null, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.MMTSJ;
                _type = type;
-               _input = in1;
-               _output = out;
        }
 
        public static MMTSJGPUInstruction parseInstruction ( String str )
@@ -77,14 +73,14 @@ public class MMTSJGPUInstruction extends GPUInstruction {
        @Override
        public void processInstruction(ExecutionContext ec) {
                GPUStatistics.incrementNoOfExecutedGPUInst();
-               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
+               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
                boolean isLeftTransposed = ( _type == MMTSJType.LEFT);
                int rlen = (int) (isLeftTransposed? mat.getNumColumns() : 
mat.getNumRows());
                int clen = rlen;
                //execute operations 
                ec.setMetaData(_output.getName(), rlen, clen);
                LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
-               ec.releaseMatrixInputForGPUInstruction(_input.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
index a4084a4..e539e69 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
@@ -37,16 +37,11 @@ import org.apache.sysds.utils.GPUStatistics;
  */
 public class MatrixAppendGPUInstruction extends GPUInstruction {
 
-       CPOperand output;
-       CPOperand input1, input2;
        AppendCPInstruction.AppendType atype;
 
        private MatrixAppendGPUInstruction(Operator op, CPOperand in1, 
CPOperand in2, CPOperand out,
                        AppendCPInstruction.AppendType type, String opcode, 
String istr) {
-               super(op, opcode, istr);
-               this.output = out;
-               this.input1 = in1;
-               this.input2 = in2;
+               super(op, in1, in2, out, opcode, istr);
                this.atype = type;
        }
 
@@ -75,16 +70,16 @@ public class MatrixAppendGPUInstruction extends 
GPUInstruction {
        public void processInstruction(ExecutionContext ec) {
                GPUStatistics.incrementNoOfExecutedGPUInst();
                String opcode = getOpcode();
-               MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, 
input1.getName());
-               MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, 
input2.getName());
+               MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                if(atype == AppendCPInstruction.AppendType.CBIND)
-                       LibMatrixCUDA.cbind(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, mat2, output.getName());
+                       LibMatrixCUDA.cbind(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, mat2, _output.getName());
                else if (atype == AppendCPInstruction.AppendType.RBIND )
-                       LibMatrixCUDA.rbind(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, mat2, output.getName());
+                       LibMatrixCUDA.rbind(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, mat2, _output.getName());
                else
                        throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
-               ec.releaseMatrixInputForGPUInstruction(input1.getName());
-               ec.releaseMatrixInputForGPUInstruction(input2.getName());
-               ec.releaseMatrixOutputForGPUInstruction(output.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index 9a3ae12..c6ce962 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -44,7 +44,7 @@ public class MatrixBuiltinGPUInstruction extends 
BuiltinUnaryGPUInstruction {
                GPUStatistics.incrementNoOfExecutedGPUInst();
 
                String opcode = getOpcode();
-               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
+               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
                if(opcode != "ucumk+*")
                        ec.setMetaData(_output.getName(), mat.getNumRows(), 
mat.getNumColumns());
 
@@ -119,7 +119,7 @@ public class MatrixBuiltinGPUInstruction extends 
BuiltinUnaryGPUInstruction {
                        LOG.trace("processInstruction() " + getExtendedOpcode() 
+ " executed in " + duration + "ms.");
                }
 
-               ec.releaseMatrixInputForGPUInstruction(_input.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 }
\ No newline at end of file
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
index 8eb4567..eab4afa 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
@@ -20,12 +20,15 @@ package org.apache.sysds.runtime.instructions.gpu;
 
 import org.apache.sysds.lops.LeftIndex;
 import org.apache.sysds.lops.RightIndex;
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
+import org.apache.sysds.runtime.lineage.LineageItem;
+import org.apache.sysds.runtime.lineage.LineageItemUtils;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.SimpleOperator;
@@ -34,33 +37,25 @@ import org.apache.sysds.utils.GPUStatistics;
 
 public class MatrixIndexingGPUInstruction extends GPUInstruction {
        CPOperand rowLower, rowUpper, colLower, colUpper;
-       CPOperand input1;
-       CPOperand input2;
-       CPOperand output;
 
        private MatrixIndexingGPUInstruction(CPOperand in, CPOperand rl, 
CPOperand ru, CPOperand cl,
                        CPOperand cu, CPOperand out, String opcode, String 
istr) {
-               super(null, opcode, istr);
+               super(null, in, null, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.MatrixIndexing;
                rowLower = rl;
                rowUpper = ru;
                colLower = cl;
                colUpper = cu;
-               input1 = in;
-               output = out;
        }
 
        private MatrixIndexingGPUInstruction(Operator op, CPOperand lhsInput, 
CPOperand rhsInput, CPOperand rl,
                        CPOperand ru, CPOperand cl, CPOperand cu, CPOperand 
out, String opcode, String istr) {
-               super(op, opcode, istr);
+               super(op, lhsInput, rhsInput, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.MatrixIndexing;
                rowLower = rl;
                rowUpper = ru;
                colLower = cl;
                colUpper = cu;
-               input1 = lhsInput;
-               input2 = rhsInput;
-               output = out;
        }
 
        public static MatrixIndexingGPUInstruction parseInstruction ( String 
str ) {
@@ -123,10 +118,10 @@ public class MatrixIndexingGPUInstruction extends 
GPUInstruction {
                
                IndexRange ixrange = getIndexRange(ec);
                if ( opcode.equalsIgnoreCase(RightIndex.OPCODE) ) {
-                       MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, 
input1.getName());
-                       LibMatrixCUDA.sliceOperations(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, ixrange, output.getName());
-                       
ec.releaseMatrixInputForGPUInstruction(input1.getName());
-                       
ec.releaseMatrixOutputForGPUInstruction(output.getName());
+                       MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+                       LibMatrixCUDA.sliceOperations(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, ixrange, _output.getName());
+                       
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+                       
ec.releaseMatrixOutputForGPUInstruction(_output.getName());
                }
                else {
                        throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
@@ -140,4 +135,10 @@ public class MatrixIndexingGPUInstruction extends 
GPUInstruction {
                        (int)(ec.getScalarInput(colLower).getLongValue()-1),
                        (int)(ec.getScalarInput(colUpper).getLongValue()-1));
        }
+
+       @Override
+       public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
+               return Pair.of(_output.getName(), new LineageItem(getOpcode(),
+                       LineageItemUtils.getLineage(ec, 
_input1,rowLower,rowUpper,colLower,colUpper)));
+       }
 }
\ No newline at end of file
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
index d1c6a9b..ff49b0d 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
@@ -40,20 +40,20 @@ public class MatrixMatrixBuiltinGPUInstruction extends 
BuiltinBinaryGPUInstructi
                GPUStatistics.incrementNoOfExecutedGPUInst();
 
                String opcode = getOpcode();
-               MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, 
input1.getName());
-               MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, 
input2.getName());
+               MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
+               MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
 
                if (opcode.equals("solve")) {
-                       ec.setMetaData(output.getName(), mat1.getNumColumns(), 
1);
-                       LibMatrixCUDA.solve(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, mat2, output.getName());
+                       ec.setMetaData(_output.getName(), mat1.getNumColumns(), 
1);
+                       LibMatrixCUDA.solve(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, mat2, _output.getName());
 
                }
                else {
                        throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
                }
-               ec.releaseMatrixInputForGPUInstruction(input1.getName());
-               ec.releaseMatrixInputForGPUInstruction(input2.getName());
-               ec.releaseMatrixOutputForGPUInstruction(output.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
index aa5ee9e..97d39cc 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
@@ -18,6 +18,7 @@
  */
 package org.apache.sysds.runtime.instructions.gpu;
 
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
@@ -28,6 +29,8 @@ import org.apache.sysds.runtime.instructions.cp.BooleanObject;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.gpu.context.ExecutionConfig;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysds.runtime.lineage.LineageItem;
+import org.apache.sysds.runtime.lineage.LineageItemUtils;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
@@ -37,20 +40,16 @@ import jcuda.Pointer;
 
 public class MatrixReshapeGPUInstruction extends GPUInstruction {
        
-       private final CPOperand _input;
-       private final CPOperand _output;
        private final CPOperand _opRows;
        private final CPOperand _opCols;
        private final CPOperand _opByRow;
        
        protected MatrixReshapeGPUInstruction(Operator op, String opcode, 
String istr, 
                        CPOperand in1, CPOperand in2, CPOperand in3, CPOperand 
in4, CPOperand out) {
-               super(op, opcode, istr);
-               _input = in1;
+               super(op, in1, null, out, opcode, istr);
                _opRows = in2;
                _opCols = in3;
                _opByRow = in4;
-               _output = out;
        }
        
        public static MatrixReshapeGPUInstruction parseInstruction ( String str 
) {
@@ -80,7 +79,7 @@ public class MatrixReshapeGPUInstruction extends 
GPUInstruction {
                GPUStatistics.incrementNoOfExecutedGPUInst();
                String instName = getExtendedOpcode();
                GPUContext gCtx = ec.getGPUContext(0); 
-               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
+               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
                if(rows*cols != mat.getNumRows()*mat.getNumColumns()) {
                        throw new DMLRuntimeException("Incorrect number of rows 
and cols in rshape instruction");
                }
@@ -100,8 +99,14 @@ public class MatrixReshapeGPUInstruction extends 
GPUInstruction {
                                LibMatrixCUDA.toInt(mat.getNumRows()), 
LibMatrixCUDA.toInt(mat.getNumColumns()),
                                rows, cols);
                }
-               ec.releaseMatrixInputForGPUInstruction(_input.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 
+       @Override
+       public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
+               return Pair.of(_output.getName(), new LineageItem(getOpcode(),
+                       LineageItemUtils.getLineage(ec, _input1, _opRows, 
_opCols, _opByRow)));
+       }
+
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java
index adeb13b..ae97fe9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java
@@ -27,17 +27,10 @@ import org.apache.sysds.runtime.matrix.operators.Operator;
 
 public abstract class RelationalBinaryGPUInstruction extends GPUInstruction {
 
-       protected CPOperand _input1;
-       protected CPOperand _input2;
-       protected CPOperand _output;
-
        protected RelationalBinaryGPUInstruction(Operator op, CPOperand in1, 
CPOperand in2, CPOperand out, String opcode,
                        String istr) {
-               super(op, opcode, istr);
+               super(op, in1, in2, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.RelationalBinary;
-               _input1 = in1;
-               _input2 = in2;
-               _output = out;
        }
 
        public static RelationalBinaryGPUInstruction parseInstruction ( String 
str ) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/ReorgGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/ReorgGPUInstruction.java
index a3e36d0..f55e16c 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/ReorgGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -19,24 +19,18 @@
 
 package org.apache.sysds.runtime.instructions.gpu;
 
-import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
-import org.apache.sysds.runtime.lineage.LineageItem;
-import org.apache.sysds.runtime.lineage.LineageItemUtils;
-import org.apache.sysds.runtime.lineage.LineageTraceable;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysds.utils.GPUStatistics;
 
-public class ReorgGPUInstruction extends GPUInstruction implements 
LineageTraceable {
-       private CPOperand _input;
-       private CPOperand _output;
+public class ReorgGPUInstruction extends GPUInstruction {
 
        /**
         * for opcodes r'
@@ -53,10 +47,8 @@ public class ReorgGPUInstruction extends GPUInstruction 
implements LineageTracea
         *            instruction string
         */
        private ReorgGPUInstruction(Operator op, CPOperand in, CPOperand out, 
String opcode, String istr) {
-               super(op, opcode, istr);
+               super(op, in, null, out, opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.Reorg;
-               _input = in;
-               _output = out;
        }
 
        public static ReorgGPUInstruction parseInstruction ( String str ) {
@@ -74,20 +66,14 @@ public class ReorgGPUInstruction extends GPUInstruction 
implements LineageTracea
        @Override
        public void processInstruction(ExecutionContext ec) {
                GPUStatistics.incrementNoOfExecutedGPUInst();
-               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
+               MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
                int rlen = (int) mat.getNumColumns();
                int clen = (int) mat.getNumRows();
                //execute operation
                ec.setMetaData(_output.getName(), rlen, clen);
                LibMatrixCUDA.transpose(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName());
                //release inputs/outputs
-               ec.releaseMatrixInputForGPUInstruction(_input.getName());
+               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
-
-       @Override
-       public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
-               return Pair.of(_output.getName(), new LineageItem(getOpcode(),
-                       LineageItemUtils.getLineage(ec, _input)));
-       }
 }
\ No newline at end of file
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/ScalarMatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/ScalarMatrixBuiltinGPUInstruction.java
index 4329189..233408e 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/ScalarMatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/ScalarMatrixBuiltinGPUInstruction.java
@@ -39,34 +39,36 @@ public class ScalarMatrixBuiltinGPUInstruction extends 
BuiltinBinaryGPUInstructi
                _gputype = GPUINSTRUCTION_TYPE.BuiltinUnary;
        }
 
-  @Override
-  public void processInstruction(ExecutionContext ec) {
-    GPUStatistics.incrementNoOfExecutedGPUInst();
+       @Override
+       public void processInstruction(ExecutionContext ec) {
+               GPUStatistics.incrementNoOfExecutedGPUInst();
 
-    String opcode = getOpcode();
-    CPOperand mat = ( input1.getDataType() == DataType.MATRIX ) ? input1 : 
input2;
-       CPOperand scalar = ( input1.getDataType() == DataType.MATRIX ) ? input2 
: input1;
-       MatrixObject in1 = getMatrixInputForGPUInstruction(ec, mat.getName());
-       ScalarObject constant = ec.getScalarInput(scalar);
-    
-    if(opcode.equals("max")) {
-       ec.setMetaData(output.getName(), in1.getNumRows(), in1.getNumColumns());
-       double constVal = constant.getDoubleValue();
-       if(constVal == 0)
-               LibMatrixCuDNN.relu(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1, output.getName());
-       else
-               LibMatrixCUDA.matrixScalarOp(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1, output.getName(), false, 
-                               
InstructionUtils.parseScalarBinaryOperator(opcode, false, constVal));
-    } else if(opcode.equals("min")) {
-       ec.setMetaData(output.getName(), in1.getNumRows(), in1.getNumColumns());
-       double constVal = constant.getDoubleValue();
-       LibMatrixCUDA.matrixScalarOp(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1, output.getName(), false, 
-                               
InstructionUtils.parseScalarBinaryOperator(opcode, false, constVal));
-    } else {
-      throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
-    }
-    ec.releaseMatrixInputForGPUInstruction(mat.getName());
-    ec.releaseMatrixOutputForGPUInstruction(output.getName());
-  }
+               String opcode = getOpcode();
+               CPOperand mat = (_input1.getDataType() == DataType.MATRIX) ? 
_input1 : _input2;
+               CPOperand scalar = (_input1.getDataType() == DataType.MATRIX) ? 
_input2 : _input1;
+               MatrixObject in1 = getMatrixInputForGPUInstruction(ec, 
mat.getName());
+               ScalarObject constant = ec.getScalarInput(scalar);
+
+               if (opcode.equals("max")) {
+                       ec.setMetaData(_output.getName(), in1.getNumRows(), 
in1.getNumColumns());
+                       double constVal = constant.getDoubleValue();
+                       if (constVal == 0)
+                               LibMatrixCuDNN.relu(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1, _output.getName());
+                       else
+                               LibMatrixCUDA.matrixScalarOp(ec, 
ec.getGPUContext(0), getExtendedOpcode(), in1, 
+                                       _output.getName(), false, 
InstructionUtils.parseScalarBinaryOperator(opcode, false, constVal));
+               }
+               else if (opcode.equals("min")) {
+                       ec.setMetaData(_output.getName(), in1.getNumRows(), 
in1.getNumColumns());
+                       double constVal = constant.getDoubleValue();
+                       LibMatrixCUDA.matrixScalarOp(ec, ec.getGPUContext(0), 
getExtendedOpcode(), in1,
+                               _output.getName(), false, 
InstructionUtils.parseScalarBinaryOperator(opcode, false, constVal));
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
+               }
+               ec.releaseMatrixInputForGPUInstruction(mat.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+       }
 
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
index b00b12c..16e3c7e 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -254,6 +254,15 @@ public class GPUContext {
                return ret;
        }
        
+       /**
+        * Shallow copy the given source {@link GPUObject} to a new {@link 
GPUObject} and 
+        * assign that to the given {@link MatrixObject}.
+        * This copy doesn't memcopy the device memory.
+        * 
+        * @param source a {@link GPUObject} which is the source of the copy
+        * @param mo a {@link MatrixObject} to associate with the new {@link 
GPUObject}
+        * @return a new {@link GPUObject} instance
+        */
        public GPUObject shallowCopyGPUObject(GPUObject source, MatrixObject 
mo) {
                GPUObject ret = new GPUObject(this, source, mo);
                
getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret);
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index d55f934..df8e2c0 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -159,6 +159,10 @@ public class GPUObject {
                        jcudaSparseMatrixPtr = null;
                }
        }
+       
+       public void setDirty(boolean flag) {
+               dirty = flag;
+       }
        // 
----------------------------------------------------------------------
 
 
@@ -452,9 +456,9 @@ public class GPUObject {
                timestamp = new AtomicLong(that.timestamp.get());
                isSparse = that.isSparse;
                isLineageCached = that.isLineageCached;
-               if (isDensePointerNull())
+               if (!that.isDensePointerNull())
                        setDensePointer(that.getDensePointer());
-               if (getJcudaSparseMatrixPtr() != null)
+               if (that.getJcudaSparseMatrixPtr() != null)
                        
setSparseMatrixCudaPointer(that.getSparseMatrixCudaPointer());
                gpuContext = gCtx;
                this.mat = mat;
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index 20fcdc2..6d5254a 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -46,7 +46,6 @@ import 
org.apache.sysds.runtime.instructions.cp.MultiReturnBuiltinCPInstruction;
 import 
org.apache.sysds.runtime.instructions.cp.ParameterizedBuiltinCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.fed.ComputationFEDInstruction;
-import org.apache.sysds.runtime.instructions.gpu.AggregateBinaryGPUInstruction;
 import org.apache.sysds.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCacheStatus;
@@ -93,10 +92,11 @@ public class LineageCache
                if (LineageCacheConfig.isReusable(inst, ec)) {
                        ComputationCPInstruction cinst = inst instanceof 
ComputationCPInstruction ? (ComputationCPInstruction)inst : null;
                        ComputationFEDInstruction cfinst = inst instanceof 
ComputationFEDInstruction ? (ComputationFEDInstruction)inst : null; 
+                       GPUInstruction gpuinst = inst instanceof GPUInstruction 
? (GPUInstruction)inst : null;
                                
                        LineageItem instLI = (cinst != null) ? 
cinst.getLineageItem(ec).getValue()
                                        : (cfinst != null) ? 
cfinst.getLineageItem(ec).getValue() 
-                                       : 
((LineageTraceable)inst).getLineageItem(ec).getValue();  //GPU instruction
+                                       : gpuinst.getLineageItem(ec).getValue();
                        List<MutablePair<LineageItem, LineageCacheEntry>> 
liList = null;
                        if (inst instanceof MultiReturnBuiltinCPInstruction) {
                                liList = new ArrayList<>();
@@ -134,8 +134,8 @@ public class LineageCache
                                                        
putIntern(item.getKey(), cinst.output.getDataType(), null, null,  0);
                                                else if (cfinst != null)
                                                        
putIntern(item.getKey(), cfinst.output.getDataType(), null, null,  0);
-                                               else if (inst instanceof 
AggregateBinaryGPUInstruction)
-                                                       
putIntern(item.getKey(), 
((AggregateBinaryGPUInstruction)inst)._output.getDataType(), null, null,  0);
+                                               else if (gpuinst != null)
+                                                       
putIntern(item.getKey(), gpuinst._output.getDataType(), null, null,  0);
                                                //FIXME: different o/p 
datatypes for MultiReturnBuiltins.
                                        }
                                }
@@ -154,16 +154,20 @@ public class LineageCache
                                                outName = 
cinst.output.getName();
                                        else if (inst instanceof 
ComputationFEDInstruction)
                                                outName = 
cfinst.output.getName();
-                                       else if (inst instanceof 
AggregateBinaryGPUInstruction)
-                                               outName = 
((AggregateBinaryGPUInstruction) inst)._output.getName();
+                                       else if (inst instanceof GPUInstruction)
+                                               outName = 
gpuinst._output.getName();
                                        
                                        if (e.isMatrixValue() && e._gpuPointer 
== null)
                                                ec.setMatrixOutput(outName, 
e.getMBValue());
                                        else if (e.isScalarValue())
                                                ec.setScalarOutput(outName, 
e.getSOValue());
-                                       else //TODO handle locks on gpu objects
+                                       else { //TODO handle locks on gpu 
objects
+                                               //shallow copy the cached 
GPUObj to the output MatrixObject
                                                
ec.getMatrixObject(outName).setGPUObject(ec.getGPUContext(0), 
                                                                
ec.getGPUContext(0).shallowCopyGPUObject(e._gpuPointer, 
ec.getMatrixObject(outName)));
+                                               //Set dirty to true, so that it 
is later copied to the host
+                                               
ec.getMatrixObject(outName).getGPUObject(ec.getGPUContext(0)).setDirty(true);
+                                       }
 
                                        reuse = true;
 
@@ -418,15 +422,27 @@ public class LineageCache
                                        liData.add(Pair.of(li, value));
                                }
                        }
-                       else if (inst instanceof AggregateBinaryGPUInstruction)
-                               liGpuObj = 
ec.getMatrixObject(((AggregateBinaryGPUInstruction) 
inst)._output).getGPUObject(ec.getGPUContext(0));
+                       else if (inst instanceof GPUInstruction) {
+                               // TODO: gpu multiretrun instructions
+                               Data gpudata = ec.getVariable(((GPUInstruction) 
inst)._output);
+                               liGpuObj = gpudata instanceof MatrixObject ? 
+                                               
ec.getMatrixObject(((GPUInstruction)inst)._output).getGPUObject(ec.getGPUContext(0))
 : null;
+
+                               // Scalar gpu intermediates is always copied 
back to host. 
+                               // No need to cache the GPUobj for scalar 
intermediates.
+                               if (liGpuObj == null)
+                                       liData = Arrays.asList(Pair.of(instLI, 
ec.getVariable(((GPUInstruction)inst)._output)));
+                       }
                        else
                                liData = inst instanceof 
ComputationCPInstruction ? 
                                                Arrays.asList(Pair.of(instLI, 
ec.getVariable(((ComputationCPInstruction) inst).output))) :
                                                Arrays.asList(Pair.of(instLI, 
ec.getVariable(((ComputationFEDInstruction) inst).output)));
                        synchronized( _cache ) {
                                if (liGpuObj != null) {
+                                       // No need to make space as the entry 
is in gpu
+                                       // TODO: account gpu memory. Eviction
                                        LineageCacheEntry centry = 
_cache.get(instLI);
+                                       // Cache the GPUObj for future reuse
                                        liGpuObj.setIsLinCached(true);
                                        centry._gpuPointer = liGpuObj;
                                        centry._computeTime = computetime;
@@ -664,16 +680,13 @@ public class LineageCache
                if (!LineageCacheConfig.getCompAssRW())
                        return true;
                
-               if (inst instanceof GPUInstruction)
-                       return true;
-
-               CPOperand output = inst instanceof ComputationCPInstruction ? 
-                               ((ComputationCPInstruction)inst).output :
-                               ((ComputationFEDInstruction)inst).output;
+               CPOperand output = inst instanceof ComputationCPInstruction ? 
((ComputationCPInstruction)inst).output 
+                               : inst instanceof ComputationFEDInstruction ? 
((ComputationFEDInstruction)inst).output
+                               : ((GPUInstruction)inst)._output;
                if (output.isMatrix()) {
-                       MatrixObject mo = inst instanceof 
ComputationCPInstruction ? 
-                                       
ec.getMatrixObject(((ComputationCPInstruction)inst).output) :
-                                       
ec.getMatrixObject(((ComputationFEDInstruction)inst).output);
+                       MatrixObject mo = inst instanceof 
ComputationCPInstruction ? 
ec.getMatrixObject(((ComputationCPInstruction)inst).output) 
+                               : inst instanceof ComputationFEDInstruction ? 
ec.getMatrixObject(((ComputationFEDInstruction)inst).output)
+                               : 
ec.getMatrixObject(((GPUInstruction)inst)._output);
                        //limit this to full reuse as partial reuse is 
applicable even for loop dependent operation
                        return !(LineageCacheConfig.getCacheType() == 
ReuseCacheType.REUSE_FULL  
                                && !mo.isMarked());
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 680a283..5ceb647 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -210,7 +210,7 @@ public class LineageCacheConfig
                        long c2 = 
ec.getMatrixObject(fedinst.input2).getNumColumns();
                        return(c1 == 1 || c2 == 1);
                }
-               else { //CPInstruction
+               else if (inst instanceof ComputationCPInstruction) { 
//CPInstruction
                        ComputationCPInstruction cpinst = 
(ComputationCPInstruction) inst;
                        if( !cpinst.input1.isMatrix() || 
!cpinst.input2.isMatrix() )
                                return false;
@@ -218,6 +218,14 @@ public class LineageCacheConfig
                        long c2 = 
ec.getMatrixObject(cpinst.input2).getNumColumns();
                        return(c1 == 1 || c2 == 1);
                }
+               else { //GPUInstruction
+                       GPUInstruction gpuinst = (GPUInstruction)inst;
+                       if( !gpuinst._input1.isMatrix() || 
!gpuinst._input2.isMatrix() )
+                               return false;
+                       long c1 = 
ec.getMatrixObject(gpuinst._input1).getNumColumns();
+                       long c2 = 
ec.getMatrixObject(gpuinst._input2).getNumColumns();
+                       return(c1 == 1 || c2 == 1);
+               }
        }
        
        public static boolean isOutputFederated(Instruction inst, Data data) {
diff --git 
a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java 
b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index a5d1cd2..6451487 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -38,6 +38,7 @@ public class GPUFullReuseTest extends AutomatedTestBase{
        
        protected static final String TEST_DIR = "functions/lineage/";
        protected static final String TEST_NAME1 = "FullReuseGPU1"; 
+       protected static final String TEST_NAME2 = "LineageTraceGPU1"; 
        protected String TEST_CLASS_DIR = TEST_DIR + 
GPUFullReuseTest.class.getSimpleName() + "/";
        
        @BeforeClass
@@ -50,17 +51,24 @@ public class GPUFullReuseTest extends AutomatedTestBase{
        public void setUp() {
                TestUtils.clearAssertionInformation();
                addTestConfiguration( TEST_NAME1, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] {"R"}) );
+               addTestConfiguration( TEST_NAME2, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] {"R"}) );
        }
        
        @Test
-       public void ReuseSingleInst() {           //reuse ba+*
+       public void ReuseAggBin() {           //reuse AggregateBinary and sum
                testLineageTraceExec(TEST_NAME1);
        }
+
+       @Test
+       public void ReuseSimpleHLM() {        //hyper-parameter tuning over LM 
(simple)
+               testLineageTraceExec(TEST_NAME2);
+       }
        
        private void testLineageTraceExec(String testname) {
                System.out.println("------------ BEGIN " + testname + 
"------------");
                getAndLoadTestConfiguration(testname);
 
+               AutomatedTestBase.TEST_GPU = true;  //adds '-gpu'
                List<String> proArgs = new ArrayList<>();
                proArgs.add("-stats");
                proArgs.add("-args");
@@ -72,7 +80,6 @@ public class GPUFullReuseTest extends AutomatedTestBase{
                runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
                HashMap<MatrixValue.CellIndex, Double> R_orig = 
readDMLMatrixFromOutputDir("R");
                
-               AutomatedTestBase.TEST_GPU = true;  //adds '-gpu'
                proArgs.add("-stats");
                proArgs.add("-lineage");
                proArgs.add("reuse_full");
diff --git a/src/test/scripts/functions/lineage/FullReuseGPU1.dml 
b/src/test/scripts/functions/lineage/FullReuseGPU1.dml
index 58307dc..9e37a4f 100644
--- a/src/test/scripts/functions/lineage/FullReuseGPU1.dml
+++ b/src/test/scripts/functions/lineage/FullReuseGPU1.dml
@@ -20,10 +20,12 @@
 #-------------------------------------------------------------
 X = rand(rows=1000, cols=100, sparsity=1, seed=42);
 y = rand(rows=100, cols=100, sparsity=1, seed=42);
+R = matrix(0, rows=1, cols=100);
+
 for (i in 1:10) {
   tmp = X %*% y;
+  R[1,i] = sum(tmp);
 }
-R = tmp;
 
 write(R, $1, format="text");

[systemds] branch master updated: [SYSTEMDS-2913] Refactor GPUInstruction to support reuse better

Reply via email to