This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new b8bdfa8  [SYSTEMDS-2913] Lineage-based reuse of GPU intermediates
b8bdfa8 is described below

commit b8bdfa8cef3563bd216615c4355a8c5ca097eeb9
Author: arnabp <[email protected]>
AuthorDate: Thu Mar 25 15:43:28 2021 +0100

    [SYSTEMDS-2913] Lineage-based reuse of GPU intermediates
    
    This patch brings the initial implementation of reuse of
    GPU memory pointers. This patch only enables reuse for
    AggregateBinaryGPU instructions. Minor code refactoring
    is needed to generally support reuse of all GPU operations.
---
 .../gpu/AggregateBinaryGPUInstruction.java         |  2 +-
 .../instructions/gpu/context/GPUContext.java       |  6 +++
 .../instructions/gpu/context/GPUObject.java        | 29 ++++++++++-
 .../apache/sysds/runtime/lineage/LineageCache.java | 37 ++++++++++++--
 .../sysds/runtime/lineage/LineageCacheConfig.java  |  2 +
 .../sysds/runtime/lineage/LineageCacheEntry.java   |  8 +++
 ...eageTraceGPUTest.java => GPUFullReuseTest.java} | 57 ++++++++++++----------
 .../functions/lineage/LineageTraceGPUTest.java     |  5 +-
 .../scripts/functions/lineage/FullReuseGPU1.dml    | 29 +++++++++++
 9 files changed, 139 insertions(+), 36 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 4d8f70c..6445135 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -41,7 +41,7 @@ import org.apache.sysds.utils.GPUStatistics;
 public class AggregateBinaryGPUInstruction extends GPUInstruction implements 
LineageTraceable {
        private CPOperand _input1 = null;
        private CPOperand _input2 = null;
-       private CPOperand _output = null;
+       public CPOperand _output = null;
        private boolean _isLeftTransposed;
        private boolean _isRightTransposed;
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
index 6873391..b00b12c 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -253,6 +253,12 @@ public class GPUContext {
                
getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret);
                return ret;
        }
+       
+       public GPUObject shallowCopyGPUObject(GPUObject source, MatrixObject 
mo) {
+               GPUObject ret = new GPUObject(this, source, mo);
+               
getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret);
+               return ret;
+       }
 
        /**
         * Gets the device properties for the active GPU (set with 
cudaSetDevice()).
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 1245502..d55f934 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -102,6 +102,11 @@ public class GPUObject {
         */
        final ShadowBuffer shadowBuffer;
        
+       /**
+        * whether cached in lineage cache
+        */
+       private boolean isLineageCached = false;
+       
        // 
----------------------------------------------------------------------
        // Methods used to access, set and check jcudaDenseMatrixPtr
        
@@ -432,7 +437,7 @@ public class GPUObject {
        /**
         * Initializes this GPUObject with a {@link MatrixObject} instance 
which will contain metadata about the enclosing matrix block
         *
-        * @param mat2 the matrix block that owns this {@link GPUObject}
+        * @param mat2 the matrix object that owns this {@link GPUObject}
         */
        GPUObject(GPUContext gCtx, MatrixObject mat2) {
                gpuContext = gCtx;
@@ -440,6 +445,22 @@ public class GPUObject {
                this.shadowBuffer = new ShadowBuffer(this);
        }
 
+       public GPUObject(GPUContext gCtx, GPUObject that, MatrixObject mat) {
+               dirty = that.dirty;
+               readLocks.reset();
+               writeLock = false;
+               timestamp = new AtomicLong(that.timestamp.get());
+               isSparse = that.isSparse;
+               isLineageCached = that.isLineageCached;
+               if (isDensePointerNull())
+                       setDensePointer(that.getDensePointer());
+               if (getJcudaSparseMatrixPtr() != null)
+                       
setSparseMatrixCudaPointer(that.getSparseMatrixCudaPointer());
+               gpuContext = gCtx;
+               this.mat = mat;
+               shadowBuffer = new ShadowBuffer(this);
+       }
+
        public boolean isSparse() {
                return isSparse;
        }
@@ -959,6 +980,8 @@ public class GPUObject {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : clearData on " + this + ", 
GPUContext=" + getGPUContext());
                }
+               if (isLineageCached)
+                       return;
                if (!isDensePointerNull()) {
                        getGPUContext().cudaFreeHelper(opcode, 
getDensePointer(), eager);
                }
@@ -989,6 +1012,10 @@ public class GPUObject {
        public boolean isDirty() {
                return dirty;
        }
+       
+       public void setIsLinCached(boolean val) {
+               isLineageCached = val;
+       }
 
        @Override
        public String toString() {
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index 5010c1c..20fcdc2 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -46,6 +46,9 @@ import 
org.apache.sysds.runtime.instructions.cp.MultiReturnBuiltinCPInstruction;
 import 
org.apache.sysds.runtime.instructions.cp.ParameterizedBuiltinCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.fed.ComputationFEDInstruction;
+import org.apache.sysds.runtime.instructions.gpu.AggregateBinaryGPUInstruction;
+import org.apache.sysds.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCacheStatus;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -91,7 +94,9 @@ public class LineageCache
                        ComputationCPInstruction cinst = inst instanceof 
ComputationCPInstruction ? (ComputationCPInstruction)inst : null;
                        ComputationFEDInstruction cfinst = inst instanceof 
ComputationFEDInstruction ? (ComputationFEDInstruction)inst : null; 
                                
-                       LineageItem instLI = (cinst != null) ? 
cinst.getLineageItem(ec).getValue():cfinst.getLineageItem(ec).getValue();
+                       LineageItem instLI = (cinst != null) ? 
cinst.getLineageItem(ec).getValue()
+                                       : (cfinst != null) ? 
cfinst.getLineageItem(ec).getValue() 
+                                       : 
((LineageTraceable)inst).getLineageItem(ec).getValue();  //GPU instruction
                        List<MutablePair<LineageItem, LineageCacheEntry>> 
liList = null;
                        if (inst instanceof MultiReturnBuiltinCPInstruction) {
                                liList = new ArrayList<>();
@@ -127,8 +132,10 @@ public class LineageCache
                                        if(e == null && 
isMarkedForCaching(inst, ec)) {
                                                if (cinst != null)
                                                        
putIntern(item.getKey(), cinst.output.getDataType(), null, null,  0);
-                                               else
+                                               else if (cfinst != null)
                                                        
putIntern(item.getKey(), cfinst.output.getDataType(), null, null,  0);
+                                               else if (inst instanceof 
AggregateBinaryGPUInstruction)
+                                                       
putIntern(item.getKey(), 
((AggregateBinaryGPUInstruction)inst)._output.getDataType(), null, null,  0);
                                                //FIXME: different o/p 
datatypes for MultiReturnBuiltins.
                                        }
                                }
@@ -145,13 +152,19 @@ public class LineageCache
                                                        
getOutput(entry.getKey().getOpcode().charAt(entry.getKey().getOpcode().length()-1)-'0').getName();
 
                                        else if (inst instanceof 
ComputationCPInstruction)
                                                outName = 
cinst.output.getName();
-                                       else
+                                       else if (inst instanceof 
ComputationFEDInstruction)
                                                outName = 
cfinst.output.getName();
+                                       else if (inst instanceof 
AggregateBinaryGPUInstruction)
+                                               outName = 
((AggregateBinaryGPUInstruction) inst)._output.getName();
                                        
-                                       if (e.isMatrixValue())
+                                       if (e.isMatrixValue() && e._gpuPointer 
== null)
                                                ec.setMatrixOutput(outName, 
e.getMBValue());
-                                       else
+                                       else if (e.isScalarValue())
                                                ec.setScalarOutput(outName, 
e.getSOValue());
+                                       else //TODO handle locks on gpu objects
+                                               
ec.getMatrixObject(outName).setGPUObject(ec.getGPUContext(0), 
+                                                               
ec.getGPUContext(0).shallowCopyGPUObject(e._gpuPointer, 
ec.getMatrixObject(outName)));
+
                                        reuse = true;
 
                                        if (DMLScript.STATISTICS) //increment 
saved time
@@ -393,6 +406,7 @@ public class LineageCache
                if (LineageCacheConfig.isReusable(inst, ec) ) {
                        //if (!isMarkedForCaching(inst, ec)) return;
                        List<Pair<LineageItem, Data>> liData = null;
+                       GPUObject liGpuObj = null;
                        LineageItem instLI = ((LineageTraceable) 
inst).getLineageItem(ec).getValue();
                        if (inst instanceof MultiReturnBuiltinCPInstruction) {
                                liData = new ArrayList<>();
@@ -404,11 +418,21 @@ public class LineageCache
                                        liData.add(Pair.of(li, value));
                                }
                        }
+                       else if (inst instanceof AggregateBinaryGPUInstruction)
+                               liGpuObj = 
ec.getMatrixObject(((AggregateBinaryGPUInstruction) 
inst)._output).getGPUObject(ec.getGPUContext(0));
                        else
                                liData = inst instanceof 
ComputationCPInstruction ? 
                                                Arrays.asList(Pair.of(instLI, 
ec.getVariable(((ComputationCPInstruction) inst).output))) :
                                                Arrays.asList(Pair.of(instLI, 
ec.getVariable(((ComputationFEDInstruction) inst).output)));
                        synchronized( _cache ) {
+                               if (liGpuObj != null) {
+                                       LineageCacheEntry centry = 
_cache.get(instLI);
+                                       liGpuObj.setIsLinCached(true);
+                                       centry._gpuPointer = liGpuObj;
+                                       centry._computeTime = computetime;
+                                       centry._status = 
LineageCacheStatus.CACHED;
+                                       return;
+                               }
                                for (Pair<LineageItem, Data> entry : liData) {
                                        LineageItem item = entry.getKey();
                                        Data data = entry.getValue();
@@ -639,6 +663,9 @@ public class LineageCache
        private static boolean isMarkedForCaching (Instruction inst, 
ExecutionContext ec) {
                if (!LineageCacheConfig.getCompAssRW())
                        return true;
+               
+               if (inst instanceof GPUInstruction)
+                       return true;
 
                CPOperand output = inst instanceof ComputationCPInstruction ? 
                                ((ComputationCPInstruction)inst).output :
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 9ffc7ef..680a283 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -32,6 +32,7 @@ import 
org.apache.sysds.runtime.instructions.cp.DataGenCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.ListIndexingCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.MatrixIndexingCPInstruction;
 import org.apache.sysds.runtime.instructions.fed.ComputationFEDInstruction;
+import org.apache.sysds.runtime.instructions.gpu.GPUInstruction;
 
 import java.util.Comparator;
 
@@ -188,6 +189,7 @@ public class LineageCacheConfig
        public static boolean isReusable (Instruction inst, ExecutionContext 
ec) {
                boolean insttype = inst instanceof ComputationCPInstruction 
                        || inst instanceof ComputationFEDInstruction
+                       || inst instanceof GPUInstruction
                        && !(inst instanceof ListIndexingCPInstruction);
                boolean rightop = (ArrayUtils.contains(REUSE_OPCODES, 
inst.getOpcode())
                        || (inst.getOpcode().equals("append") && 
isVectorAppend(inst, ec))
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index 1562d60..78b5aad 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -24,6 +24,7 @@ import java.util.Map;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCacheStatus;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
@@ -39,6 +40,7 @@ public class LineageCacheEntry {
        protected LineageItem _origItem;
        private String _outfile = null;
        protected double score;
+       protected GPUObject _gpuPointer;
        
        public LineageCacheEntry(LineageItem key, DataType dt, MatrixBlock 
Mval, ScalarObject Sval, long computetime) {
                _key = key;
@@ -49,6 +51,8 @@ public class LineageCacheEntry {
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
                _nextEntry = null;
                _origItem = null;
+               _outfile = null;
+               _gpuPointer = null;
        }
        
        protected synchronized void setCacheStatus(LineageCacheStatus st) {
@@ -98,6 +102,10 @@ public class LineageCacheEntry {
        public boolean isMatrixValue() {
                return _dt.isMatrix();
        }
+
+       public boolean isScalarValue() {
+               return _dt.isScalar();
+       }
        
        public synchronized void setValue(MatrixBlock val, long computetime) {
                _MBval = val;
diff --git 
a/src/test/java/org/apache/sysds/test/functions/lineage/LineageTraceGPUTest.java
 b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
similarity index 60%
copy from 
src/test/java/org/apache/sysds/test/functions/lineage/LineageTraceGPUTest.java
copy to 
src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 405ead7..a5d1cd2 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/lineage/LineageTraceGPUTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -23,28 +23,28 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 
-import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysds.runtime.instructions.cp.Data;
 import org.apache.sysds.runtime.lineage.Lineage;
-import org.apache.sysds.runtime.lineage.LineageRecomputeUtils;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
+import org.junit.Assume;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import jcuda.runtime.cudaError;
 
-public class LineageTraceGPUTest extends AutomatedTestBase{
+public class GPUFullReuseTest extends AutomatedTestBase{
        
        protected static final String TEST_DIR = "functions/lineage/";
-       protected static final String TEST_NAME1 = "LineageTraceGPU1"; 
-       protected String TEST_CLASS_DIR = TEST_DIR + 
LineageTraceGPUTest.class.getSimpleName() + "/";
-       
-       protected static final int numRecords = 10;
-       protected static final int numFeatures = 5;
+       protected static final String TEST_NAME1 = "FullReuseGPU1"; 
+       protected String TEST_CLASS_DIR = TEST_DIR + 
GPUFullReuseTest.class.getSimpleName() + "/";
        
+       @BeforeClass
+       public static void checkGPU() {
+               // Skip all the tests if no GPU is available
+               Assume.assumeTrue(TestUtils.isGPUAvailable() == 
cudaError.cudaSuccess);
+       }
        
        @Override
        public void setUp() {
@@ -53,39 +53,42 @@ public class LineageTraceGPUTest extends AutomatedTestBase{
        }
        
        @Test
-       public void simpleHLM_gpu() {              //hyper-parameter tuning 
over LM (simple)
+       public void ReuseSingleInst() {           //reuse ba+*
                testLineageTraceExec(TEST_NAME1);
        }
        
        private void testLineageTraceExec(String testname) {
                System.out.println("------------ BEGIN " + testname + 
"------------");
-               
-               int gpuStatus = TestUtils.isGPUAvailable(); 
                getAndLoadTestConfiguration(testname);
+
                List<String> proArgs = new ArrayList<>();
+               proArgs.add("-stats");
+               proArgs.add("-args");
+               proArgs.add(output("R"));
+               programArgs = proArgs.toArray(new String[proArgs.size()]);
+               fullDMLScriptName = getScript();
+               
+               //run the test
+               runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+               HashMap<MatrixValue.CellIndex, Double> R_orig = 
readDMLMatrixFromOutputDir("R");
                
+               AutomatedTestBase.TEST_GPU = true;  //adds '-gpu'
                proArgs.add("-stats");
-               if (gpuStatus == cudaError.cudaSuccess)
-                       proArgs.add("-gpu");
                proArgs.add("-lineage");
+               proArgs.add("reuse_full");
                proArgs.add("-args");
                proArgs.add(output("R"));
-               proArgs.add(String.valueOf(numRecords));
-               proArgs.add(String.valueOf(numFeatures));
                programArgs = proArgs.toArray(new String[proArgs.size()]);
                fullDMLScriptName = getScript();
                
                Lineage.resetInternalState();
                //run the test
                runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-               
-               //get lineage and generate program
-               String Rtrace = readDMLLineageFromHDFS("R");
-               //NOTE: the generated program is CP-only.
-               Data ret = 
LineageRecomputeUtils.parseNComputeLineageTrace(Rtrace, null);
-               
-               HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromOutputDir("R");
-               MatrixBlock tmp = ((MatrixObject)ret).acquireReadAndRelease();
-               TestUtils.compareMatrices(dmlfile, tmp, 1e-6);
+               AutomatedTestBase.TEST_GPU = false;
+               HashMap<MatrixValue.CellIndex, Double> R_reused = 
readDMLMatrixFromOutputDir("R");
+
+               //compare results 
+               TestUtils.compareMatrices(R_orig, R_reused, 1e-6, "Origin", 
"Reused");
        }
 }
+
diff --git 
a/src/test/java/org/apache/sysds/test/functions/lineage/LineageTraceGPUTest.java
 
b/src/test/java/org/apache/sysds/test/functions/lineage/LineageTraceGPUTest.java
index 405ead7..031ed65 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/lineage/LineageTraceGPUTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/lineage/LineageTraceGPUTest.java
@@ -61,12 +61,12 @@ public class LineageTraceGPUTest extends AutomatedTestBase{
                System.out.println("------------ BEGIN " + testname + 
"------------");
                
                int gpuStatus = TestUtils.isGPUAvailable(); 
+               if (gpuStatus == cudaError.cudaSuccess)
+                       AutomatedTestBase.TEST_GPU = true;  //adds '-gpu'
                getAndLoadTestConfiguration(testname);
                List<String> proArgs = new ArrayList<>();
                
                proArgs.add("-stats");
-               if (gpuStatus == cudaError.cudaSuccess)
-                       proArgs.add("-gpu");
                proArgs.add("-lineage");
                proArgs.add("-args");
                proArgs.add(output("R"));
@@ -81,6 +81,7 @@ public class LineageTraceGPUTest extends AutomatedTestBase{
                
                //get lineage and generate program
                String Rtrace = readDMLLineageFromHDFS("R");
+               AutomatedTestBase.TEST_GPU = false;
                //NOTE: the generated program is CP-only.
                Data ret = 
LineageRecomputeUtils.parseNComputeLineageTrace(Rtrace, null);
                
diff --git a/src/test/scripts/functions/lineage/FullReuseGPU1.dml 
b/src/test/scripts/functions/lineage/FullReuseGPU1.dml
new file mode 100644
index 0000000..58307dc
--- /dev/null
+++ b/src/test/scripts/functions/lineage/FullReuseGPU1.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+X = rand(rows=1000, cols=100, sparsity=1, seed=42);
+y = rand(rows=100, cols=100, sparsity=1, seed=42);
+for (i in 1:10) {
+  tmp = X %*% y;
+}
+R = tmp;
+
+write(R, $1, format="text");
+

Reply via email to