[SYSTEMML-445] [WIP] Initial version of GPU backend
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/f306b0b1 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/f306b0b1 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/f306b0b1 Branch: refs/heads/master Commit: f306b0b1ecd97fdb5755989f77728d1ca81358c5 Parents: 8c37e2e Author: Niketan Pansare <npan...@us.ibm.com> Authored: Wed Jun 15 10:02:44 2016 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Wed Jun 15 10:03:48 2016 -0700 ---------------------------------------------------------------------- docs/devdocs/gpu-backend.md | 70 ++++ pom.xml | 63 ++++ .../java/org/apache/sysml/api/DMLScript.java | 27 ++ .../java/org/apache/sysml/hops/AggBinaryOp.java | 44 ++- .../org/apache/sysml/hops/ConvolutionOp.java | 39 +- .../org/apache/sysml/hops/OptimizerUtils.java | 2 + .../java/org/apache/sysml/hops/ReorgOp.java | 6 + src/main/java/org/apache/sysml/lops/Binary.java | 19 +- src/main/java/org/apache/sysml/lops/Lop.java | 2 +- .../org/apache/sysml/lops/LopProperties.java | 2 +- .../runtime/controlprogram/ProgramBlock.java | 7 +- .../controlprogram/caching/CacheableData.java | 30 ++ .../controlprogram/caching/MatrixObject.java | 12 + .../context/ExecutionContext.java | 91 ++++- .../controlprogram/context/GPUContext.java | 70 ++++ .../controlprogram/context/GPUObject.java | 156 ++++++++ .../controlprogram/context/JCudaContext.java | 146 ++++++++ .../controlprogram/context/JCudaObject.java | 276 ++++++++++++++ .../instructions/GPUInstructionParser.java | 87 +++++ .../sysml/runtime/instructions/Instruction.java | 3 +- .../runtime/instructions/InstructionParser.java | 5 + .../runtime/instructions/InstructionUtils.java | 13 + .../runtime/instructions/cp/CPInstruction.java | 5 +- .../gpu/AggregateBinaryGPUInstruction.java | 152 ++++++++ .../gpu/ConvolutionGPUInstruction.java | 207 +++++++++++ .../runtime/matrix/data/LibMatrixCUDA.java | 359 +++++++++++++++++++ .../sysml/runtime/util/ConvolutionUtils.java | 127 +++++++ .../java/org/apache/sysml/utils/Statistics.java | 55 +++ .../test/integration/AutomatedTestBase.java | 17 +- .../binary/matrix/MatrixMultiplicationTest.java | 27 ++ 30 files changed, 2091 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/docs/devdocs/gpu-backend.md ---------------------------------------------------------------------- diff --git a/docs/devdocs/gpu-backend.md b/docs/devdocs/gpu-backend.md new file mode 100644 index 0000000..c6f66d6 --- /dev/null +++ b/docs/devdocs/gpu-backend.md @@ -0,0 +1,70 @@ +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +# Initial prototype for GPU backend + +A GPU backend implements two important abstract classes: +1. `org.apache.sysml.runtime.controlprogram.context.GPUContext` +2. `org.apache.sysml.runtime.controlprogram.context.GPUObject` + +The GPUContext is responsible for GPU memory management and initialization/destruction of Cuda handles. + +A GPUObject (like RDDObject and BroadcastObject) is stored in CacheableData object. It gets call-backs from SystemML's bufferpool on following methods +1. void acquireDeviceRead() +2. void acquireDenseDeviceModify(int numElemsToAllocate) +3. void acquireHostRead() +4. void acquireHostModify() +5. void release(boolean isGPUCopyModified) + +## JCudaContext: +The current prototype supports Nvidia's CUDA libraries using JCuda wrapper. The implementation for the above classes can be found in: +1. `org.apache.sysml.runtime.controlprogram.context.JCudaContext` +2. `org.apache.sysml.runtime.controlprogram.context.JCudaObject` + +### Setup instructions for JCudaContext: + +1. Follow the instructions from `https://developer.nvidia.com/cuda-downloads` and install CUDA 7.5. +2. Follow the instructions from `https://developer.nvidia.com/cudnn` and install CuDNN v4. +3. Download install JCuda binaries version 0.7.5b and JCudnn version 0.7.5. Easiest option would be to use mavenized jcuda: +```python +git clone https://github.com/MysterionRise/mavenized-jcuda.git +mvn -Djcuda.version=0.7.5b -Djcudnn.version=0.7.5 clean package +CURR_DIR=`pwd` +JCUDA_PATH=$CURR_DIR"/target/lib/" +JAR_PATH="." +for j in `ls $JCUDA_PATH/*.jar` +do + JAR_PATH=$JAR_PATH":"$j +done +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JCUDA_PATH +``` + +Note for Windows users: +* CuDNN v4 is available to download: `http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip` +* If above steps doesn't work for JCuda, copy the DLLs into C:\lib (or /lib) directory. + +To use SystemML's GPU backend, +1. Add JCuda's jar into the classpath. +2. Include CUDA, CuDNN and JCuda's libraries in LD_LIBRARY_PATH (or using -Djava.library.path). +3. Use `-gpu` flag. + +For example: to use GPU backend in standalone mode: +```python +java -classpath $JAR_PATH:systemml-0.10.0-incubating-SNAPSHOT-standalone.jar org.apache.sysml.api.DMLScript -f MyDML.dml -gpu -exec singlenode ... +``` http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 9982d41..1564c99 100644 --- a/pom.xml +++ b/pom.xml @@ -73,6 +73,24 @@ <!-- OS-specific JVM arguments for running integration tests --> <integrationTestExtraJVMArgs /> </properties> + + <repositories> + <repository> + <id>central</id> + <url>https://repo1.maven.org/maven2</url> + <releases> + <enabled>true</enabled> + </releases> + </repository> + <repository> + <id>mavenized-jcuda-mvn-repo</id> + <url>https://raw.github.com/niketanpansare/mavenized-jcuda/mvn-repo/</url> + <snapshots> + <enabled>true</enabled> + <updatePolicy>always</updatePolicy> + </snapshots> + </repository> + </repositories> <build> @@ -784,6 +802,51 @@ <dependencies> + + <!-- For GPU backend + Use org.mystic:mavenized-jcuda until Alan puts org.jcuda:* + --> + <dependency> + <groupId>org.mystic</groupId> + <artifactId>mavenized-jcuda</artifactId> + <version>0.7.5b</version> + <type>jar</type> + <scope>provided</scope> + <exclusions> + <exclusion> + <groupId>*</groupId> + <artifactId>*</artifactId> + </exclusion> + </exclusions> + </dependency> + <!-- Since there is no mvn repo for jcuda + <dependency> + <groupId>org.jcuda</groupId> + <artifactId>jcuda</artifactId> + <version>0.7.5b</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.jcuda</groupId> + <artifactId>jcublas</artifactId> + <version>0.7.5b</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.jcuda</groupId> + <artifactId>jcusparse</artifactId> + <version>0.7.5b</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.jcuda</groupId> + <artifactId>jcudnn</artifactId> + <version>0.7.5</version> + <scope>provided</scope> + </dependency> + --> + <!-- ************************* --> + <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index ec71af3..04b4548 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -69,6 +69,7 @@ import org.apache.sysml.runtime.controlprogram.caching.CacheStatistics; import org.apache.sysml.runtime.controlprogram.caching.CacheableData; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory; +import org.apache.sysml.runtime.controlprogram.context.GPUContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; @@ -103,6 +104,9 @@ public class DMLScript public static String DML_FILE_PATH_ANTLR_PARSER = null; public static ExplainType EXPLAIN = ExplainType.NONE; //default explain + public static boolean USE_ACCELERATOR = false; + public static boolean FORCE_ACCELERATOR = false; + // flag that indicates whether or not to suppress any prints to stdout public static boolean _suppressPrint2Stdout = false; @@ -121,6 +125,8 @@ public class DMLScript //+ " -s: <filename> will be interpreted as a DML script string \n" + " -python: (optional) parses Python-like DML\n" + " -debug: (optional) run in debug mode\n" + + " -accelerator: <flags> (optional) use acceleration whenever possible. Current version only supports CUDA.\n" + + " Optional <flags> that is supported for this mode is force=(true|false)\n" // Later add optional flags to indicate optimizations turned on or off. Currently they are turned off. //+ " -debug: <flags> (optional) run in debug mode\n" //+ " Optional <flags> that is supported for this mode is optimize=(on|off)\n" @@ -302,6 +308,24 @@ public class DMLScript else if( args[i].equalsIgnoreCase("-debug") ) { ENABLE_DEBUG_MODE = true; } + else if( args[i].equalsIgnoreCase("-gpu") ) { + USE_ACCELERATOR = true; + if( args.length > (i+1) && !args[i+1].startsWith("-") ) { + String flag = args[++i]; + if(flag.startsWith("force=")) { + String [] flagOptions = flag.split("="); + if(flagOptions.length == 2) + FORCE_ACCELERATOR = Boolean.parseBoolean(flagOptions[1]); + else + throw new DMLRuntimeException("Unsupported \"force\" option for -gpu:" + flag); + } + else { + throw new DMLRuntimeException("Unsupported flag for -gpu:" + flag); + } + } + + GPUContext.createGPUContext(); // Set GPU memory budget + } else if( args[i].equalsIgnoreCase("-python") ) { parsePyDML = true; } @@ -673,6 +697,9 @@ public class DMLScript } finally //ensure cleanup/shutdown { + if(DMLScript.USE_ACCELERATOR && ec != null) { + ec.destroyGPUContext(); + } if(ec != null && ec instanceof SparkExecutionContext) { ((SparkExecutionContext) ec).close(); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/AggBinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java index 8a8cffc..0532d01 100644 --- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java @@ -640,16 +640,46 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop throws HopsException, LopsException { Lop matmultCP = null; - if( isLeftTransposeRewriteApplicable(true, false) ) { - matmultCP = constructCPLopsMMWithLeftTransposeRewrite(); + + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) { + Hop h1 = getInput().get(0); + Hop h2 = getInput().get(1); + Lop left; Lop right; + boolean isLeftTransposed; boolean isRightTransposed; + if( h1 instanceof ReorgOp && ((ReorgOp)h1).getOp()==ReOrgOp.TRANSPOSE ) { + isLeftTransposed = true; + left = h1.getInput().get(0).constructLops(); + } + else { + isLeftTransposed = false; + left = h1.constructLops(); + } + if( h2 instanceof ReorgOp && ((ReorgOp)h2).getOp()==ReOrgOp.TRANSPOSE ) { + isRightTransposed = true; + right = h2.getInput().get(0).constructLops(); + } + else { + isRightTransposed = false; + right = h2.constructLops(); + } + + matmultCP = new Binary(left, right, + Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.GPU, isLeftTransposed, isRightTransposed); + setOutputDimensions(matmultCP); + setNnz(-1); } - else { - int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - matmultCP = new Binary(getInput().get(0).constructLops(),getInput().get(1).constructLops(), - Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.CP, k); + else { + if( isLeftTransposeRewriteApplicable(true, false) ) { + matmultCP = constructCPLopsMMWithLeftTransposeRewrite(); + } + else { + int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); + matmultCP = new Binary(getInput().get(0).constructLops(),getInput().get(1).constructLops(), + Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.CP, k); + } + setOutputDimensions(matmultCP); } - setOutputDimensions(matmultCP); setLineNumbers( matmultCP ); setLops(matmultCP); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/ConvolutionOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java index e9023d4..7f53e2e 100644 --- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java +++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java @@ -33,6 +33,7 @@ import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.ConvolutionParameters; +import org.apache.sysml.runtime.util.ConvolutionUtils; public class ConvolutionOp extends Hop implements MultiThreadedHop { @@ -90,6 +91,16 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop return getLops(); ExecType et = optFindExecType(); + + Lop ret = ConvolutionUtils.constructConvolutionLops(this, et); + if(ret != null) { + return ret; + } + ret = ConvolutionUtils.constructConvolutionBackwardDataLops(this, et); + if(ret != null) { + return ret; + } + ArrayList<Hop> inputs = getInput(); switch( op ) { @@ -99,11 +110,26 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop case COL2IM: case MAX_POOLING: case MAX_POOLING_BACKWARD: + { + et = ExecType.CP; // TODO: Since max_backwards and other Convolution Ops only implemented for CP + + if( et == ExecType.CP ) + { + setLops(constructConvolutionLops(et, inputs)); + break; + } + else { + // TODO: Add support for SPARK/MR backends once we are happy with the performance of + // single node Lenet script. + throw new HopsException("Unimplemented ConvolutionOp for execution type: " + et.name()); + } + // break; + } case DIRECT_CONV2D: case DIRECT_CONV2D_BACKWARD_DATA: case DIRECT_CONV2D_BACKWARD_FILTER: { - if( et == ExecType.CP ) + if( et == ExecType.GPU ) { setLops(constructConvolutionLops(et, inputs)); break; @@ -148,15 +174,17 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop HopsConv2Lops.get(op), getDataType(), getValueType(), et, k); setOutputDimensions(transform1); setLineNumbers(transform1); + in.addOutput(transform1); // stride1, stride2, padding1, padding2 // input_shape1, input_shape2, input_shape3, input_shape4, // filter_shape1, filter_shape2, filter_shape3, filter_shape4 - for( int i=1; i <= (expectedNumInputs-1); i++ ) + for( int i=1; i < inputs.size(); i++ ) { Lop ltmp = inputs.get(i).constructLops(); transform1.addInput(ltmp); - ltmp.addOutput(transform1); + //if(i == 1 && expectedNumInputs == 14) + ltmp.addOutput(transform1); } transform1.setLevel(); //force order of added lops return transform1; @@ -265,11 +293,6 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop if ( OptimizerUtils.isMemoryBasedOptLevel() ) { _etype = findExecTypeByMemEstimate(); } - // Choose CP, if the input dimensions are below threshold or if the input is a vector - else if ( getInput().get(0).areDimsBelowThreshold() || getInput().get(0).isVector() ) - { - _etype = ExecType.CP; - } else { _etype = REMOTE; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/OptimizerUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java index aa9f7d0..e6c8b88 100644 --- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java +++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java @@ -204,6 +204,8 @@ public class OptimizerUtils public static boolean ALLOW_FRAME_CSV_REBLOCK = false; + public static long GPU_MEMORY_BUDGET = -1; + ////////////////////// // Optimizer levels // ////////////////////// http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/hops/ReorgOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java b/src/main/java/org/apache/sysml/hops/ReorgOp.java index d81e777..55c0f3f 100644 --- a/src/main/java/org/apache/sysml/hops/ReorgOp.java +++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java @@ -34,6 +34,7 @@ import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; +import org.apache.sysml.runtime.util.ConvolutionUtils; /** * Reorg (cell) operation: aij @@ -118,6 +119,11 @@ public class ReorgOp extends Hop implements MultiThreadedHop if( getLops() != null ) return getLops(); + Lop ret = ConvolutionUtils.constructConvolutionBackwardFilterLops(this); + if(ret != null) { + return ret; + } + ExecType et = optFindExecType(); switch( op ) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/lops/Binary.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/Binary.java b/src/main/java/org/apache/sysml/lops/Binary.java index d3389bc..bae7dee 100644 --- a/src/main/java/org/apache/sysml/lops/Binary.java +++ b/src/main/java/org/apache/sysml/lops/Binary.java @@ -42,6 +42,7 @@ public class Binary extends Lop private OperationTypes operation; private int numThreads = -1; + boolean isLeftTransposed; boolean isRightTransposed; // Used for GPU matmult operation /** * Constructor to perform a binary operation. @@ -58,6 +59,14 @@ public class Binary extends Lop numThreads = k; } + public Binary(Lop input1, Lop input2, OperationTypes op, DataType dt, ValueType vt, ExecType et, + boolean isLeftTransposed, boolean isRightTransposed) { + super(Lop.Type.Binary, dt, vt); + init(input1, input2, op, dt, vt, et); + this.isLeftTransposed = isLeftTransposed; + this.isRightTransposed = isRightTransposed; + } + private void init(Lop input1, Lop input2, OperationTypes op, DataType dt, ValueType vt, ExecType et) { operation = op; @@ -76,7 +85,7 @@ public class Binary extends Lop lps.addCompatibility(JobType.REBLOCK); this.lps.setProperties( inputs, et, ExecLocation.Reduce, breaksAlignment, aligner, definesMRJob ); } - else if ( et == ExecType.CP || et == ExecType.SPARK ){ + else if ( et == ExecType.CP || et == ExecType.SPARK || et == ExecType.GPU ){ lps.addCompatibility(JobType.INVALID); this.lps.setProperties( inputs, et, ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob ); } @@ -183,7 +192,13 @@ public class Binary extends Lop if( operation == OperationTypes.MATMULT && getExecType()==ExecType.CP ) { sb.append( OPERAND_DELIMITOR ); sb.append( numThreads ); - } + } + else if( operation == OperationTypes.MATMULT && getExecType()==ExecType.GPU ) { + sb.append( OPERAND_DELIMITOR ); + sb.append( isLeftTransposed ); + sb.append( OPERAND_DELIMITOR ); + sb.append( isRightTransposed ); + } return sb.toString(); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/lops/Lop.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/Lop.java b/src/main/java/org/apache/sysml/lops/Lop.java index 8424e28..1ffdd10 100644 --- a/src/main/java/org/apache/sysml/lops/Lop.java +++ b/src/main/java/org/apache/sysml/lops/Lop.java @@ -668,7 +668,7 @@ public abstract class Lop boolean isLiteral = (isData && ((Data)this).isLiteral()); StringBuilder sb = new StringBuilder(""); - if ( et == ExecType.CP || et == ExecType.SPARK || (isData && isLiteral)) { + if ( et == ExecType.CP || et == ExecType.SPARK || et == ExecType.GPU || (isData && isLiteral)) { sb.append(label); } else { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/lops/LopProperties.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/LopProperties.java b/src/main/java/org/apache/sysml/lops/LopProperties.java index 8d9e916..48e8f57 100644 --- a/src/main/java/org/apache/sysml/lops/LopProperties.java +++ b/src/main/java/org/apache/sysml/lops/LopProperties.java @@ -27,7 +27,7 @@ import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; public class LopProperties { - public enum ExecType { CP, CP_FILE, MR, SPARK, INVALID }; + public enum ExecType { CP, CP_FILE, MR, SPARK, GPU, INVALID }; public enum ExecLocation {INVALID, RecordReader, Map, MapOrReduce, MapAndReduce, Reduce, Data, ControlProgram }; // static variable to assign an unique ID to every lop that is created http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java index 40be5b1..7d3103c 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java @@ -37,6 +37,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject.UpdateType; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.Instruction; +import org.apache.sysml.runtime.instructions.Instruction.INSTRUCTION_TYPE; import org.apache.sysml.runtime.instructions.cp.BooleanObject; import org.apache.sysml.runtime.instructions.cp.ComputationCPInstruction; import org.apache.sysml.runtime.instructions.cp.Data; @@ -308,7 +309,11 @@ public class ProgramBlock // maintain aggregate statistics if( DMLScript.STATISTICS) { - Statistics.maintainCPHeavyHitters( + if(tmp.getType() == INSTRUCTION_TYPE.GPU) + Statistics.maintainCPHeavyHitters( + "gpu_"+tmp.getExtendedOpcode(), System.nanoTime()-t0); + else + Statistics.maintainCPHeavyHitters( tmp.getExtendedOpcode(), System.nanoTime()-t0); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java index 51c4de5..ce412e1 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java @@ -33,6 +33,7 @@ import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy; +import org.apache.sysml.runtime.controlprogram.context.GPUObject; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject; @@ -43,6 +44,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; import org.apache.sysml.runtime.matrix.MetaData; import org.apache.sysml.runtime.matrix.data.FileFormatProperties; import org.apache.sysml.runtime.matrix.data.InputInfo; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.util.LocalFileUtils; import org.apache.sysml.runtime.util.MapReduceTool; @@ -178,6 +180,19 @@ public abstract class CacheableData<T extends CacheBlock> extends Data //for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available private RDDObject _rddHandle = null; //RDD handle private BroadcastObject _bcHandle = null; //Broadcast handle + public GPUObject _gpuHandle = null; + + public GPUObject getGPUObject() { + return _gpuHandle; + } + public MatrixBlock getMatrixBlock() { + if(_data == null) + getCache(); + if(_data != null && _data instanceof MatrixBlock) + return (MatrixBlock) _data; + else + return null; + } /** * Basic constructor for any cacheable data. @@ -203,6 +218,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data _hdfsFileName = that._hdfsFileName; _hdfsFileExists = that._hdfsFileExists; _varName = that._varName; + _gpuHandle = that._gpuHandle; } @@ -412,6 +428,9 @@ public abstract class CacheableData<T extends CacheBlock> extends Data if( _data == null ) getCache(); + if( _gpuHandle != null ) + _gpuHandle.acquireHostRead(); + //read data from HDFS/RDD if required //(probe data for cache_nowrite / jvm_reuse) if( isEmpty(true) && _data==null ) @@ -497,6 +516,10 @@ public abstract class CacheableData<T extends CacheBlock> extends Data if( _data == null ) getCache(); +// // Donot need to sync GPU data as it can cause redundant copy when GPU instruction does release +// if( _gpuHandle != null ) +// _gpuHandle.acquireHostModify(); + //read data from HDFS if required if( isEmpty(true) && _data == null ) { @@ -675,6 +698,9 @@ public abstract class CacheableData<T extends CacheBlock> extends Data clearReusableData(); _data = null; clearCache(); + if(_gpuHandle != null) { + _gpuHandle.clearData(); + } // clear rdd/broadcast back refs if( _rddHandle != null ) @@ -735,6 +761,8 @@ public abstract class CacheableData<T extends CacheBlock> extends Data exportData(fName, outputFormat, -1, formatProperties); } + protected void exportGPUData() throws CacheException { } + /** * Synchronized because there might be parallel threads (parfor local) that * access the same object (in case it was created before the loop). @@ -762,6 +790,8 @@ public abstract class CacheableData<T extends CacheBlock> extends Data throw new CacheException ("MatrixObject not available to read."); LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat); + + exportGPUData(); boolean pWrite = false; // !fName.equals(_hdfsFileName); //persistent write flag if ( fName.equals(_hdfsFileName) ) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java index 90d1f3f..d781730 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java @@ -32,6 +32,7 @@ import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; +import org.apache.sysml.runtime.controlprogram.context.GPUContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.instructions.spark.data.RDDObject; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; @@ -87,6 +88,8 @@ public class MatrixObject extends CacheableData<MatrixBlock> */ public MatrixObject (ValueType vt, String file) { this (vt, file, null); //HDFS file path + if(DMLScript.USE_ACCELERATOR) + _gpuHandle = GPUContext.createGPUObject(this); } /** @@ -98,6 +101,8 @@ public class MatrixObject extends CacheableData<MatrixBlock> _hdfsFileName = file; _cache = null; _data = null; + if(DMLScript.USE_ACCELERATOR) + _gpuHandle = GPUContext.createGPUObject(this); } /** @@ -230,6 +235,13 @@ public class MatrixObject extends CacheableData<MatrixBlock> } } + @Override + protected void exportGPUData() throws CacheException { + if(DMLScript.USE_ACCELERATOR && getGPUObject() != null) { + getGPUObject().acquireHostRead(); + } + } + public String toString() { StringBuilder str = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index 6356a76..e446fed 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -47,9 +47,12 @@ import org.apache.sysml.runtime.instructions.cp.ScalarObject; import org.apache.sysml.runtime.instructions.cp.StringObject; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData; +import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; import org.apache.sysml.runtime.matrix.MetaData; import org.apache.sysml.runtime.matrix.data.FrameBlock; +import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.runtime.util.UtilFunctions; @@ -66,6 +69,8 @@ public class ExecutionContext //debugging (optional) protected DebugState _dbState = null; + protected GPUContext _gpuCtx = null; + protected ExecutionContext() { //protected constructor to force use of ExecutionContextFactory @@ -76,7 +81,6 @@ public class ExecutionContext { //protected constructor to force use of ExecutionContextFactory this( true, prog ); - } protected ExecutionContext(LocalVariableMap vars) @@ -111,6 +115,11 @@ public class ExecutionContext _variables = vars; } + public void destroyGPUContext() throws DMLRuntimeException { + if(_gpuCtx != null) + _gpuCtx.destroy(); + } + /* ------------------------------------------------------- * Methods to handle variables and associated data objects @@ -229,7 +238,59 @@ public class ExecutionContext throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); - return mo.acquireRead(); + MatrixBlock mb = mo.acquireRead(); + return mb; + } + + public void setMetaData(String varName, long nrows, long ncols) throws DMLRuntimeException { + MatrixObject mo = getMatrixObject(varName); + if(mo.getNumRows() != nrows || mo.getNumColumns() != ncols) { + MatrixCharacteristics mc = new MatrixCharacteristics((long)nrows, (long)ncols, + (int) mo.getNumRowsPerBlock(), (int)mo.getNumColumnsPerBlock()); + OutputInfo oiOld = null; + InputInfo iiOld = null; + MetaData oldMetaData = mo.getMetaData(); + if(oldMetaData != null && oldMetaData instanceof MatrixFormatMetaData) { + oiOld = ((MatrixFormatMetaData)oldMetaData).getOutputInfo(); + iiOld = ((MatrixFormatMetaData)oldMetaData).getInputInfo(); + } + else { + throw new DMLRuntimeException("Metadata not available"); + } + mo.setMetaData(new MatrixFormatMetaData(mc, oiOld, iiOld)); + } + } + + public MatrixObject getMatrixOutputForGPUInstruction(String varName, boolean isSparse) + throws DMLRuntimeException { + if(isSparse) { + throw new DMLRuntimeException("Sparse matrix block is not supported for GPU instruction"); + } + MatrixObject mo = getMatrixObject(varName); + if(mo.getMatrixBlock() == null) { + MatrixBlock mb = new MatrixBlock((int)mo.getNumRows(), (int)mo.getNumColumns(), false); + mo.acquireModify(mb); + mo.release(); + } + mo.getGPUObject().acquireDenseDeviceModify((int)(mo.getNumRows()*mo.getNumColumns())); + mo.getMatrixCharacteristics().setNonZeros(-1); + mo.getMatrixBlock().setNonZeros(-1); + return mo; + } + + public MatrixObject getMatrixInputForGPUInstruction(String varName) + throws DMLRuntimeException + { + MatrixObject mo = getMatrixObject(varName); + if(mo == null) { + throw new DMLRuntimeException("No matrix object available for variable:" + varName); + } + if(mo.getGPUObject() == null || !mo.getGPUObject().isAllocated) { + mo.acquireRead(); + mo.release(); + } + mo.getGPUObject().acquireDeviceRead(); + return mo; } /** @@ -245,6 +306,13 @@ public class ExecutionContext mo.release(); } + public void releaseMatrixInputForGPUInstruction(String varName) + throws DMLRuntimeException + { + MatrixObject mo = getMatrixObject(varName); + mo.getGPUObject().release(false); + } + /** * Pins a frame variable into memory and returns the internal frame block. * @@ -311,6 +379,16 @@ public class ExecutionContext setVariable(varName, so); } + public void releaseMatrixOutputForGPUInstruction(String varName) throws DMLRuntimeException { + MatrixObject mo = getMatrixObject(varName); + if(mo.getGPUObject() == null || !mo.getGPUObject().isAllocated) { + throw new DMLRuntimeException("No output is allocated on GPU"); + } + mo.getGPUObject().release(true); +// mo.acquireModify(); +// mo.release(); + } + /** * * @param varName @@ -318,12 +396,17 @@ public class ExecutionContext * @throws DMLRuntimeException */ public void setMatrixOutput(String varName, MatrixBlock outputData) - throws DMLRuntimeException + throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); + if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated) { + throw new DMLRuntimeException("GPU instructions should not set matrix output. " + + "Instead should use releaseMatrixOutput. If called by non-GPU instruction, " + + "then inconsistent bufferpool logic. Possible skipped deleting GPU object when acquire modify."); + } + mo.acquireModify(outputData); mo.release(); - setVariable(varName, mo); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java new file mode 100644 index 0000000..9c513aa --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.controlprogram.context; + +import java.util.ArrayList; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; + +public abstract class GPUContext { + + public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>(); + protected static GPUContext currContext; + protected GPUContext() { } + + public static volatile Boolean isGPUContextCreated = false; + + public abstract long getAvailableMemory(); + + // Creation / Destruction of GPUContext and related handles + public static GPUContext createGPUContext() { + if(currContext == null && DMLScript.USE_ACCELERATOR) { + // TODO: Handle this thread and resolve concurrency related bugs if any + new Thread(new Runnable() { + @Override + public void run() { + // Lazy GPU context creation + synchronized(isGPUContextCreated) { + currContext = new JCudaContext(); + OptimizerUtils.GPU_MEMORY_BUDGET = ((JCudaContext)currContext).getAvailableMemory(); + isGPUContextCreated = true; + } + } + }).start(); + } + return currContext; + } + public static GPUObject createGPUObject(MatrixObject mo) { + if(DMLScript.USE_ACCELERATOR) { + synchronized(isGPUContextCreated) { + if(currContext == null) + throw new RuntimeException("GPUContext is not created"); + if(currContext instanceof JCudaContext) + return new JCudaObject(mo); + } + } + throw new RuntimeException("Cannot create createGPUObject when USE_ACCELERATOR is off"); + } + public abstract void destroy() throws DMLRuntimeException; + + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java new file mode 100644 index 0000000..4f0b77f --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.controlprogram.context; + +import java.util.Collections; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.CacheException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; + +public abstract class GPUObject { + + public boolean isDeviceCopyModified = false; + AtomicInteger numLocks = new AtomicInteger(0); + + public boolean isInSparseFormat = false; + public boolean isAllocated = false; + + MatrixObject mat = null; + protected GPUObject(MatrixObject mat2) { + this.mat = mat2; + } + + public abstract void acquireDeviceRead() throws DMLRuntimeException; + public abstract void acquireDenseDeviceModify(int numElemsToAllocate) throws DMLRuntimeException; + public abstract void acquireHostRead() throws CacheException; + public abstract void acquireHostModify() throws CacheException; + public abstract void release(boolean isGPUCopyModified) throws CacheException; + + + // package-level visibility as these methods are guarded by underlying GPUContext + abstract void allocateMemoryOnDevice(int numElemToAllocate) throws DMLRuntimeException; + abstract void deallocateMemoryOnDevice() throws DMLRuntimeException; + abstract long getSizeOnDevice() throws DMLRuntimeException; + abstract void copyFromHostToDevice() throws DMLRuntimeException; + abstract void copyFromDeviceToHost() throws DMLRuntimeException; // Called by export() + + + /** + * It finds matrix toBeRemoved such that toBeRemoved.GPUSize >= size + * // TODO: it is the smallest matrix size that satisfy the above condition. For now just evicting the largest pointer. + * Then returns toBeRemoved. + * + */ + protected void evict(long GPUSize) throws DMLRuntimeException { + if(GPUContext.allocatedPointers.size() == 0) { + throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + } + + synchronized(evictionLock) { + Collections.sort(GPUContext.allocatedPointers, new Comparator<GPUObject>() { + + @Override + public int compare(GPUObject p1, GPUObject p2) { + int p1Val = p1.numLocks.get(); + int p2Val = p2.numLocks.get(); + + if(p1Val < 0 || p2Val < 0) { + throw new RuntimeException("Number of locks cannot be negative"); + } + else if(p1Val == 0 && p2Val == 0) { + // Both p1 and p2 are unlocked, return largest object + // TODO: Modify this !! + long p1Size = 0; long p2Size = 0; + try { + p1Size = p1.getSizeOnDevice(); + p2Size = p2.getSizeOnDevice(); + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + if(p1Size == p2Size) { + return 0; + } + else if(p1Size < p2Size) { + return 1; + } + else { + return -1; + } + } + else if(p1Val > p2Val) { + // There are more locks on p1 + return 1; + } + else { + // There are more locks on p2 + return -1; + } + } + }); + + + while(GPUSize > getAvailableMemory() && GPUContext.allocatedPointers.size() > 0) { + GPUObject toBeRemoved = GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1); + if(toBeRemoved.numLocks.get() != 0) { + throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + } + if(toBeRemoved.isDeviceCopyModified) { + toBeRemoved.copyFromDeviceToHost(); + } + toBeRemoved.clearData(); + } + } + } + + public void clearData() throws CacheException { + synchronized(evictionLock) { + GPUContext.allocatedPointers.remove(this); + } + try { + deallocateMemoryOnDevice(); + } catch (DMLRuntimeException e) { + throw new CacheException(e); + } + } + + static Boolean evictionLock = new Boolean(true); + + protected long getAvailableMemory() { + return GPUContext.currContext.getAvailableMemory(); + } + +// // Copying from device -> host occurs here +// // Called by MatrixObject's exportData +// public void exportData() throws CacheException { +// boolean isDeviceCopyModified = mat.getGPUObject() != null && mat.getGPUObject().isDeviceCopyModified; +// boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || +// (mat.getMatrixBlock().getDenseBlock() == null && mat.getMatrixBlock().getSparseBlock() == null); +// +// if(mat.getGPUObject() != null && (isDeviceCopyModified || isHostCopyUnavailable)) { +// try { +// mat.getGPUObject().copyFromDeviceToHost(); +// } catch (DMLRuntimeException e) { +// throw new CacheException(e); +// } +// } +// } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java new file mode 100644 index 0000000..6c8f244 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.controlprogram.context; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.utils.Statistics; + +import jcuda.driver.JCudaDriver; +import jcuda.jcublas.JCublas2; +import jcuda.jcublas.cublasHandle; +import jcuda.jcudnn.JCudnn; +import jcuda.runtime.JCuda; +import jcuda.jcudnn.cudnnHandle; +import static jcuda.jcudnn.JCudnn.cudnnCreate; +import static jcuda.jcublas.JCublas2.cublasCreate; +import static jcuda.jcublas.JCublas2.cublasDestroy; +import static jcuda.jcudnn.JCudnn.cudnnDestroy; +import static jcuda.driver.JCudaDriver.cuInit; +import static jcuda.driver.JCudaDriver.cuDeviceGetCount; +import static jcuda.runtime.JCuda.cudaMemGetInfo; +import static jcuda.runtime.cudaError.cudaSuccess; + +/** + * Setup: + * 1. Install CUDA 7.5 + * 2. Install CuDNN v4 from http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip + * 3. Download JCuda binaries version 0.7.5b and JCudnn version 0.7.5. Copy the DLLs into C:\lib (or /lib) directory. Link: http://www.jcuda.org/downloads/downloads.html + * + */ +public class JCudaContext extends GPUContext { + + private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName()); + + public static boolean DEBUG = false; + + public static long totalNumBytes = 0; + public static long availableNumBytesWithoutUtilFactor = 0; + // Fraction of available memory to use. The available memory is computer when the JCudaContext is created + // to handle the tradeoff on calling cudaMemGetInfo too often. + public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; + public static boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = true; + + static { + long start = System.nanoTime(); + JCuda.setExceptionsEnabled(true); + JCudnn.setExceptionsEnabled(true); + JCublas2.setExceptionsEnabled(true); + JCudaDriver.setExceptionsEnabled(true); + cuInit(0); // Initialize the driver + // Obtain the number of devices + int deviceCountArray[] = { 0 }; + cuDeviceGetCount(deviceCountArray); + int deviceCount = deviceCountArray[0]; + LOG.info("Total number of GPUs on the machine: " + deviceCount); + Statistics.cudaInitTime = System.nanoTime() - start; + } + + public long getAvailableMemory() { + if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) { + long free [] = { 0 }; + long total [] = { 0 }; + if(cudaMemGetInfo(free, total) == cudaSuccess) { + totalNumBytes = total[0]; + availableNumBytesWithoutUtilFactor = free[0]; + } + else { + throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); + } + } + return (long) (availableNumBytesWithoutUtilFactor*GPU_MEMORY_UTILIZATION_FACTOR); + } + + + public JCudaContext() { + if(isGPUContextCreated) { + // Wait until it is deleted. This case happens during multi-threaded testing. + // This also allows for multi-threaded execute calls + long startTime = System.currentTimeMillis(); + do { + try { + Thread.sleep(100); + } catch (InterruptedException e) {} + } while(isGPUContextCreated && (System.currentTimeMillis() - startTime) < 60000); + synchronized(isGPUContextCreated) { + if(GPUContext.currContext != null) { + throw new RuntimeException("Cannot create multiple JCudaContext. Waited for 10 min to close previous GPUContext"); + } + } + } + GPUContext.currContext = this; + + long start = System.nanoTime(); + LibMatrixCUDA.cudnnHandle = new cudnnHandle(); + cudnnCreate(LibMatrixCUDA.cudnnHandle); + LibMatrixCUDA.cublasHandle = new cublasHandle(); + cublasCreate(LibMatrixCUDA.cublasHandle); + Statistics.cudaLibrariesInitTime = System.nanoTime() - start; + + long free [] = { 0 }; + long total [] = { 0 }; + if(cudaMemGetInfo(free, total) == cudaSuccess) { + totalNumBytes = total[0]; + availableNumBytesWithoutUtilFactor = free[0]; + } + else { + throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); + } + LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB"); + LOG.info("Available GPU memory: " + (availableNumBytesWithoutUtilFactor*(1e-6)) + " MB"); + } + + @Override + public void destroy() throws DMLRuntimeException { + if(currContext != null) { + synchronized(isGPUContextCreated) { + cudnnDestroy(LibMatrixCUDA.cudnnHandle); + cublasDestroy(LibMatrixCUDA.cublasHandle); + currContext = null; + isGPUContextCreated = false; + } + } + else if(LibMatrixCUDA.cudnnHandle != null || LibMatrixCUDA.cublasHandle != null) { + throw new DMLRuntimeException("Error while destroying the GPUContext"); + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java new file mode 100644 index 0000000..10d6408 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.controlprogram.context; + +import static jcuda.runtime.JCuda.cudaFree; +import static jcuda.runtime.JCuda.cudaMalloc; +import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; +import jcuda.Pointer; +import jcuda.Sizeof; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.CacheException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.utils.Statistics; + +public class JCudaObject extends GPUObject { + + public Pointer jcudaPointer = null; +// public long numElems = -1; + + JCudaObject(MatrixObject mat2) { + super(mat2); + } + + private void prepare(boolean isInput, int numElemsToAllocate) throws DMLRuntimeException { + if(jcudaPointer != null) { + // Already allocated on GPU and expected to be in sync + // checkDimensions(); + } + else { + long GPUSize; + if(numElemsToAllocate != -1) + GPUSize = (Sizeof.DOUBLE) * (long) (numElemsToAllocate); + else if(isInput && mat != null && mat.getMatrixBlock() != null && mat.getMatrixBlock().getDenseBlock() != null) { + GPUSize = (Sizeof.DOUBLE) * (long) mat.getMatrixBlock().getDenseBlock().length; + numElemsToAllocate = mat.getMatrixBlock().getDenseBlock().length; + } + else + GPUSize = getSizeOnDevice(); + // Ensure enough memory while allocating the matrix + if(GPUSize > getAvailableMemory()) { + evict(GPUSize); + } + allocateMemoryOnDevice(numElemsToAllocate); + synchronized(evictionLock) { + GPUContext.allocatedPointers.add(this); + } + if(isInput) + copyFromHostToDevice(); + } + numLocks.addAndGet(1); + } + + @Override + public void acquireDeviceRead() throws DMLRuntimeException { + prepare(true, -1); + if(!isAllocated) + throw new DMLRuntimeException("Expected device data to be allocated"); + } + +// private void checkDimensions() throws DMLRuntimeException { +// if(LibMatrixCUDA.isInSparseFormat(mat)) +// throw new DMLRuntimeException("Sparse format not implemented"); +// else { +// if(mat.getNumRows()*mat.getNumColumns() != numElems) { +// throw new DMLRuntimeException("The jcudaPointer and MatrixBlock is not in synched"); +// } +// } +// } + + @Override + public void acquireDenseDeviceModify(int numElemsToAllocate) throws DMLRuntimeException { + prepare(false, numElemsToAllocate); + isDeviceCopyModified = true; + if(!isAllocated) + throw new DMLRuntimeException("Expected device data to be allocated"); + } + + @Override + public void acquireHostRead() throws CacheException { + if(isAllocated) { + try { + if(isDeviceCopyModified) { + copyFromDeviceToHost(); + } + } catch (DMLRuntimeException e) { + throw new CacheException(e); + } + } + } + + @Override + public void acquireHostModify() throws CacheException { + if(isAllocated) { + try { + if(isDeviceCopyModified) { + throw new DMLRuntimeException("Potential overwrite of GPU data"); + // copyFromDeviceToHost(); + } + clearData(); + } catch (DMLRuntimeException e) { + throw new CacheException(e); + } + } + } + + public void release(boolean isGPUCopyModified) throws CacheException { + if(numLocks.addAndGet(-1) < 0) { + throw new CacheException("Redundant release of GPU object"); + } + isDeviceCopyModified = isGPUCopyModified; + } + + @Override + void allocateMemoryOnDevice(int numElemToAllocate) throws DMLRuntimeException { + if(jcudaPointer == null) { + long start = System.nanoTime(); + jcudaPointer = new Pointer(); + if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) + throw new DMLRuntimeException("Sparse format not implemented"); + else if(numElemToAllocate == -1) { + // Called for dense input + cudaMalloc(jcudaPointer, mat.getNumRows()*mat.getNumColumns()*Sizeof.DOUBLE); + } + else { + // Called for dense output + cudaMalloc(jcudaPointer, numElemToAllocate*Sizeof.DOUBLE); + } + + Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start); + Statistics.cudaAllocCount.addAndGet(1); + } + isAllocated = true; + } + + @Override + void deallocateMemoryOnDevice() { + if(jcudaPointer != null) { + long start = System.nanoTime(); + cudaFree(jcudaPointer); + Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); + Statistics.cudaDeAllocCount.addAndGet(1); + } + jcudaPointer = null; + isAllocated = false; + numLocks.set(0); + } + + @Override + void copyFromHostToDevice() throws DMLRuntimeException { + if(jcudaPointer != null) { + printCaller(); + long start = System.nanoTime(); + if(LibMatrixCUDA.isInSparseFormat(mat)) + throw new DMLRuntimeException("Sparse format not implemented"); + else { + double [] data = mat.getMatrixBlock().getDenseBlock(); + if(data == null && mat.getMatrixBlock().getSparseBlock() != null) { + throw new DMLRuntimeException("Incorrect sparsity calculation"); + } + else if(data == null) { + if(mat.getMatrixBlock().getNonZeros() == 0) { + data = new double[mat.getMatrixBlock().getNumRows()*mat.getMatrixBlock().getNumColumns()]; + } + else + throw new DMLRuntimeException("MatrixBlock is not allocated"); + } + cudaMemcpy(jcudaPointer, Pointer.to(data), mat.getNumRows()*mat.getNumColumns() * Sizeof.DOUBLE, cudaMemcpyHostToDevice); + } + Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start); + Statistics.cudaToDevCount.addAndGet(1); + } + else { + throw new DMLRuntimeException("Cannot copy from host to device without allocating"); + } + } + + @Override + protected void copyFromDeviceToHost() throws DMLRuntimeException { + if(jcudaPointer != null) { + printCaller(); + if(LibMatrixCUDA.isInSparseFormat(mat)) + throw new DMLRuntimeException("Sparse format not implemented"); + else { + long start = System.nanoTime(); + MatrixBlock mb = mat.getMatrixBlock(); + if(mb == null) { + throw new DMLRuntimeException("CP Data is not allocated"); + } + if(mb.getDenseBlock() == null) { + mb.allocateDenseBlock(); + } + double [] data = mb.getDenseBlock(); + + cudaMemcpy(Pointer.to(data), jcudaPointer, data.length * Sizeof.DOUBLE, cudaMemcpyDeviceToHost); + mat.getMatrixBlock().recomputeNonZeros(); + Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start); + Statistics.cudaFromDevCount.addAndGet(1); + } + } + else { + throw new DMLRuntimeException("Cannot copy from device to host as JCuda pointer is not allocated"); + } + isDeviceCopyModified = false; + } + + @Override + protected long getSizeOnDevice() throws DMLRuntimeException { + long GPUSize = 0; +// boolean emptyBlock = (mat.getDenseBlock() == null && mat.getSparseBlock() == null); + int rlen = (int) mat.getNumRows(); + int clen = (int) mat.getNumColumns(); +// long nonZeros = mat.getNonZeros(); + if(LibMatrixCUDA.isInSparseFormat(mat)) { + // if(LibMatrixCUDA.isInSparseFormat(mat)) { +// +// if (mat.getMatrixBlock().isInSparseFormat() ) { // && !emptyBlock) { +// GPUSize = (rlen + 1) * (long) (Integer.SIZE / Byte.SIZE) + +// nonZeros * (long) (Integer.SIZE / Byte.SIZE) + +// nonZeros * (long) (Double.SIZE / Byte.SIZE); + throw new DMLRuntimeException("Sparse format not implemented"); + } + else { + int align = 0; +// if (clen > 5120) +// if (clen % 256 == 0) +// align = 0; +// else +// align = 256; // in the dense case we use this for alignment +// else +// if (clen % 128 == 0) +// align = 0; +// else +// align = 128; // in the dense case we use this for alignment + GPUSize = (Sizeof.DOUBLE) * (long) (rlen * clen + align); + + } + return GPUSize; + } + + private String getClassAndMethod(StackTraceElement st) { + String [] str = st.getClassName().split("\\."); + return str[str.length - 1] + "." + st.getMethodName(); + } + private void printCaller() { + if(JCudaContext.DEBUG) { + StackTraceElement[] st = Thread.currentThread().getStackTrace(); + String ret = getClassAndMethod(st[1]); + for(int i = 2; i < st.length && i < 7; i++) { + ret += "->" + getClassAndMethod(st[i]); + } + System.out.println("CALL_STACK:" + ret); + } + + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java new file mode 100644 index 0000000..a29baf4 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions; + +import java.util.HashMap; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.instructions.Instruction.INSTRUCTION_TYPE; +import org.apache.sysml.runtime.instructions.cp.CPInstruction; +import org.apache.sysml.runtime.instructions.cp.CPInstruction.CPINSTRUCTION_TYPE; +import org.apache.sysml.runtime.instructions.gpu.AggregateBinaryGPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction; + +public class GPUInstructionParser extends InstructionParser +{ + public static final HashMap<String, CPINSTRUCTION_TYPE> String2GPUInstructionType; + static { + String2GPUInstructionType = new HashMap<String, CPINSTRUCTION_TYPE>(); + String2GPUInstructionType.put( "conv2d" , CPINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "conv2d_backward_filter" , CPINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "conv2d_backward_data" , CPINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "ba+*" , CPINSTRUCTION_TYPE.AggregateBinary); + + } + + public static CPInstruction parseSingleInstruction (String str ) + throws DMLRuntimeException + { + if ( str == null || str.isEmpty() ) + return null; + + CPINSTRUCTION_TYPE cptype = InstructionUtils.getCPType(str); + if ( cptype == null ) + throw new DMLRuntimeException("Unable derive cptype for instruction: " + str); + CPInstruction cpinst = parseSingleInstruction(cptype, str); + if ( cpinst == null ) + throw new DMLRuntimeException("Unable to parse instruction: " + str); + return cpinst; + } + + public static CPInstruction parseSingleInstruction ( CPINSTRUCTION_TYPE cptype, String str ) + throws DMLRuntimeException + { + + if ( str == null || str.isEmpty() ) + return null; + + if(cptype == null) { + throw new DMLRuntimeException("The instruction is not GPU-enabled:" + str); + } + + CPInstruction ret; + switch(cptype) + { + case AggregateBinary: + ret = AggregateBinaryGPUInstruction.parseInstruction(str); + break; + + case Convolution: + ret = ConvolutionGPUInstruction.parseInstruction(str); + break; + + default: + throw new DMLRuntimeException("Invalid GPU Instruction Type: " + cptype ); + } + + ret.setType(INSTRUCTION_TYPE.GPU); + return ret; + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java b/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java index 97f792d..60ca3b7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/Instruction.java @@ -37,7 +37,8 @@ public abstract class Instruction EXTERNAL_LIBRARY, MAPREDUCE_JOB, BREAKPOINT, - SPARK + SPARK, + GPU }; protected static final Log LOG = LogFactory.getLog(Instruction.class.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java index 5ae3124..b468eaa 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/InstructionParser.java @@ -46,6 +46,11 @@ public class InstructionParser SPINSTRUCTION_TYPE sptype = InstructionUtils.getSPType(str); return SPInstructionParser.parseSingleInstruction (sptype, str); } + else if ( execType.equalsIgnoreCase(ExecType.GPU.toString()) ) + { + CPINSTRUCTION_TYPE cptype = InstructionUtils.getGPUType(str); + return GPUInstructionParser.parseSingleInstruction (cptype, str); + } else if ( execType.equalsIgnoreCase("MR") ) { MRINSTRUCTION_TYPE mrtype = InstructionUtils.getMRType(str); if ( mrtype == null ) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java index c8d406e..b3493c2 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/InstructionUtils.java @@ -259,6 +259,19 @@ public class InstructionUtils /** * + * @param str + * @return + * @throws DMLUnsupportedOperationException + */ + public static CPINSTRUCTION_TYPE getGPUType( String str ) + { + String opcode = getOpCode(str); + CPINSTRUCTION_TYPE cptype = GPUInstructionParser.String2GPUInstructionType.get( opcode ); + return cptype; + } + + /** + * * @param opcode * @return */ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java index 38e92bb..6aa6205 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java @@ -23,8 +23,8 @@ import org.apache.sysml.api.MLContextProxy; import org.apache.sysml.lops.runtime.RunMRJobs; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; -import org.apache.sysml.runtime.instructions.CPInstructionParser; import org.apache.sysml.runtime.instructions.Instruction; +import org.apache.sysml.runtime.instructions.InstructionParser; import org.apache.sysml.runtime.matrix.operators.Operator; @@ -78,7 +78,8 @@ public abstract class CPInstruction extends Instruction { //note: no exchange of updated instruction as labels might change in the general case String updInst = RunMRJobs.updateLabels(tmp.toString(), ec.getVariables()); - tmp = CPInstructionParser.parseSingleInstruction(updInst); + // tmp = CPInstructionParser.parseSingleInstruction(updInst); + tmp = InstructionParser.parseSingleInstruction(updInst); if(MLContextProxy.isActive()) { MLContextProxy.setInstructionForMonitoring(tmp); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java new file mode 100644 index 0000000..4a09e28 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.parser.Expression.DataType; +import org.apache.sysml.parser.Expression.ValueType; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.functionobjects.Multiply; +import org.apache.sysml.runtime.functionobjects.Plus; +import org.apache.sysml.runtime.functionobjects.SwapIndex; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.BinaryCPInstruction; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator; +import org.apache.sysml.runtime.matrix.operators.AggregateOperator; +import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.runtime.matrix.operators.ReorgOperator; +import org.apache.sysml.utils.Statistics; + +public class AggregateBinaryGPUInstruction extends BinaryCPInstruction +{ + + public AggregateBinaryGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, + String opcode, String istr, boolean isLeftTransposed, boolean isRightTransposed){ + super(op, in1, in2, out, opcode, istr); + _cptype = CPINSTRUCTION_TYPE.AggregateBinary; + this.isLeftTransposed = isLeftTransposed; + this.isRightTransposed = isRightTransposed; + } + + boolean isLeftTransposed; + boolean isRightTransposed; + + /** + * + * @param str + * @return + * @throws DMLRuntimeException + */ + public static AggregateBinaryGPUInstruction parseInstruction( String str ) + throws DMLRuntimeException + { + CPOperand in1 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN); + CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN); + CPOperand out = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN); + + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + String opcode = parts[0]; + + if ( !opcode.equalsIgnoreCase("ba+*")) { + throw new DMLRuntimeException("AggregateBinaryInstruction.parseInstruction():: Unknown opcode " + opcode); + } + + InstructionUtils.checkNumFields( parts, 5 ); + in1.split(parts[1]); + in2.split(parts[2]); + out.split(parts[3]); + + boolean isLeftTransposed = Boolean.parseBoolean(parts[4]); + boolean isRightTransposed = Boolean.parseBoolean(parts[5]); + + AggregateOperator agg = new AggregateOperator(0, Plus.getPlusFnObject()); + AggregateBinaryOperator aggbin = new AggregateBinaryOperator(Multiply.getMultiplyFnObject(), agg, 1); + return new AggregateBinaryGPUInstruction(aggbin, in1, in2, out, opcode, str, isLeftTransposed, isRightTransposed); + } + + private MatrixBlock transpose(MatrixBlock m1) throws DMLRuntimeException { + ReorgOperator r_op = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), 1); + return (MatrixBlock) (m1.reorgOperations(r_op, new MatrixBlock(), 0, 0, 0)); + } + + private boolean isSparse(ExecutionContext ec, String var) throws DMLRuntimeException { + MatrixObject mo = (MatrixObject) ec.getVariable(var); + return LibMatrixCUDA.isInSparseFormat(mo); + } + + @Override + public void processInstruction(ExecutionContext ec) + throws DMLRuntimeException + { + // -------------------------------------- + // This code will be removed when the JIRA SYSTEMML-702 is complete + if( isSparse(ec, input1.getName()) || isSparse(ec, input2.getName())) { + + Statistics.gpuSparseMultCount.addAndGet(1); + + //get inputs + MatrixBlock matBlock1 = ec.getMatrixInput(input1.getName()); + MatrixBlock matBlock2 = ec.getMatrixInput(input2.getName()); + + if(isLeftTransposed) + matBlock1 = transpose(matBlock1); + if(isRightTransposed) + matBlock2 = transpose(matBlock2); + + //compute matrix multiplication + AggregateBinaryOperator ab_op = (AggregateBinaryOperator) _optr; + MatrixBlock soresBlock = (MatrixBlock) (matBlock1.aggregateBinaryOperations(matBlock1, matBlock2, new MatrixBlock(), ab_op)); + + //release inputs/outputs + ec.releaseMatrixInput(input1.getName()); + ec.releaseMatrixInput(input2.getName()); + ec.setMatrixOutput(output.getName(), soresBlock); + return; + } + // -------------------------------------- + + Statistics.incrementNoOfExecutedGPUInst(); + + AggregateBinaryOperator op = (AggregateBinaryOperator) _optr; + if( !(op.binaryFn instanceof Multiply && op.aggOp.increOp.fn instanceof Plus) ) { + throw new DMLRuntimeException("Unsupported binary aggregate operation: ("+op.binaryFn+", "+op.aggOp+")."); + } + + //get inputs + MatrixObject m1 = ec.getMatrixInputForGPUInstruction(input1.getName()); + MatrixObject m2 = ec.getMatrixInputForGPUInstruction(input2.getName()); + + //compute matrix multiplication + int rlen = (int) (isLeftTransposed ? m1.getNumColumns() : m1.getNumRows()); + int clen = (int) (isRightTransposed ? m2.getNumRows() : m2.getNumColumns()); + + ec.setMetaData(output.getName(), rlen, clen); + MatrixObject out = ec.getMatrixOutputForGPUInstruction(output.getName(), false); + LibMatrixCUDA.matmult(m1, m2, out, isLeftTransposed, isRightTransposed); + + //release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(input1.getName()); + ec.releaseMatrixInputForGPUInstruction(input2.getName()); + ec.releaseMatrixOutputForGPUInstruction(output.getName()); + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java new file mode 100644 index 0000000..8907c4d --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu; + +import java.util.ArrayList; + +import org.apache.sysml.parser.Expression.DataType; +import org.apache.sysml.parser.Expression.ValueType; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.functionobjects.SwapIndex; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.cp.UnaryCPInstruction; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.ReorgOperator; +import org.apache.sysml.runtime.util.ConvolutionUtils; +import org.apache.sysml.utils.Statistics; + +public class ConvolutionGPUInstruction extends UnaryCPInstruction { + + private CPOperand _in2; + private ArrayList<CPOperand> _input_shape; + private ArrayList<CPOperand> _filter_shape; + private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>(); + private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>(); + + int N; int C; int H; int W; + int K; int R; int S; int stride_h; int stride_w; int pad_h; int pad_w; + int P; int Q; + + public ConvolutionGPUInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, + String istr, ArrayList<CPOperand> stride, + ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, + ArrayList<CPOperand> filter_shape) { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, + opcode, istr); + _in2 = in2; + _cptype = CPINSTRUCTION_TYPE.Convolution; + _stride = stride; + _padding = padding; + _input_shape = input_shape; + _filter_shape = filter_shape; + } + + public static ConvolutionGPUInstruction parseInstruction(String str) + throws DMLRuntimeException { + CPOperand in = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN); + CPOperand out = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN); + + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + String opcode = parts[0]; + if (opcode.equalsIgnoreCase("conv2d") || + opcode.equalsIgnoreCase("conv2d_backward_filter") || + opcode.equalsIgnoreCase("conv2d_backward_data")) { + InstructionUtils.checkNumFields(parts, 15); + // dout, stride1, stride2, padding1, padding2 + // input_shape1, input_shape2, input_shape3, input_shape4, + // filter_shape1, filter_shape2, filter_shape3, filter_shape4, + in.split(parts[1]); + CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN); + in2.split(parts[2]); + out.split(parts[15]); + + ArrayList<CPOperand> stride = new ArrayList<CPOperand>(); + ArrayList<CPOperand> padding = new ArrayList<CPOperand>(); + ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>(); + ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>(); + stride.add(new CPOperand(parts[3])); + stride.add(new CPOperand(parts[4])); + padding.add(new CPOperand(parts[5])); + padding.add(new CPOperand(parts[6])); + input_shape.add(new CPOperand(parts[7])); + input_shape.add(new CPOperand(parts[8])); + input_shape.add(new CPOperand(parts[9])); + input_shape.add(new CPOperand(parts[10])); + filter_shape.add(new CPOperand(parts[11])); + filter_shape.add(new CPOperand(parts[12])); + filter_shape.add(new CPOperand(parts[13])); + filter_shape.add(new CPOperand(parts[14])); + + return new ConvolutionGPUInstruction(in, in2, out, opcode, str, stride, + padding, input_shape, filter_shape); + } + else { + throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str); + } + } + + private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, + int index) throws DMLRuntimeException { + return (int) ec.getScalarInput(aL.get(index).getName(), + aL.get(index).getValueType(), aL.get(index).isLiteral()) + .getLongValue(); + } + + @Override + public void processInstruction(ExecutionContext ec) + throws DMLRuntimeException { + + Statistics.incrementNoOfExecutedGPUInst(); + + MatrixObject out = null; + if (instOpcode.equalsIgnoreCase("conv2d") || + instOpcode.equalsIgnoreCase("conv2d_backward_filter") || + instOpcode.equalsIgnoreCase("conv2d_backward_data")) { + + pad_h = getScalarInput(ec, _padding, 0); + pad_w = getScalarInput(ec, _padding, 1); + stride_h = getScalarInput(ec, _stride, 0); + stride_w = getScalarInput(ec, _stride, 1); + + N = getScalarInput(ec, _input_shape, 0); + C = getScalarInput(ec, _input_shape, 1); + H = getScalarInput(ec, _input_shape, 2); + W = getScalarInput(ec, _input_shape, 3); + + K = getScalarInput(ec, _filter_shape, 0); + + R = getScalarInput(ec, _filter_shape, 2); + S = getScalarInput(ec, _filter_shape, 3); + + P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h); + Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w); + + if (instOpcode.equalsIgnoreCase("conv2d")) { + MatrixObject image = ec.getMatrixInputForGPUInstruction(input1.getName()); + MatrixObject filter = ec.getMatrixInputForGPUInstruction(_in2.getName()); + if(image.getMatrixBlock().isInSparseFormat() || filter.getMatrixBlock().isInSparseFormat()) { + throw new DMLRuntimeException("Sparse convolution not implemented"); + } + if(image.getNumRows() != N || image.getNumColumns() != C*H*W) + throw new DMLRuntimeException("Incorrect dimensions for image in conv2d"); + if(filter.getNumRows() != K || filter.getNumColumns() != C*R*S) + throw new DMLRuntimeException("Incorrect dimensions for filter in conv2d"); + + ec.setMetaData(output.getName(), N, K * P * Q); + out = ec.getMatrixOutputForGPUInstruction(output.getName(), false); + LibMatrixCUDA.conv2d(image, filter, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + + } + else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) { + MatrixObject image = ec.getMatrixInputForGPUInstruction(input1.getName()); + MatrixObject dout = ec.getMatrixInputForGPUInstruction(_in2.getName()); + if(image.getMatrixBlock().isInSparseFormat() || dout.getMatrixBlock().isInSparseFormat()) + throw new DMLRuntimeException("Sparse convolution_backward_filter not implemented"); + if(image.getNumRows() != N || image.getNumColumns() != C*H*W) + throw new DMLRuntimeException("Incorrect dimensions for image in conv2d_backward_filter"); + if(dout.getNumRows() != N || dout.getNumColumns() != K*P*Q) + throw new DMLRuntimeException("Incorrect dimensions for dout in conv2d_backward_filter: " + + dout.getNumRows() + " != " + N + " || " + dout.getNumColumns() + " != " + K*P*Q); + + ec.setMetaData(output.getName(), K, C * R * S); + out = ec.getMatrixOutputForGPUInstruction(output.getName(), false); + LibMatrixCUDA.conv2d_backward_filter(image, dout, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + // TODO: For now always copy the device data to host + // ec.gpuCtx.copyDeviceToHost(outputBlock); + } + else if (instOpcode.equalsIgnoreCase("conv2d_backward_data")) { + MatrixObject filter = ec.getMatrixInputForGPUInstruction(input1.getName()); + MatrixObject dout = ec.getMatrixInputForGPUInstruction(_in2.getName()); + if(filter.getMatrixBlock().isInSparseFormat() || dout.getMatrixBlock().isInSparseFormat()) + throw new DMLRuntimeException("Sparse convolution_backward_data not implemented"); + if(filter.getNumRows() != K || filter.getNumColumns() != C*R*S) + throw new DMLRuntimeException("Incorrect dimensions for filter in convolution_backward_data"); + if(dout.getNumRows() != N || dout.getNumColumns() != K*P*Q) + throw new DMLRuntimeException("Incorrect dimensions for dout in conv2d_backward_data: " + + dout.getNumRows() + " != " + N + " || " + dout.getNumColumns() + " != " + K*P*Q); + + ec.setMetaData(output.getName(), N, C * H * W); + out = ec.getMatrixOutputForGPUInstruction(output.getName(), false); + LibMatrixCUDA.conv2d_backward_data(filter, dout, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + } + else { + throw new DMLRuntimeException("Unsupported GPU context for " + instOpcode); + } + } + else { + throw new DMLRuntimeException("Unsupported op code " + instOpcode); + } + // release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(input1.getName()); + ec.releaseMatrixInputForGPUInstruction(_in2.getName()); + ec.releaseMatrixOutputForGPUInstruction(output.getName()); + } + +} \ No newline at end of file