Refactored GPU{Contex,Object} to make it friendlier for parfor - Folded JCuda{Context,Object} to GPU{Context,Object} - Removed "deviceMemBytes", it was redundant - Removed all synchronized in GPU{Object,Contex} - print GPUContext from everywhere in log.trace - LibMatrixCUDA functions expect a GPUContext instead of getting it statically - Restructured GPUContext to use a pool of already initialized GPUContexts - Call cudaSetDevice when on different thread - TODO FIXME Disabled cublasDgeam for scalarMatrixArithmetic - TODO FIXME revisit the need to always force gpu to be used, mem est broken - Ability to restrict parfor from picking up all GPUs on the machine, from a system property
Closes #462 Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/129f0f6b Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/129f0f6b Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/129f0f6b Branch: refs/heads/master Commit: 129f0f6b0e0f6167e4137c6d47374ab96501b888 Parents: 9ed27ad Author: Nakul Jindal <naku...@gmail.com> Authored: Fri Apr 21 16:22:19 2017 -0700 Committer: Nakul Jindal <naku...@gmail.com> Committed: Fri Apr 21 16:22:19 2017 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 3 + .../java/org/apache/sysml/api/DMLScript.java | 27 +- .../api/mlcontext/MLContextConversionUtil.java | 4 +- .../sysml/api/mlcontext/ScriptExecutor.java | 25 +- .../java/org/apache/sysml/conf/DMLConfig.java | 17 +- .../controlprogram/ParForProgramBlock.java | 25 + .../controlprogram/caching/CacheableData.java | 80 +- .../context/ExecutionContext.java | 56 +- .../controlprogram/parfor/LocalParWorker.java | 5 + .../controlprogram/parfor/ParWorker.java | 5 + .../parfor/opt/OptTreeConverter.java | 2 +- .../cp/FunctionCallCPInstruction.java | 13 +- .../gpu/AggregateBinaryGPUInstruction.java | 4 +- .../gpu/AggregateUnaryGPUInstruction.java | 3 +- .../gpu/ConvolutionGPUInstruction.java | 25 +- .../instructions/gpu/GPUInstruction.java | 3 +- .../instructions/gpu/MMTSJGPUInstruction.java | 3 +- .../gpu/MatrixBuiltinGPUInstruction.java | 5 +- .../MatrixMatrixArithmeticGPUInstruction.java | 5 +- .../gpu/MatrixMatrixAxpyGPUInstruction.java | 5 +- .../instructions/gpu/ReorgGPUInstruction.java | 5 +- .../ScalarMatrixArithmeticGPUInstruction.java | 4 +- .../instructions/gpu/context/CSRPointer.java | 457 ++++++ .../instructions/gpu/context/GPUContext.java | 619 ++++++- .../gpu/context/GPUContextPool.java | 158 ++ .../instructions/gpu/context/GPUObject.java | 957 ++++++++--- .../instructions/gpu/context/JCudaContext.java | 286 ---- .../instructions/gpu/context/JCudaKernels.java | 70 +- .../instructions/gpu/context/JCudaObject.java | 1330 --------------- .../runtime/matrix/data/LibMatrixCUDA.java | 1509 ++++++++++-------- .../test/integration/AutomatedTestBase.java | 2 +- 31 files changed, 3035 insertions(+), 2677 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index a4c7b2f..fe4437f 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -71,4 +71,7 @@ <!-- prints extra statistics information for Deep Neural Networks done in CP mode --> <systemml.stats.extraDNN>false</systemml.stats.extraDNN> + + <!-- sets the maximum number of GPUs per process, -1 for all GPUs --> + <systemml.gpu.perProcessMax>-1</systemml.gpu.perProcessMax> </root> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index ce60d55..febbf13 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -89,6 +89,7 @@ import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.controlprogram.parfor.util.IDHandler; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.matrix.CleanupMR; import org.apache.sysml.runtime.matrix.data.LibMatrixDNN; @@ -111,7 +112,7 @@ public class DMLScript HADOOP, // execute all matrix operations in MR SINGLE_NODE, // execute all matrix operations in CP HYBRID, // execute matrix operations in CP or MR - HYBRID_SPARK, // execute matrix operations in CP or Spark + HYBRID_SPARK, // execute matrix operations in CP or Spark SPARK // execute matrix operations in Spark } @@ -476,10 +477,6 @@ public class DMLScript return true; } - if (USE_ACCELERATOR){ - GPUContext.getGPUContext(); - } - if (dmlOptions.clean) { cleanSystemMLWorkspace(); return true; @@ -834,21 +831,35 @@ public class DMLScript // into performance problems are recorded and displayed GPUStatistics.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS); LibMatrixDNN.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS); + + // Sets the maximum number of GPUs per process, -1 for all available GPUs + GPUContextPool.PER_PROCESS_MAX_GPUS = dmlconf.getIntValue(DMLConfig.MAX_GPUS_PER_PROCESS); //Step 10: execute runtime program Statistics.startRunTimer(); ExecutionContext ec = null; + GPUContext gCtx = null; try { //run execute (w/ exception handling to ensure proper shutdown) ec = ExecutionContextFactory.createContext(rtprog); + if (DMLScript.USE_ACCELERATOR && ec != null){ + gCtx = GPUContextPool.getFromPool(); + gCtx.initializeThread(); + ec.setGPUContext(gCtx); + if (gCtx == null) { + throw new DMLRuntimeException("GPU : Could not create GPUContext, either no GPU or all GPUs currently in use"); + } + } rtprog.execute( ec ); } finally //ensure cleanup/shutdown - { - if(DMLScript.USE_ACCELERATOR && ec != null) - ec.destroyGPUContext(); + { + if(DMLScript.USE_ACCELERATOR && ec.getGPUContext() != null) { + GPUContextPool.returnToPool(ec.getGPUContext()); + } + if( dmlconf.getBooleanValue(DMLConfig.CODEGEN) ) SpoofCompiler.cleanupCodeGenerator(); if(ec != null && ec instanceof SparkExecutionContext) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java index dc20108..b37f037 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java @@ -169,7 +169,7 @@ public class MLContextConversionUtil { matrixObject.acquireModify(matrixBlock); matrixObject.release(); return matrixObject; - } catch (CacheException e) { + } catch (DMLRuntimeException e) { throw new MLContextException("Exception converting MatrixBlock to MatrixObject", e); } } @@ -197,7 +197,7 @@ public class MLContextConversionUtil { frameObject.acquireModify(frameBlock); frameObject.release(); return frameObject; - } catch (CacheException e) { + } catch (DMLRuntimeException e) { throw new MLContextException("Exception converting FrameBlock to FrameObject", e); } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java index ac2b92c..2368c90 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java @@ -48,6 +48,7 @@ import org.apache.sysml.runtime.controlprogram.Program; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.utils.Explain; import org.apache.sysml.utils.Explain.ExplainCounts; import org.apache.sysml.utils.Explain.ExplainType; @@ -244,6 +245,18 @@ public class ScriptExecutor { if (symbolTable != null) { executionContext.setVariables(symbolTable); } + try { + if (gpu) { + GPUContext gCtx = GPUContextPool.getFromPool(); + if (gCtx == null) + throw new MLContextException("GPU : no GPUs or no more free GPUs available"); + executionContext.setGPUContext(gCtx); + gCtx.initializeThread(); + } + } catch (DMLRuntimeException e) { + throw new MLContextException("GPU : Exception occurred during initialization"); + } + } /** @@ -324,12 +337,6 @@ public class ScriptExecutor { script.setScriptExecutor(this); // Set global variable indicating the script type DMLScript.SCRIPT_TYPE = script.getScriptType(); - try { - if (gpu) - GPUContext.getGPUContext(); - } catch (DMLRuntimeException e) { - throw new MLContextException("Exception occurred during initialization of GPU", e); - } } /** @@ -338,8 +345,10 @@ public class ScriptExecutor { protected void cleanupAfterExecution() { restoreInputsInSymbolTable(); try { - if (gpu) - executionContext.destroyGPUContext(); + if (gpu) { + GPUContext gCtx = executionContext.getGPUContext(); + GPUContextPool.returnToPool(gCtx); + } } catch (DMLRuntimeException e) { throw new MLContextException("Exception occurred during cleanup of GPU related resources", e); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index a42b1ca..e974a71 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -39,17 +39,17 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.parser.ParseException; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.util.LocalFileUtils; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; -public class DMLConfig +public class DMLConfig { public static final String DEFAULT_SYSTEMML_CONFIG_FILEPATH = "./SystemML-config.xml"; @@ -76,12 +76,11 @@ public class DMLConfig public static final String CODEGEN_LITERALS = "codegen.literals"; //1..heuristic, 2..always public static final String EXTRA_GPU_STATS = "systemml.stats.extraGPU"; //boolean public static final String EXTRA_DNN_STATS = "systemml.stats.extraDNN"; //boolean + public static final String MAX_GPUS_PER_PROCESS = "systemml.gpu.perProcessMax"; // boolean, maximum number of gpus to use, -1 for all - // Fraction of available memory to use. The available memory is computer when the JCudaContext is created + // Fraction of available memory to use. The available memory is computer when the GPUContext is created // to handle the tradeoff on calling cudaMemGetInfo too often. public static final String GPU_MEMORY_UTILIZATION_FACTOR = "gpu.memory.util.factor"; - // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. - public static final String REFRESH_AVAILABLE_MEMORY_EVERY_TIME = "gpu.memory.refresh"; // supported prefixes for custom map/reduce configurations public static final String PREFIX_MAPRED = "mapred"; @@ -121,7 +120,7 @@ public class DMLConfig _defaultVals.put(EXTRA_DNN_STATS, "false" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); - _defaultVals.put(REFRESH_AVAILABLE_MEMORY_EVERY_TIME, "true" ); + _defaultVals.put(MAX_GPUS_PER_PROCESS, "-1"); } public DMLConfig() http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java index d6186f3..f3de422 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java @@ -102,6 +102,8 @@ import org.apache.sysml.runtime.instructions.cp.DoubleObject; import org.apache.sysml.runtime.instructions.cp.IntObject; import org.apache.sysml.runtime.instructions.cp.StringObject; import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.OutputInfo; @@ -629,6 +631,11 @@ public class ParForProgramBlock extends ForProgramBlock switch( _execMode ) { case LOCAL: //create parworkers as local threads + if (DMLScript.USE_ACCELERATOR) { + GPUContextPool.returnToPool(ec.getGPUContext()); + ec.setGPUContext(null); + setDegreeOfParallelism(GPUContextPool.getDeviceCount()); + } executeLocalParFor(ec, iterVar, from, to, incr); break; @@ -718,6 +725,7 @@ public class ParForProgramBlock extends ForProgramBlock private void executeLocalParFor( ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr ) throws DMLRuntimeException, InterruptedException { + LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads); /* Step 1) init parallel workers, task queue and threads * start threads (from now on waiting for tasks) * Step 2) create tasks @@ -820,6 +828,17 @@ public class ParForProgramBlock extends ForProgramBlock _prog.removeFunctionProgramBlock(parts[0], parts[1]); } } + + // Frees up the GPUContexts used in the threaded Parfor and sets + // the main thread to use the GPUContext + if (DMLScript.USE_ACCELERATOR) { + for (int i = 0; i < _numThreads; i++) { + GPUContext gCtx = workers[i].getExecutionContext().getGPUContext(); + GPUContextPool.returnToPool(gCtx); + } + ec.setGPUContext(GPUContextPool.getFromPool()); + ec.getGPUContext().initializeThread(); + } } finally { @@ -1399,6 +1418,12 @@ public class ParForProgramBlock extends ForProgramBlock //deep copy execution context (including prepare parfor update-in-place) ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec); + + // If GPU mode is enabled, gets a GPUContext from the pool of GPUContexts + // and sets it in the ExecutionContext + if (DMLScript.USE_ACCELERATOR){ + cpEc.setGPUContext(GPUContextPool.getFromPool()); + } //prepare basic update-in-place variables (vars dropped on result merge) prepareUpdateInPlaceVariables(cpEc, pwID); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java index 054b333..904eb87 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java @@ -22,6 +22,8 @@ package org.apache.sysml.runtime.controlprogram.caching; import java.io.File; import java.io.IOException; import java.lang.ref.SoftReference; +import java.util.HashMap; +import java.util.Map; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.lang.mutable.MutableBoolean; @@ -34,9 +36,10 @@ import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy; -import org.apache.sysml.runtime.instructions.gpu.context.GPUObject; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysml.runtime.instructions.cp.Data; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUObject; import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject; import org.apache.sysml.runtime.instructions.spark.data.RDDObject; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; @@ -50,6 +53,7 @@ import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.util.LocalFileUtils; import org.apache.sysml.runtime.util.MapReduceTool; + /** * Each object of this class is a cache envelope for some large piece of data * called "cache block". For example, the body of a matrix can be the cache block. @@ -187,7 +191,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data //for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available private RDDObject _rddHandle = null; //RDD handle private BroadcastObject<T> _bcHandle = null; //Broadcast handle - protected GPUObject _gpuHandle = null; + protected HashMap<GPUContext, GPUObject> _gpuObjects = null; //Per GPUContext object allocated on GPU /** * Basic constructor for any cacheable data. @@ -200,6 +204,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data _uniqueID = (int)_seq.getNextID(); _cacheStatus = CacheStatus.EMPTY; _numReadThreads = 0; + _gpuObjects = new HashMap<>(); } /** @@ -213,7 +218,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data _hdfsFileName = that._hdfsFileName; _hdfsFileExists = that._hdfsFileExists; _varName = that._varName; - _gpuHandle = that._gpuHandle; + _gpuObjects = that._gpuObjects; } @@ -341,15 +346,16 @@ public abstract class CacheableData<T extends CacheBlock> extends Data bc.setBackReference(this); } - public GPUObject getGPUObject() { - return _gpuHandle; + public synchronized GPUObject getGPUObject(GPUContext gCtx) { + return _gpuObjects.get(gCtx); } - - public void setGPUObject(GPUObject handle) { - _gpuHandle = handle; + + public synchronized void setGPUObject(GPUContext gCtx, GPUObject gObj) throws DMLRuntimeException { + GPUObject old = _gpuObjects.put(gCtx, gObj); + if (old != null) + throw new DMLRuntimeException("GPU : Inconsistent internal state - this CacheableData already has a GPUObject assigned to the current GPUContext (" + gCtx + ")"); } - // ********************************************* // *** *** // *** HIGH-LEVEL METHODS THAT SPECIFY *** @@ -384,12 +390,20 @@ public abstract class CacheableData<T extends CacheBlock> extends Data if( _data == null ) getCache(); - //call acquireHostRead if gpuHandle is set as well as is allocated - if( _gpuHandle != null && _gpuHandle.isAllocated()) { - _gpuHandle.acquireHostRead(); - if( _data == null ) - getCache(); - } + //call acquireHostRead if gpuHandle is set as well as is allocated + boolean copiedFromGPU = false; + for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) { + GPUObject gObj = kv.getValue(); + if (gObj != null && copiedFromGPU && gObj.isDirty()) { + LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU"); + throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU"); + } else if (gObj != null){ + copiedFromGPU = gObj.acquireHostRead(); + if( _data == null ) + getCache(); + } + } + //read data from HDFS/RDD if required //(probe data for cache_nowrite / jvm_reuse) if( isEmpty(true) && _data==null ) @@ -517,10 +531,10 @@ public abstract class CacheableData<T extends CacheBlock> extends Data * * @param newData new data * @return cacheable data - * @throws CacheException if CacheException occurs + * @throws DMLRuntimeException if error occurs */ public synchronized T acquireModify(T newData) - throws CacheException + throws DMLRuntimeException { if( LOG.isTraceEnabled() ) LOG.trace("Acquire modify newdata "+getVarName()); @@ -631,10 +645,10 @@ public abstract class CacheableData<T extends CacheBlock> extends Data * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: EMPTY. * - * @throws CacheException if CacheException occurs + * @throws DMLRuntimeException if error occurs */ public synchronized void clearData() - throws CacheException + throws DMLRuntimeException { if( LOG.isTraceEnabled() ) LOG.trace("Clear data "+getVarName()); @@ -661,9 +675,14 @@ public abstract class CacheableData<T extends CacheBlock> extends Data _rddHandle.setBackReference(null); if( _bcHandle != null ) _bcHandle.setBackReference(null); - if( _gpuHandle != null ) - _gpuHandle.clearData(); - + if( _gpuObjects != null ) { + for (GPUObject gObj : _gpuObjects.values()){ + if (gObj != null) { + gObj.clearData(); + } + } + } + // change object state EMPTY setDirty(false); setEmpty(); @@ -731,10 +750,19 @@ public abstract class CacheableData<T extends CacheBlock> extends Data LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat); - //TODO remove - if( getGPUObject() != null && getGPUObject().isAllocated() ) { - getGPUObject().acquireHostRead(); - } + //TODO remove + boolean copiedFromGPU = false; + for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) { + GPUObject gObj = kv.getValue(); + if (gObj != null && copiedFromGPU && gObj.isDirty()) { + LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU"); + throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU"); + } else if (gObj != null){ + copiedFromGPU = gObj.acquireHostRead(); + if( _data == null ) + getCache(); + } + } boolean pWrite = false; // !fName.equals(_hdfsFileName); //persistent write flag if ( fName.equals(_hdfsFileName) ) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index b6e1830..35b4cd1 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -22,6 +22,8 @@ package org.apache.sysml.runtime.controlprogram.context; import java.util.ArrayList; import java.util.HashMap; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.debug.DMLFrame; import org.apache.sysml.debug.DMLProgramCounter; @@ -52,9 +54,9 @@ import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.util.MapReduceTool; -public class ExecutionContext -{ - +public class ExecutionContext { + protected static final Log LOG = LogFactory.getLog(ExecutionContext.class.getName()); + //program reference (e.g., function repository) protected Program _prog = null; @@ -64,6 +66,8 @@ public class ExecutionContext //debugging (optional) protected DebugState _dbState = null; + protected GPUContext _gpuContext = null; + protected ExecutionContext() { //protected constructor to force use of ExecutionContextFactory @@ -94,13 +98,15 @@ public class ExecutionContext public void setVariables(LocalVariableMap vars) { _variables = vars; } - - public void destroyGPUContext() throws DMLRuntimeException { - if(GPUContext.isGPUContextCreated) - GPUContext.getGPUContext().destroy(); - } - - + + public GPUContext getGPUContext() { + return _gpuContext; + } + + public void setGPUContext(GPUContext _gpuContext) { + this._gpuContext = _gpuContext; + } + /* ------------------------------------------------------- * Methods to handle variables and associated data objects * ------------------------------------------------------- @@ -232,7 +238,7 @@ public class ExecutionContext throws DMLRuntimeException { MatrixObject mo = allocateGPUMatrixObject(varName); - boolean allocated = mo.getGPUObject().acquireDeviceModifyDense(); + boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifyDense(); mo.getMatrixCharacteristics().setNonZeros(-1); return new Pair<MatrixObject, Boolean>(mo, allocated); } @@ -251,7 +257,7 @@ public class ExecutionContext { MatrixObject mo = allocateGPUMatrixObject(varName); mo.getMatrixCharacteristics().setNonZeros(nnz); - boolean allocated = mo.getGPUObject().acquireDeviceModifySparse(); + boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifySparse(); return new Pair<MatrixObject, Boolean>(mo, allocated); } @@ -263,8 +269,9 @@ public class ExecutionContext */ public MatrixObject allocateGPUMatrixObject(String varName) throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); - if( mo.getGPUObject() == null ) { - mo.setGPUObject(GPUContext.createGPUObject(mo)); + if( mo.getGPUObject(getGPUContext()) == null ) { + GPUObject newGObj = getGPUContext().createGPUObject(mo); + mo.setGPUObject(getGPUContext(), newGObj); } return mo; } @@ -272,20 +279,23 @@ public class ExecutionContext public Pair<MatrixObject, Boolean> getMatrixInputForGPUInstruction(String varName) throws DMLRuntimeException { + GPUContext gCtx = getGPUContext(); boolean copied = false; MatrixObject mo = getMatrixObject(varName); if(mo == null) { throw new DMLRuntimeException("No matrix object available for variable:" + varName); } - if( mo.getGPUObject() == null ) { - mo.setGPUObject(GPUContext.createGPUObject(mo)); - } + boolean acquired = false; - if( !mo.getGPUObject().isAllocated() ) { + if( mo.getGPUObject(gCtx) == null ) { + GPUObject newGObj = gCtx.createGPUObject(mo); + mo.setGPUObject(gCtx, newGObj); + } else if( !mo.getGPUObject(gCtx).isInputAllocated() ) { mo.acquireRead(); acquired = true; } - copied = mo.getGPUObject().acquireDeviceRead(); + + copied = mo.getGPUObject(gCtx).acquireDeviceRead(); if(acquired) { mo.release(); } @@ -309,7 +319,7 @@ public class ExecutionContext throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); - mo.getGPUObject().releaseInput(); + mo.getGPUObject(getGPUContext()).releaseInput(); } /** @@ -361,10 +371,10 @@ public class ExecutionContext public void releaseMatrixOutputForGPUInstruction(String varName) throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); - if(mo.getGPUObject() == null || !mo.getGPUObject().isAllocated()) { + if(mo.getGPUObject(getGPUContext()) == null || !mo.getGPUObject(getGPUContext()).isAllocated()) { throw new DMLRuntimeException("No output is allocated on GPU"); } - mo.getGPUObject().releaseOutput(); + mo.getGPUObject(getGPUContext()).releaseOutput(); } public void setMatrixOutput(String varName, MatrixBlock outputData) @@ -486,7 +496,7 @@ public class ExecutionContext return varlist; } - public void cleanupMatrixObject(MatrixObject mo) + public void cleanupMatrixObject(MatrixObject mo) throws DMLRuntimeException { try http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java index e146821..c4684ec 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java @@ -21,6 +21,7 @@ package org.apache.sysml.runtime.controlprogram.parfor; import java.util.Collection; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.CompilerConfig; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.OptimizerUtils; @@ -79,6 +80,10 @@ public class LocalParWorker extends ParWorker implements Runnable SparkExecutionContext sec = (SparkExecutionContext)_ec; sec.setThreadLocalSchedulerPool("parforPool"+_workerID); } + + // Initialize this GPUContext to this thread + if (DMLScript.USE_ACCELERATOR) + _ec.getGPUContext().initializeThread(); //setup compiler config for worker thread ConfigurationManager.setLocalConfig(_cconf); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java index e0b30f4..05872b1 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java @@ -49,6 +49,11 @@ public abstract class ParWorker protected long _workerID = -1; protected ArrayList<ProgramBlock> _childBlocks = null; + + public ExecutionContext getExecutionContext() { + return _ec; + } + protected ExecutionContext _ec = null; protected ArrayList<String> _resultVars = null; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java index 5c693d5..1d5a195 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java @@ -516,7 +516,7 @@ public class OptTreeConverter LopProperties.ExecType et = (hop.getExecType()!=null) ? hop.getExecType() : LopProperties.ExecType.CP; switch( et ) { - case CP: + case CP:case GPU: node.setExecType(ExecType.CP); break; case SPARK: node.setExecType(ExecType.SPARK); break; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java index c65553b..0958aeb 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java @@ -172,8 +172,12 @@ public class FunctionCallCPInstruction extends CPInstruction // Create a symbol table under a new execution context for the function invocation, // and copy the function arguments into the created table. ExecutionContext fn_ec = ExecutionContextFactory.createContext(false, ec.getProgram()); + if (DMLScript.USE_ACCELERATOR) { + fn_ec.setGPUContext(ec.getGPUContext()); + ec.setGPUContext(null); + fn_ec.getGPUContext().initializeThread(); + } fn_ec.setVariables(functionVariables); - // execute the function block try { fpb._functionName = this._functionName; @@ -187,7 +191,6 @@ public class FunctionCallCPInstruction extends CPInstruction String fname = DMLProgram.constructFunctionKey(_namespace, _functionName); throw new DMLRuntimeException("error executing function " + fname, e); } - LocalVariableMap retVars = fn_ec.getVariables(); // cleanup all returned variables w/o binding @@ -206,6 +209,12 @@ public class FunctionCallCPInstruction extends CPInstruction // Unpin the pinned variables ec.unpinVariables(_boundInputParamNames, pinStatus); + + if (DMLScript.USE_ACCELERATOR) { + ec.setGPUContext(fn_ec.getGPUContext()); + fn_ec.setGPUContext(null); + ec.getGPUContext().initializeThread(); + } // add the updated binding for each return variable to the variables in original symbol table for (int i=0; i< fpb.getOutputParams().size(); i++){ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java index 55c8f8d..2531c17 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java @@ -96,7 +96,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns()); ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmult(ec, getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); + LibMatrixCUDA.matmult(ec, ec.getGPUContext(), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); @@ -113,6 +113,6 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction @SuppressWarnings("unused") private boolean isSparse(ExecutionContext ec, String var) throws DMLRuntimeException { MatrixObject mo = ec.getMatrixObject(var); - return LibMatrixCUDA.isInSparseFormat(mo); + return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(), mo); } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java index 45db44c..bfe30f9 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java @@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.ReduceCol; import org.apache.sysml.runtime.functionobjects.ReduceRow; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator; import org.apache.sysml.runtime.matrix.operators.Operator; @@ -93,7 +94,7 @@ public class AggregateUnaryGPUInstruction extends GPUInstruction { ec.setMetaData(_output.getName(), rlen, 1); } - LibMatrixCUDA.unaryAggregate(ec, getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr); + LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index a02115d..e806f3b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -18,19 +18,20 @@ */ package org.apache.sysml.runtime.instructions.gpu; +import java.util.ArrayList; + import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; import org.apache.sysml.utils.GPUStatistics; -import java.util.ArrayList; - public class ConvolutionGPUInstruction extends GPUInstruction { private CPOperand _input1; @@ -186,9 +187,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); if(instOpcode.equalsIgnoreCase("bias_add")) - LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, out); + LibMatrixCUDA.biasAdd(ec.getGPUContext(), getExtendedOpcode(), input, bias, out); else if(instOpcode.equalsIgnoreCase("bias_multiply")) - LibMatrixCUDA.biasMultiply(getExtendedOpcode(), input, bias, out); + LibMatrixCUDA.biasMultiply(ec.getGPUContext(), getExtendedOpcode(), input, bias, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -202,7 +203,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); - LibMatrixCUDA.reluBackward(getExtendedOpcode(), input, dout, out); + LibMatrixCUDA.reluBackward(ec.getGPUContext(), getExtendedOpcode(), input, dout, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -253,7 +254,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, K * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2d(getExtendedOpcode(), image, filter, out, N, C, H, W, + LibMatrixCUDA.conv2d(ec.getGPUContext(), getExtendedOpcode(), image, filter, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) { @@ -268,7 +269,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, K * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2dBiasAdd(getExtendedOpcode(), image, bias, filter, out, N, C, H, W, + LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(), getExtendedOpcode(), image, bias, filter, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) { @@ -283,7 +284,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), K, C * R * S); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2dBackwardFilter(getExtendedOpcode(), image, dout, out, N, C, H, W, + LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); // TODO: For now always copy the device data to host // ec.gpuCtx.copyDeviceToHost(outputBlock); @@ -300,7 +301,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2dBackwardData(getExtendedOpcode(), filter, dout, out, N, C, H, W, + LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) { @@ -313,10 +314,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); if(instOpcode.equalsIgnoreCase("maxpooling")) - LibMatrixCUDA.maxpooling(getExtendedOpcode(), image, out, N, C, H, W, + LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); else - LibMatrixCUDA.reluMaxpooling(getExtendedOpcode(), image, out, N, C, H, W, + LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { @@ -331,7 +332,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.maxpoolingBackward(getExtendedOpcode(), image, dout, out, N, C, H, W, + LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index ab275e7..0b69b5e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -19,7 +19,6 @@ package org.apache.sysml.runtime.instructions.gpu; -import jcuda.runtime.JCuda; import org.apache.sysml.lops.runtime.RunMRJobs; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; @@ -149,7 +148,7 @@ public abstract class GPUInstruction extends Instruction public void postprocessInstruction(ExecutionContext ec) throws DMLRuntimeException { - JCuda.cudaDeviceSynchronize(); + //JCuda.cudaDeviceSynchronize(); } /** http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java index 0f2542a..af97141 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java @@ -32,6 +32,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.utils.GPUStatistics; @@ -107,7 +108,7 @@ public class MMTSJGPUInstruction extends GPUInstruction //execute operations ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmultTSMM(ec, getExtendedOpcode(), mat, _output.getName(), isLeftTransposed); + LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName(), isLeftTransposed); ec.releaseMatrixInputForGPUInstruction(_input.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java index 2766909..3fc8a98 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java @@ -23,6 +23,7 @@ import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.utils.GPUStatistics; @@ -43,10 +44,10 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction { ec.setMetaData(_output.getName(), mat.getNumRows(), mat.getNumColumns()); if(opcode.equals("sel+")) { - LibMatrixCUDA.relu(ec, getExtendedOpcode(), mat, _output.getName()); + LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); } else if (opcode.equals("exp")) { - LibMatrixCUDA.exp(ec, getExtendedOpcode(), mat, _output.getName()); + LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); } else { throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java index 2da1aa6..a1520a9 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java @@ -23,6 +23,7 @@ import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.matrix.operators.Operator; @@ -71,10 +72,10 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns ec.setMetaData(_output.getName(), (int)rlen, (int)clen); BinaryOperator bop = (BinaryOperator) _optr; - LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); + LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); - ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java index 2c833e4..28254c6 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java @@ -26,6 +26,7 @@ import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.instructions.cp.ScalarObject; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.utils.GPUStatistics; @@ -103,11 +104,11 @@ public class MatrixMatrixAxpyGPUInstruction extends ArithmeticBinaryGPUInstructi " and input2:" + rlen2 + " X " + clen2); } - LibMatrixCUDA.axpy(ec, getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue()); + LibMatrixCUDA.axpy(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue()); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); - ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } private boolean isValidMMOperation(long rlen1, long rlen2, long clen1, long clen2) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java index 54ba32e..424a2c5 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java @@ -25,6 +25,7 @@ import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; @@ -81,10 +82,10 @@ public class ReorgGPUInstruction extends GPUInstruction //execute operation ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.transpose(ec, getExtendedOpcode(), mat, _output.getName()); + LibMatrixCUDA.transpose(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input.getName()); - ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java index 44cc6e2..64cb6c4 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java @@ -60,9 +60,9 @@ public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns ScalarOperator sc_op = (ScalarOperator) _optr; sc_op.setConstant(constant.getDoubleValue()); - LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op); + LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op); ec.releaseMatrixInputForGPUInstruction(mat.getName()); - ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java new file mode 100644 index 0000000..5e202a9 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -0,0 +1,457 @@ +package org.apache.sysml.runtime.instructions.gpu.context; + +import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr; +import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense; +import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase; +import static jcuda.jcusparse.JCusparse.cusparseSetMatType; +import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode; +import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz; +import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz; +import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO; +import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL; +import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.utils.GPUStatistics; + +import jcuda.Pointer; +import jcuda.jcublas.cublasHandle; +import jcuda.jcusparse.cusparseHandle; +import jcuda.jcusparse.cusparseMatDescr; +import jcuda.jcusparse.cusparsePointerMode; + +/** + * Compressed Sparse Row (CSR) format for CUDA + * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations + */ +public class CSRPointer { + + private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName()); + + /** {@link GPUContext} instance to track the GPU to do work on */ + private final GPUContext gpuContext; + + private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004; + + public static cusparseMatDescr matrixDescriptor; + + /** Number of non zeroes */ + public long nnz; + + /** double array of non zero values */ + public Pointer val; + + /** integer array of start of all rows and end of last row + 1 */ + public Pointer rowPtr; + + /** integer array of nnz values' column indices */ + public Pointer colInd; + + /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */ + public cusparseMatDescr descr; + + /** + * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)} + * @param gCtx a valid {@link GPUContext} + */ + private CSRPointer(GPUContext gCtx) { + gpuContext = gCtx; + val = new Pointer(); + rowPtr = new Pointer(); + colInd = new Pointer(); + allocateMatDescrPointer(); + } + + private Pointer allocate(String instName, long size) throws DMLRuntimeException { + return getGPUContext().allocate(instName, size); + } + + private Pointer allocate(long size) throws DMLRuntimeException { + return getGPUContext().allocate(size); + } + + private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(toFree); + } + + private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(toFree, eager); + } + + private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(instName, toFree, eager); + } + + private static long getDoubleSizeOf(long numElems) { + return numElems * ((long)jcuda.Sizeof.DOUBLE); + } + + private static long getIntSizeOf(long numElems) { + return numElems * ((long)jcuda.Sizeof.INT); + } + + private GPUContext getGPUContext() throws DMLRuntimeException { + return gpuContext; + } + + public static int toIntExact(long l) throws DMLRuntimeException { + if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { + throw new DMLRuntimeException("Cannot be cast to int:" + l); + } + return (int) l; + } + + /** + * @return Singleton default matrix descriptor object + * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO) + */ + public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() { + if (matrixDescriptor == null) { + // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java + matrixDescriptor = new cusparseMatDescr(); + cusparseCreateMatDescr(matrixDescriptor); + cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO); + } + return matrixDescriptor; + } + + /** + * Estimate the size of a CSR matrix in GPU memory + * Size of pointers is not needed and is not added in + * + * @param nnz2 number of non zeroes + * @param rows number of rows + * @return size estimate + */ + public static long estimateSize(long nnz2, long rows) { + long sizeofValArray = getDoubleSizeOf(nnz2); + long sizeofRowPtrArray = getIntSizeOf(rows + 1); + long sizeofColIndArray = getIntSizeOf(nnz2); + long sizeofDescr = getIntSizeOf(4); + // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as: + // typedef struct { + // cusparseMatrixType_t MatrixType; + // cusparseFillMode_t FillMode; + // cusparseDiagType_t DiagType; + // cusparseIndexBase_t IndexBase; + // } cusparseMatDescr_t; + long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr; + return tot; + } + + /** + * Static method to copy a CSR sparse matrix from Host to Device + * + * @param dest [input] destination location (on GPU) + * @param rows number of rows + * @param nnz number of non-zeroes + * @param rowPtr integer array of row pointers + * @param colInd integer array of column indices + * @param values double array of non zero values + */ + public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { + CSRPointer r = dest; + long t0 = 0; + if (DMLScript.STATISTICS) t0 = System.nanoTime(); + r.nnz = nnz; + cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice); + cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice); + cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice); + if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3); + } + + /** + * Static method to copy a CSR sparse matrix from Device to host + * + * @param src [input] source location (on GPU) + * @param rows [input] number of rows + * @param nnz [input] number of non-zeroes + * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1) + * @param colInd [output] pre-allocated integer array of column indices of size nnz + * @param values [output] pre-allocated double array of values of size nnz + */ + public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { + CSRPointer r = src; + long t0 = 0; + if (DMLScript.STATISTICS) t0 = System.nanoTime(); + cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost); + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3); + } + + /** + * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation + * C = a op(A) + b op(B) + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param B Sparse Matrix B on GPU + * @param m Rows in A + * @param n Columns in Bs + * @return CSR (compressed sparse row) pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) + throws DMLRuntimeException { + if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { + throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); + } + CSRPointer C = new CSRPointer(gCtx); + step1AllocateRowPointers(gCtx, handle, C, m); + step2GatherNNZGeam(gCtx, handle, A, B, C, m, n); + step3AllocateValNInd(gCtx, handle, C); + return C; + } + + /** + * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B + * and returns the {@link CSRPointer} to C with the appropriate GPU memory. + * + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param transA 'T' if A is to be transposed, 'N' otherwise + * @param B Sparse Matrix B on GPU + * @param transB 'T' if B is to be transposed, 'N' otherwise + * @param m Rows in A + * @param n Columns in B + * @param k Columns in A / Rows in B + * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int m, int n, int k) + throws DMLRuntimeException { + // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at + // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java + CSRPointer C = new CSRPointer(gCtx); + step1AllocateRowPointers(gCtx, handle, C, m); + step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k); + step3AllocateValNInd(gCtx, handle, C); + return C; + } + + /** + * Check for ultra sparsity + * + * @param rows number of rows + * @param cols number of columns + * @return true if ultra sparse + */ + public boolean isUltraSparse(int rows, int cols) { + double sp = ((double) nnz / rows / cols); + return sp < ULTRA_SPARSITY_TURN_POINT; + } + +// ============================================================================================== + +// The following methods estimate the memory needed for sparse matrices that are +// results of operations on other sparse matrices using the cuSparse Library. +// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs +// op = whether to transpose or not +// binaryOperation = For cuSparse, +, - are *(matmul) are supported + +// From CuSparse Manual, +// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach +// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1 +// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC +// and the total number of nonzero elements. In the second step, the user gathers nnzC +//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr) +// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of +// nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam() +// to complete matrix C. + + /** + * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL, + * the default that works for DGEMM. + */ + private void allocateMatDescrPointer() { + this.descr = getDefaultCuSparseMatrixDescriptor(); + } + + /** + * Factory method to allocate an empty CSR Sparse matrix on the GPU + * + * @param nnz2 number of non-zeroes + * @param rows number of rows + * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException { + LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); + assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"; + CSRPointer r = new CSRPointer(gCtx); + r.nnz = nnz2; + if (nnz2 == 0) { + // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object + // with no memory allocated on the GPU. + return r; + } + gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2)); + // increment the cudaCount by 1 for the allocation of all 3 arrays + r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2)); + r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1)); + r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2)); + return r; + } + + /** + * Allocate row pointers of m+1 elements + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param C Output matrix + * @param rowsC number of rows in C + * @throws DMLRuntimeException ? + */ + private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException { + LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx); + cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); + //cudaDeviceSynchronize; + // Do not increment the cudaCount of allocations on GPU + C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1)); + } + + /** + * Determine total number of nonzero element for the cusparseDgeam operation. + * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param B Sparse Matrix B on GPU + * @param C Output Sparse Matrix C on GPU + * @param m Rows in C + * @param n Columns in C + * @throws DMLRuntimeException ? + */ + private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws DMLRuntimeException { + LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx); + int[] CnnzArray = {-1}; + cusparseXcsrgeamNnz(handle, m, n, + A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, + B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, + C.descr, C.rowPtr, Pointer.to(CnnzArray)); + //cudaDeviceSynchronize; + if (CnnzArray[0] != -1) { + C.nnz = CnnzArray[0]; + } else { + int baseArray[] = {0}; + cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); + C.nnz = CnnzArray[0] - baseArray[0]; + } + } + +// ============================================================================================== + + /** + * Determine total number of nonzero element for the cusparseDgemm operation. + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param transA op - whether A is transposed + * @param B Sparse Matrix B on GPU + * @param transB op - whether B is transposed + * @param C Output Sparse Matrix C on GPU + * @param m Number of rows of sparse matrix op ( A ) and C + * @param n Number of columns of sparse matrix op ( B ) and C + * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B ) + * @throws DMLRuntimeException ? + */ + private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException { + LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx); + int[] CnnzArray = {-1}; + if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { + throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); + } + cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, + A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, + B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, + C.descr, C.rowPtr, Pointer.to(CnnzArray)); + //cudaDeviceSynchronize; + if (CnnzArray[0] != -1) { + C.nnz = CnnzArray[0]; + } else { + int baseArray[] = {0}; + cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); + C.nnz = CnnzArray[0] - baseArray[0]; + } + } + + /** + * Allocate val and index pointers. + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param C Output sparse matrix on GPU + * @throws DMLRuntimeException ? + */ + private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) throws DMLRuntimeException { + LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx); + // Increment cudaCount by one when all three arrays of CSR sparse array are allocated + C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz)); + C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz)); + } + + /** + * Copies this CSR matrix on the GPU to a dense column-major matrix + * on the GPU. This is a temporary matrix for operations such as + * cusparseDcsrmv. + * Since the allocated matrix is temporary, bookkeeping is not updated. + * The caller is responsible for calling "free" on the returned Pointer object + * + * @param cusparseHandle a valid {@link cusparseHandle} + * @param cublasHandle a valid {@link cublasHandle} + * @param rows number of rows in this CSR matrix + * @param cols number of columns in this CSR matrix + * @throws DMLRuntimeException if DMLRuntimeException occurs + * @return A {@link Pointer} to the allocated dense matrix (in column-major format) + */ + public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException { + LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext()); + long size = ((long) rows) * getDoubleSizeOf((long) cols); + Pointer A = allocate(size); + // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned. + if (val != null && rowPtr != null && colInd != null && nnz > 0) { + // Note: cusparseDcsr2dense method cannot handle empty blocks + cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows); + //cudaDeviceSynchronize; + } else { + LOG.warn("in CSRPointer, the values array, row pointers array or column indices array was null"); + } + return A; + } + + /** + * Calls cudaFree lazily on the allocated {@link Pointer} instances + */ + public void deallocate() throws DMLRuntimeException { + deallocate(false); + } + + /** + * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances + * + * @param eager whether to do eager or lazy cudaFrees + */ + public void deallocate(boolean eager) throws DMLRuntimeException { + if (nnz > 0) { + cudaFreeHelper(val, eager); + cudaFreeHelper(rowPtr, eager); + cudaFreeHelper(colInd, eager); + } + } + + @Override + public String toString() { + return "CSRPointer{" + + "nnz=" + nnz + + '}'; + } +}