Refactored GPU{Contex,Object} to make it friendlier for parfor
- Folded JCuda{Context,Object} to GPU{Context,Object}
- Removed "deviceMemBytes", it was redundant
- Removed all synchronized in GPU{Object,Contex}
- print GPUContext from everywhere in log.trace
- LibMatrixCUDA functions expect a GPUContext instead of getting it statically
- Restructured GPUContext to use a pool of already initialized GPUContexts
- Call cudaSetDevice when on different thread
- TODO FIXME Disabled cublasDgeam for scalarMatrixArithmetic
- TODO FIXME revisit the need to always force gpu to be used, mem est
broken
- Ability to restrict parfor from picking up all GPUs on the machine,
from a system property
Closes #462
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit:
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/129f0f6b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/129f0f6b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/129f0f6b
Branch: refs/heads/master
Commit: 129f0f6b0e0f6167e4137c6d47374ab96501b888
Parents: 9ed27ad
Author: Nakul Jindal <[email protected]>
Authored: Fri Apr 21 16:22:19 2017 -0700
Committer: Nakul Jindal <[email protected]>
Committed: Fri Apr 21 16:22:19 2017 -0700
----------------------------------------------------------------------
conf/SystemML-config.xml.template | 3 +
.../java/org/apache/sysml/api/DMLScript.java | 27 +-
.../api/mlcontext/MLContextConversionUtil.java | 4 +-
.../sysml/api/mlcontext/ScriptExecutor.java | 25 +-
.../java/org/apache/sysml/conf/DMLConfig.java | 17 +-
.../controlprogram/ParForProgramBlock.java | 25 +
.../controlprogram/caching/CacheableData.java | 80 +-
.../context/ExecutionContext.java | 56 +-
.../controlprogram/parfor/LocalParWorker.java | 5 +
.../controlprogram/parfor/ParWorker.java | 5 +
.../parfor/opt/OptTreeConverter.java | 2 +-
.../cp/FunctionCallCPInstruction.java | 13 +-
.../gpu/AggregateBinaryGPUInstruction.java | 4 +-
.../gpu/AggregateUnaryGPUInstruction.java | 3 +-
.../gpu/ConvolutionGPUInstruction.java | 25 +-
.../instructions/gpu/GPUInstruction.java | 3 +-
.../instructions/gpu/MMTSJGPUInstruction.java | 3 +-
.../gpu/MatrixBuiltinGPUInstruction.java | 5 +-
.../MatrixMatrixArithmeticGPUInstruction.java | 5 +-
.../gpu/MatrixMatrixAxpyGPUInstruction.java | 5 +-
.../instructions/gpu/ReorgGPUInstruction.java | 5 +-
.../ScalarMatrixArithmeticGPUInstruction.java | 4 +-
.../instructions/gpu/context/CSRPointer.java | 457 ++++++
.../instructions/gpu/context/GPUContext.java | 619 ++++++-
.../gpu/context/GPUContextPool.java | 158 ++
.../instructions/gpu/context/GPUObject.java | 957 ++++++++---
.../instructions/gpu/context/JCudaContext.java | 286 ----
.../instructions/gpu/context/JCudaKernels.java | 70 +-
.../instructions/gpu/context/JCudaObject.java | 1330 ---------------
.../runtime/matrix/data/LibMatrixCUDA.java | 1509 ++++++++++--------
.../test/integration/AutomatedTestBase.java | 2 +-
31 files changed, 3035 insertions(+), 2677 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template
b/conf/SystemML-config.xml.template
index a4c7b2f..fe4437f 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -71,4 +71,7 @@
<!-- prints extra statistics information for Deep Neural Networks done in
CP mode -->
<systemml.stats.extraDNN>false</systemml.stats.extraDNN>
+
+ <!-- sets the maximum number of GPUs per process, -1 for all GPUs -->
+ <systemml.gpu.perProcessMax>-1</systemml.gpu.perProcessMax>
</root>
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java
b/src/main/java/org/apache/sysml/api/DMLScript.java
index ce60d55..febbf13 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -89,6 +89,7 @@ import
org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter;
import
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.controlprogram.parfor.util.IDHandler;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.CleanupMR;
import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
@@ -111,7 +112,7 @@ public class DMLScript
HADOOP, // execute all matrix operations in MR
SINGLE_NODE, // execute all matrix operations in CP
HYBRID, // execute matrix operations in CP or MR
- HYBRID_SPARK, // execute matrix operations in CP or Spark
+ HYBRID_SPARK, // execute matrix operations in CP or Spark
SPARK // execute matrix operations in Spark
}
@@ -476,10 +477,6 @@ public class DMLScript
return true;
}
- if (USE_ACCELERATOR){
- GPUContext.getGPUContext();
- }
-
if (dmlOptions.clean) {
cleanSystemMLWorkspace();
return true;
@@ -834,21 +831,35 @@ public class DMLScript
// into performance problems are recorded and displayed
GPUStatistics.DISPLAY_STATISTICS =
dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS);
LibMatrixDNN.DISPLAY_STATISTICS =
dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS);
+
+ // Sets the maximum number of GPUs per process, -1 for all
available GPUs
+ GPUContextPool.PER_PROCESS_MAX_GPUS =
dmlconf.getIntValue(DMLConfig.MAX_GPUS_PER_PROCESS);
//Step 10: execute runtime program
Statistics.startRunTimer();
ExecutionContext ec = null;
+ GPUContext gCtx = null;
try
{
//run execute (w/ exception handling to ensure proper
shutdown)
ec = ExecutionContextFactory.createContext(rtprog);
+ if (DMLScript.USE_ACCELERATOR && ec != null){
+ gCtx = GPUContextPool.getFromPool();
+ gCtx.initializeThread();
+ ec.setGPUContext(gCtx);
+ if (gCtx == null) {
+ throw new DMLRuntimeException("GPU :
Could not create GPUContext, either no GPU or all GPUs currently in use");
+ }
+ }
rtprog.execute( ec );
}
finally //ensure cleanup/shutdown
- {
- if(DMLScript.USE_ACCELERATOR && ec != null)
- ec.destroyGPUContext();
+ {
+ if(DMLScript.USE_ACCELERATOR && ec.getGPUContext() !=
null) {
+ GPUContextPool.returnToPool(ec.getGPUContext());
+ }
+
if( dmlconf.getBooleanValue(DMLConfig.CODEGEN) )
SpoofCompiler.cleanupCodeGenerator();
if(ec != null && ec instanceof SparkExecutionContext)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
index dc20108..b37f037 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
@@ -169,7 +169,7 @@ public class MLContextConversionUtil {
matrixObject.acquireModify(matrixBlock);
matrixObject.release();
return matrixObject;
- } catch (CacheException e) {
+ } catch (DMLRuntimeException e) {
throw new MLContextException("Exception converting
MatrixBlock to MatrixObject", e);
}
}
@@ -197,7 +197,7 @@ public class MLContextConversionUtil {
frameObject.acquireModify(frameBlock);
frameObject.release();
return frameObject;
- } catch (CacheException e) {
+ } catch (DMLRuntimeException e) {
throw new MLContextException("Exception converting
FrameBlock to FrameObject", e);
}
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
index ac2b92c..2368c90 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
@@ -48,6 +48,7 @@ import org.apache.sysml.runtime.controlprogram.Program;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
import org.apache.sysml.utils.Explain;
import org.apache.sysml.utils.Explain.ExplainCounts;
import org.apache.sysml.utils.Explain.ExplainType;
@@ -244,6 +245,18 @@ public class ScriptExecutor {
if (symbolTable != null) {
executionContext.setVariables(symbolTable);
}
+ try {
+ if (gpu) {
+ GPUContext gCtx = GPUContextPool.getFromPool();
+ if (gCtx == null)
+ throw new MLContextException("GPU : no
GPUs or no more free GPUs available");
+ executionContext.setGPUContext(gCtx);
+ gCtx.initializeThread();
+ }
+ } catch (DMLRuntimeException e) {
+ throw new MLContextException("GPU : Exception occurred
during initialization");
+ }
+
}
/**
@@ -324,12 +337,6 @@ public class ScriptExecutor {
script.setScriptExecutor(this);
// Set global variable indicating the script type
DMLScript.SCRIPT_TYPE = script.getScriptType();
- try {
- if (gpu)
- GPUContext.getGPUContext();
- } catch (DMLRuntimeException e) {
- throw new MLContextException("Exception occurred during
initialization of GPU", e);
- }
}
/**
@@ -338,8 +345,10 @@ public class ScriptExecutor {
protected void cleanupAfterExecution() {
restoreInputsInSymbolTable();
try {
- if (gpu)
- executionContext.destroyGPUContext();
+ if (gpu) {
+ GPUContext gCtx =
executionContext.getGPUContext();
+ GPUContextPool.returnToPool(gCtx);
+ }
} catch (DMLRuntimeException e) {
throw new MLContextException("Exception occurred during
cleanup of GPU related resources", e);
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index a42b1ca..e974a71 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -39,17 +39,17 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.parser.ParseException;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.util.LocalFileUtils;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
-public class DMLConfig
+public class DMLConfig
{
public static final String DEFAULT_SYSTEMML_CONFIG_FILEPATH =
"./SystemML-config.xml";
@@ -76,12 +76,11 @@ public class DMLConfig
public static final String CODEGEN_LITERALS = "codegen.literals";
//1..heuristic, 2..always
public static final String EXTRA_GPU_STATS =
"systemml.stats.extraGPU"; //boolean
public static final String EXTRA_DNN_STATS =
"systemml.stats.extraDNN"; //boolean
+ public static final String MAX_GPUS_PER_PROCESS =
"systemml.gpu.perProcessMax"; // boolean, maximum number of gpus to use, -1 for
all
- // Fraction of available memory to use. The available memory is
computer when the JCudaContext is created
+ // Fraction of available memory to use. The available memory is
computer when the GPUContext is created
// to handle the tradeoff on calling cudaMemGetInfo too often.
public static final String GPU_MEMORY_UTILIZATION_FACTOR =
"gpu.memory.util.factor";
- // Invoke cudaMemGetInfo to get available memory information. Useful if
GPU is shared among multiple application.
- public static final String REFRESH_AVAILABLE_MEMORY_EVERY_TIME =
"gpu.memory.refresh";
// supported prefixes for custom map/reduce configurations
public static final String PREFIX_MAPRED = "mapred";
@@ -121,7 +120,7 @@ public class DMLConfig
_defaultVals.put(EXTRA_DNN_STATS, "false" );
_defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" );
- _defaultVals.put(REFRESH_AVAILABLE_MEMORY_EVERY_TIME,
"true" );
+ _defaultVals.put(MAX_GPUS_PER_PROCESS, "-1");
}
public DMLConfig()
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
index d6186f3..f3de422 100644
---
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
+++
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
@@ -102,6 +102,8 @@ import
org.apache.sysml.runtime.instructions.cp.DoubleObject;
import org.apache.sysml.runtime.instructions.cp.IntObject;
import org.apache.sysml.runtime.instructions.cp.StringObject;
import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
@@ -629,6 +631,11 @@ public class ParForProgramBlock extends ForProgramBlock
switch( _execMode )
{
case LOCAL: //create parworkers as local threads
+ if (DMLScript.USE_ACCELERATOR) {
+
GPUContextPool.returnToPool(ec.getGPUContext());
+ ec.setGPUContext(null);
+
setDegreeOfParallelism(GPUContextPool.getDeviceCount());
+ }
executeLocalParFor(ec, iterVar, from,
to, incr);
break;
@@ -718,6 +725,7 @@ public class ParForProgramBlock extends ForProgramBlock
private void executeLocalParFor( ExecutionContext ec, IntObject
itervar, IntObject from, IntObject to, IntObject incr )
throws DMLRuntimeException, InterruptedException
{
+ LOG.trace("Local Par For (multi-threaded) with degree of
parallelism : " + _numThreads);
/* Step 1) init parallel workers, task queue and threads
* start threads (from now on waiting for tasks)
* Step 2) create tasks
@@ -820,6 +828,17 @@ public class ParForProgramBlock extends ForProgramBlock
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
}
}
+
+ // Frees up the GPUContexts used in the threaded Parfor
and sets
+ // the main thread to use the GPUContext
+ if (DMLScript.USE_ACCELERATOR) {
+ for (int i = 0; i < _numThreads; i++) {
+ GPUContext gCtx =
workers[i].getExecutionContext().getGPUContext();
+ GPUContextPool.returnToPool(gCtx);
+ }
+ ec.setGPUContext(GPUContextPool.getFromPool());
+ ec.getGPUContext().initializeThread();
+ }
}
finally
{
@@ -1399,6 +1418,12 @@ public class ParForProgramBlock extends ForProgramBlock
//deep copy execution context (including prepare parfor
update-in-place)
ExecutionContext cpEc =
ProgramConverter.createDeepCopyExecutionContext(ec);
+
+ // If GPU mode is enabled, gets a GPUContext from the
pool of GPUContexts
+ // and sets it in the ExecutionContext
+ if (DMLScript.USE_ACCELERATOR){
+
cpEc.setGPUContext(GPUContextPool.getFromPool());
+ }
//prepare basic update-in-place variables (vars dropped
on result merge)
prepareUpdateInPlaceVariables(cpEc, pwID);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 054b333..904eb87 100644
---
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.controlprogram.caching;
import java.io.File;
import java.io.IOException;
import java.lang.ref.SoftReference;
+import java.util.HashMap;
+import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.lang.mutable.MutableBoolean;
@@ -34,9 +36,10 @@ import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
import org.apache.sysml.runtime.instructions.cp.Data;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject;
import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
@@ -50,6 +53,7 @@ import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.util.LocalFileUtils;
import org.apache.sysml.runtime.util.MapReduceTool;
+
/**
* Each object of this class is a cache envelope for some large piece of data
* called "cache block". For example, the body of a matrix can be the cache
block.
@@ -187,7 +191,7 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
//for lazily evaluated RDDs, and (2) as abstraction for environments
that do not necessarily have spark libraries available
private RDDObject _rddHandle = null; //RDD handle
private BroadcastObject<T> _bcHandle = null; //Broadcast handle
- protected GPUObject _gpuHandle = null;
+ protected HashMap<GPUContext, GPUObject> _gpuObjects = null; //Per
GPUContext object allocated on GPU
/**
* Basic constructor for any cacheable data.
@@ -200,6 +204,7 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
_uniqueID = (int)_seq.getNextID();
_cacheStatus = CacheStatus.EMPTY;
_numReadThreads = 0;
+ _gpuObjects = new HashMap<>();
}
/**
@@ -213,7 +218,7 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
_hdfsFileName = that._hdfsFileName;
_hdfsFileExists = that._hdfsFileExists;
_varName = that._varName;
- _gpuHandle = that._gpuHandle;
+ _gpuObjects = that._gpuObjects;
}
@@ -341,15 +346,16 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
bc.setBackReference(this);
}
- public GPUObject getGPUObject() {
- return _gpuHandle;
+ public synchronized GPUObject getGPUObject(GPUContext gCtx) {
+ return _gpuObjects.get(gCtx);
}
-
- public void setGPUObject(GPUObject handle) {
- _gpuHandle = handle;
+
+ public synchronized void setGPUObject(GPUContext gCtx, GPUObject gObj)
throws DMLRuntimeException {
+ GPUObject old = _gpuObjects.put(gCtx, gObj);
+ if (old != null)
+ throw new DMLRuntimeException("GPU :
Inconsistent internal state - this CacheableData already has a GPUObject
assigned to the current GPUContext (" + gCtx + ")");
}
-
// *********************************************
// *** ***
// *** HIGH-LEVEL METHODS THAT SPECIFY ***
@@ -384,12 +390,20 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
if( _data == null )
getCache();
- //call acquireHostRead if gpuHandle is set as well as is
allocated
- if( _gpuHandle != null && _gpuHandle.isAllocated()) {
- _gpuHandle.acquireHostRead();
- if( _data == null )
- getCache();
- }
+ //call acquireHostRead if gpuHandle is set as well as is
allocated
+ boolean copiedFromGPU = false;
+ for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
+ GPUObject gObj = kv.getValue();
+ if (gObj != null && copiedFromGPU && gObj.isDirty()) {
+ LOG.error("Inconsistent internal state - A copy of this
CacheableData was dirty on more than 1 GPU");
+ throw new CacheException("Internal Error : Inconsistent
internal state, A copy of this CacheableData was dirty on more than 1 GPU");
+ } else if (gObj != null){
+ copiedFromGPU = gObj.acquireHostRead();
+ if( _data == null )
+ getCache();
+ }
+ }
+
//read data from HDFS/RDD if required
//(probe data for cache_nowrite / jvm_reuse)
if( isEmpty(true) && _data==null )
@@ -517,10 +531,10 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
*
* @param newData new data
* @return cacheable data
- * @throws CacheException if CacheException occurs
+ * @throws DMLRuntimeException if error occurs
*/
public synchronized T acquireModify(T newData)
- throws CacheException
+ throws DMLRuntimeException
{
if( LOG.isTraceEnabled() )
LOG.trace("Acquire modify newdata "+getVarName());
@@ -631,10 +645,10 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
* In-Status: EMPTY, EVICTABLE, EVICTED;
* Out-Status: EMPTY.
*
- * @throws CacheException if CacheException occurs
+ * @throws DMLRuntimeException if error occurs
*/
public synchronized void clearData()
- throws CacheException
+ throws DMLRuntimeException
{
if( LOG.isTraceEnabled() )
LOG.trace("Clear data "+getVarName());
@@ -661,9 +675,14 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
_rddHandle.setBackReference(null);
if( _bcHandle != null )
_bcHandle.setBackReference(null);
- if( _gpuHandle != null )
- _gpuHandle.clearData();
-
+ if( _gpuObjects != null ) {
+ for (GPUObject gObj : _gpuObjects.values()){
+ if (gObj != null) {
+ gObj.clearData();
+ }
+ }
+ }
+
// change object state EMPTY
setDirty(false);
setEmpty();
@@ -731,10 +750,19 @@ public abstract class CacheableData<T extends CacheBlock>
extends Data
LOG.trace("Exporting " + this.getDebugName() + " to " + fName +
" in format " + outputFormat);
- //TODO remove
- if( getGPUObject() != null && getGPUObject().isAllocated() ) {
- getGPUObject().acquireHostRead();
- }
+ //TODO remove
+ boolean copiedFromGPU = false;
+ for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
+ GPUObject gObj = kv.getValue();
+ if (gObj != null && copiedFromGPU && gObj.isDirty()) {
+ LOG.error("Inconsistent internal state - A copy of this
CacheableData was dirty on more than 1 GPU");
+ throw new CacheException("Internal Error : Inconsistent
internal state, A copy of this CacheableData was dirty on more than 1 GPU");
+ } else if (gObj != null){
+ copiedFromGPU = gObj.acquireHostRead();
+ if( _data == null )
+ getCache();
+ }
+ }
boolean pWrite = false; // !fName.equals(_hdfsFileName);
//persistent write flag
if ( fName.equals(_hdfsFileName) ) {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index b6e1830..35b4cd1 100644
---
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.controlprogram.context;
import java.util.ArrayList;
import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.debug.DMLFrame;
import org.apache.sysml.debug.DMLProgramCounter;
@@ -52,9 +54,9 @@ import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.util.MapReduceTool;
-public class ExecutionContext
-{
-
+public class ExecutionContext {
+ protected static final Log LOG =
LogFactory.getLog(ExecutionContext.class.getName());
+
//program reference (e.g., function repository)
protected Program _prog = null;
@@ -64,6 +66,8 @@ public class ExecutionContext
//debugging (optional)
protected DebugState _dbState = null;
+ protected GPUContext _gpuContext = null;
+
protected ExecutionContext()
{
//protected constructor to force use of ExecutionContextFactory
@@ -94,13 +98,15 @@ public class ExecutionContext
public void setVariables(LocalVariableMap vars) {
_variables = vars;
}
-
- public void destroyGPUContext() throws DMLRuntimeException {
- if(GPUContext.isGPUContextCreated)
- GPUContext.getGPUContext().destroy();
- }
-
-
+
+ public GPUContext getGPUContext() {
+ return _gpuContext;
+ }
+
+ public void setGPUContext(GPUContext _gpuContext) {
+ this._gpuContext = _gpuContext;
+ }
+
/* -------------------------------------------------------
* Methods to handle variables and associated data objects
* -------------------------------------------------------
@@ -232,7 +238,7 @@ public class ExecutionContext
throws DMLRuntimeException
{
MatrixObject mo = allocateGPUMatrixObject(varName);
- boolean allocated =
mo.getGPUObject().acquireDeviceModifyDense();
+ boolean allocated =
mo.getGPUObject(getGPUContext()).acquireDeviceModifyDense();
mo.getMatrixCharacteristics().setNonZeros(-1);
return new Pair<MatrixObject, Boolean>(mo, allocated);
}
@@ -251,7 +257,7 @@ public class ExecutionContext
{
MatrixObject mo = allocateGPUMatrixObject(varName);
mo.getMatrixCharacteristics().setNonZeros(nnz);
- boolean allocated =
mo.getGPUObject().acquireDeviceModifySparse();
+ boolean allocated =
mo.getGPUObject(getGPUContext()).acquireDeviceModifySparse();
return new Pair<MatrixObject, Boolean>(mo, allocated);
}
@@ -263,8 +269,9 @@ public class ExecutionContext
*/
public MatrixObject allocateGPUMatrixObject(String varName) throws
DMLRuntimeException {
MatrixObject mo = getMatrixObject(varName);
- if( mo.getGPUObject() == null ) {
- mo.setGPUObject(GPUContext.createGPUObject(mo));
+ if( mo.getGPUObject(getGPUContext()) == null ) {
+ GPUObject newGObj = getGPUContext().createGPUObject(mo);
+ mo.setGPUObject(getGPUContext(), newGObj);
}
return mo;
}
@@ -272,20 +279,23 @@ public class ExecutionContext
public Pair<MatrixObject, Boolean>
getMatrixInputForGPUInstruction(String varName)
throws DMLRuntimeException
{
+ GPUContext gCtx = getGPUContext();
boolean copied = false;
MatrixObject mo = getMatrixObject(varName);
if(mo == null) {
throw new DMLRuntimeException("No matrix object
available for variable:" + varName);
}
- if( mo.getGPUObject() == null ) {
- mo.setGPUObject(GPUContext.createGPUObject(mo));
- }
+
boolean acquired = false;
- if( !mo.getGPUObject().isAllocated() ) {
+ if( mo.getGPUObject(gCtx) == null ) {
+ GPUObject newGObj = gCtx.createGPUObject(mo);
+ mo.setGPUObject(gCtx, newGObj);
+ } else if( !mo.getGPUObject(gCtx).isInputAllocated() ) {
mo.acquireRead();
acquired = true;
}
- copied = mo.getGPUObject().acquireDeviceRead();
+
+ copied = mo.getGPUObject(gCtx).acquireDeviceRead();
if(acquired) {
mo.release();
}
@@ -309,7 +319,7 @@ public class ExecutionContext
throws DMLRuntimeException
{
MatrixObject mo = getMatrixObject(varName);
- mo.getGPUObject().releaseInput();
+ mo.getGPUObject(getGPUContext()).releaseInput();
}
/**
@@ -361,10 +371,10 @@ public class ExecutionContext
public void releaseMatrixOutputForGPUInstruction(String varName) throws
DMLRuntimeException {
MatrixObject mo = getMatrixObject(varName);
- if(mo.getGPUObject() == null ||
!mo.getGPUObject().isAllocated()) {
+ if(mo.getGPUObject(getGPUContext()) == null ||
!mo.getGPUObject(getGPUContext()).isAllocated()) {
throw new DMLRuntimeException("No output is allocated
on GPU");
}
- mo.getGPUObject().releaseOutput();
+ mo.getGPUObject(getGPUContext()).releaseOutput();
}
public void setMatrixOutput(String varName, MatrixBlock outputData)
@@ -486,7 +496,7 @@ public class ExecutionContext
return varlist;
}
- public void cleanupMatrixObject(MatrixObject mo)
+ public void cleanupMatrixObject(MatrixObject mo)
throws DMLRuntimeException
{
try
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
index e146821..c4684ec 100644
---
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
+++
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.controlprogram.parfor;
import java.util.Collection;
+import org.apache.sysml.api.DMLScript;
import org.apache.sysml.conf.CompilerConfig;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.hops.OptimizerUtils;
@@ -79,6 +80,10 @@ public class LocalParWorker extends ParWorker implements
Runnable
SparkExecutionContext sec = (SparkExecutionContext)_ec;
sec.setThreadLocalSchedulerPool("parforPool"+_workerID);
}
+
+ // Initialize this GPUContext to this thread
+ if (DMLScript.USE_ACCELERATOR)
+ _ec.getGPUContext().initializeThread();
//setup compiler config for worker thread
ConfigurationManager.setLocalConfig(_cconf);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
index e0b30f4..05872b1 100644
---
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
+++
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
@@ -49,6 +49,11 @@ public abstract class ParWorker
protected long _workerID = -1;
protected ArrayList<ProgramBlock> _childBlocks = null;
+
+ public ExecutionContext getExecutionContext() {
+ return _ec;
+ }
+
protected ExecutionContext _ec = null;
protected ArrayList<String> _resultVars = null;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
index 5c693d5..1d5a195 100644
---
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
+++
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
@@ -516,7 +516,7 @@ public class OptTreeConverter
LopProperties.ExecType et = (hop.getExecType()!=null) ?
hop.getExecType() :
LopProperties.ExecType.CP;
switch( et ) {
- case CP:
+ case CP:case GPU:
node.setExecType(ExecType.CP); break;
case SPARK:
node.setExecType(ExecType.SPARK); break;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index c65553b..0958aeb 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -172,8 +172,12 @@ public class FunctionCallCPInstruction extends
CPInstruction
// Create a symbol table under a new execution context for the
function invocation,
// and copy the function arguments into the created table.
ExecutionContext fn_ec =
ExecutionContextFactory.createContext(false, ec.getProgram());
+ if (DMLScript.USE_ACCELERATOR) {
+ fn_ec.setGPUContext(ec.getGPUContext());
+ ec.setGPUContext(null);
+ fn_ec.getGPUContext().initializeThread();
+ }
fn_ec.setVariables(functionVariables);
-
// execute the function block
try {
fpb._functionName = this._functionName;
@@ -187,7 +191,6 @@ public class FunctionCallCPInstruction extends CPInstruction
String fname =
DMLProgram.constructFunctionKey(_namespace, _functionName);
throw new DMLRuntimeException("error executing function
" + fname, e);
}
-
LocalVariableMap retVars = fn_ec.getVariables();
// cleanup all returned variables w/o binding
@@ -206,6 +209,12 @@ public class FunctionCallCPInstruction extends
CPInstruction
// Unpin the pinned variables
ec.unpinVariables(_boundInputParamNames, pinStatus);
+
+ if (DMLScript.USE_ACCELERATOR) {
+ ec.setGPUContext(fn_ec.getGPUContext());
+ fn_ec.setGPUContext(null);
+ ec.getGPUContext().initializeThread();
+ }
// add the updated binding for each return variable to the
variables in original symbol table
for (int i=0; i< fpb.getOutputParams().size(); i++){
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 55c8f8d..2531c17 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -96,7 +96,7 @@ public class AggregateBinaryGPUInstruction extends
GPUInstruction
int clen = (int) (_isRightTransposed ? m2.getNumRows() :
m2.getNumColumns());
ec.setMetaData(_output.getName(), rlen, clen);
- LibMatrixCUDA.matmult(ec, getExtendedOpcode(), m1, m2,
_output.getName(), _isLeftTransposed, _isRightTransposed);
+ LibMatrixCUDA.matmult(ec, ec.getGPUContext(),
getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed,
_isRightTransposed);
//release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
@@ -113,6 +113,6 @@ public class AggregateBinaryGPUInstruction extends
GPUInstruction
@SuppressWarnings("unused")
private boolean isSparse(ExecutionContext ec, String var) throws
DMLRuntimeException {
MatrixObject mo = ec.getMatrixObject(var);
- return LibMatrixCUDA.isInSparseFormat(mo);
+ return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(), mo);
}
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
index 45db44c..bfe30f9 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.ReduceCol;
import org.apache.sysml.runtime.functionobjects.ReduceRow;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysml.runtime.matrix.operators.Operator;
@@ -93,7 +94,7 @@ public class AggregateUnaryGPUInstruction extends
GPUInstruction {
ec.setMetaData(_output.getName(), rlen, 1);
}
- LibMatrixCUDA.unaryAggregate(ec, getExtendedOpcode(), in1,
_output.getName(), (AggregateUnaryOperator)_optr);
+ LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(), getExtendedOpcode(),
in1, _output.getName(), (AggregateUnaryOperator)_optr);
//release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index a02115d..e806f3b 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -18,19 +18,20 @@
*/
package org.apache.sysml.runtime.instructions.gpu;
+import java.util.ArrayList;
+
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.SwapIndex;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
import org.apache.sysml.runtime.util.ConvolutionUtils;
import org.apache.sysml.utils.GPUStatistics;
-import java.util.ArrayList;
-
public class ConvolutionGPUInstruction extends GPUInstruction
{
private CPOperand _input1;
@@ -186,9 +187,9 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
ec.setMetaData(_output.getName(), input.getNumRows(),
input.getNumColumns());
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec,
_output.getName());
if(instOpcode.equalsIgnoreCase("bias_add"))
- LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias,
out);
+ LibMatrixCUDA.biasAdd(ec.getGPUContext(),
getExtendedOpcode(), input, bias, out);
else if(instOpcode.equalsIgnoreCase("bias_multiply"))
- LibMatrixCUDA.biasMultiply(getExtendedOpcode(), input,
bias, out);
+ LibMatrixCUDA.biasMultiply(ec.getGPUContext(),
getExtendedOpcode(), input, bias, out);
// release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -202,7 +203,7 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec,
_output.getName());
ec.setMetaData(_output.getName(), input.getNumRows(),
input.getNumColumns());
- LibMatrixCUDA.reluBackward(getExtendedOpcode(), input, dout,
out);
+ LibMatrixCUDA.reluBackward(ec.getGPUContext(),
getExtendedOpcode(), input, dout, out);
// release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -253,7 +254,7 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
ec.setMetaData(_output.getName(), N, K * P * Q);
MatrixObject out =
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2d(getExtendedOpcode(), image,
filter, out, N, C, H, W,
+ LibMatrixCUDA.conv2d(ec.getGPUContext(),
getExtendedOpcode(), image, filter, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h,
stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -268,7 +269,7 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
ec.setMetaData(_output.getName(), N, K * P * Q);
MatrixObject out =
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2dBiasAdd(getExtendedOpcode(), image,
bias, filter, out, N, C, H, W,
+ LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(),
getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
K, R, S, pad_h, pad_w,
stride_h, stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter"))
{
@@ -283,7 +284,7 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
ec.setMetaData(_output.getName(), K, C * R * S);
MatrixObject out =
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2dBackwardFilter(getExtendedOpcode(),
image, dout, out, N, C, H, W,
+ LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(),
getExtendedOpcode(), image, dout, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h,
stride_w, P, Q);
// TODO: For now always copy the device data to host
// ec.gpuCtx.copyDeviceToHost(outputBlock);
@@ -300,7 +301,7 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
ec.setMetaData(_output.getName(), N, C * H * W);
MatrixObject out =
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2dBackwardData(getExtendedOpcode(),
filter, dout, out, N, C, H, W,
+ LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(),
getExtendedOpcode(), filter, dout, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h,
stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("maxpooling") ||
instOpcode.equalsIgnoreCase("relu_maxpooling")) {
@@ -313,10 +314,10 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
ec.setMetaData(_output.getName(), N, C * P * Q);
MatrixObject out =
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
if(instOpcode.equalsIgnoreCase("maxpooling"))
- LibMatrixCUDA.maxpooling(getExtendedOpcode(),
image, out, N, C, H, W,
+ LibMatrixCUDA.maxpooling(ec.getGPUContext(),
getExtendedOpcode(), image, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h,
stride_w, P, Q);
else
-
LibMatrixCUDA.reluMaxpooling(getExtendedOpcode(), image, out, N, C, H, W,
+
LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image,
out, N, C, H, W,
K, R, S, pad_h, pad_w,
stride_h, stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
@@ -331,7 +332,7 @@ public class ConvolutionGPUInstruction extends
GPUInstruction
ec.setMetaData(_output.getName(), N, C * H * W);
MatrixObject out =
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.maxpoolingBackward(getExtendedOpcode(),
image, dout, out, N, C, H, W,
+ LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(),
getExtendedOpcode(), image, dout, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h,
stride_w, P, Q);
}
else {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index ab275e7..0b69b5e 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -19,7 +19,6 @@
package org.apache.sysml.runtime.instructions.gpu;
-import jcuda.runtime.JCuda;
import org.apache.sysml.lops.runtime.RunMRJobs;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
@@ -149,7 +148,7 @@ public abstract class GPUInstruction extends Instruction
public void postprocessInstruction(ExecutionContext ec)
throws DMLRuntimeException
{
- JCuda.cudaDeviceSynchronize();
+ //JCuda.cudaDeviceSynchronize();
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
index 0f2542a..af97141 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -32,6 +32,7 @@ import
org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.utils.GPUStatistics;
@@ -107,7 +108,7 @@ public class MMTSJGPUInstruction extends GPUInstruction
//execute operations
ec.setMetaData(_output.getName(), rlen, clen);
- LibMatrixCUDA.matmultTSMM(ec, getExtendedOpcode(), mat,
_output.getName(), isLeftTransposed);
+ LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(),
getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
ec.releaseMatrixInputForGPUInstruction(_input.getName());
ec.releaseMatrixOutputForGPUInstruction(_output.getName());
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index 2766909..3fc8a98 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -23,6 +23,7 @@ import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.utils.GPUStatistics;
@@ -43,10 +44,10 @@ public class MatrixBuiltinGPUInstruction extends
BuiltinUnaryGPUInstruction {
ec.setMetaData(_output.getName(), mat.getNumRows(),
mat.getNumColumns());
if(opcode.equals("sel+")) {
- LibMatrixCUDA.relu(ec, getExtendedOpcode(), mat,
_output.getName());
+ LibMatrixCUDA.relu(ec, ec.getGPUContext(),
getExtendedOpcode(), mat, _output.getName());
} else if (opcode.equals("exp")) {
- LibMatrixCUDA.exp(ec, getExtendedOpcode(), mat,
_output.getName());
+ LibMatrixCUDA.exp(ec, ec.getGPUContext(),
getExtendedOpcode(), mat, _output.getName());
}
else {
throw new DMLRuntimeException("Unsupported GPU
operator:" + opcode);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
index 2da1aa6..a1520a9 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -23,6 +23,7 @@ import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.matrix.operators.Operator;
@@ -71,10 +72,10 @@ public class MatrixMatrixArithmeticGPUInstruction extends
ArithmeticBinaryGPUIns
ec.setMetaData(_output.getName(), (int)rlen, (int)clen);
BinaryOperator bop = (BinaryOperator) _optr;
- LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(),
in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
+ LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(),
getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed,
isRightTransposed, bop);
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
- ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+ ec.releaseMatrixOutputForGPUInstruction(_output.getName());
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
index 2c833e4..28254c6 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
@@ -26,6 +26,7 @@ import
org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.utils.GPUStatistics;
@@ -103,11 +104,11 @@ public class MatrixMatrixAxpyGPUInstruction extends
ArithmeticBinaryGPUInstructi
" and input2:" + rlen2 + " X " + clen2);
}
- LibMatrixCUDA.axpy(ec, getExtendedOpcode(), in1, in2,
_output.getName(), multiplier*scalar.getDoubleValue());
+ LibMatrixCUDA.axpy(ec, ec.getGPUContext(), getExtendedOpcode(),
in1, in2, _output.getName(), multiplier*scalar.getDoubleValue());
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
- ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+ ec.releaseMatrixOutputForGPUInstruction(_output.getName());
}
private boolean isValidMMOperation(long rlen1, long rlen2, long clen1,
long clen2) {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
index 54ba32e..424a2c5 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -25,6 +25,7 @@ import
org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.SwapIndex;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
@@ -81,10 +82,10 @@ public class ReorgGPUInstruction extends GPUInstruction
//execute operation
ec.setMetaData(_output.getName(), rlen, clen);
- LibMatrixCUDA.transpose(ec, getExtendedOpcode(), mat,
_output.getName());
+ LibMatrixCUDA.transpose(ec, ec.getGPUContext(),
getExtendedOpcode(), mat, _output.getName());
//release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input.getName());
- ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+ ec.releaseMatrixOutputForGPUInstruction(_output.getName());
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
index 44cc6e2..64cb6c4 100644
---
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -60,9 +60,9 @@ public class ScalarMatrixArithmeticGPUInstruction extends
ArithmeticBinaryGPUIns
ScalarOperator sc_op = (ScalarOperator) _optr;
sc_op.setConstant(constant.getDoubleValue());
- LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(),
in1, _output.getName(), isTransposed, sc_op);
+ LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(),
getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
ec.releaseMatrixInputForGPUInstruction(mat.getName());
- ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+ ec.releaseMatrixOutputForGPUInstruction(_output.getName());
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
new file mode 100644
index 0000000..5e202a9
--- /dev/null
+++
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -0,0 +1,457 @@
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
+import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense;
+import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
+import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
+import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
+import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
+import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.jcusparse.cusparsePointerMode;
+
+/**
+ * Compressed Sparse Row (CSR) format for CUDA
+ * Generalized matrix multiply is implemented for CSR format in the cuSparse
library among other operations
+ */
+public class CSRPointer {
+
+ private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());
+
+ /** {@link GPUContext} instance to track the GPU to do work on */
+ private final GPUContext gpuContext;
+
+ private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
+
+ public static cusparseMatDescr matrixDescriptor;
+
+ /** Number of non zeroes */
+ public long nnz;
+
+ /** double array of non zero values */
+ public Pointer val;
+
+ /** integer array of start of all rows and end of last row + 1 */
+ public Pointer rowPtr;
+
+ /** integer array of nnz values' column indices */
+ public Pointer colInd;
+
+ /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */
+ public cusparseMatDescr descr;
+
+ /**
+ * Default constructor to help with Factory method {@link
#allocateEmpty(GPUContext, long, long)}
+ * @param gCtx a valid {@link GPUContext}
+ */
+ private CSRPointer(GPUContext gCtx) {
+ gpuContext = gCtx;
+ val = new Pointer();
+ rowPtr = new Pointer();
+ colInd = new Pointer();
+ allocateMatDescrPointer();
+ }
+
+ private Pointer allocate(String instName, long size) throws
DMLRuntimeException {
+ return getGPUContext().allocate(instName, size);
+ }
+
+ private Pointer allocate(long size) throws DMLRuntimeException {
+ return getGPUContext().allocate(size);
+ }
+
+ private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+ getGPUContext().cudaFreeHelper(toFree);
+ }
+
+ private void cudaFreeHelper(Pointer toFree, boolean eager) throws
DMLRuntimeException {
+ getGPUContext().cudaFreeHelper(toFree, eager);
+ }
+
+ private void cudaFreeHelper(String instName, Pointer toFree, boolean eager)
throws DMLRuntimeException {
+ getGPUContext().cudaFreeHelper(instName, toFree, eager);
+ }
+
+ private static long getDoubleSizeOf(long numElems) {
+ return numElems * ((long)jcuda.Sizeof.DOUBLE);
+ }
+
+ private static long getIntSizeOf(long numElems) {
+ return numElems * ((long)jcuda.Sizeof.INT);
+ }
+
+ private GPUContext getGPUContext() throws DMLRuntimeException {
+ return gpuContext;
+ }
+
+ public static int toIntExact(long l) throws DMLRuntimeException {
+ if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+ throw new DMLRuntimeException("Cannot be cast to int:" + l);
+ }
+ return (int) l;
+ }
+
+ /**
+ * @return Singleton default matrix descriptor object
+ * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
+ */
+ public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
+ if (matrixDescriptor == null) {
+ // Code from JCuda Samples -
http://www.jcuda.org/samples/JCusparseSample.java
+ matrixDescriptor = new cusparseMatDescr();
+ cusparseCreateMatDescr(matrixDescriptor);
+ cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
+ cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
+ }
+ return matrixDescriptor;
+ }
+
+ /**
+ * Estimate the size of a CSR matrix in GPU memory
+ * Size of pointers is not needed and is not added in
+ *
+ * @param nnz2 number of non zeroes
+ * @param rows number of rows
+ * @return size estimate
+ */
+ public static long estimateSize(long nnz2, long rows) {
+ long sizeofValArray = getDoubleSizeOf(nnz2);
+ long sizeofRowPtrArray = getIntSizeOf(rows + 1);
+ long sizeofColIndArray = getIntSizeOf(nnz2);
+ long sizeofDescr = getIntSizeOf(4);
+ // From the CUSPARSE documentation, the cusparseMatDescr in native code is
represented as:
+ // typedef struct {
+ // cusparseMatrixType_t MatrixType;
+ // cusparseFillMode_t FillMode;
+ // cusparseDiagType_t DiagType;
+ // cusparseIndexBase_t IndexBase;
+ // } cusparseMatDescr_t;
+ long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray +
sizeofDescr;
+ return tot;
+ }
+
+ /**
+ * Static method to copy a CSR sparse matrix from Host to Device
+ *
+ * @param dest [input] destination location (on GPU)
+ * @param rows number of rows
+ * @param nnz number of non-zeroes
+ * @param rowPtr integer array of row pointers
+ * @param colInd integer array of column indices
+ * @param values double array of non zero values
+ */
+ public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[]
rowPtr, int[] colInd, double[] values) {
+ CSRPointer r = dest;
+ long t0 = 0;
+ if (DMLScript.STATISTICS) t0 = System.nanoTime();
+ r.nnz = nnz;
+ cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1),
cudaMemcpyHostToDevice);
+ cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz),
cudaMemcpyHostToDevice);
+ cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz),
cudaMemcpyHostToDevice);
+ if (DMLScript.STATISTICS)
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3);
+ }
+
+ /**
+ * Static method to copy a CSR sparse matrix from Device to host
+ *
+ * @param src [input] source location (on GPU)
+ * @param rows [input] number of rows
+ * @param nnz [input] number of non-zeroes
+ * @param rowPtr [output] pre-allocated integer array of row pointers of
size (rows+1)
+ * @param colInd [output] pre-allocated integer array of column indices of
size nnz
+ * @param values [output] pre-allocated double array of values of size nnz
+ */
+ public static void copyToHost(CSRPointer src, int rows, long nnz, int[]
rowPtr, int[] colInd, double[] values) {
+ CSRPointer r = src;
+ long t0 = 0;
+ if (DMLScript.STATISTICS) t0 = System.nanoTime();
+ cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1),
cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz),
cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz),
cudaMemcpyDeviceToHost);
+ if (DMLScript.STATISTICS)
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3);
+ }
+
+ /**
+ * Estimates the number of non zero elements from the results of a sparse
cusparseDgeam operation
+ * C = a op(A) + b op(B)
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param B Sparse Matrix B on GPU
+ * @param m Rows in A
+ * @param n Columns in Bs
+ * @return CSR (compressed sparse row) pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle
handle, CSRPointer A, CSRPointer B, int m, int n)
+ throws DMLRuntimeException {
+ if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+ throw new DMLRuntimeException("Number of non zeroes is larger than
supported by cuSparse");
+ }
+ CSRPointer C = new CSRPointer(gCtx);
+ step1AllocateRowPointers(gCtx, handle, C, m);
+ step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
+ step3AllocateValNInd(gCtx, handle, C);
+ return C;
+ }
+
+ /**
+ * Estimates the number of non-zero elements from the result of a sparse
matrix multiplication C = A * B
+ * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
+ *
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param transA 'T' if A is to be transposed, 'N' otherwise
+ * @param B Sparse Matrix B on GPU
+ * @param transB 'T' if B is to be transposed, 'N' otherwise
+ * @param m Rows in A
+ * @param n Columns in B
+ * @param k Columns in A / Rows in B
+ * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on
GPU
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx,
cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int
m, int n, int k)
+ throws DMLRuntimeException {
+ // Following the code example at
http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
+ //
https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
+ CSRPointer C = new CSRPointer(gCtx);
+ step1AllocateRowPointers(gCtx, handle, C, m);
+ step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
+ step3AllocateValNInd(gCtx, handle, C);
+ return C;
+ }
+
+ /**
+ * Check for ultra sparsity
+ *
+ * @param rows number of rows
+ * @param cols number of columns
+ * @return true if ultra sparse
+ */
+ public boolean isUltraSparse(int rows, int cols) {
+ double sp = ((double) nnz / rows / cols);
+ return sp < ULTRA_SPARSITY_TURN_POINT;
+ }
+
+//
==============================================================================================
+
+// The following methods estimate the memory needed for sparse matrices that
are
+// results of operations on other sparse matrices using the cuSparse Library.
+// The operation is C = op(A) binaryOperation op(B), C is the output and A & B
are the inputs
+// op = whether to transpose or not
+// binaryOperation = For cuSparse, +, - are *(matmul) are supported
+
+// From CuSparse Manual,
+// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step
approach
+// to complete sparse matrix C. In the first step, the user allocates
csrRowPtrC of m+1
+// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
+// and the total number of nonzero elements. In the second step, the user
gathers nnzC
+//(number of nonzero elements of matrix C) from either
(nnzC=*nnzTotalDevHostPtr)
+// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
+// nnzC elements respectively, then finally calls function
cusparse[S|D|C|Z]csrgeam()
+// to complete matrix C.
+
+ /**
+ * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
+ * the default that works for DGEMM.
+ */
+ private void allocateMatDescrPointer() {
+ this.descr = getDefaultCuSparseMatrixDescriptor();
+ }
+
+ /**
+ * Factory method to allocate an empty CSR Sparse matrix on the GPU
+ *
+ * @param nnz2 number of non-zeroes
+ * @param rows number of rows
+ * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on
GPU
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long
rows) throws DMLRuntimeException {
+ LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and
rows=" + rows + ", GPUContext=" + gCtx);
+ assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes
is less than 0 when trying to allocate sparse data on GPU";
+ CSRPointer r = new CSRPointer(gCtx);
+ r.nnz = nnz2;
+ if (nnz2 == 0) {
+ // The convention for an empty sparse matrix is to just have an instance
of the CSRPointer object
+ // with no memory allocated on the GPU.
+ return r;
+ }
+ gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) +
getIntSizeOf(nnz2));
+ // increment the cudaCount by 1 for the allocation of all 3 arrays
+ r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
+ r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
+ r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+ return r;
+ }
+
+ /**
+ * Allocate row pointers of m+1 elements
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param C Output matrix
+ * @param rowsC number of rows in C
+ * @throws DMLRuntimeException ?
+ */
+ private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle
handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
+ LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
+ cusparseSetPointerMode(handle,
cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
+ //cudaDeviceSynchronize;
+ // Do not increment the cudaCount of allocations on GPU
+ C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
+ }
+
+ /**
+ * Determine total number of nonzero element for the cusparseDgeam
operation.
+ * This is done from either (nnzC=*nnzTotalDevHostPtr) or
(nnzC=csrRowPtrC(m)-csrRowPtrC(0))
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param B Sparse Matrix B on GPU
+ * @param C Output Sparse Matrix C on GPU
+ * @param m Rows in C
+ * @param n Columns in C
+ * @throws DMLRuntimeException ?
+ */
+ private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle
handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws
DMLRuntimeException {
+ LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
+ int[] CnnzArray = {-1};
+ cusparseXcsrgeamNnz(handle, m, n,
+ A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
+ B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
+ C.descr, C.rowPtr, Pointer.to(CnnzArray));
+ //cudaDeviceSynchronize;
+ if (CnnzArray[0] != -1) {
+ C.nnz = CnnzArray[0];
+ } else {
+ int baseArray[] = {0};
+ cudaMemcpy(Pointer.to(CnnzArray),
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1),
cudaMemcpyDeviceToHost);
+ C.nnz = CnnzArray[0] - baseArray[0];
+ }
+ }
+
+//
==============================================================================================
+
+ /**
+ * Determine total number of nonzero element for the cusparseDgemm operation.
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param transA op - whether A is transposed
+ * @param B Sparse Matrix B on GPU
+ * @param transB op - whether B is transposed
+ * @param C Output Sparse Matrix C on GPU
+ * @param m Number of rows of sparse matrix op ( A ) and C
+ * @param n Number of columns of sparse matrix op ( B ) and C
+ * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B )
+ * @throws DMLRuntimeException ?
+ */
+ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle
handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int
m, int n, int k) throws DMLRuntimeException {
+ LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
+ int[] CnnzArray = {-1};
+ if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+ throw new DMLRuntimeException("Number of non zeroes is larger than
supported by cuSparse");
+ }
+ cusparseXcsrgemmNnz(handle, transA, transB, m, n, k,
+ A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
+ B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
+ C.descr, C.rowPtr, Pointer.to(CnnzArray));
+ //cudaDeviceSynchronize;
+ if (CnnzArray[0] != -1) {
+ C.nnz = CnnzArray[0];
+ } else {
+ int baseArray[] = {0};
+ cudaMemcpy(Pointer.to(CnnzArray),
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1),
cudaMemcpyDeviceToHost);
+ C.nnz = CnnzArray[0] - baseArray[0];
+ }
+ }
+
+ /**
+ * Allocate val and index pointers.
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param C Output sparse matrix on GPU
+ * @throws DMLRuntimeException ?
+ */
+ private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle
handle, CSRPointer C) throws DMLRuntimeException {
+ LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
+ // Increment cudaCount by one when all three arrays of CSR sparse array
are allocated
+ C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
+ C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
+ }
+
+ /**
+ * Copies this CSR matrix on the GPU to a dense column-major matrix
+ * on the GPU. This is a temporary matrix for operations such as
+ * cusparseDcsrmv.
+ * Since the allocated matrix is temporary, bookkeeping is not updated.
+ * The caller is responsible for calling "free" on the returned Pointer
object
+ *
+ * @param cusparseHandle a valid {@link cusparseHandle}
+ * @param cublasHandle a valid {@link cublasHandle}
+ * @param rows number of rows in this CSR matrix
+ * @param cols number of columns in this CSR matrix
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ * @return A {@link Pointer} to the allocated dense matrix (in column-major
format)
+ */
+ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle,
cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException {
+ LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " +
this + ", GPUContext=" + getGPUContext());
+ long size = ((long) rows) * getDoubleSizeOf((long) cols);
+ Pointer A = allocate(size);
+ // If this sparse block is empty, the allocated dense matrix, initialized
to zeroes, will be returned.
+ if (val != null && rowPtr != null && colInd != null && nnz > 0) {
+ // Note: cusparseDcsr2dense method cannot handle empty blocks
+ cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr,
colInd, A, rows);
+ //cudaDeviceSynchronize;
+ } else {
+ LOG.warn("in CSRPointer, the values array, row pointers array or column
indices array was null");
+ }
+ return A;
+ }
+
+ /**
+ * Calls cudaFree lazily on the allocated {@link Pointer} instances
+ */
+ public void deallocate() throws DMLRuntimeException {
+ deallocate(false);
+ }
+
+ /**
+ * Calls cudaFree lazily or eagerly on the allocated {@link Pointer}
instances
+ *
+ * @param eager whether to do eager or lazy cudaFrees
+ */
+ public void deallocate(boolean eager) throws DMLRuntimeException {
+ if (nnz > 0) {
+ cudaFreeHelper(val, eager);
+ cudaFreeHelper(rowPtr, eager);
+ cudaFreeHelper(colInd, eager);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "CSRPointer{" +
+ "nnz=" + nnz +
+ '}';
+ }
+}