[FIX] Fixed nested parfor for GPUs Additionally - Fixed intellij codestyle accordingly - Fixed formatting of some GPU related source files
Closes #532 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f5871756 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f5871756 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f5871756 Branch: refs/heads/master Commit: f587175647a84a3825b174b4d29c0398be17331f Parents: 0bcae49 Author: Nakul Jindal <[email protected]> Authored: Sat Jun 10 12:06:47 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Sat Jun 10 12:06:47 2017 -0700 ---------------------------------------------------------------------- dev/code-style/systemml-style-intellij.xml | 37 +- .../apache/sysml/api/ScriptExecutorUtils.java | 17 +- .../controlprogram/ParForProgramBlock.java | 19 +- .../context/ExecutionContext.java | 64 +- .../controlprogram/parfor/LocalParWorker.java | 2 +- .../cp/FunctionCallCPInstruction.java | 12 +- .../gpu/AggregateBinaryGPUInstruction.java | 4 +- .../gpu/AggregateUnaryGPUInstruction.java | 2 +- .../gpu/ConvolutionGPUInstruction.java | 18 +- .../instructions/gpu/MMTSJGPUInstruction.java | 2 +- .../gpu/MatrixBuiltinGPUInstruction.java | 30 +- .../MatrixMatrixArithmeticGPUInstruction.java | 2 +- .../gpu/MatrixMatrixAxpyGPUInstruction.java | 2 +- .../gpu/MatrixMatrixBuiltinGPUInstruction.java | 2 +- .../instructions/gpu/ReorgGPUInstruction.java | 2 +- .../ScalarMatrixArithmeticGPUInstruction.java | 2 +- .../instructions/gpu/context/CSRPointer.java | 922 ++++++------- .../gpu/context/ExecutionConfig.java | 85 +- .../instructions/gpu/context/GPUContext.java | 1257 +++++++++--------- .../gpu/context/GPUContextPool.java | 266 ++-- .../instructions/gpu/context/GPUObject.java | 454 ++++--- .../instructions/gpu/context/JCudaKernels.java | 141 +- .../runtime/matrix/data/LibMatrixCUDA.java | 42 +- .../runtime/matrix/data/LibMatrixDNNHelper.java | 1 + .../org/apache/sysml/test/gpu/GPUTests.java | 47 +- .../sysml/test/gpu/NeuralNetworkOpTests.java | 106 +- 26 files changed, 1917 insertions(+), 1621 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/dev/code-style/systemml-style-intellij.xml ---------------------------------------------------------------------- diff --git a/dev/code-style/systemml-style-intellij.xml b/dev/code-style/systemml-style-intellij.xml index 1ad3209..b4a53b4 100644 --- a/dev/code-style/systemml-style-intellij.xml +++ b/dev/code-style/systemml-style-intellij.xml @@ -1,28 +1,27 @@ <!-- - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. --> - <code_scheme name="SystemML Format"> <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" /> <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" /> <option name="IMPORT_LAYOUT_TABLE"> <value> - <package name="" withSubpackages="true" static="false" /> + <package name="" withSubpackages="true" static="true" /> <emptyLine /> <package name="java" withSubpackages="true" static="false" /> <emptyLine /> @@ -32,7 +31,7 @@ <emptyLine /> <package name="com" withSubpackages="true" static="false" /> <emptyLine /> - <package name="" withSubpackages="true" static="true" /> + <package name="" withSubpackages="true" static="false" /> </value> </option> <codeStyleSettings language="JAVA"> http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index 674a011..2895aa4 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -19,6 +19,8 @@ package org.apache.sysml.api; +import java.util.List; + import org.apache.sysml.api.mlcontext.ScriptExecutor; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.codegen.SpoofCompiler; @@ -79,23 +81,22 @@ public class ScriptExecutorUtils { // GPUs GPUContextPool.PER_PROCESS_MAX_GPUS = dmlconf.getIntValue(DMLConfig.MAX_GPUS_PER_PROCESS); Statistics.startRunTimer(); - GPUContext gCtx = null; try { // run execute (w/ exception handling to ensure proper shutdown) if (DMLScript.USE_ACCELERATOR && ec != null) { - gCtx = GPUContextPool.getFromPool(); - if (gCtx == null) { + List<GPUContext> gCtxs = GPUContextPool.reserveAllGPUContexts(); + if (gCtxs == null) { throw new DMLRuntimeException( "GPU : Could not create GPUContext, either no GPU or all GPUs currently in use"); } - gCtx.initializeThread(); - ec.setGPUContext(gCtx); + gCtxs.get(0).initializeThread(); + ec.setGPUContexts(gCtxs); } rtprog.execute(ec); } finally { // ensure cleanup/shutdown - if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != null) { - ec.getGPUContext().clearTemporaryMemory(); - GPUContextPool.returnToPool(ec.getGPUContext()); + if (DMLScript.USE_ACCELERATOR && !ec.getGPUContexts().isEmpty()) { + ec.getGPUContexts().forEach(gCtx -> gCtx.clearTemporaryMemory()); + GPUContextPool.freeAllGPUContexts(); } if (dmlconf.getBooleanValue(DMLConfig.CODEGEN)) SpoofCompiler.cleanupCodeGenerator(); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java index c9dcc22..95e28e7 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java @@ -632,9 +632,7 @@ public class ParForProgramBlock extends ForProgramBlock { case LOCAL: //create parworkers as local threads if (DMLScript.USE_ACCELERATOR) { - GPUContextPool.returnToPool(ec.getGPUContext()); - ec.setGPUContext(null); - setDegreeOfParallelism(GPUContextPool.getDeviceCount()); + setDegreeOfParallelism(ec.getNumGPUContexts()); } executeLocalParFor(ec, iterVar, from, to, incr); break; @@ -757,7 +755,7 @@ public class ParForProgramBlock extends ForProgramBlock { //create parallel workers as (lazy) deep copies //including preparation of update-in-place variables - workers[i] = createParallelWorker( _pwIDs[i], queue, ec ); + workers[i] = createParallelWorker( _pwIDs[i], queue, ec, i); threads[i] = new Thread( workers[i] ); threads[i].setPriority(Thread.MAX_PRIORITY); } @@ -833,11 +831,9 @@ public class ParForProgramBlock extends ForProgramBlock // the main thread to use the GPUContext if (DMLScript.USE_ACCELERATOR) { for (int i = 0; i < _numThreads; i++) { - GPUContext gCtx = workers[i].getExecutionContext().getGPUContext(); - GPUContextPool.returnToPool(gCtx); + workers[i].getExecutionContext().setGPUContexts(null); } - ec.setGPUContext(GPUContextPool.getFromPool()); - ec.getGPUContext().initializeThread(); + ec.getGPUContext(0).initializeThread(); } } finally @@ -1386,10 +1382,11 @@ public class ParForProgramBlock extends ForProgramBlock * @param pwID parworker id * @param queue task queue * @param ec execution context + * @param index the index of the worker * @return local parworker * @throws DMLRuntimeException if DMLRuntimeException occurs */ - private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec) + private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec, int index) throws DMLRuntimeException { LocalParWorker pw = null; @@ -1420,9 +1417,9 @@ public class ParForProgramBlock extends ForProgramBlock ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec); // If GPU mode is enabled, gets a GPUContext from the pool of GPUContexts - // and sets it in the ExecutionContext + // and sets it in the ExecutionContext of the parfor if (DMLScript.USE_ACCELERATOR){ - cpEc.setGPUContext(GPUContextPool.getFromPool()); + cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index))); } //prepare basic update-in-place variables (vars dropped on result merge) http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index 735f394..fb179f5 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -20,7 +20,9 @@ package org.apache.sysml.runtime.controlprogram.context; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; +import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -66,7 +68,10 @@ public class ExecutionContext { //debugging (optional) protected DebugState _dbState = null; - protected GPUContext _gpuContext = null; + /** + * List of {@link GPUContext}s owned by this {@link ExecutionContext} + */ + protected List<GPUContext> _gpuContexts = new ArrayList<>(); protected ExecutionContext() { @@ -99,13 +104,42 @@ public class ExecutionContext { _variables = vars; } - public GPUContext getGPUContext() { - return _gpuContext; + /** + * Get the i-th GPUContext + * @param index index of the GPUContext + * @return a valid GPUContext or null if the indexed GPUContext does not exist. + */ + public GPUContext getGPUContext(int index) { + try { + return _gpuContexts.get(index); + } catch (IndexOutOfBoundsException e){ + return null; + } } - public void setGPUContext(GPUContext _gpuContext) { - this._gpuContext = _gpuContext; - } + /** + * Sets the list of GPUContexts + * @param gpuContexts a collection of GPUContexts + */ + public void setGPUContexts(List<GPUContext> gpuContexts){ + _gpuContexts = gpuContexts; + } + + /** + * Gets the list of GPUContexts + * @return a list of GPUContexts + */ + public List<GPUContext> getGPUContexts() { + return _gpuContexts; + } + + /** + * Gets the number of GPUContexts + * @return number of GPUContexts + */ + public int getNumGPUContexts() { + return _gpuContexts.size(); + } /* ------------------------------------------------------- * Methods to handle variables and associated data objects @@ -238,7 +272,7 @@ public class ExecutionContext { throws DMLRuntimeException { MatrixObject mo = allocateGPUMatrixObject(varName); - boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifyDense(); + boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense(); mo.getMatrixCharacteristics().setNonZeros(-1); return new Pair<MatrixObject, Boolean>(mo, allocated); } @@ -257,7 +291,7 @@ public class ExecutionContext { { MatrixObject mo = allocateGPUMatrixObject(varName); mo.getMatrixCharacteristics().setNonZeros(nnz); - boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifySparse(); + boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(); return new Pair<MatrixObject, Boolean>(mo, allocated); } @@ -269,12 +303,12 @@ public class ExecutionContext { */ public MatrixObject allocateGPUMatrixObject(String varName) throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); - if( mo.getGPUObject(getGPUContext()) == null ) { - GPUObject newGObj = getGPUContext().createGPUObject(mo); + if( mo.getGPUObject(getGPUContext(0)) == null ) { + GPUObject newGObj = getGPUContext(0).createGPUObject(mo); // The lock is added here for an output block // so that any block currently in use is not deallocated by eviction on the GPU newGObj.addLock(); - mo.setGPUObject(getGPUContext(), newGObj); + mo.setGPUObject(getGPUContext(0), newGObj); } return mo; } @@ -282,7 +316,7 @@ public class ExecutionContext { public Pair<MatrixObject, Boolean> getMatrixInputForGPUInstruction(String varName) throws DMLRuntimeException { - GPUContext gCtx = getGPUContext(); + GPUContext gCtx = getGPUContext(0); boolean copied = false; MatrixObject mo = getMatrixObject(varName); if(mo == null) { @@ -322,7 +356,7 @@ public class ExecutionContext { throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); - mo.getGPUObject(getGPUContext()).releaseInput(); + mo.getGPUObject(getGPUContext(0)).releaseInput(); } /** @@ -374,10 +408,10 @@ public class ExecutionContext { public void releaseMatrixOutputForGPUInstruction(String varName) throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); - if(mo.getGPUObject(getGPUContext()) == null || !mo.getGPUObject(getGPUContext()).isAllocated()) { + if(mo.getGPUObject(getGPUContext(0)) == null || !mo.getGPUObject(getGPUContext(0)).isAllocated()) { throw new DMLRuntimeException("No output is allocated on GPU"); } - mo.getGPUObject(getGPUContext()).releaseOutput(); + mo.getGPUObject(getGPUContext(0)).releaseOutput(); } public void setMatrixOutput(String varName, MatrixBlock outputData) http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java index c4684ec..636b1f8 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java @@ -83,7 +83,7 @@ public class LocalParWorker extends ParWorker implements Runnable // Initialize this GPUContext to this thread if (DMLScript.USE_ACCELERATOR) - _ec.getGPUContext().initializeThread(); + _ec.getGPUContext(0).initializeThread(); //setup compiler config for worker thread ConfigurationManager.setLocalConfig(_cconf); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java index 9cc6bb2..3cd2633 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java @@ -168,9 +168,9 @@ public class FunctionCallCPInstruction extends CPInstruction // and copy the function arguments into the created table. ExecutionContext fn_ec = ExecutionContextFactory.createContext(false, ec.getProgram()); if (DMLScript.USE_ACCELERATOR) { - fn_ec.setGPUContext(ec.getGPUContext()); - ec.setGPUContext(null); - fn_ec.getGPUContext().initializeThread(); + fn_ec.setGPUContexts(ec.getGPUContexts()); + ec.setGPUContexts(null); + fn_ec.getGPUContext(0).initializeThread(); } fn_ec.setVariables(functionVariables); // execute the function block @@ -206,9 +206,9 @@ public class FunctionCallCPInstruction extends CPInstruction ec.unpinVariables(_boundInputParamNames, pinStatus); if (DMLScript.USE_ACCELERATOR) { - ec.setGPUContext(fn_ec.getGPUContext()); - fn_ec.setGPUContext(null); - ec.getGPUContext().initializeThread(); + ec.setGPUContexts(fn_ec.getGPUContexts()); + fn_ec.setGPUContexts(null); + ec.getGPUContext(0).initializeThread(); } // add the updated binding for each return variable to the variables in original symbol table http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java index 2531c17..0c0a4b2 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java @@ -96,7 +96,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns()); ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmult(ec, ec.getGPUContext(), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); + LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); @@ -113,6 +113,6 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction @SuppressWarnings("unused") private boolean isSparse(ExecutionContext ec, String var) throws DMLRuntimeException { MatrixObject mo = ec.getMatrixObject(var); - return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(), mo); + return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(0), mo); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java index 8bdd397..5d01820 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java @@ -93,7 +93,7 @@ public class AggregateUnaryGPUInstruction extends GPUInstruction { ec.setMetaData(_output.getName(), rlen, 1); } - LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr); + LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index 9d4cd1f..e5ea097 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -186,9 +186,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); if(instOpcode.equalsIgnoreCase("bias_add")) - LibMatrixCUDA.biasAdd(ec.getGPUContext(), getExtendedOpcode(), input, bias, out); + LibMatrixCUDA.biasAdd(ec.getGPUContext(0), getExtendedOpcode(), input, bias, out); else if(instOpcode.equalsIgnoreCase("bias_multiply")) - LibMatrixCUDA.biasMultiply(ec.getGPUContext(), getExtendedOpcode(), input, bias, out); + LibMatrixCUDA.biasMultiply(ec.getGPUContext(0), getExtendedOpcode(), input, bias, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -202,7 +202,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); - LibMatrixCUDA.reluBackward(ec.getGPUContext(), getExtendedOpcode(), input, dout, out); + LibMatrixCUDA.reluBackward(ec.getGPUContext(0), getExtendedOpcode(), input, dout, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -253,7 +253,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, K * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2d(ec.getGPUContext(), getExtendedOpcode(), image, filter, out, N, C, H, W, + LibMatrixCUDA.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) { @@ -268,7 +268,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, K * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(), getExtendedOpcode(), image, bias, filter, out, N, C, H, W, + LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) { @@ -283,7 +283,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), K, C * R * S); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W, + LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); // TODO: For now always copy the device data to host // ec.gpuCtx.copyDeviceToHost(outputBlock); @@ -300,7 +300,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W, + LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling")) { @@ -313,7 +313,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); if(instOpcode.equalsIgnoreCase("maxpooling")) - LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W, + LibMatrixCUDA.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { @@ -328,7 +328,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W, + LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else { http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java index c147a6f..55656f0 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java @@ -107,7 +107,7 @@ public class MMTSJGPUInstruction extends GPUInstruction //execute operations ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName(), isLeftTransposed); + LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName(), isLeftTransposed); ec.releaseMatrixInputForGPUInstruction(_input.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java index 7b50285..beeacee 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java @@ -44,35 +44,35 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction { switch(opcode) { case "sel+": - LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "exp": - LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.exp(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "sqrt": - LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.sqrt(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "log": - LibMatrixCUDA.log(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.log(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "round": - LibMatrixCUDA.round(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.round(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "floor": - LibMatrixCUDA.floor(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.floor(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "ceil": - LibMatrixCUDA.ceil(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.ceil(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "abs": - LibMatrixCUDA.abs(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.abs(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "sin": - LibMatrixCUDA.sin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.sin(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "cos": - LibMatrixCUDA.cos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.cos(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "tan": - LibMatrixCUDA.tan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.tan(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "asin": - LibMatrixCUDA.asin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.asin(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "acos": - LibMatrixCUDA.acos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.acos(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "atan": - LibMatrixCUDA.atan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.atan(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "sign": - LibMatrixCUDA.sign(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCUDA.sign(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; default: throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); } http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java index 9573a1a..a03f9b1 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java @@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns ec.setMetaData(_output.getName(), (int)rlen, (int)clen); BinaryOperator bop = (BinaryOperator) _optr; - LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); + LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java index 58905d6..e430e29 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java @@ -103,7 +103,7 @@ public class MatrixMatrixAxpyGPUInstruction extends ArithmeticBinaryGPUInstructi " and input2:" + rlen2 + " X " + clen2); } - LibMatrixCUDA.axpy(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue()); + LibMatrixCUDA.axpy(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue()); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java index 8936735..e60a3d7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java @@ -46,7 +46,7 @@ public class MatrixMatrixBuiltinGPUInstruction extends BuiltinBinaryGPUInstructi if(opcode.equals("solve")) { ec.setMetaData(output.getName(), mat1.getNumColumns(), 1); - LibMatrixCUDA.solve(ec, ec.getGPUContext(), getExtendedOpcode(), mat1, mat2, output.getName()); + LibMatrixCUDA.solve(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName()); } else { throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java index 53d56a3..bc63d12 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java @@ -81,7 +81,7 @@ public class ReorgGPUInstruction extends GPUInstruction //execute operation ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.transpose(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); + LibMatrixCUDA.transpose(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java index 64cb6c4..ea4665a 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java @@ -60,7 +60,7 @@ public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns ScalarOperator sc_op = (ScalarOperator) _optr; sc_op.setConstant(constant.getDoubleValue()); - LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op); + LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op); ec.releaseMatrixInputForGPUInstruction(mat.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index 0ff9d14..b15dd69 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -52,453 +52,477 @@ import jcuda.jcusparse.cusparsePointerMode; */ public class CSRPointer { - private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName()); - - private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004; - - /** {@link GPUContext} instance to track the GPU to do work on */ - private final GPUContext gpuContext; - - public static cusparseMatDescr matrixDescriptor; - - /** Number of non zeroes */ - public long nnz; - - /** double array of non zero values */ - public Pointer val; - - /** integer array of start of all rows and end of last row + 1 */ - public Pointer rowPtr; - - /** integer array of nnz values' column indices */ - public Pointer colInd; - - /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */ - public cusparseMatDescr descr; - - - public CSRPointer clone(int rows) throws DMLRuntimeException { - CSRPointer me = this; - CSRPointer that = new CSRPointer(me.getGPUContext()); - - that.allocateMatDescrPointer(); - long totalSize = estimateSize(me.nnz, rows); - that.gpuContext.ensureFreeSpace(totalSize); - - that.nnz = me.nnz; - that.val = allocate(that.nnz * Sizeof.DOUBLE); - that.rowPtr = allocate(rows * Sizeof.DOUBLE); - that.colInd = allocate(that.nnz * Sizeof.DOUBLE); - - cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); - cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); - cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); - - return that; - } - - /** - * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)} - * @param gCtx a valid {@link GPUContext} - */ - private CSRPointer(GPUContext gCtx) { - gpuContext = gCtx; - val = new Pointer(); - rowPtr = new Pointer(); - colInd = new Pointer(); - allocateMatDescrPointer(); - } - -// private Pointer allocate(String instName, long size) throws DMLRuntimeException { -// return getGPUContext().allocate(instName, size); -// } - - private Pointer allocate(long size) throws DMLRuntimeException { - return getGPUContext().allocate(size); - } - -// private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { -// getGPUContext().cudaFreeHelper(toFree); -// } - - private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException { - getGPUContext().cudaFreeHelper(toFree, eager); - } - -// private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException { -// getGPUContext().cudaFreeHelper(instName, toFree, eager); -// } - - private static long getDoubleSizeOf(long numElems) { - return numElems * ((long)jcuda.Sizeof.DOUBLE); - } - - private static long getIntSizeOf(long numElems) { - return numElems * ((long)jcuda.Sizeof.INT); - } - - private GPUContext getGPUContext() { - return gpuContext; - } - - public static int toIntExact(long l) throws DMLRuntimeException { - if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { - throw new DMLRuntimeException("Cannot be cast to int:" + l); - } - return (int) l; - } - - /** - * @return Singleton default matrix descriptor object - * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO) - */ - public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() { - if (matrixDescriptor == null) { - // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java - matrixDescriptor = new cusparseMatDescr(); - cusparseCreateMatDescr(matrixDescriptor); - cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO); - } - return matrixDescriptor; - } - - /** - * Estimate the size of a CSR matrix in GPU memory - * Size of pointers is not needed and is not added in - * - * @param nnz2 number of non zeroes - * @param rows number of rows - * @return size estimate - */ - public static long estimateSize(long nnz2, long rows) { - long sizeofValArray = getDoubleSizeOf(nnz2); - long sizeofRowPtrArray = getIntSizeOf(rows + 1); - long sizeofColIndArray = getIntSizeOf(nnz2); - long sizeofDescr = getIntSizeOf(4); - // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as: - // typedef struct { - // cusparseMatrixType_t MatrixType; - // cusparseFillMode_t FillMode; - // cusparseDiagType_t DiagType; - // cusparseIndexBase_t IndexBase; - // } cusparseMatDescr_t; - long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr; - return tot; - } - - /** - * Static method to copy a CSR sparse matrix from Host to Device - * - * @param dest [input] destination location (on GPU) - * @param rows number of rows - * @param nnz number of non-zeroes - * @param rowPtr integer array of row pointers - * @param colInd integer array of column indices - * @param values double array of non zero values - */ - public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { - CSRPointer r = dest; - long t0 = 0; - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - r.nnz = nnz; - cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice); - cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice); - cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice); - if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3); - } - - /** - * Static method to copy a CSR sparse matrix from Device to host - * - * @param src [input] source location (on GPU) - * @param rows [input] number of rows - * @param nnz [input] number of non-zeroes - * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1) - * @param colInd [output] pre-allocated integer array of column indices of size nnz - * @param values [output] pre-allocated double array of values of size nnz - */ - public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { - CSRPointer r = src; - long t0 = 0; - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3); - } - - /** - * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation - * C = a op(A) + b op(B) - * @param gCtx a valid {@link GPUContext} - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param B Sparse Matrix B on GPU - * @param m Rows in A - * @param n Columns in Bs - * @return CSR (compressed sparse row) pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) - throws DMLRuntimeException { - if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { - throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); - } - CSRPointer C = new CSRPointer(gCtx); - step1AllocateRowPointers(gCtx, handle, C, m); - step2GatherNNZGeam(gCtx, handle, A, B, C, m, n); - step3AllocateValNInd(gCtx, handle, C); - return C; - } - - /** - * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B - * and returns the {@link CSRPointer} to C with the appropriate GPU memory. - * - * @param gCtx ? - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param transA 'T' if A is to be transposed, 'N' otherwise - * @param B Sparse Matrix B on GPU - * @param transB 'T' if B is to be transposed, 'N' otherwise - * @param m Rows in A - * @param n Columns in B - * @param k Columns in A / Rows in B - * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int m, int n, int k) - throws DMLRuntimeException { - // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at - // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java - CSRPointer C = new CSRPointer(gCtx); - step1AllocateRowPointers(gCtx, handle, C, m); - step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k); - step3AllocateValNInd(gCtx, handle, C); - return C; - } - - /** - * Check for ultra sparsity - * - * @param rows number of rows - * @param cols number of columns - * @return true if ultra sparse - */ - public boolean isUltraSparse(int rows, int cols) { - double sp = ((double) nnz / rows / cols); - return sp < ULTRA_SPARSITY_TURN_POINT; - } - -// ============================================================================================== - -// The following methods estimate the memory needed for sparse matrices that are -// results of operations on other sparse matrices using the cuSparse Library. -// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs -// op = whether to transpose or not -// binaryOperation = For cuSparse, +, - are *(matmul) are supported - -// From CuSparse Manual, -// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach -// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1 -// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC -// and the total number of nonzero elements. In the second step, the user gathers nnzC -//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr) -// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of -// nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam() -// to complete matrix C. - - /** - * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL, - * the default that works for DGEMM. - */ - private void allocateMatDescrPointer() { - this.descr = getDefaultCuSparseMatrixDescriptor(); - } - - /** - * Factory method to allocate an empty CSR Sparse matrix on the GPU - * - * @param gCtx ? - * @param nnz2 number of non-zeroes - * @param rows number of rows - * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException { - LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); - assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"; - CSRPointer r = new CSRPointer(gCtx); - r.nnz = nnz2; - if (nnz2 == 0) { - // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object - // with no memory allocated on the GPU. - return r; - } - gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2)); - // increment the cudaCount by 1 for the allocation of all 3 arrays - r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2)); - r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1)); - r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2)); - return r; - } - - /** - * Allocate row pointers of m+1 elements - * @param gCtx a valid {@link GPUContext} - * @param handle a valid {@link cusparseHandle} - * @param C Output matrix - * @param rowsC number of rows in C - * @throws DMLRuntimeException ? - */ - private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException { - LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx); - cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); - //cudaDeviceSynchronize; - // Do not increment the cudaCount of allocations on GPU - C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1)); - } - - /** - * Determine total number of nonzero element for the cusparseDgeam operation. - * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) - * @param gCtx a valid {@link GPUContext} - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param B Sparse Matrix B on GPU - * @param C Output Sparse Matrix C on GPU - * @param m Rows in C - * @param n Columns in C - * @throws DMLRuntimeException ? - */ - private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws DMLRuntimeException { - LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx); - int[] CnnzArray = {-1}; - cusparseXcsrgeamNnz(handle, m, n, - A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, - B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, - C.descr, C.rowPtr, Pointer.to(CnnzArray)); - //cudaDeviceSynchronize; - if (CnnzArray[0] != -1) { - C.nnz = CnnzArray[0]; - } else { - int baseArray[] = {0}; - cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); - C.nnz = CnnzArray[0] - baseArray[0]; - } - } - -// ============================================================================================== - - /** - * Determine total number of nonzero element for the cusparseDgemm operation. - * @param gCtx a valid {@link GPUContext} - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param transA op - whether A is transposed - * @param B Sparse Matrix B on GPU - * @param transB op - whether B is transposed - * @param C Output Sparse Matrix C on GPU - * @param m Number of rows of sparse matrix op ( A ) and C - * @param n Number of columns of sparse matrix op ( B ) and C - * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B ) - * @throws DMLRuntimeException ? - */ - private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException { - LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx); - int[] CnnzArray = {-1}; - if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { - throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); - } - cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, - A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, - B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, - C.descr, C.rowPtr, Pointer.to(CnnzArray)); - //cudaDeviceSynchronize; - if (CnnzArray[0] != -1) { - C.nnz = CnnzArray[0]; - } else { - int baseArray[] = {0}; - cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); - C.nnz = CnnzArray[0] - baseArray[0]; - } - } - - /** - * Allocate val and index pointers. - * @param gCtx a valid {@link GPUContext} - * @param handle a valid {@link cusparseHandle} - * @param C Output sparse matrix on GPU - * @throws DMLRuntimeException ? - */ - private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) throws DMLRuntimeException { - LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx); - // Increment cudaCount by one when all three arrays of CSR sparse array are allocated - C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz)); - C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz)); - } - - /** - * Copies this CSR matrix on the GPU to a dense column-major matrix - * on the GPU. This is a temporary matrix for operations such as - * cusparseDcsrmv. - * Since the allocated matrix is temporary, bookkeeping is not updated. - * The caller is responsible for calling "free" on the returned Pointer object - * - * @param cusparseHandle a valid {@link cusparseHandle} - * @param cublasHandle a valid {@link cublasHandle} - * @param rows number of rows in this CSR matrix - * @param cols number of columns in this CSR matrix - * @throws DMLRuntimeException if DMLRuntimeException occurs - * @return A {@link Pointer} to the allocated dense matrix (in column-major format) - */ - public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException { - LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext()); - long size = ((long) rows) * getDoubleSizeOf((long) cols); - Pointer A = allocate(size); - // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned. - if (val != null && rowPtr != null && colInd != null && nnz > 0) { - // Note: cusparseDcsr2dense method cannot handle empty blocks - cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows); - //cudaDeviceSynchronize; - } else { - LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null"); - } - return A; - } - - /** - * Calls cudaFree lazily on the allocated {@link Pointer} instances - * - * @throws DMLRuntimeException ? - */ - public void deallocate() throws DMLRuntimeException { - deallocate(false); - } - - /** - * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances - * - * @param eager whether to do eager or lazy cudaFrees - * @throws DMLRuntimeException ? - */ - public void deallocate(boolean eager) throws DMLRuntimeException { - if (nnz > 0) { - cudaFreeHelper(val, eager); - cudaFreeHelper(rowPtr, eager); - cudaFreeHelper(colInd, eager); - } - } - - @Override - public String toString() { - return "CSRPointer{" + - "nnz=" + nnz + - '}'; - } + private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName()); + + private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004; + public static cusparseMatDescr matrixDescriptor; + /** + * {@link GPUContext} instance to track the GPU to do work on + */ + private final GPUContext gpuContext; + /** + * Number of non zeroes + */ + public long nnz; + + /** + * double array of non zero values + */ + public Pointer val; + + /** + * integer array of start of all rows and end of last row + 1 + */ + public Pointer rowPtr; + + /** + * integer array of nnz values' column indices + */ + public Pointer colInd; + + /** + * descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported + */ + public cusparseMatDescr descr; + + /** + * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)} + * + * @param gCtx a valid {@link GPUContext} + */ + private CSRPointer(GPUContext gCtx) { + gpuContext = gCtx; + val = new Pointer(); + rowPtr = new Pointer(); + colInd = new Pointer(); + allocateMatDescrPointer(); + } + + private static long getDoubleSizeOf(long numElems) { + return numElems * ((long) jcuda.Sizeof.DOUBLE); + } + + // private Pointer allocate(String instName, long size) throws DMLRuntimeException { + // return getGPUContext().allocate(instName, size); + // } + + private static long getIntSizeOf(long numElems) { + return numElems * ((long) jcuda.Sizeof.INT); + } + + // private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { + // getGPUContext().cudaFreeHelper(toFree); + // } + + public static int toIntExact(long l) throws DMLRuntimeException { + if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { + throw new DMLRuntimeException("Cannot be cast to int:" + l); + } + return (int) l; + } + + // private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException { + // getGPUContext().cudaFreeHelper(instName, toFree, eager); + // } + + /** + * @return Singleton default matrix descriptor object + * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO) + */ + public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() { + if (matrixDescriptor == null) { + // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java + matrixDescriptor = new cusparseMatDescr(); + cusparseCreateMatDescr(matrixDescriptor); + cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO); + } + return matrixDescriptor; + } + + /** + * Estimate the size of a CSR matrix in GPU memory + * Size of pointers is not needed and is not added in + * + * @param nnz2 number of non zeroes + * @param rows number of rows + * @return size estimate + */ + public static long estimateSize(long nnz2, long rows) { + long sizeofValArray = getDoubleSizeOf(nnz2); + long sizeofRowPtrArray = getIntSizeOf(rows + 1); + long sizeofColIndArray = getIntSizeOf(nnz2); + long sizeofDescr = getIntSizeOf(4); + // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as: + // typedef struct { + // cusparseMatrixType_t MatrixType; + // cusparseFillMode_t FillMode; + // cusparseDiagType_t DiagType; + // cusparseIndexBase_t IndexBase; + // } cusparseMatDescr_t; + long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr; + return tot; + } + + /** + * Static method to copy a CSR sparse matrix from Host to Device + * + * @param dest [input] destination location (on GPU) + * @param rows number of rows + * @param nnz number of non-zeroes + * @param rowPtr integer array of row pointers + * @param colInd integer array of column indices + * @param values double array of non zero values + */ + public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { + CSRPointer r = dest; + long t0 = 0; + if (DMLScript.STATISTICS) + t0 = System.nanoTime(); + r.nnz = nnz; + cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice); + cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice); + cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice); + if (DMLScript.STATISTICS) + GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) + GPUStatistics.cudaToDevCount.addAndGet(3); + } + + /** + * Static method to copy a CSR sparse matrix from Device to host + * + * @param src [input] source location (on GPU) + * @param rows [input] number of rows + * @param nnz [input] number of non-zeroes + * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1) + * @param colInd [output] pre-allocated integer array of column indices of size nnz + * @param values [output] pre-allocated double array of values of size nnz + */ + public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { + CSRPointer r = src; + long t0 = 0; + if (DMLScript.STATISTICS) + t0 = System.nanoTime(); + cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost); + if (DMLScript.STATISTICS) + GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0); + if (DMLScript.STATISTICS) + GPUStatistics.cudaFromDevCount.addAndGet(3); + } + + /** + * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation + * C = a op(A) + b op(B) + * + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param B Sparse Matrix B on GPU + * @param m Rows in A + * @param n Columns in Bs + * @return CSR (compressed sparse row) pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, + int n) throws DMLRuntimeException { + if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { + throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); + } + CSRPointer C = new CSRPointer(gCtx); + step1AllocateRowPointers(gCtx, handle, C, m); + step2GatherNNZGeam(gCtx, handle, A, B, C, m, n); + step3AllocateValNInd(gCtx, handle, C); + return C; + } + + /** + * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B + * and returns the {@link CSRPointer} to C with the appropriate GPU memory. + * + * @param gCtx ? + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param transA 'T' if A is to be transposed, 'N' otherwise + * @param B Sparse Matrix B on GPU + * @param transB 'T' if B is to be transposed, 'N' otherwise + * @param m Rows in A + * @param n Columns in B + * @param k Columns in A / Rows in B + * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, + CSRPointer B, int transB, int m, int n, int k) throws DMLRuntimeException { + // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at + // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java + CSRPointer C = new CSRPointer(gCtx); + step1AllocateRowPointers(gCtx, handle, C, m); + step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k); + step3AllocateValNInd(gCtx, handle, C); + return C; + } + + /** + * Factory method to allocate an empty CSR Sparse matrix on the GPU + * + * @param gCtx ? + * @param nnz2 number of non-zeroes + * @param rows number of rows + * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException { + LOG.trace( + "GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); + assert nnz2 + > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"; + CSRPointer r = new CSRPointer(gCtx); + r.nnz = nnz2; + if (nnz2 == 0) { + // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object + // with no memory allocated on the GPU. + return r; + } + gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2)); + // increment the cudaCount by 1 for the allocation of all 3 arrays + r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2)); + r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1)); + r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2)); + return r; + } + + /** + * Allocate row pointers of m+1 elements + * + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param C Output matrix + * @param rowsC number of rows in C + * @throws DMLRuntimeException ? + */ + private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) + throws DMLRuntimeException { + LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx); + cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); + //cudaDeviceSynchronize; + // Do not increment the cudaCount of allocations on GPU + C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1)); + } + + /** + * Determine total number of nonzero element for the cusparseDgeam operation. + * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) + * + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param B Sparse Matrix B on GPU + * @param C Output Sparse Matrix C on GPU + * @param m Rows in C + * @param n Columns in C + * @throws DMLRuntimeException ? + */ + private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, + CSRPointer C, int m, int n) throws DMLRuntimeException { + LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx); + int[] CnnzArray = { -1 }; + cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz), + B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray)); + //cudaDeviceSynchronize; + if (CnnzArray[0] != -1) { + C.nnz = CnnzArray[0]; + } else { + int baseArray[] = { 0 }; + cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), + cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); + C.nnz = CnnzArray[0] - baseArray[0]; + } + } + + /** + * Determine total number of nonzero element for the cusparseDgemm operation. + * + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param A Sparse Matrix A on GPU + * @param transA op - whether A is transposed + * @param B Sparse Matrix B on GPU + * @param transB op - whether B is transposed + * @param C Output Sparse Matrix C on GPU + * @param m Number of rows of sparse matrix op ( A ) and C + * @param n Number of columns of sparse matrix op ( B ) and C + * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B ) + * @throws DMLRuntimeException ? + */ + private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, + CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException { + LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx); + int[] CnnzArray = { -1 }; + if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { + throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); + } + cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, + toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray)); + //cudaDeviceSynchronize; + if (CnnzArray[0] != -1) { + C.nnz = CnnzArray[0]; + } else { + int baseArray[] = { 0 }; + cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), + cudaMemcpyDeviceToHost); + cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); + C.nnz = CnnzArray[0] - baseArray[0]; + } + } + + /** + * Allocate val and index pointers. + * + * @param gCtx a valid {@link GPUContext} + * @param handle a valid {@link cusparseHandle} + * @param C Output sparse matrix on GPU + * @throws DMLRuntimeException ? + */ + private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) + throws DMLRuntimeException { + LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx); + // Increment cudaCount by one when all three arrays of CSR sparse array are allocated + C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz)); + C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz)); + } + + // ============================================================================================== + + // The following methods estimate the memory needed for sparse matrices that are + // results of operations on other sparse matrices using the cuSparse Library. + // The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs + // op = whether to transpose or not + // binaryOperation = For cuSparse, +, - are *(matmul) are supported + + // From CuSparse Manual, + // Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach + // to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1 + // elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC + // and the total number of nonzero elements. In the second step, the user gathers nnzC + //(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr) + // or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of + // nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam() + // to complete matrix C. + + public CSRPointer clone(int rows) throws DMLRuntimeException { + CSRPointer me = this; + CSRPointer that = new CSRPointer(me.getGPUContext()); + + that.allocateMatDescrPointer(); + long totalSize = estimateSize(me.nnz, rows); + that.gpuContext.ensureFreeSpace(totalSize); + + that.nnz = me.nnz; + that.val = allocate(that.nnz * Sizeof.DOUBLE); + that.rowPtr = allocate(rows * Sizeof.DOUBLE); + that.colInd = allocate(that.nnz * Sizeof.DOUBLE); + + cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); + cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); + cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); + + return that; + } + + private Pointer allocate(long size) throws DMLRuntimeException { + return getGPUContext().allocate(size); + } + + private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(toFree, eager); + } + + private GPUContext getGPUContext() { + return gpuContext; + } + + // ============================================================================================== + + /** + * Check for ultra sparsity + * + * @param rows number of rows + * @param cols number of columns + * @return true if ultra sparse + */ + public boolean isUltraSparse(int rows, int cols) { + double sp = ((double) nnz / rows / cols); + return sp < ULTRA_SPARSITY_TURN_POINT; + } + + /** + * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL, + * the default that works for DGEMM. + */ + private void allocateMatDescrPointer() { + this.descr = getDefaultCuSparseMatrixDescriptor(); + } + + /** + * Copies this CSR matrix on the GPU to a dense column-major matrix + * on the GPU. This is a temporary matrix for operations such as + * cusparseDcsrmv. + * Since the allocated matrix is temporary, bookkeeping is not updated. + * The caller is responsible for calling "free" on the returned Pointer object + * + * @param cusparseHandle a valid {@link cusparseHandle} + * @param cublasHandle a valid {@link cublasHandle} + * @param rows number of rows in this CSR matrix + * @param cols number of columns in this CSR matrix + * @return A {@link Pointer} to the allocated dense matrix (in column-major format) + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, + int cols) throws DMLRuntimeException { + LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + + getGPUContext()); + long size = ((long) rows) * getDoubleSizeOf((long) cols); + Pointer A = allocate(size); + // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned. + if (val != null && rowPtr != null && colInd != null && nnz > 0) { + // Note: cusparseDcsr2dense method cannot handle empty blocks + cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows); + //cudaDeviceSynchronize; + } else { + LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null"); + } + return A; + } + + /** + * Calls cudaFree lazily on the allocated {@link Pointer} instances + * + * @throws DMLRuntimeException ? + */ + public void deallocate() throws DMLRuntimeException { + deallocate(false); + } + + /** + * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances + * + * @param eager whether to do eager or lazy cudaFrees + * @throws DMLRuntimeException ? + */ + public void deallocate(boolean eager) throws DMLRuntimeException { + if (nnz > 0) { + cudaFreeHelper(val, eager); + cudaFreeHelper(rowPtr, eager); + cudaFreeHelper(colInd, eager); + } + } + + @Override + public String toString() { + return "CSRPointer{" + "nnz=" + nnz + '}'; + } } http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java index ce5c5ff..ef000c2 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java @@ -31,25 +31,24 @@ import jcuda.driver.CUstream; * Java Wrapper to specify CUDA execution configuration for launching custom kernels */ public class ExecutionConfig { - public int gridDimX; + public int gridDimX; public int gridDimY = 1; public int gridDimZ = 1; - public int blockDimX; + public int blockDimX; public int blockDimY = 1; public int blockDimZ = 1; public int sharedMemBytes = 0; public CUstream stream = null; - - private static HashMap<Integer, Integer> maxBlockDimForDevice = new HashMap<Integer, Integer>(); /** * Convenience constructor for setting the number of blocks, number of threads and the * shared memory size - * @param gridDimX Number of blocks (for CUDA Kernel) - * @param blockDimX Number of threads per block (for CUDA Kernel) - * @param sharedMemBytes Amount of Shared memory (for CUDA Kernel) + * + * @param gridDimX Number of blocks (for CUDA Kernel) + * @param blockDimX Number of threads per block (for CUDA Kernel) + * @param sharedMemBytes Amount of Shared memory (for CUDA Kernel) */ public ExecutionConfig(int gridDimX, int blockDimX, int sharedMemBytes) { this.gridDimX = gridDimX; @@ -58,13 +57,13 @@ public class ExecutionConfig { } /** - * Use this for simple vector operations and use following in the kernel - * <code> - * int index = blockIdx.x * blockDim.x + threadIdx.x + * Use this for simple vector operations and use following in the kernel + * <code> + * int index = blockIdx.x * blockDim.x + threadIdx.x * </code> - * + * <p> * This tries to schedule as minimum grids as possible. - * + * * @param numCells number of cells * @return execution configuration * @throws DMLRuntimeException if DMLRuntimeException occurs @@ -72,19 +71,19 @@ public class ExecutionConfig { public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) throws DMLRuntimeException { int deviceNumber = 0; int blockDimX = getMaxBlockDim(deviceNumber); - int gridDimX = (int)Math.ceil((double)numCells / blockDimX); + int gridDimX = (int) Math.ceil((double) numCells / blockDimX); return new ExecutionConfig(gridDimX, blockDimX); } - + /** - * Use this for simple matrix operations and use following in the kernel - * <code> + * Use this for simple matrix operations and use following in the kernel + * <code> * int ix = blockIdx.x * blockDim.x + threadIdx.x; * int iy = blockIdx.y * blockDim.y + threadIdx.y; * </code> - * + * <p> * This tries to schedule as minimum grids as possible. - * + * * @param rlen number of rows * @param clen number of columns * @return execution configuration @@ -94,45 +93,45 @@ public class ExecutionConfig { int deviceNumber = 0; int maxBlockDim = getMaxBlockDim(deviceNumber); int blockDimX = (int) Math.min(maxBlockDim, rlen); - int gridDimX = (int)Math.ceil((double)rlen / blockDimX); - int blockDimY = (int)Math.min(Math.floor(((double)maxBlockDim)/blockDimX), clen); - int gridDimY = (int)Math.ceil((double)clen / blockDimY); + int gridDimX = (int) Math.ceil((double) rlen / blockDimX); + int blockDimY = (int) Math.min(Math.floor(((double) maxBlockDim) / blockDimX), clen); + int gridDimY = (int) Math.ceil((double) clen / blockDimY); return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY); } - + public ExecutionConfig(int gridDimX, int blockDimX) { this.gridDimX = gridDimX; this.blockDimX = blockDimX; } - + public ExecutionConfig(int gridDimX, int gridDimY, int blockDimX, int blockDimY) { this.gridDimX = gridDimX; this.gridDimY = gridDimY; this.blockDimX = blockDimX; this.blockDimY = blockDimY; } - - + /** - * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device - * + * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device + * * @param deviceNumber device number of the given device * @return The maximum block dimension, in x-direction * @throws DMLRuntimeException if DMLRuntimeException occurs */ - private static int getMaxBlockDim(int deviceNumber) throws DMLRuntimeException { -// return 32; - // TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy - Integer ret = maxBlockDimForDevice.get(deviceNumber); - if(ret == null) { - CUdevice device = new CUdevice(); - JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, deviceNumber)); - int maxBlockDimX[] = {0}; - jcuda.driver.JCudaDriver.cuDeviceGetAttribute(maxBlockDimX, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device); - maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]); - return maxBlockDimX[0]; - } - return ret; - } - - } + private static int getMaxBlockDim(int deviceNumber) throws DMLRuntimeException { + // return 32; + // TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy + Integer ret = maxBlockDimForDevice.get(deviceNumber); + if (ret == null) { + CUdevice device = new CUdevice(); + JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, deviceNumber)); + int maxBlockDimX[] = { 0 }; + jcuda.driver.JCudaDriver + .cuDeviceGetAttribute(maxBlockDimX, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device); + maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]); + return maxBlockDimX[0]; + } + return ret; + } + +}
