http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index 97597e0..798e74e 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -78,6 +78,8 @@ import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.controlprogram.parfor.util.IDHandler; import org.apache.sysml.runtime.matrix.CleanupMR; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.LibMatrixDNN; import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames; import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration; import org.apache.sysml.runtime.util.LocalFileUtils; @@ -85,6 +87,7 @@ import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.utils.Explain; import org.apache.sysml.utils.Explain.ExplainCounts; import org.apache.sysml.utils.Explain.ExplainType; +import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; import org.apache.sysml.yarn.DMLAppMasterUtils; import org.apache.sysml.yarn.DMLYarnClientProxy; @@ -646,7 +649,11 @@ public class DMLScript //double costs = CostEstimationWrapper.getTimeEstimate(rtprog, ExecutionContextFactory.createContext()); //System.out.println("Estimated costs: "+costs); - + + // Whether extra statistics useful for developers and others interested in digging + // into performance problems are recorded and displayed + GPUStatistics.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS); + LibMatrixDNN.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS); //Step 10: execute runtime program Statistics.startRunTimer();
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index 3d0fb28..a42b1ca 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -74,6 +74,8 @@ public class DMLConfig public static final String CODEGEN = "codegen.enabled"; //boolean public static final String CODEGEN_PLANCACHE = "codegen.plancache"; //boolean public static final String CODEGEN_LITERALS = "codegen.literals"; //1..heuristic, 2..always + public static final String EXTRA_GPU_STATS = "systemml.stats.extraGPU"; //boolean + public static final String EXTRA_DNN_STATS = "systemml.stats.extraDNN"; //boolean // Fraction of available memory to use. The available memory is computer when the JCudaContext is created // to handle the tradeoff on calling cudaMemGetInfo too often. @@ -114,7 +116,10 @@ public class DMLConfig _defaultVals.put(CODEGEN, "false" ); _defaultVals.put(CODEGEN_PLANCACHE, "true" ); _defaultVals.put(CODEGEN_LITERALS, "1" ); - + + _defaultVals.put(EXTRA_GPU_STATS, "false" ); + _defaultVals.put(EXTRA_DNN_STATS, "false" ); + _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); _defaultVals.put(REFRESH_AVAILABLE_MEMORY_EVERY_TIME, "true" ); } @@ -402,6 +407,7 @@ public class DMLConfig YARN_APPMASTER, YARN_APPMASTERMEM, YARN_MAPREDUCEMEM, CP_PARALLEL_MATRIXMULT, CP_PARALLEL_TEXTIO, COMPRESSED_LINALG, CODEGEN, CODEGEN_LITERALS, CODEGEN_PLANCACHE, + EXTRA_GPU_STATS, EXTRA_DNN_STATS }; StringBuilder sb = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index f14123e..6455add 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -51,6 +51,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; import org.apache.sysml.runtime.matrix.MetaData; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.runtime.util.UtilFunctions; @@ -224,14 +225,20 @@ public class ExecutionContext ((MatrixFormatMetaData)oldMetaData).getOutputInfo(), ((MatrixFormatMetaData)oldMetaData).getInputInfo())); } - - public MatrixObject getDenseMatrixOutputForGPUInstruction(String varName) + + /** + * Allocates a dense matrix on the GPU (for output) + * @param varName name of the output matrix (known by this {@link ExecutionContext}) + * @return a pair containing the wrapping {@link MatrixObject} and a boolean indicating whether a cuda memory allocation took place (as opposed to the space already being allocated) + * @throws DMLRuntimeException + */ + public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String varName) throws DMLRuntimeException { MatrixObject mo = allocateGPUMatrixObject(varName); - mo.getGPUObject().acquireDeviceModifyDense(); + boolean allocated = mo.getGPUObject().acquireDeviceModifyDense(); mo.getMatrixCharacteristics().setNonZeros(-1); - return mo; + return new Pair<MatrixObject, Boolean>(mo, allocated); } /** @@ -243,13 +250,13 @@ public class ExecutionContext * @return matrix object * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public MatrixObject getSparseMatrixOutputForGPUInstruction(String varName, long nnz) + public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long nnz) throws DMLRuntimeException { MatrixObject mo = allocateGPUMatrixObject(varName); mo.getMatrixCharacteristics().setNonZeros(nnz); - mo.getGPUObject().acquireDeviceModifySparse(); - return mo; + boolean allocated = mo.getGPUObject().acquireDeviceModifySparse(); + return new Pair<MatrixObject, Boolean>(mo, allocated); } /** @@ -266,9 +273,10 @@ public class ExecutionContext return mo; } - public MatrixObject getMatrixInputForGPUInstruction(String varName) + public Pair<MatrixObject, Boolean> getMatrixInputForGPUInstruction(String varName) throws DMLRuntimeException - { + { + boolean copied = false; MatrixObject mo = getMatrixObject(varName); if(mo == null) { throw new DMLRuntimeException("No matrix object available for variable:" + varName); @@ -281,11 +289,11 @@ public class ExecutionContext mo.acquireRead(); acquired = true; } - mo.getGPUObject().acquireDeviceRead(); + copied = mo.getGPUObject().acquireDeviceRead(); if(acquired) { mo.release(); } - return mo; + return new Pair<MatrixObject, Boolean>(mo, copied); } /** @@ -301,7 +309,7 @@ public class ExecutionContext mo.release(); } - public void releaseMatrixInputForGPUInstruction(String varName) + public void releaseMatrixInputForGPUInstruction(String varName) throws DMLRuntimeException { MatrixObject mo = getMatrixObject(varName); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 0be2139..23b5328 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -30,7 +30,7 @@ import org.apache.sysml.runtime.instructions.gpu.MatrixMatrixAxpyGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE; import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction; -import org.apache.sysml.runtime.instructions.gpu.context.AggregateUnaryGPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.AggregateUnaryGPUInstruction; public class GPUInstructionParser extends InstructionParser { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java index 7219c6c..55c8f8d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java @@ -32,7 +32,7 @@ import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator; import org.apache.sysml.runtime.matrix.operators.AggregateOperator; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class AggregateBinaryGPUInstruction extends GPUInstruction { @@ -74,12 +74,12 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction AggregateBinaryOperator aggbin = new AggregateBinaryOperator(Multiply.getMultiplyFnObject(), agg, 1); return new AggregateBinaryGPUInstruction(aggbin, in1, in2, out, opcode, str, isLeftTransposed, isRightTransposed); } - + @Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); AggregateBinaryOperator op = (AggregateBinaryOperator) _optr; if( !(op.binaryFn instanceof Multiply && op.aggOp.increOp.fn instanceof Plus) ) { @@ -87,15 +87,16 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction } //get inputs - MatrixObject m1 = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject m2 = ec.getMatrixInputForGPUInstruction(_input2.getName()); + MatrixObject m1 = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject m2 = getMatrixInputForGPUInstruction(ec, _input2.getName()); + //compute matrix multiplication int rlen = (int) (_isLeftTransposed ? m1.getNumColumns() : m1.getNumRows()); int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns()); ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmult(ec, m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); + LibMatrixCUDA.matmult(ec, getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java new file mode 100644 index 0000000..45db44c --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.functionobjects.IndexFunction; +import org.apache.sysml.runtime.functionobjects.ReduceCol; +import org.apache.sysml.runtime.functionobjects.ReduceRow; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator; +import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.utils.GPUStatistics; + +/** + * Implements aggregate unary instructions for CUDA + */ +public class AggregateUnaryGPUInstruction extends GPUInstruction { + private CPOperand _input1 = null; + private CPOperand _output = null; + + public AggregateUnaryGPUInstruction(Operator op, CPOperand in1, CPOperand out, + String opcode, String istr) + { + super(op, opcode, istr); + _gputype = GPUINSTRUCTION_TYPE.AggregateUnary; + _input1 = in1; + _output = out; + } + + public static AggregateUnaryGPUInstruction parseInstruction(String str ) + throws DMLRuntimeException + { + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + String opcode = parts[0]; + CPOperand in1 = new CPOperand(parts[1]); + CPOperand out = new CPOperand(parts[2]); + + // This follows logic similar to AggregateUnaryCPInstruction. + // nrow, ncol & length should either read or refresh metadata + Operator aggop = null; + if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) { + throw new DMLRuntimeException("nrow, ncol & length should not be compiled as GPU instructions!"); + } else { + aggop = InstructionUtils.parseBasicAggregateUnaryOperator(opcode); + } + return new AggregateUnaryGPUInstruction(aggop, in1, out, opcode, str); + } + + @Override + public void processInstruction(ExecutionContext ec) + throws DMLRuntimeException + { + GPUStatistics.incrementNoOfExecutedGPUInst(); + + String opcode = getOpcode(); + + // nrow, ncol & length should either read or refresh metadata + if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) { + throw new DMLRuntimeException("nrow, ncol & length should not be compiled as GPU instructions!"); + } + + //get inputs + MatrixObject in1 = getMatrixInputForGPUInstruction(ec, _input1.getName()); + + int rlen = (int)in1.getNumRows(); + int clen = (int)in1.getNumColumns(); + + IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn; + if (indexFunction instanceof ReduceRow){ // COL{SUM, MAX...} + ec.setMetaData(_output.getName(), 1, clen); + } else if (indexFunction instanceof ReduceCol) { // ROW{SUM, MAX,...} + ec.setMetaData(_output.getName(), rlen, 1); + } + + LibMatrixCUDA.unaryAggregate(ec, getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr); + + //release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + + // If the unary aggregate is a row reduction or a column reduction, it results in a vector + // which needs to be released. Otherwise a scala is produced and it is copied back to the host + // and set in the execution context by invoking the setScalarOutput + if (indexFunction instanceof ReduceRow || indexFunction instanceof ReduceCol) { + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + } + } + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index 56e95b7..cb8c729 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -29,7 +29,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class ConvolutionGPUInstruction extends GPUInstruction { @@ -140,29 +140,29 @@ public class ConvolutionGPUInstruction extends GPUInstruction throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str); } } - + public void processBiasInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); - MatrixObject input = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject bias = ec.getMatrixInputForGPUInstruction(_input2.getName()); + GPUStatistics.incrementNoOfExecutedGPUInst(); + MatrixObject input = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input2.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); - MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.biasAdd(input, bias, out); + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); + LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } - + public void processReLUBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); - MatrixObject input = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject dout = ec.getMatrixInputForGPUInstruction(_input2.getName()); + GPUStatistics.incrementNoOfExecutedGPUInst(); + MatrixObject input = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); - MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); - LibMatrixCUDA.reluBackward(input, dout, out); + LibMatrixCUDA.reluBackward(getExtendedOpcode(), input, dout, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -182,7 +182,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction return; } - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); int pad_h = getScalarInput(ec, _padding, 0); int pad_w = getScalarInput(ec, _padding, 1); @@ -203,8 +203,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w); if (instOpcode.equalsIgnoreCase("conv2d")) { - MatrixObject image = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject filter = ec.getMatrixInputForGPUInstruction(_input2.getName()); + MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject filter = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in conv2d"); @@ -212,13 +212,13 @@ public class ConvolutionGPUInstruction extends GPUInstruction throw new DMLRuntimeException("Incorrect dimensions for filter in conv2d"); ec.setMetaData(_output.getName(), N, K * P * Q); - MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.conv2d(image, filter, out, N, C, H, W, + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); + LibMatrixCUDA.conv2d(getExtendedOpcode(), image, filter, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) { - MatrixObject image = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject dout = ec.getMatrixInputForGPUInstruction(_input2.getName()); + MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in conv2d_backward_filter"); @@ -227,15 +227,15 @@ public class ConvolutionGPUInstruction extends GPUInstruction dout.getNumRows() + " != " + N + " || " + dout.getNumColumns() + " != " + K*P*Q); ec.setMetaData(_output.getName(), K, C * R * S); - MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.conv2dBackwardFilter(image, dout, out, N, C, H, W, + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); + LibMatrixCUDA.conv2dBackwardFilter(getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); // TODO: For now always copy the device data to host // ec.gpuCtx.copyDeviceToHost(outputBlock); } else if (instOpcode.equalsIgnoreCase("conv2d_backward_data")) { - MatrixObject filter = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject dout = ec.getMatrixInputForGPUInstruction(_input2.getName()); + MatrixObject filter = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(filter.getNumRows() != K || filter.getNumColumns() != C*R*S) throw new DMLRuntimeException("Incorrect dimensions for filter in convolution_backward_data"); @@ -244,25 +244,25 @@ public class ConvolutionGPUInstruction extends GPUInstruction dout.getNumRows() + " != " + N + " || " + dout.getNumColumns() + " != " + K*P*Q); ec.setMetaData(_output.getName(), N, C * H * W); - MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.conv2dBackwardData(filter, dout, out, N, C, H, W, + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); + LibMatrixCUDA.conv2dBackwardData(getExtendedOpcode(), filter, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling")) { - MatrixObject image = ec.getMatrixInputForGPUInstruction(_input1.getName()); + MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in maxpooling: " + image.getNumRows() + " != " + N + " || " + image.getNumColumns() + " != " + C*H*W); ec.setMetaData(_output.getName(), N, C * P * Q); - MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.maxpooling(image, out, N, C, H, W, + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); + LibMatrixCUDA.maxpooling(getExtendedOpcode(), image, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { - MatrixObject image = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject dout = ec.getMatrixInputForGPUInstruction(_input2.getName()); + MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(dout.getNumRows() != N || dout.getNumColumns() != C*P*Q) throw new DMLRuntimeException("Incorrect dimensions for dout in maxpooling_backward"); @@ -271,8 +271,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction image.getNumRows() + " != " + N + " || " + image.getNumColumns() + " != " + K*P*Q); ec.setMetaData(_output.getName(), N, C * H * W); - MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.maxpoolingBackward(image, dout, out, N, C, H, W, + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); + LibMatrixCUDA.maxpoolingBackward(getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index 1c91a51..dcb2edc 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -22,15 +22,72 @@ package org.apache.sysml.runtime.instructions.gpu; import jcuda.runtime.JCuda; import org.apache.sysml.lops.runtime.RunMRJobs; import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.GPUInstructionParser; import org.apache.sysml.runtime.instructions.Instruction; +import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.utils.GPUStatistics; +import org.apache.sysml.utils.Statistics; -public abstract class GPUInstruction extends Instruction +public abstract class GPUInstruction extends Instruction { public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary, Builtin }; - + + // Memory/conversions + public final static String MISC_TIMER_HOST_TO_DEVICE = "H2D"; // time spent in bringing data to gpu (from host) + public final static String MISC_TIMER_DEVICE_TO_HOST = "D2H"; // time spent in bringing data from gpu (to host) + public final static String MISC_TIMER_DEVICE_TO_DEVICE = "D2D"; // time spent in copying data from one region on the device to another + public final static String MISC_TIMER_SPARSE_TO_DENSE = "s2d"; // time spent in converting data from sparse to dense + public final static String MISC_TIMER_DENSE_TO_SPARSE = "d2s"; // time spent in converting data from dense to sparse + public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree + public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu + public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ao"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE) + + // Matmult instructions + public final static String MISC_TIMER_SPARSE_ALLOCATE_LIB = "Msao"; // time spend in allocating for sparse matrix output + public final static String MISC_TIMER_DENSE_DOT_LIB = "Mddot"; // time spent in dot product of 2 dense vectors + public final static String MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB = "Mdvdm"; // time spent in matrix mult of dense vector and dense matrix + public final static String MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB = "Mdmdv"; // time spent in matrix mult of dense matrix and dense vector + public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = "Mdmdm"; // time spent in matrix mult of dense matrices + public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = "Msmdv"; // time spent in matrix mult of sparse matrix and dense vector + public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm"; // time spent in matrix mult of sparse matrices + public final static String MISC_TIMER_SYRK_LIB = "Msyrk"; // time spent in symmetric rank-k update + + // Other BLAS instructions + public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy + + // Transpose + public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B)) + public final static String MISC_TIMER_DENSE_DGEAM_LIB = "ddgeaml"; // time spent in dense transpose (and other ops of type a*op(A) + b*op(B)) + public final static String MISC_TIMER_TRANSPOSE_LIB = "dtl"; // time spent on dense transpose, this includes allocation of output + + // Custom kernels + public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck"; // time spent in matrix-matrix cellwise operations + public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = "cask"; // time spent in compareAndSet kernel + public final static String MISC_TIMER_EXP_KERNEL = "expk"; // time spent in the exp kernel + public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel + public final static String MISC_TIMER_FILL_KERNEL = "fillk"; // time spent in the "fill" kernel + public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = "msk"; // time spent in the matrix scalar kernel + public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel + public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel + public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel + + // Deep learning operators + public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward + public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB = "nncf"; // time spent in cudnnConvolutionForward + public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB = "nncbf"; // time spent in cudnnConvolutionBackwardFilter + public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData + public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward + public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward + public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add cuda kernel + public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel + public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel + public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call + public final static String MISC_TIMER_CUDNN_CLEANUP = "nnc"; // time spent in cleanup for cudnn call + + protected GPUINSTRUCTION_TYPE _gputype; protected Operator _optr; @@ -91,4 +148,34 @@ public abstract class GPUInstruction extends Instruction { JCuda.cudaDeviceSynchronize(); } + + /** + * Helper method to get the input block (allocated on the GPU) + * Also records performance information into {@link Statistics} + * @param ec active {@link ExecutionContext} + * @param name name of input matrix (that the {@link ExecutionContext} is aware of) + * @return the matrix object + * @throws DMLRuntimeException if an error occurs + */ + protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, String name) throws DMLRuntimeException { + long t0 = System.nanoTime(); + Pair<MatrixObject, Boolean> mb = ec.getMatrixInputForGPUInstruction(name); + if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t0); + return mb.getKey(); + } + + /** + * Helper method to get the output block (allocated on the GPU) + * Also records performance information into {@link Statistics} + * @param ec active {@link ExecutionContext} + * @param name name of input matrix (that the {@link ExecutionContext} is aware of) + * @return the matrix object + * @throws DMLRuntimeException if an error occurs + */ + protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name) throws DMLRuntimeException { + long t0 = System.nanoTime(); + Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name); + if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); + return mb.getKey(); + } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java index 4c05833..0f2542a 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java @@ -34,7 +34,7 @@ import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class MMTSJGPUInstruction extends GPUInstruction { @@ -95,10 +95,10 @@ public class MMTSJGPUInstruction extends GPUInstruction public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); //get input - MatrixObject mat = ec.getMatrixInputForGPUInstruction(_input.getName()); + MatrixObject mat = getMatrixInputForGPUInstruction(ec, _input.getName()); boolean isLeftTransposed = ( _type == MMTSJType.LEFT); @@ -107,7 +107,7 @@ public class MMTSJGPUInstruction extends GPUInstruction //execute operations ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmultTSMM(ec, mat, _output.getName(), isLeftTransposed); + LibMatrixCUDA.matmultTSMM(ec, getExtendedOpcode(), mat, _output.getName(), isLeftTransposed); ec.releaseMatrixInputForGPUInstruction(_input.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java index a423cdd..2766909 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java @@ -22,11 +22,10 @@ package org.apache.sysml.runtime.instructions.gpu; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; -import org.apache.sysml.runtime.instructions.gpu.BuiltinUnaryGPUInstruction; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction { @@ -37,18 +36,17 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction { @Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); String opcode = getOpcode(); - MatrixObject mat = ec.getMatrixInputForGPUInstruction(_input.getName()); - + MatrixObject mat = getMatrixInputForGPUInstruction(ec, _input.getName()); ec.setMetaData(_output.getName(), mat.getNumRows(), mat.getNumColumns()); if(opcode.equals("sel+")) { - LibMatrixCUDA.relu(ec, mat, _output.getName()); + LibMatrixCUDA.relu(ec, getExtendedOpcode(), mat, _output.getName()); } else if (opcode.equals("exp")) { - LibMatrixCUDA.exp(ec, mat, _output.getName()); + LibMatrixCUDA.exp(ec, getExtendedOpcode(), mat, _output.getName()); } else { throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java index b7c0e99..2da1aa6 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java @@ -26,7 +26,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.matrix.operators.Operator; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUInstruction { @@ -42,10 +42,10 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns @Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); - MatrixObject in1 = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject in2 = ec.getMatrixInputForGPUInstruction(_input2.getName()); + MatrixObject in1 = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject in2 = getMatrixInputForGPUInstruction(ec, _input2.getName()); //TODO: make hop level changes for this boolean isLeftTransposed = false; @@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns ec.setMetaData(_output.getName(), (int)rlen, (int)clen); BinaryOperator bop = (BinaryOperator) _optr; - LibMatrixCUDA.matrixScalarArithmetic(ec, in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); + LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java index cc8ff9f..e6c6b90 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java @@ -28,7 +28,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.instructions.cp.ScalarObject; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class MatrixMatrixAxpyGPUInstruction extends ArithmeticBinaryGPUInstruction { @@ -85,10 +85,10 @@ public class MatrixMatrixAxpyGPUInstruction extends ArithmeticBinaryGPUInstructi @Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); - MatrixObject in1 = ec.getMatrixInputForGPUInstruction(_input1.getName()); - MatrixObject in2 = ec.getMatrixInputForGPUInstruction(_input2.getName()); + MatrixObject in1 = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject in2 = getMatrixInputForGPUInstruction(ec, _input2.getName()); ScalarObject scalar = ec.getScalarInput(constant.getName(), constant.getValueType(), constant.isLiteral()); long rlen1 = in1.getNumRows(); @@ -103,7 +103,7 @@ public class MatrixMatrixAxpyGPUInstruction extends ArithmeticBinaryGPUInstructi ec.setMetaData(_output.getName(), (int)rlen1, (int)clen1); - LibMatrixCUDA.axpy(ec, in1, in2, _output.getName(), multiplier*scalar.getDoubleValue()); + LibMatrixCUDA.axpy(ec, getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue()); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java index 12deb22..54ba32e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java @@ -28,7 +28,7 @@ import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class ReorgGPUInstruction extends GPUInstruction @@ -72,16 +72,16 @@ public class ReorgGPUInstruction extends GPUInstruction public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); //acquire input - MatrixObject mat = ec.getMatrixInputForGPUInstruction(_input.getName()); + MatrixObject mat = getMatrixInputForGPUInstruction(ec, _input.getName()); int rlen = (int) mat.getNumColumns(); int clen = (int) mat.getNumRows(); //execute operation ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.transpose(ec, mat, _output.getName()); + LibMatrixCUDA.transpose(ec, getExtendedOpcode(), mat, _output.getName()); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java index 78b480e..44cc6e2 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java @@ -28,7 +28,7 @@ import org.apache.sysml.runtime.instructions.cp.ScalarObject; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.runtime.matrix.operators.ScalarOperator; -import org.apache.sysml.utils.Statistics; +import org.apache.sysml.utils.GPUStatistics; public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUInstruction { public ScalarMatrixArithmeticGPUInstruction(Operator op, @@ -44,11 +44,11 @@ public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - Statistics.incrementNoOfExecutedGPUInst(); + GPUStatistics.incrementNoOfExecutedGPUInst(); CPOperand mat = ( _input1.getDataType() == DataType.MATRIX ) ? _input1 : _input2; CPOperand scalar = ( _input1.getDataType() == DataType.MATRIX ) ? _input2 : _input1; - MatrixObject in1 = ec.getMatrixInputForGPUInstruction(mat.getName()); + MatrixObject in1 = getMatrixInputForGPUInstruction(ec, mat.getName()); ScalarObject constant = (ScalarObject) ec.getScalarInput(scalar.getName(), scalar.getValueType(), scalar.isLiteral()); boolean isTransposed = false; @@ -60,7 +60,7 @@ public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns ScalarOperator sc_op = (ScalarOperator) _optr; sc_op.setConstant(constant.getDoubleValue()); - LibMatrixCUDA.matrixScalarArithmetic(ec, in1, _output.getName(), isTransposed, sc_op); + LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op); ec.releaseMatrixInputForGPUInstruction(mat.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java deleted file mode 100644 index 28efe64..0000000 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysml.runtime.instructions.gpu.context; - -import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; -import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; -import org.apache.sysml.runtime.functionobjects.IndexFunction; -import org.apache.sysml.runtime.functionobjects.ReduceCol; -import org.apache.sysml.runtime.functionobjects.ReduceRow; -import org.apache.sysml.runtime.instructions.InstructionUtils; -import org.apache.sysml.runtime.instructions.cp.CPOperand; -import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; -import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; -import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator; -import org.apache.sysml.runtime.matrix.operators.Operator; -import org.apache.sysml.utils.Statistics; - -/** - * Implements aggregate unary instructions for CUDA - */ -public class AggregateUnaryGPUInstruction extends GPUInstruction { - private CPOperand _input1 = null; - private CPOperand _output = null; - - public AggregateUnaryGPUInstruction(Operator op, CPOperand in1, CPOperand out, - String opcode, String istr) - { - super(op, opcode, istr); - _gputype = GPUINSTRUCTION_TYPE.AggregateUnary; - _input1 = in1; - _output = out; - } - - public static AggregateUnaryGPUInstruction parseInstruction(String str ) - throws DMLRuntimeException - { - String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); - String opcode = parts[0]; - CPOperand in1 = new CPOperand(parts[1]); - CPOperand out = new CPOperand(parts[2]); - - // This follows logic similar to AggregateUnaryCPInstruction. - // nrow, ncol & length should either read or refresh metadata - Operator aggop = null; - if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) { - throw new DMLRuntimeException("nrow, ncol & length should not be compiled as GPU instructions!"); - } else { - aggop = InstructionUtils.parseBasicAggregateUnaryOperator(opcode); - } - return new AggregateUnaryGPUInstruction(aggop, in1, out, opcode, str); - } - - @Override - public void processInstruction(ExecutionContext ec) - throws DMLRuntimeException - { - Statistics.incrementNoOfExecutedGPUInst(); - - String opcode = getOpcode(); - - // nrow, ncol & length should either read or refresh metadata - if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || opcode.equalsIgnoreCase("length")) { - throw new DMLRuntimeException("nrow, ncol & length should not be compiled as GPU instructions!"); - } - - //get inputs - MatrixObject in1 = ec.getMatrixInputForGPUInstruction(_input1.getName()); - - int rlen = (int)in1.getNumRows(); - int clen = (int)in1.getNumColumns(); - - IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn; - if (indexFunction instanceof ReduceRow){ // COL{SUM, MAX...} - ec.setMetaData(_output.getName(), 1, clen); - } else if (indexFunction instanceof ReduceCol) { // ROW{SUM, MAX,...} - ec.setMetaData(_output.getName(), rlen, 1); - } - - LibMatrixCUDA.unaryAggregate(ec, in1, _output.getName(), (AggregateUnaryOperator)_optr); - - //release inputs/outputs - ec.releaseMatrixInputForGPUInstruction(_input1.getName()); - - // If the unary aggregate is a row reduction or a column reduction, it results in a vector - // which needs to be released. Otherwise a scala is produced and it is copied back to the host - // and set in the execution context by invoking the setScalarOutput - if (indexFunction instanceof ReduceRow || indexFunction instanceof ReduceCol) { - ec.releaseMatrixOutputForGPUInstruction(_output.getName()); - } - } - -} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 215b38c..ba605bc 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -22,6 +22,7 @@ import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.CacheException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; import java.util.Collections; @@ -54,18 +55,25 @@ public abstract class GPUObject } public abstract boolean isAllocated(); - - public abstract void acquireDeviceRead() throws DMLRuntimeException; + + /** + * Signal intent that a matrix block will be read (as input) on the GPU + * @return true if a host memory to device memory transfer happened + * @throws DMLRuntimeException + */ + public abstract boolean acquireDeviceRead() throws DMLRuntimeException; /** * To signal intent that a matrix block will be written to on the GPU + * @return true if memory was allocated on the GPU as a result of this call * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public abstract void acquireDeviceModifyDense() throws DMLRuntimeException; + public abstract boolean acquireDeviceModifyDense() throws DMLRuntimeException; /** * To signal intent that a sparse matrix block will be written to on the GPU + * @return true if memory was allocated on the GPU as a result of this call * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public abstract void acquireDeviceModifySparse() throws DMLRuntimeException; + public abstract boolean acquireDeviceModifySparse() throws DMLRuntimeException; /** * If memory on GPU has been allocated from elsewhere, this method @@ -73,9 +81,14 @@ public abstract class GPUObject * @param numBytes number of bytes */ public abstract void setDeviceModify(long numBytes); - - public abstract void acquireHostRead() throws CacheException; - public abstract void acquireHostModify() throws CacheException; + + /** + * Signal intent that a block needs to be read on the host + * @return true if copied from device to host + * @throws CacheException + */ + public abstract boolean acquireHostRead() throws CacheException; + public abstract void releaseInput() throws CacheException; public abstract void releaseOutput() throws CacheException; @@ -134,7 +147,7 @@ public abstract class GPUObject throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); } - Statistics.cudaEvictionCount.addAndGet(1); + GPUStatistics.cudaEvictionCount.addAndGet(1); synchronized (evictionLock) { Collections.sort(GPUContext.allocatedPointers, new Comparator<GPUObject>() { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java index d118429..c58bfe7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java @@ -26,6 +26,7 @@ import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; import jcuda.driver.JCudaDriver; @@ -104,7 +105,7 @@ public class JCudaContext extends GPUContext { LOG.info("Active CUDA device number : " + device[0]); LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock); - Statistics.cudaInitTime = System.nanoTime() - start; + GPUStatistics.cudaInitTime = System.nanoTime() - start; start = System.nanoTime(); LibMatrixCUDA.cudnnHandle = new cudnnHandle(); @@ -116,7 +117,7 @@ public class JCudaContext extends GPUContext { // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); LibMatrixCUDA.cusparseHandle = new cusparseHandle(); cusparseCreate(LibMatrixCUDA.cusparseHandle); - Statistics.cudaLibrariesInitTime = System.nanoTime() - start; + GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; try { LibMatrixCUDA.kernels = new JCudaKernels(); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java index c5de805..f24c320 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java @@ -32,6 +32,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.HashMap; +import jcuda.runtime.JCuda; import org.apache.sysml.runtime.DMLRuntimeException; import jcuda.CudaException; @@ -165,7 +166,7 @@ public class JCudaKernels { config.gridDimX, config.gridDimY, config.gridDimZ, config.blockDimX, config.blockDimY, config.blockDimZ, config.sharedMemBytes, config.stream, Pointer.to(kernelParams), null)); - // JCuda.cudaDeviceSynchronize(); + JCuda.cudaDeviceSynchronize(); } public static void checkResult(int cuResult) throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java index b9c9161..d25d387 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java @@ -39,12 +39,14 @@ import org.apache.commons.logging.LogFactory; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.CacheException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.SparseBlock; import org.apache.sysml.runtime.matrix.data.SparseBlockCOO; import org.apache.sysml.runtime.matrix.data.SparseBlockCSR; import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR; +import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; import jcuda.Pointer; @@ -173,9 +175,9 @@ public class JCudaObject extends GPUObject { } ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2)); // increment the cudaCount by 1 for the allocation of all 3 arrays - r.val = allocate(getDoubleSizeOf(nnz2), 0); - r.rowPtr = allocate(getIntSizeOf(rows + 1), 0); - r.colInd = allocate(getIntSizeOf(nnz2), 1); + r.val = allocate(null, getDoubleSizeOf(nnz2), 0); + r.rowPtr = allocate(null, getIntSizeOf(rows + 1), 0); + r.colInd = allocate(null, getIntSizeOf(nnz2), 1); return r; } @@ -195,8 +197,8 @@ public class JCudaObject extends GPUObject { cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice); cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice); cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice); - Statistics.cudaToDevTime.addAndGet(System.nanoTime()-t0); - Statistics.cudaToDevCount.addAndGet(3); + GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-t0); + GPUStatistics.cudaToDevCount.addAndGet(3); } /** @@ -214,8 +216,8 @@ public class JCudaObject extends GPUObject { cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost); cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost); cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost); - Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-t0); - Statistics.cudaFromDevCount.addAndGet(3); + GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-t0); + GPUStatistics.cudaFromDevCount.addAndGet(3); } // ============================================================================================== @@ -248,7 +250,7 @@ public class JCudaObject extends GPUObject { cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); cudaDeviceSynchronize(); // Do not increment the cudaCount of allocations on GPU - C.rowPtr = allocate(getIntSizeOf((long)rowsC+1), 0); + C.rowPtr = allocate(null, getIntSizeOf((long)rowsC+1), 0); } /** @@ -325,8 +327,8 @@ public class JCudaObject extends GPUObject { */ private static void step3AllocateValNInd(cusparseHandle handle, CSRPointer C) throws DMLRuntimeException { // Increment cudaCount by one when all three arrays of CSR sparse array are allocated - C.val = allocate(getDoubleSizeOf(C.nnz), 0); - C.colInd = allocate(getIntSizeOf(C.nnz), 1); + C.val = allocate(null, getDoubleSizeOf(C.nnz), 0); + C.colInd = allocate(null, getIntSizeOf(C.nnz), 1); } // ============================================================================================== @@ -456,12 +458,13 @@ public class JCudaObject extends GPUObject { * Allocates temporary space on the device. * Does not update bookkeeping. * The caller is responsible for freeing up after usage. + * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record * @param size Size of data (in bytes) to allocate * @param statsCount amount to increment the cudaAllocCount by * @return jcuda Pointer * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static Pointer allocate(long size, int statsCount) throws DMLRuntimeException{ + public static Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException{ synchronized (GPUContext.syncObj) { Pointer A = new Pointer(); ensureFreeSpace(size); @@ -469,23 +472,37 @@ public class JCudaObject extends GPUObject { cudaMalloc(A, size); // Set all elements to 0 since newly allocated space will contain garbage cudaMemset(A, 0, size); - Statistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0); - Statistics.cudaAllocCount.getAndAdd(statsCount); + GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0); + GPUStatistics.cudaAllocCount.getAndAdd(statsCount); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0); + return A; } } /** - * Convenience method for {@link #allocate(long, int)}, defaults statsCount to 1. + * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. * @param size size of data (in bytes) to allocate * @return jcuda pointer * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static Pointer allocate(long size) throws DMLRuntimeException { - return allocate(size, 1); + return allocate(null, size, 1); } /** + * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. + * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record + * @param size size of data (in bytes) to allocate + * @return jcuda pointer + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static Pointer allocate(String instructionName, long size) throws DMLRuntimeException { + return allocate(instructionName, size, 1); + } + + + /** * Allocates a sparse and empty {@link JCudaObject} * This is the result of operations that are both non zero matrices. * @@ -528,22 +545,27 @@ public class JCudaObject extends GPUObject { } @Override - public synchronized void acquireDeviceRead() throws DMLRuntimeException { + public synchronized boolean acquireDeviceRead() throws DMLRuntimeException { + boolean transferred = false; if(!isAllocated()) { copyFromHostToDevice(); + transferred = true; } else { numLocks.addAndGet(1); } if(!isAllocated()) throw new DMLRuntimeException("Expected device data to be allocated"); + return transferred; } @Override - public synchronized void acquireDeviceModifyDense() throws DMLRuntimeException { + public synchronized boolean acquireDeviceModifyDense() throws DMLRuntimeException { + boolean allocated = false; if(!isAllocated()) { mat.setDirty(true); // Dense block, size = numRows * numCols allocateDenseMatrixOnDevice(); + allocated = true; synchronized(evictionLock) { GPUContext.allocatedPointers.add(this); } @@ -551,14 +573,17 @@ public class JCudaObject extends GPUObject { isDeviceCopyModified = true; if(!isAllocated()) throw new DMLRuntimeException("Expected device data to be allocated"); + return allocated; } @Override - public synchronized void acquireDeviceModifySparse() throws DMLRuntimeException { + public synchronized boolean acquireDeviceModifySparse() throws DMLRuntimeException { + boolean allocated = false; isInSparseFormat = true; if(!isAllocated()) { mat.setDirty(true); allocateSparseMatrixOnDevice(); + allocated = true; synchronized(evictionLock) { GPUContext.allocatedPointers.add(this); } @@ -566,14 +591,17 @@ public class JCudaObject extends GPUObject { isDeviceCopyModified = true; if(!isAllocated()) throw new DMLRuntimeException("Expected device data to be allocated"); + return allocated; } @Override - public synchronized void acquireHostRead() throws CacheException { + public synchronized boolean acquireHostRead() throws CacheException { + boolean copied = false; if(isAllocated()) { try { if(isDeviceCopyModified) { copyFromDeviceToHost(); + copied = true; } } catch (DMLRuntimeException e) { throw new CacheException(e); @@ -582,21 +610,7 @@ public class JCudaObject extends GPUObject { else { throw new CacheException("Cannot perform acquireHostRead as the GPU data is not allocated:" + mat.getVarName()); } - } - - @Override - public synchronized void acquireHostModify() throws CacheException { - if(isAllocated()) { - try { - if(isDeviceCopyModified) { - throw new DMLRuntimeException("Potential overwrite of GPU data"); - // copyFromDeviceToHost(); - } - clearData(); - } catch (DMLRuntimeException e) { - throw new CacheException(e); - } - } + return copied; } /** @@ -659,8 +673,8 @@ public class JCudaObject extends GPUObject { JCudaContext.deviceMemBytes.addAndGet(-numBytes); } - Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start); - Statistics.cudaAllocCount.addAndGet(1); + GPUStatistics.cudaAllocTime.addAndGet(System.nanoTime()-start); + GPUStatistics.cudaAllocCount.addAndGet(1); } } @@ -715,17 +729,17 @@ public class JCudaObject extends GPUObject { void deallocateMemoryOnDevice(boolean synchronous) { if(jcudaDenseMatrixPtr != null) { long start = System.nanoTime(); - cudaFreeHelper(jcudaDenseMatrixPtr, synchronous); + cudaFreeHelper(null, jcudaDenseMatrixPtr, synchronous); ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes); - Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); - Statistics.cudaDeAllocCount.addAndGet(1); + GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); + GPUStatistics.cudaDeAllocCount.addAndGet(1); } if (jcudaSparseMatrixPtr != null) { long start = System.nanoTime(); jcudaSparseMatrixPtr.deallocate(synchronous); ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes); - Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); - Statistics.cudaDeAllocCount.addAndGet(1); + GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); + GPUStatistics.cudaDeAllocCount.addAndGet(1); } jcudaDenseMatrixPtr = null; jcudaSparseMatrixPtr = null; @@ -785,14 +799,14 @@ public class JCudaObject extends GPUObject { long t0 = System.nanoTime(); SparseBlockCOO cooBlock = (SparseBlockCOO)block; csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values()); - Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); - Statistics.cudaSparseConversionCount.incrementAndGet(); + GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaSparseConversionCount.incrementAndGet(); } else if (block instanceof SparseBlockMCSR) { long t0 = System.nanoTime(); SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block; csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size())); - Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); - Statistics.cudaSparseConversionCount.incrementAndGet(); + GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaSparseConversionCount.incrementAndGet(); } else { throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations"); } @@ -830,8 +844,8 @@ public class JCudaObject extends GPUObject { mat.release(); - Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start); - Statistics.cudaToDevCount.addAndGet(1); + GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start); + GPUStatistics.cudaToDevCount.addAndGet(1); } public static int toIntExact(long l) throws DMLRuntimeException { @@ -860,8 +874,8 @@ public class JCudaObject extends GPUObject { mat.acquireModify(tmp); mat.release(); - Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start); - Statistics.cudaFromDevCount.addAndGet(1); + GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start); + GPUStatistics.cudaFromDevCount.addAndGet(1); } else if (jcudaSparseMatrixPtr != null){ printCaller(); @@ -887,8 +901,8 @@ public class JCudaObject extends GPUObject { MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock); mat.acquireModify(tmp); mat.release(); - Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - start); - Statistics.cudaFromDevCount.addAndGet(1); + GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start); + GPUStatistics.cudaFromDevCount.addAndGet(1); } } else { @@ -992,8 +1006,8 @@ public class JCudaObject extends GPUObject { setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(cusparseHandle, rows, cols, jcudaDenseMatrixPtr)); // TODO: What if mat.getNnz() is -1 ? numBytes = CSRPointer.estimateSize(mat.getNnz(), rows); - Statistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0); - Statistics.cudaDenseToSparseCount.addAndGet(1); + GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaDenseToSparseCount.addAndGet(1); } /** @@ -1055,16 +1069,27 @@ public class JCudaObject extends GPUObject { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public void sparseToDense() throws DMLRuntimeException { - long t0 = System.nanoTime(); + sparseToDense(null); + } + + /** + * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format) + * Also records per instruction invokation of sparseToDense. + * @param instructionName Name of the instruction for which statistics are recorded in {@link GPUStatistics} + * @throws DMLRuntimeException + */ + public void sparseToDense(String instructionName) throws DMLRuntimeException { + long start = System.nanoTime(); if(jcudaSparseMatrixPtr == null || !isAllocated()) throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); sparseToColumnMajorDense(); convertDensePtrFromColMajorToRowMajor(); - Statistics.cudaSparseToDenseTime.addAndGet(System.nanoTime() - t0); - Statistics.cudaSparseToDenseCount.addAndGet(1); + long end = System.nanoTime(); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start); + GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start); + GPUStatistics.cudaSparseToDenseCount.addAndGet(1); } - /** * More efficient method to convert sparse to dense but returns dense in column major format @@ -1107,8 +1132,8 @@ public class JCudaObject extends GPUObject { long t1 = System.nanoTime(); nnzPerRowPtr = allocate(getIntSizeOf(rows)); nnzTotalDevHostPtr = allocate(getIntSizeOf(1)); - Statistics.cudaAllocTime.addAndGet(System.nanoTime() - t1); - Statistics.cudaAllocCount.addAndGet(2); + GPUStatistics.cudaAllocTime.addAndGet(System.nanoTime() - t1); + GPUStatistics.cudaAllocCount.addAndGet(2); // Output is in dense vector format, convert it to CSR cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr); @@ -1117,8 +1142,8 @@ public class JCudaObject extends GPUObject { long t2 = System.nanoTime(); cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); - Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2); - Statistics.cudaFromDevCount.addAndGet(2); + GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2); + GPUStatistics.cudaFromDevCount.addAndGet(2); if (nnzC[0] == -1){ throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU"); @@ -1139,16 +1164,28 @@ public class JCudaObject extends GPUObject { * @param toFree {@link Pointer} instance to be freed */ public static void cudaFreeHelper(final Pointer toFree) { - cudaFreeHelper(toFree, false); + cudaFreeHelper(null, toFree, false); + } + + /** + * Does asynchronous cudaFree calls + * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record + * @param toFree {@link Pointer} instance to be freed + */ + public static void cudaFreeHelper(String instructionName, final Pointer toFree) { + cudaFreeHelper(instructionName, toFree, false); } /** * Does cudaFree calls, either synchronously or asynchronously + * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record * @param toFree {@link Pointer} instance to be freed * @param synchronous true if to be done synchronously */ @SuppressWarnings("rawtypes") - public static void cudaFreeHelper(final Pointer toFree, boolean synchronous) { + public static void cudaFreeHelper(String instructionName, final Pointer toFree, boolean synchronous) { + long t0 = 0; + if (instructionName != null) t0 = System.nanoTime(); if (synchronous) { cudaFree(toFree); } else { @@ -1160,6 +1197,7 @@ public class JCudaObject extends GPUObject { }); GPUContext.pendingDeallocates.offer(submitted); } + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0); }
