Repository: systemml Updated Branches: refs/heads/master 978d4de47 -> a7364746a
[SYSTEMML-1735] relational operators for GPU Closes #557 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a7364746 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a7364746 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a7364746 Branch: refs/heads/master Commit: a7364746a462069853421d59db1093ab145253c9 Parents: 978d4de Author: Nakul Jindal <[email protected]> Authored: Wed Jul 5 11:33:41 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Wed Jul 5 11:33:41 2017 -0700 ---------------------------------------------------------------------- relational.dml | 6 + .../java/org/apache/sysml/hops/BinaryOp.java | 8 +- .../instructions/GPUInstructionParser.java | 13 +- .../instructions/gpu/GPUInstruction.java | 34 ++-- .../MatrixMatrixArithmeticGPUInstruction.java | 2 +- ...rixMatrixRelationalBinaryGPUInstruction.java | 69 ++++++++ .../gpu/RelationalBinaryGPUInstruction.java | 68 +++++++ ...larMatrixRelationalBinaryGPUInstruction.java | 61 +++++++ .../instructions/gpu/context/CSRPointer.java | 6 +- .../instructions/gpu/context/GPUObject.java | 2 +- .../runtime/matrix/data/LibMatrixCUDA.java | 177 ++++++++++++++----- .../gpu/MatrixMatrixElementWiseOpTests.java | 32 +++- .../gpu/ScalarMatrixElementwiseOpTests.java | 64 ++++++- 13 files changed, 477 insertions(+), 65 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/relational.dml ---------------------------------------------------------------------- diff --git a/relational.dml b/relational.dml new file mode 100644 index 0000000..3f492a1 --- /dev/null +++ b/relational.dml @@ -0,0 +1,6 @@ +A = rand(rows=10, cols=10) +B = rand(rows=10, cols=10) + +C = A >= B + +print(toString(C)) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/hops/BinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java index 83209ef..36f573c 100644 --- a/src/main/java/org/apache/sysml/hops/BinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java @@ -582,7 +582,9 @@ public class BinaryOp extends Hop if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool .initialGPUMemBudget()) && (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW - || op == OpOp2.MINUS_NZ || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV) ) { + || op == OpOp2.MINUS_NZ || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV + || op == OpOp2.LESS || op == OpOp2.LESSEQUAL || op == OpOp2.EQUAL || op == OpOp2.NOTEQUAL + || op == OpOp2.GREATER || op == OpOp2.GREATEREQUAL)) { et = ExecType.GPU; } Unary unary1 = new Unary(getInput().get(0).constructLops(), @@ -602,7 +604,9 @@ public class BinaryOp extends Hop if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool .initialGPUMemBudget()) && (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW - || op == OpOp2.SOLVE || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV)) { + || op == OpOp2.SOLVE || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV + || op == OpOp2.LESS || op == OpOp2.LESSEQUAL || op == OpOp2.EQUAL || op == OpOp2.NOTEQUAL + || op == OpOp2.GREATER || op == OpOp2.GREATEREQUAL)) { et = ExecType.GPU; } http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 5fd6fa0..17b1578 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -30,6 +30,7 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.instructions.gpu.MatrixMatrixAxpyGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE; import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.RelationalBinaryGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.AggregateUnaryGPUInstruction; @@ -115,6 +116,14 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "uavar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Variance String2GPUInstructionType.put( "uarvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance String2GPUInstructionType.put( "uacvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance + + // Relational Binary + String2GPUInstructionType.put( "==" , GPUINSTRUCTION_TYPE.RelationalBinary); + String2GPUInstructionType.put( "!=" , GPUINSTRUCTION_TYPE.RelationalBinary); + String2GPUInstructionType.put( "<" , GPUINSTRUCTION_TYPE.RelationalBinary); + String2GPUInstructionType.put( ">" , GPUINSTRUCTION_TYPE.RelationalBinary); + String2GPUInstructionType.put( "<=" , GPUINSTRUCTION_TYPE.RelationalBinary); + String2GPUInstructionType.put( ">=" , GPUINSTRUCTION_TYPE.RelationalBinary); } public static GPUInstruction parseSingleInstruction (String str ) @@ -168,7 +177,9 @@ public class GPUInstructionParser extends InstructionParser return MatrixMatrixAxpyGPUInstruction.parseInstruction(str); else return ArithmeticBinaryGPUInstruction.parseInstruction(str); - + case RelationalBinary: + return RelationalBinaryGPUInstruction.parseInstruction(str); + default: throw new DMLRuntimeException("Invalid GPU Instruction Type: " + gputype ); } http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index 48b7da6..7f981eb 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -32,7 +32,18 @@ import org.apache.sysml.utils.Statistics; public abstract class GPUInstruction extends Instruction { - public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary, BuiltinBinary, Builtin }; + public enum GPUINSTRUCTION_TYPE { + AggregateUnary, + AggregateBinary, + RelationalBinary, + Convolution, + MMTSJ, + Reorg, + ArithmeticBinary, + BuiltinUnary, + BuiltinBinary, + Builtin + }; // Memory/conversions public final static String MISC_TIMER_HOST_TO_DEVICE = "H2D"; // time spent in bringing data to gpu (from host) @@ -46,7 +57,8 @@ public abstract class GPUInstruction extends Instruction public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu - public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ao"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE) + public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ad"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE) + public final static String MISC_TIMER_ALLOCATE_SPARSE_OUTPUT = "as"; // time spent to allocate sparse output (recorded differently than MISC_TIMER_ALLOCATE) public final static String MISC_TIMER_SET_ZERO = "az"; // time spent to allocate public final static String MISC_TIMER_REUSE = "r"; // time spent in reusing already allocated memory on GPU (mainly for the count) @@ -114,27 +126,27 @@ public abstract class GPUInstruction extends Instruction protected GPUINSTRUCTION_TYPE _gputype; protected Operator _optr; - + protected boolean _requiresLabelUpdate = false; - + public GPUInstruction(String opcode, String istr) { type = INSTRUCTION_TYPE.GPU; instString = istr; - + //prepare opcode and update requirement for repeated usage instOpcode = opcode; _requiresLabelUpdate = super.requiresLabelUpdate(); } - + public GPUInstruction(Operator op, String opcode, String istr) { this(opcode, istr); _optr = op; } - + public GPUINSTRUCTION_TYPE getGPUInstructionType() { return _gputype; } - + @Override public boolean requiresLabelUpdate() { return _requiresLabelUpdate; @@ -147,11 +159,11 @@ public abstract class GPUInstruction extends Instruction @Override public Instruction preprocessInstruction(ExecutionContext ec) - throws DMLRuntimeException + throws DMLRuntimeException { //default preprocess behavior (e.g., debug state) Instruction tmp = super.preprocessInstruction(ec); - + //instruction patching if( tmp.requiresLabelUpdate() ) { //update labels only if required //note: no exchange of updated instruction as labels might change in the general case @@ -162,7 +174,7 @@ public abstract class GPUInstruction extends Instruction return tmp; } - @Override + @Override public abstract void processInstruction(ExecutionContext ec) throws DMLRuntimeException; http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java index a03f9b1..ef3333d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java @@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns ec.setMetaData(_output.getName(), (int)rlen, (int)clen); BinaryOperator bop = (BinaryOperator) _optr; - LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); + LibMatrixCUDA.matrixMatrixArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop); ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixRelationalBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixRelationalBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixRelationalBinaryGPUInstruction.java new file mode 100644 index 0000000..a7e969f --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixRelationalBinaryGPUInstruction.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.BinaryOperator; +import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.utils.GPUStatistics; + +public class MatrixMatrixRelationalBinaryGPUInstruction extends RelationalBinaryGPUInstruction { + + public MatrixMatrixRelationalBinaryGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, + String opcode, String istr) { + super(op, in1, in2, out, opcode, istr); + } + + @Override + public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { + GPUStatistics.incrementNoOfExecutedGPUInst(); + + MatrixObject in1 = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject in2 = getMatrixInputForGPUInstruction(ec, _input2.getName()); + + long rlen1 = in1.getNumRows(); + long clen1 = in1.getNumColumns(); + long rlen2 = in2.getNumRows(); + long clen2 = in2.getNumColumns(); + + // Assume ordinary binary op + long rlen = rlen1; + long clen = clen1; + + // Outer binary op ( [100,1] + [1,100] or [100,100] + [100,1] + if (rlen1 != rlen2 || clen1 != clen2){ + rlen = rlen1 > rlen2 ? rlen1 : rlen2; + clen = clen1 > clen2 ? clen1 : clen2; + } + + ec.setMetaData(_output.getName(), (int)rlen, (int)clen); + + BinaryOperator bop = (BinaryOperator) _optr; + LibMatrixCUDA.matrixMatrixRelational(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), bop); + + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + ec.releaseMatrixInputForGPUInstruction(_input2.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java new file mode 100644 index 0000000..8dedf0b --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/RelationalBinaryGPUInstruction.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.parser.Expression; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.matrix.operators.Operator; + +public abstract class RelationalBinaryGPUInstruction extends GPUInstruction { + + protected CPOperand _input1; + protected CPOperand _input2; + protected CPOperand _output; + + public RelationalBinaryGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr) { + super(op, opcode, istr); + _gputype = GPUINSTRUCTION_TYPE.RelationalBinary; + _input1 = in1; + _input2 = in2; + _output = out; + } + + public static RelationalBinaryGPUInstruction parseInstruction ( String str ) throws DMLRuntimeException { + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + InstructionUtils.checkNumFields ( parts, 3 ); + + String opcode = parts[0]; + CPOperand in1 = new CPOperand(parts[1]); + CPOperand in2 = new CPOperand(parts[2]); + CPOperand out = new CPOperand(parts[3]); + + Expression.DataType dt1 = in1.getDataType(); + Expression.DataType dt2 = in2.getDataType(); + Expression.DataType dt3 = out.getDataType(); + + Operator operator = (dt1 != dt2) ? + InstructionUtils.parseScalarBinaryOperator(opcode, (dt1 == Expression.DataType.SCALAR)) : + InstructionUtils.parseBinaryOperator(opcode); + + if(dt1 == Expression.DataType.MATRIX && dt2 == Expression.DataType.MATRIX && dt3 == Expression.DataType.MATRIX) { + return new MatrixMatrixRelationalBinaryGPUInstruction(operator, in1, in2, out, opcode, str); + } + else if( dt3 == Expression.DataType.MATRIX && ((dt1 == Expression.DataType.SCALAR && dt2 == Expression.DataType.MATRIX) || (dt1 == Expression.DataType.MATRIX && dt2 == Expression.DataType.SCALAR)) ) { + return new ScalarMatrixRelationalBinaryGPUInstruction(operator, in1, in2, out, opcode, str); + } + else + throw new DMLRuntimeException("Unsupported GPU RelationalBinaryGPUInstruction."); + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixRelationalBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixRelationalBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixRelationalBinaryGPUInstruction.java new file mode 100644 index 0000000..2a084b9 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixRelationalBinaryGPUInstruction.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.parser.Expression; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.cp.ScalarObject; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.runtime.matrix.operators.ScalarOperator; +import org.apache.sysml.utils.GPUStatistics; + +public class ScalarMatrixRelationalBinaryGPUInstruction extends RelationalBinaryGPUInstruction { + + public ScalarMatrixRelationalBinaryGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, + String opcode, String istr) { + super(op, in1, in2, out, opcode, istr); + } + + @Override + public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { + GPUStatistics.incrementNoOfExecutedGPUInst(); + + CPOperand mat = ( _input1.getDataType() == Expression.DataType.MATRIX ) ? _input1 : _input2; + CPOperand scalar = ( _input1.getDataType() == Expression.DataType.MATRIX ) ? _input2 : _input1; + MatrixObject in1 = getMatrixInputForGPUInstruction(ec, mat.getName()); + ScalarObject constant = (ScalarObject) ec.getScalarInput(scalar.getName(), scalar.getValueType(), scalar.isLiteral()); + + int rlen = (int) in1.getNumRows(); + int clen = (int) in1.getNumColumns(); + ec.setMetaData(_output.getName(), rlen, clen); + + ScalarOperator sc_op = (ScalarOperator) _optr; + sc_op.setConstant(constant.getDoubleValue()); + + LibMatrixCUDA.matrixScalarRelational(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, _output.getName(), sc_op); + + ec.releaseMatrixInputForGPUInstruction(mat.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index b15dd69..a4bff9a 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -275,10 +275,8 @@ public class CSRPointer { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException { - LOG.trace( - "GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); - assert nnz2 - > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"; + LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); + assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"; CSRPointer r = new CSRPointer(gCtx); r.nnz = nnz2; if (nnz2 == 0) { http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 366eee5..94ceb36 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -891,7 +891,7 @@ public class GPUObject { "Block not in sparse format on host yet the device sparse matrix pointer is not null"); if (this.isSparseAndEmpty()) { - MatrixBlock tmp = new MatrixBlock(); // Empty Block + MatrixBlock tmp = new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l); // Empty Block mat.acquireModify(tmp); mat.release(); } else { http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 7b6e9b7..6f28313 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -62,8 +62,6 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; -import jcuda.jcusparse.cusparseAction; -import jcuda.jcusparse.cusparseIndexBase; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; @@ -137,7 +135,9 @@ import jcuda.jcudnn.cudnnStatus; import jcuda.jcudnn.cudnnTensorDescriptor; import jcuda.jcusolver.JCusolverDn; import jcuda.jcusparse.JCusparse; +import jcuda.jcusparse.cusparseAction; import jcuda.jcusparse.cusparseHandle; +import jcuda.jcusparse.cusparseIndexBase; /** * All CUDA kernels and library calls are redirected through this class @@ -2317,15 +2317,48 @@ public class LibMatrixCUDA { //********************************************************************/ /** - * Entry point to perform elementwise matrix-scalar operation specified by op + * Entry point to perform elementwise matrix-scalar relational operation specified by op * - * @param ec execution context - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param in input matrix + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in input matrix * @param outputName output matrix name + * @param op scalar operator + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void matrixScalarRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, ScalarOperator op) throws DMLRuntimeException { + if (ec.getGPUContext(0) != gCtx) + throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); + double constant = op.getConstant(); + LOG.trace("GPU : matrixScalarRelational, scalar: " + constant + ", GPUContext=" + gCtx); + + Pointer A, C; + if (isSparseAndEmpty(gCtx, in)) { + setOutputToConstant(ec, gCtx, instName, op.executeScalar(0.0), outputName); + return; + } else { + A = getDensePointer(gCtx, in, instName); + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix + C = getDensePointer(gCtx, out, instName); + } + + int rlenA = (int) in.getNumRows(); + int clenA = (int) in.getNumColumns(); + + matrixScalarOp(gCtx, instName, A, constant, rlenA, clenA, C, op); + } + + /** + * Entry point to perform elementwise matrix-scalar arithmetic operation specified by op + * + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in input matrix + * @param outputName output matrix name * @param isInputTransposed true if input transposed - * @param op scalar operator + * @param op scalar operator * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixScalarArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException { @@ -2342,6 +2375,7 @@ public class LibMatrixCUDA { } else if(op.fn instanceof Multiply || op.fn instanceof And) { setOutputToConstant(ec, gCtx, instName, 0.0, outputName); + } else if(op.fn instanceof Power) { setOutputToConstant(ec, gCtx, instName, 1.0, outputName); @@ -2393,8 +2427,44 @@ public class LibMatrixCUDA { //} } + + /** + * Performs elementwise operation relational specified by op of two input matrices in1 and in2 + * + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix 1 + * @param in2 input matrix 2 + * @param outputName output matrix name + * @param op binary operator + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void matrixMatrixRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, + String outputName, BinaryOperator op) throws DMLRuntimeException { + + if (ec.getGPUContext(0) != gCtx) + throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); + + boolean in1SparseAndEmpty = isSparseAndEmpty(gCtx, in1); + boolean in2SparseAndEmpty = isSparseAndEmpty(gCtx, in2); + if (in1SparseAndEmpty && in2SparseAndEmpty) { + if (op.fn instanceof LessThan || op.fn instanceof GreaterThan || op.fn instanceof NotEquals) { + setOutputToConstant(ec, gCtx, instName, 0.0, outputName); + } else if (op.fn instanceof LessThanEquals || op.fn instanceof GreaterThanEquals || op.fn instanceof Equals) { + setOutputToConstant(ec, gCtx, instName, 1.0, outputName); + } + } else if (in1SparseAndEmpty) { + matrixScalarRelational(ec, gCtx, instName, in2, outputName, new LeftScalarOperator(op.fn, 0.0)); + } else if (in2SparseAndEmpty) { + matrixScalarRelational(ec, gCtx, instName, in1, outputName, new RightScalarOperator(op.fn, 0.0)); + } else { + matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, false, false, op); + } + } + /** - * Performs elementwise operation specified by op of two input matrices in1 and in2 + * Performs elementwise arithmetic operation specified by op of two input matrices in1 and in2 * * @param ec execution context * @param gCtx a valid {@link GPUContext} @@ -2407,7 +2477,7 @@ public class LibMatrixCUDA { * @param op binary operator * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void matrixScalarArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, + public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); @@ -2456,24 +2526,25 @@ public class LibMatrixCUDA { int clenA = (int) in.getNumColumns(); Pointer A = getDensePointer(gCtx, in, instName); // TODO: FIXME: Implement sparse binCellSparseScalarOp kernel double scalar = op.getConstant(); - MatrixObject out = ec.getMatrixObject(outputName); - getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix + // MatrixObject out = ec.getMatrixObject(outputName); + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix Pointer C = getDensePointer(gCtx, out, instName); matrixScalarOp(gCtx, instName, A, scalar, rlenA, clenA, C, op); } /** - * Helper method to launch binary scalar-matrix arithmetic operations CUDA kernel. - * This method is isolated to be taken advatage of from other operations + * Helper method to launch binary scalar-matrix arithmetic/relational operations CUDA kernel. + * This method is isolated to be taken advantage of from other operations * as it accepts JCuda {@link Pointer} instances instead of {@link MatrixObject} instances. - * @param gCtx a valid {@link GPUContext} + * + * @param gCtx a valid {@link GPUContext} * @param instName the invoking instruction's name for record {@link Statistics}. - * @param a the dense input matrix (allocated on GPU) - * @param scalar the scalar value to do the op - * @param rlenA row length of matrix a - * @param clenA column lenght of matrix a - * @param c the dense output matrix - * @param op operation to perform + * @param a the dense input matrix (allocated on GPU) + * @param scalar the scalar value to do the op + * @param rlenA row length of matrix a + * @param clenA column lenght of matrix a + * @param c the dense output matrix + * @param op operation to perform * @throws DMLRuntimeException throws runtime exception */ private static void matrixScalarOp(GPUContext gCtx, String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException { @@ -2490,15 +2561,16 @@ public class LibMatrixCUDA { /** * Utility to launch binary cellwise matrix-matrix operations CUDA kernel - * @param gCtx a valid {@link GPUContext} - * @param ec execution context - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param in1 left input matrix - * @param in2 right input matrix - * @param outputName output variable name - * @param isLeftTransposed true if left matrix is transposed + * + * @param gCtx a valid {@link GPUContext} + * @param ec execution context + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 left input matrix + * @param in2 right input matrix + * @param outputName output variable name + * @param isLeftTransposed true if left matrix is transposed * @param isRightTransposed true if right matrix is transposed - * @param op operator + * @param op operator * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, @@ -2679,19 +2751,21 @@ public class LibMatrixCUDA { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); if(constant == 0) { - // TODO: Create sparse empty block instead + MatrixObject out = getSparseMatrixOutputForGPUInstruction(ec, 0, instName, outputName); + } else { + //MatrixObject out = ec.getMatrixObject(outputName); + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix + Pointer A = getDensePointer(gCtx, out, instName); + int rlen = (int) out.getNumRows(); + int clen = (int) out.getNumColumns(); + long t0 = 0; + if (GPUStatistics.DISPLAY_STATISTICS) + t0 = System.nanoTime(); + int size = rlen * clen; + getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size); + if (GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0); } - MatrixObject out = ec.getMatrixObject(outputName); - getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix - Pointer A = getDensePointer(gCtx, out, instName); - int rlen = (int) out.getNumRows(); - int clen = (int) out.getNumColumns(); - long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - int size = rlen * clen; - getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), - A, constant, size); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0); } /** @@ -3374,4 +3448,25 @@ public class LibMatrixCUDA { return mb.getKey(); } + /** + * Helper method to get the output block (allocated on the GPU) + * Also records performance information into {@link Statistics} + * @param ec active {@link ExecutionContext} + * @param nnz number of non zeroes in output matrix + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param name name of input matrix (that the {@link ExecutionContext} is aware of) + * @return the matrix object + * @throws DMLRuntimeException if an error occurs + */ + private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long nnz, String instName, String name) throws DMLRuntimeException { + long t0=0; + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + Pair<MatrixObject, Boolean> mb = ec.getSparseMatrixOutputForGPUInstruction(name, nnz); + if (mb.getValue()) + if (GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0); + return mb.getKey(); + } + + } http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java index 744b2c2..490befa 100644 --- a/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java +++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java @@ -34,8 +34,8 @@ import org.junit.Test; public class MatrixMatrixElementWiseOpTests extends GPUTests { private final static String TEST_NAME = "MatrixMatrixElementWiseOpTests"; - private final int[] rowSizes = new int[] { 1, 64, 130, 1024, 2049 }; - private final int[] columnSizes = new int[] { 1, 64, 130, 1024, 2049 }; + private final int[] rowSizes = new int[] { 1, 64, 1024, 2049 }; + private final int[] columnSizes = new int[] { 1, 64, 1024, 2049 }; private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 }; private final double[] scalars = new double[] { 0.0, 0.5, 2.0 }; private final int seed = 42; @@ -171,7 +171,35 @@ public class MatrixMatrixElementWiseOpTests extends GPUTests { runMatrixRowVectorTest("O = 1 - X * Y", "X", "Y", "O", "gpu_1-*"); } + @Test + public void testLessThan() { + runMatrixMatrixElementwiseTest("O = X < Y", "X", "Y", "O", "gpu_<"); + } + + @Test + public void testLessThanEqual() { + runMatrixMatrixElementwiseTest("O = X <= Y", "X", "Y", "O", "gpu_<="); + } + + @Test + public void testGreaterThan() { + runMatrixMatrixElementwiseTest("O = X > Y", "X", "Y", "O", "gpu_>"); + } + @Test + public void testGreaterThanEqual() { + runMatrixMatrixElementwiseTest("O = X >= Y", "X", "Y", "O", "gpu_>="); + } + + @Test + public void testEqual() { + runMatrixMatrixElementwiseTest("O = X == Y", "X", "Y", "O", "gpu_=="); + } + + @Test + public void NotEqual() { + runMatrixMatrixElementwiseTest("O = X != Y", "X", "Y", "O", "gpu_!="); + } /** * Runs a simple matrix-matrix elementwise op test http://git-wip-us.apache.org/repos/asf/systemml/blob/a7364746/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java b/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java index c58365a..7ceeb0f 100644 --- a/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java +++ b/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java @@ -35,8 +35,8 @@ public class ScalarMatrixElementwiseOpTests extends GPUTests { private final static String TEST_NAME = "ScalarMatrixElementwiseOpTests"; - private final int[] rowSizes = new int[] { 1, 64, 130, 2049 }; - private final int[] columnSizes = new int[] { 1, 64, 130, 2049 }; + private final int[] rowSizes = new int[] { 1, 64, 2049 }; + private final int[] columnSizes = new int[] { 1, 64, 2049 }; private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 }; private final int seed = 42; @@ -48,6 +48,66 @@ public class ScalarMatrixElementwiseOpTests extends GPUTests { } @Test + public void testLessThanRightScalar() { + runScalarMatrixElementWiseTests("O = X < scalar", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_<"); + } + + @Test + public void testLessThanLeftScalar() { + runScalarMatrixElementWiseTests("O = scalar < X", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_<"); + } + + @Test + public void testLessThanEqualRightScalar() { + runScalarMatrixElementWiseTests("O = X <= scalar", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_<="); + } + + @Test + public void testLessThanEqualLeftScalar() { + runScalarMatrixElementWiseTests("O = scalar <= X", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_<="); + } + + @Test + public void testGreaterThanRightScalar() { + runScalarMatrixElementWiseTests("O = X > scalar", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_>"); + } + + @Test + public void testGreaterThanLeftScalar() { + runScalarMatrixElementWiseTests("O = scalar > X", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_>"); + } + + @Test + public void testGreaterThanEqualRightScalar() { + runScalarMatrixElementWiseTests("O = X >= scalar", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_>="); + } + + @Test + public void testGreaterThanEqualLeftScalar() { + runScalarMatrixElementWiseTests("O = scalar >= X", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_>="); + } + + @Test + public void testEqualRightScalar() { + runScalarMatrixElementWiseTests("O = X == scalar", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_=="); + } + + @Test + public void testEqualLeftScalar() { + runScalarMatrixElementWiseTests("O = scalar == X", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_=="); + } + + @Test + public void testNotEqualRightScalar() { + runScalarMatrixElementWiseTests("O = X != scalar", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_!="); + } + + @Test + public void testNotEqualEqualLeftScalar() { + runScalarMatrixElementWiseTests("O = scalar != X", "X", "scalar", "O", new double[] { 0.0, 20.0 }, "gpu_!="); + } + + @Test public void testPlusRightScalar() { runScalarMatrixElementWiseTests("O = X + scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 20.0 }, "gpu_+"); }
