Repository: incubator-systemml Updated Branches: refs/heads/master 0d553e384 -> 1fc764b9b
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index f4c523b..48b7da6 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -61,41 +61,55 @@ public abstract class GPUInstruction extends Instruction public final static String MISC_TIMER_SYRK_LIB = "Msyrk"; // time spent in symmetric rank-k update // Other BLAS instructions - public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy - public final static String MISC_TIMER_QR_BUFFER = "qr_buffer"; // time spent in calculating buffer needed to perform QR - public final static String MISC_TIMER_QR = "qr"; // time spent in doing QR - public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr - public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm + public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy + public final static String MISC_TIMER_QR_BUFFER = "qr_buffer";// time spent in calculating buffer needed to perform QR + public final static String MISC_TIMER_QR = "qr"; // time spent in doing QR + public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr + public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm // Transpose - public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B)) - public final static String MISC_TIMER_DENSE_DGEAM_LIB = "ddgeaml"; // time spent in dense transpose (and other ops of type a*op(A) + b*op(B)) - public final static String MISC_TIMER_TRANSPOSE_LIB = "dtl"; // time spent on dense transpose, this includes allocation of output + public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B)) + public final static String MISC_TIMER_DENSE_DGEAM_LIB = "ddgeaml"; // time spent in dense transpose (and other ops of type a*op(A) + b*op(B)) + public final static String MISC_TIMER_TRANSPOSE_LIB = "dtl"; // time spent on dense transpose, this includes allocation of output // Custom kernels - public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck"; // time spent in matrix-matrix cellwise operations - public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = "cask"; // time spent in compareAndSet kernel - public final static String MISC_TIMER_EXP_KERNEL = "expk"; // time spent in the exp kernel - public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv"; // time spent in the daxpy_matrix_vector kernel - public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel - public final static String MISC_TIMER_FILL_KERNEL = "fillk"; // time spent in the "fill" kernel - public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = "msk"; // time spent in the matrix scalar kernel - public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel - public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel - public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel + public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck"; // time spent in matrix-matrix cellwise operations + public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = "cask"; // time spent in compareAndSet kernel + public final static String MISC_TIMER_EXP_KERNEL = "expk"; // time spent in the exp kernel + public final static String MISC_TIMER_SQRT_KERNEL = "sqrtk"; // time spent in the sqrt kernel + public final static String MISC_TIMER_ROUND_KERNEL = "roundk"; // time spent in the round kernel + public final static String MISC_TIMER_ABS_KERNEL = "absk"; // time spent in the abs kernel + public final static String MISC_TIMER_LOG_KERNEL = "logk"; // time spent in the log kernel + public final static String MISC_TIMER_FLOOR_KERNEL = "floork"; // time spent in the floor kernel + public final static String MISC_TIMER_CEIL_KERNEL = "ceilk"; // time spent in the ceil kernel + public final static String MISC_TIMER_SIN_KERNEL = "sink"; // time spent in the sin kernel + public final static String MISC_TIMER_COS_KERNEL = "cosk"; // time spent in the cos kernel + public final static String MISC_TIMER_TAN_KERNEL = "tank"; // time spent in the tan kernel + public final static String MISC_TIMER_ASIN_KERNEL = "asink"; // time spent in the asin kernel + public final static String MISC_TIMER_ACOS_KERNEL = "acosk"; // time spent in the acos kernel + public final static String MISC_TIMER_ATAN_KERNEL = "atank"; // time spent in the atan kernel + public final static String MISC_TIMER_SIGN_KERNEL = "signk"; // time spent in the sign kernel + + public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv";// time spent in the daxpy_matrix_vector kernel + public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel + public final static String MISC_TIMER_FILL_KERNEL = "fillk"; // time spent in the "fill" kernel + public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = "msk"; // time spent in the matrix scalar kernel + public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel + public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel + public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel // Deep learning operators - public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward - public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB = "nncf"; // time spent in cudnnConvolutionForward - public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB = "nncbf"; // time spent in cudnnConvolutionBackwardFilter - public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData - public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward - public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward - public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add cuda kernel - public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel - public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel - public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call - public final static String MISC_TIMER_CUDNN_CLEANUP = "nnc"; // time spent in cleanup for cudnn call + public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward + public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB = "nncf"; // time spent in cudnnConvolutionForward + public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB ="nncbf"; // time spent in cudnnConvolutionBackwardFilter + public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData + public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward + public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward + public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add cuda kernel + public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel + public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel + public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call + public final static String MISC_TIMER_CUDNN_CLEANUP = "nnc"; // time spent in cleanup for cudnn call protected GPUINSTRUCTION_TYPE _gputype; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java index ce25dec..7b50285 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java @@ -42,14 +42,39 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction { MatrixObject mat = getMatrixInputForGPUInstruction(ec, _input.getName()); ec.setMetaData(_output.getName(), mat.getNumRows(), mat.getNumColumns()); - if(opcode.equals("sel+")) { - LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); - - } else if (opcode.equals("exp")) { - LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); - } - else { - throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); + switch(opcode) { + case "sel+": + LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "exp": + LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "sqrt": + LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "log": + LibMatrixCUDA.log(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "round": + LibMatrixCUDA.round(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "floor": + LibMatrixCUDA.floor(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "ceil": + LibMatrixCUDA.ceil(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "abs": + LibMatrixCUDA.abs(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "sin": + LibMatrixCUDA.sin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "cos": + LibMatrixCUDA.cos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "tan": + LibMatrixCUDA.tan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "asin": + LibMatrixCUDA.asin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "acos": + LibMatrixCUDA.acos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "atan": + LibMatrixCUDA.atan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + case "sign": + LibMatrixCUDA.sign(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break; + default: + throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); } ec.releaseMatrixInputForGPUInstruction(_input.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index 05257e5..0ff9d14 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -467,7 +467,7 @@ public class CSRPointer { cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows); //cudaDeviceSynchronize; } else { - LOG.warn("in CSRPointer, the values array, row pointers array or column indices array was null"); + LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null"); } return A; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index d735e38..be3cc09 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -510,7 +510,9 @@ public class GPUObject { setDenseMatrixCudaPointer(allocate(size)); addReadLock(); // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v" - getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems); + // If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0 + if (v != 0) + getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getJcudaDenseMatrixPtr(), v, numElems); } /** http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index a99571a..074119b 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -2885,29 +2885,239 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : exp" + ", GPUContext=" + gCtx); + // e^0 = 1, create a dense block full of 1s + unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, GPUInstruction.MISC_TIMER_EXP_KERNEL); + } + + /** + * Performs an "sqrt" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx); + // sqrt(0) = 0, create a dense block full of 0s + unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, GPUInstruction.MISC_TIMER_SQRT_KERNEL); + } + + /** + * Performs an "round" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : round" + ", GPUContext=" + gCtx); + // round(0) = 0, create a dense block full of 0s + unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, GPUInstruction.MISC_TIMER_ROUND_KERNEL); + } + + /** + * Performs an "abs" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : abs" + ", GPUContext=" + gCtx); + // abs(0) = 0, create a dense block full of 0s + unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, GPUInstruction.MISC_TIMER_ABS_KERNEL); + } + + /** + * Performs an "log" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : log" + ", GPUContext=" + gCtx); + // log(0) = -Inf + unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL); + } + + /** + * Performs an "floor" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : floor" + ", GPUContext=" + gCtx); + // floor(0) = 0 + unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, GPUInstruction.MISC_TIMER_FLOOR_KERNEL); + } + + /** + * Performs an "ceil" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx); + // ceil(0) = 0 + unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, GPUInstruction.MISC_TIMER_CEIL_KERNEL); + } + + /** + * Performs an "sin" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : sin" + ", GPUContext=" + gCtx); + // sin(0) = 0 + unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIN_KERNEL); + } + + /** + * Performs an "cos" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : cos" + ", GPUContext=" + gCtx); + // cos(0) = 1 + unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, GPUInstruction.MISC_TIMER_COS_KERNEL); + } + + /** + * Performs an "tan" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : tan" + ", GPUContext=" + gCtx); + // tan(0) = 0 + unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, GPUInstruction.MISC_TIMER_TAN_KERNEL); + } + + /** + * Performs an "asin" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : asin" + ", GPUContext=" + gCtx); + // asin(0) = 0 + unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, GPUInstruction.MISC_TIMER_ASIN_KERNEL); + } + + /** + * Performs an "acos" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : acos" + ", GPUContext=" + gCtx); + // acos(0) = PI/2 + unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL); + } + + /** + * Performs an "atan" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : atan" + ", GPUContext=" + gCtx); + // atan(0) = 0 + unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, GPUInstruction.MISC_TIMER_ATAN_KERNEL); + } + + /** + * Performs an "sign" operation on a matrix on the GPU + * @param ec execution context + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix + * @param outputName output matrix name + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { + LOG.trace("GPU : sign" + ", GPUContext=" + gCtx); + // sign(0) = 0 + unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIGN_KERNEL); + } + + + /** + * A helper function for all Unary ops (sqrt, abs, sin.. etc) + * @param ec valid execution context + * @param gCtx a valid {@link GPUContext} + * @param in1 input matrix + * @param kernel name of CUDA kernel for the unary op to execute + * @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block + * @param outputName output matrix name + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param kernelTimer the name of the timer to measure the kernel invocation + * @throws DMLRuntimeException + */ + private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException { if (ec.getGPUContext() != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - LOG.trace("GPU : exp" + ", GPUContext=" + gCtx); GPUObject in = in1.getGPUObject(gCtx); boolean isSparseAndEmpty = in.isSparseAndEmpty(); long t1=0; if (isSparseAndEmpty) { - // e^0 = 1, create a dense block full of 1s MatrixObject out = ec.getMatrixObject(outputName); ec.allocateGPUMatrixObject(outputName); - out.getGPUObject(gCtx).allocateAndFillDense(1); + out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue); } else { // Dense MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); Pointer output = getDensePointer(gCtx, out, instName); - // If the input is in sparse format, convert it to dense. - // The output will always be dense, because for all x, exp(x) > 0 Pointer input = getDensePointer(gCtx, in1, instName); int size = (int)(in1.getNumColumns() * in1.getNumRows()); if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - getCudaKernels(gCtx).launchKernel("matrix_exp", ExecutionConfig.getConfigForSimpleVectorOperations(size), - input, output, size); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_EXP_KERNEL, System.nanoTime() - t1); + getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), + input, output, size); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1); } }