[1/2] incubator-systemml git commit: [SYSTEMML-1344] sqrt, round, abs, log, floor, ceil, trig funcs & sign for GPU

nakul02 Wed, 17 May 2017 10:56:54 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 0d553e384 -> 1fc764b9b



http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index f4c523b..48b7da6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -61,41 +61,55 @@ public abstract class GPUInstruction extends Instruction
        public final static String MISC_TIMER_SYRK_LIB =                        
                                                                        
"Msyrk";        // time spent in symmetric rank-k update
 
        // Other BLAS instructions
-       public final static String MISC_TIMER_DAXPY_LIB = "daxpy";      // time 
spent in daxpy
-       public final static String MISC_TIMER_QR_BUFFER = "qr_buffer";  // time 
spent in calculating buffer needed to perform QR
-       public final static String MISC_TIMER_QR = "qr";        // time spent 
in doing QR
-       public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in 
ormqr
-       public final static String MISC_TIMER_TRSM = "trsm"; // time spent in 
cublas Dtrsm
+       public final static String MISC_TIMER_DAXPY_LIB =   "daxpy";    // time 
spent in daxpy
+       public final static String MISC_TIMER_QR_BUFFER =   "qr_buffer";// time 
spent in calculating buffer needed to perform QR
+       public final static String MISC_TIMER_QR =          "qr";       // time 
spent in doing QR
+       public final static String MISC_TIMER_ORMQR =       "ormqr";    // time 
spent in ormqr
+       public final static String MISC_TIMER_TRSM =        "trsm";     // time 
spent in cublas Dtrsm
 
        // Transpose
-       public final static String MISC_TIMER_SPARSE_DGEAM_LIB =        
"sdgeaml";      // time spent in sparse transpose (and other ops of type 
a*op(A) + b*op(B))
-       public final static String MISC_TIMER_DENSE_DGEAM_LIB =         
"ddgeaml";      // time spent in dense transpose (and other ops of type a*op(A) 
+ b*op(B))
-       public final static String MISC_TIMER_TRANSPOSE_LIB =           "dtl";  
                // time spent on dense transpose, this includes allocation of 
output
+       public final static String MISC_TIMER_SPARSE_DGEAM_LIB =    "sdgeaml";  
// time spent in sparse transpose (and other ops of type a*op(A) + b*op(B))
+       public final static String MISC_TIMER_DENSE_DGEAM_LIB =     "ddgeaml";  
// time spent in dense transpose (and other ops of type a*op(A) + b*op(B))
+       public final static String MISC_TIMER_TRANSPOSE_LIB =       "dtl";      
// time spent on dense transpose, this includes allocation of output
 
        // Custom kernels
-       public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL 
=        "mmck"; // time spent in matrix-matrix cellwise operations
-       public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL =          
                                "cask"; // time spent in compareAndSet kernel
-       public final static String MISC_TIMER_EXP_KERNEL =                      
                                                                        "expk"; 
// time spent in the exp kernel
-       public final static String MISC_TIMER_DAXPY_MV_KERNEL =                 
                                                        "daxpymv";      // time 
spent in the daxpy_matrix_vector kernel
-       public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL =  
        "u2lk"; // time spent in the copy_u2l_dense kernel
-       public final static String MISC_TIMER_FILL_KERNEL       =               
                                                                                
"fillk"; // time spent in the "fill" kernel
-       public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL =         
                                "msk";  // time spent in the matrix scalar 
kernel
-       public final static String MISC_TIMER_REDUCE_ALL_KERNEL =               
                                                "rallk"; // time spent in 
reduce all kernel
-       public final static String MISC_TIMER_REDUCE_ROW_KERNEL =               
                                                "rrowk"; // time spent in 
reduce row kernel
-       public final static String MISC_TIMER_REDUCE_COL_KERNEL =               
                                                "rcolk";        // time spent 
in reduce column kernel
+       public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL 
= "mmck";   // time spent in matrix-matrix cellwise operations
+       public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL =          
 "cask";   // time spent in compareAndSet kernel
+       public final static String MISC_TIMER_EXP_KERNEL =                      
 "expk";   // time spent in the exp kernel
+       public final static String MISC_TIMER_SQRT_KERNEL =                     
 "sqrtk";   // time spent in the sqrt kernel
+       public final static String MISC_TIMER_ROUND_KERNEL =                    
 "roundk";   // time spent in the round kernel
+       public final static String MISC_TIMER_ABS_KERNEL =                      
 "absk";   // time spent in the abs kernel
+       public final static String MISC_TIMER_LOG_KERNEL =                      
 "logk";   // time spent in the log kernel
+       public final static String MISC_TIMER_FLOOR_KERNEL =                    
 "floork";  // time spent in the floor kernel
+       public final static String MISC_TIMER_CEIL_KERNEL =                     
 "ceilk";   // time spent in the ceil kernel
+       public final static String MISC_TIMER_SIN_KERNEL =                      
 "sink";   // time spent in the sin kernel
+       public final static String MISC_TIMER_COS_KERNEL =                      
 "cosk";   // time spent in the cos kernel
+       public final static String MISC_TIMER_TAN_KERNEL =                      
 "tank";   // time spent in the tan kernel
+       public final static String MISC_TIMER_ASIN_KERNEL =                     
 "asink";   // time spent in the asin kernel
+       public final static String MISC_TIMER_ACOS_KERNEL =                     
 "acosk";   // time spent in the acos kernel
+       public final static String MISC_TIMER_ATAN_KERNEL =                     
 "atank";   // time spent in the atan kernel
+       public final static String MISC_TIMER_SIGN_KERNEL =                     
 "signk";   // time spent in the sign kernel
+
+       public final static String MISC_TIMER_DAXPY_MV_KERNEL =                 
 "daxpymv";// time spent in the daxpy_matrix_vector kernel
+       public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL =  
 "u2lk";   // time spent in the copy_u2l_dense kernel
+       public final static String MISC_TIMER_FILL_KERNEL =                     
 "fillk";  // time spent in the "fill" kernel
+       public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL =         
 "msk";    // time spent in the matrix scalar kernel
+       public final static String MISC_TIMER_REDUCE_ALL_KERNEL =               
 "rallk";  // time spent in reduce all kernel
+       public final static String MISC_TIMER_REDUCE_ROW_KERNEL =               
 "rrowk";  // time spent in reduce row kernel
+       public final static String MISC_TIMER_REDUCE_COL_KERNEL =               
 "rcolk";  // time spent in reduce column kernel
 
        // Deep learning operators
-       public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB =          
                        "nnaf"; // time spent in cudnnActivationForward
-       public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB =         
                        "nncf"; // time spent in cudnnConvolutionForward
-       public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB = 
"nncbf"; // time spent in cudnnConvolutionBackwardFilter
-       public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB =   
"nncbd"; // time spent in cudnnConvolutionBackwardData
-       public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB =          
                        "nnmf"; // time spent in cudnnPoolingForward
-       public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB =         
                        "nnmb"; // time spent in cudnnPoolingBackward
-       public final static String MISC_TIMER_BIAS_ADD_LIB =                    
                                                        "nnba"; // time spent 
in bias_add cuda kernel
-       public final static String MISC_TIMER_RELU_BACKWARD_KERNEL=             
                                "nnrbk"; // time spent in relu_backward cuda 
kernel
-       public final static String MISC_TIMER_RELU_KERNEL =                     
                                                        "nnrk"; // time spent 
in the relu kernel
-       public final static String MISC_TIMER_CUDNN_INIT =                      
                                                                "nni";  // time 
spent in initializations for cudnn call
-       public final static String MISC_TIMER_CUDNN_CLEANUP =                   
                                                "nnc";  // time spent in 
cleanup for cudnn call
+       public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB =         
"nnaf";  // time spent in cudnnActivationForward
+       public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB =        
"nncf";  // time spent in cudnnConvolutionForward
+       public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB 
="nncbf"; // time spent in cudnnConvolutionBackwardFilter
+       public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB =  
"nncbd"; // time spent in cudnnConvolutionBackwardData
+       public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB =         
"nnmf";  // time spent in cudnnPoolingForward
+       public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB =        
"nnmb";  // time spent in cudnnPoolingBackward
+       public final static String MISC_TIMER_BIAS_ADD_LIB =                   
"nnba";  // time spent in bias_add cuda kernel
+       public final static String MISC_TIMER_RELU_BACKWARD_KERNEL=            
"nnrbk"; // time spent in relu_backward cuda kernel
+       public final static String MISC_TIMER_RELU_KERNEL =                    
"nnrk";  // time spent in the relu kernel
+       public final static String MISC_TIMER_CUDNN_INIT =                     
"nni";   // time spent in initializations for cudnn call
+       public final static String MISC_TIMER_CUDNN_CLEANUP =                  
"nnc";   // time spent in cleanup for cudnn call
 
 
        protected GPUINSTRUCTION_TYPE _gputype;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index ce25dec..7b50285 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -42,14 +42,39 @@ public class MatrixBuiltinGPUInstruction extends 
BuiltinUnaryGPUInstruction {
                MatrixObject mat = getMatrixInputForGPUInstruction(ec, 
_input.getName());
                ec.setMetaData(_output.getName(), mat.getNumRows(), 
mat.getNumColumns());
 
-               if(opcode.equals("sel+")) {
-                       LibMatrixCUDA.relu(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName());
-
-               } else if (opcode.equals("exp")) {
-                       LibMatrixCUDA.exp(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName());
-               }
-               else {
-                       throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
+               switch(opcode) {
+                       case "sel+":
+                               LibMatrixCUDA.relu(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "exp":
+                               LibMatrixCUDA.exp(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "sqrt":
+                               LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "log":
+                               LibMatrixCUDA.log(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "round":
+                               LibMatrixCUDA.round(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "floor":
+                               LibMatrixCUDA.floor(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "ceil":
+                               LibMatrixCUDA.ceil(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "abs":
+                               LibMatrixCUDA.abs(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "sin":
+                               LibMatrixCUDA.sin(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "cos":
+                               LibMatrixCUDA.cos(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "tan":
+                               LibMatrixCUDA.tan(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "asin":
+                               LibMatrixCUDA.asin(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "acos":
+                               LibMatrixCUDA.acos(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "atan":
+                               LibMatrixCUDA.atan(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       case "sign":
+                               LibMatrixCUDA.sign(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName()); break;
+                       default:
+                               throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
                }
                ec.releaseMatrixInputForGPUInstruction(_input.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 05257e5..0ff9d14 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -467,7 +467,7 @@ public class CSRPointer {
       cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, 
colInd, A, rows);
       //cudaDeviceSynchronize;
     } else {
-      LOG.warn("in CSRPointer, the values array, row pointers array or column 
indices array was null");
+      LOG.debug("in CSRPointer, the values array, row pointers array or column 
indices array was null");
     }
     return A;
   }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index d735e38..be3cc09 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -510,7 +510,9 @@ public class GPUObject {
                setDenseMatrixCudaPointer(allocate(size));
                addReadLock();
                // The "fill" kernel is called which treats the matrix 
"jcudaDensePtr" like a vector and fills it with value "v"
-               getGPUContext().getKernels().launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems), 
getJcudaDenseMatrixPtr(), v, numElems);
+               // If the fill value is 0, no need to call the special kernel, 
the allocate memsets the allocated region to 0
+               if (v != 0)
+                       getGPUContext().getKernels().launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems), 
getJcudaDenseMatrixPtr(), v, numElems);
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index a99571a..074119b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -2885,29 +2885,239 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void exp(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
+               // e^0 = 1, create a dense block full of 1s
+               unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, 
GPUInstruction.MISC_TIMER_EXP_KERNEL);
+       }
+
+       /**
+        * Performs an "sqrt" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void sqrt(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx);
+               // sqrt(0) = 0, create a dense block full of 0s
+               unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_SQRT_KERNEL);
+       }
+
+       /**
+        * Performs an "round" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void round(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : round" + ", GPUContext=" + gCtx);
+               // round(0) = 0, create a dense block full of 0s
+               unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ROUND_KERNEL);
+       }
+
+       /**
+        * Performs an "abs" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void abs(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : abs" + ", GPUContext=" + gCtx);
+               // abs(0) = 0, create a dense block full of 0s
+               unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ABS_KERNEL);
+       }
+
+       /**
+        * Performs an "log" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void log(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : log" + ", GPUContext=" + gCtx);
+               // log(0) = -Inf
+               unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, 
outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL);
+       }
+
+       /**
+        * Performs an "floor" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void floor(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : floor" + ", GPUContext=" + gCtx);
+               // floor(0) = 0
+               unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_FLOOR_KERNEL);
+       }
+
+       /**
+        * Performs an "ceil" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void ceil(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx);
+               // ceil(0) = 0
+               unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_CEIL_KERNEL);
+       }
+
+       /**
+        * Performs an "sin" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void sin(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : sin" + ", GPUContext=" + gCtx);
+               // sin(0) = 0
+               unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_SIN_KERNEL);
+       }
+
+       /**
+        * Performs an "cos" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void cos(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : cos" + ", GPUContext=" + gCtx);
+               // cos(0) = 1
+               unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, 
GPUInstruction.MISC_TIMER_COS_KERNEL);
+       }
+
+       /**
+        * Performs an "tan" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void tan(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : tan" + ", GPUContext=" + gCtx);
+               // tan(0) = 0
+               unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_TAN_KERNEL);
+       }
+
+       /**
+        * Performs an "asin" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void asin(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : asin" + ", GPUContext=" + gCtx);
+               // asin(0) = 0
+               unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ASIN_KERNEL);
+       }
+
+       /**
+        * Performs an "acos" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void acos(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : acos" + ", GPUContext=" + gCtx);
+               // acos(0) = PI/2
+               unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, 
instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL);
+       }
+
+       /**
+        * Performs an "atan" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void atan(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : atan" + ", GPUContext=" + gCtx);
+               // atan(0) = 0
+               unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ATAN_KERNEL);
+       }
+
+       /**
+        * Performs an "sign" operation on a matrix on the GPU
+        * @param ec    execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param in1   input matrix
+        * @param outputName    output matrix name
+        * @throws DMLRuntimeException  if DMLRuntimeException occurs
+        */
+       public static void sign(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
+               LOG.trace("GPU : sign" + ", GPUContext=" + gCtx);
+               // sign(0) = 0
+               unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_SIGN_KERNEL);
+       }
+
+
+       /**
+        * A helper function for all Unary ops (sqrt, abs, sin.. etc)
+        * @param ec valid execution context
+        * @param gCtx a valid {@link GPUContext}
+        * @param in1 input matrix
+        * @param kernel name of CUDA kernel for the unary op to execute
+        * @param sparseAndEmptyFillValue the result of the unary op on a 
completely empty input matrix block
+        * @param outputName output matrix name
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param kernelTimer the name of the timer to measure the kernel 
invocation
+        * @throws DMLRuntimeException
+        */
+       private static void unaryOp(ExecutionContext ec, GPUContext gCtx, 
MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String 
outputName, String instName, String kernelTimer) throws DMLRuntimeException {
                if (ec.getGPUContext() != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
-               LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
                GPUObject in = in1.getGPUObject(gCtx);
                boolean isSparseAndEmpty = in.isSparseAndEmpty();
                long t1=0;
                if (isSparseAndEmpty) {
-                       // e^0 = 1, create a dense block full of 1s
                        MatrixObject out = ec.getMatrixObject(outputName);
                        ec.allocateGPUMatrixObject(outputName);
-                       out.getGPUObject(gCtx).allocateAndFillDense(1);
+                       
out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
                } else {
                        // Dense
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
                        Pointer output = getDensePointer(gCtx, out, instName);
-                       // If the input is in sparse format, convert it to 
dense.
-                       // The output will always be dense, because for all x, 
exp(x) > 0
                        Pointer input = getDensePointer(gCtx, in1, instName);
                        int size = (int)(in1.getNumColumns() * 
in1.getNumRows());
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       getCudaKernels(gCtx).launchKernel("matrix_exp", 
ExecutionConfig.getConfigForSimpleVectorOperations(size),
-                                                       input, output, size);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_EXP_KERNEL, System.nanoTime() - t1);
+                       getCudaKernels(gCtx).launchKernel(kernel, 
ExecutionConfig.getConfigForSimpleVectorOperations(size),
+                                       input, output, size);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - 
t1);
                }
        }

[1/2] incubator-systemml git commit: [SYSTEMML-1344] sqrt, round, abs, log, floor, ceil, trig funcs & sign for GPU

Reply via email to