[SYSTEMML-1359] Added extra instrumentation for CUDA lib calls

- Added instrumentation around input copies & output allocations
- A config property is available to enable/disable advanced stats for
  DNN and GPU
- Minor refactoring and change of SystemML.cu function names

Closes #412


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/4f9dcf9a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/4f9dcf9a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/4f9dcf9a

Branch: refs/heads/master
Commit: 4f9dcf9add6b9bdbc190d97efef9781e32772dd9
Parents: ee33ec6
Author: Nakul Jindal <[email protected]>
Authored: Tue Mar 7 10:50:47 2017 -0800
Committer: Nakul Jindal <[email protected]>
Committed: Tue Mar 7 10:50:47 2017 -0800

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |    6 +
 src/main/cpp/kernels/SystemML.cu                |   37 +-
 src/main/cpp/kernels/SystemML.ptx               | 3923 +++++-------------
 .../java/org/apache/sysml/api/DMLScript.java    |    9 +-
 .../java/org/apache/sysml/conf/DMLConfig.java   |    8 +-
 .../context/ExecutionContext.java               |   32 +-
 .../instructions/GPUInstructionParser.java      |    2 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |   13 +-
 .../gpu/AggregateUnaryGPUInstruction.java       |  109 +
 .../gpu/ConvolutionGPUInstruction.java          |   66 +-
 .../instructions/gpu/GPUInstruction.java        |   91 +-
 .../instructions/gpu/MMTSJGPUInstruction.java   |    8 +-
 .../gpu/MatrixBuiltinGPUInstruction.java        |   12 +-
 .../MatrixMatrixArithmeticGPUInstruction.java   |   10 +-
 .../gpu/MatrixMatrixAxpyGPUInstruction.java     |   10 +-
 .../instructions/gpu/ReorgGPUInstruction.java   |    8 +-
 .../ScalarMatrixArithmeticGPUInstruction.java   |    8 +-
 .../context/AggregateUnaryGPUInstruction.java   |  110 -
 .../instructions/gpu/context/GPUObject.java     |   29 +-
 .../instructions/gpu/context/JCudaContext.java  |    5 +-
 .../instructions/gpu/context/JCudaKernels.java  |    3 +-
 .../instructions/gpu/context/JCudaObject.java   |  164 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 1681 +++++---
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |    4 +-
 .../org/apache/sysml/utils/GPUStatistics.java   |  209 +
 .../java/org/apache/sysml/utils/Statistics.java |  116 +-
 26 files changed, 2782 insertions(+), 3891 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index da80039..a4c7b2f 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -65,4 +65,10 @@
    
    <!-- if codegen.enabled, compile literals as constants: 1..heuristic, 
2..always -->
    <codegen.literals>1</codegen.literals>
+
+   <!-- prints extra statistics information for GPU -->
+   <systemml.stats.extraGPU>false</systemml.stats.extraGPU>
+
+   <!-- prints extra statistics information for Deep Neural Networks done in 
CP mode -->
+   <systemml.stats.extraDNN>false</systemml.stats.extraDNN>
 </root>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 40a1046..7bb2c34 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -25,10 +25,15 @@ nvcc -ptx -arch=sm_30 SystemML.cu
 
 #include <cfloat>
 
-// dim => rlen (Assumption: rlen == clen)
-// N = length of dense array
+
+/**
+ * Does a copy of upper to lower triangle of the given matrix
+ * @param ret the input and output array allocated on the GPU
+ * @param dim the number of rows of the square matrix ret
+ * @param N total number of elements of the matrix
+ */
 extern "C"
-__global__ void copyUpperToLowerTriangleDense(double* ret, int dim, int N) {
+__global__ void copy_u2l_dense(double* ret, int dim, int N) {
        int ix = blockIdx.x * blockDim.x + threadIdx.x;
        int iy = blockIdx.y * blockDim.y + threadIdx.y;
        int id_dest = iy * dim + ix;
@@ -71,26 +76,6 @@ __forceinline__ __device__ double binaryOp(double x, double 
y, int op) {
 }
 
 extern "C"
-__global__ void dense_matrix_set(double* A,  double scalar, int rlen, int 
clen) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
-       int index = ix * clen + iy;
-       if(index < rlen*clen) {
-               A[index] = scalar;
-       }
-}
-
-extern "C"
-__global__ void dense_matrix_copy(double* A,  double* ret, int rlen, int clen) 
{
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
-       int index = ix * clen + iy;
-       if(ix < rlen && iy < clen) {
-               ret[index] = A[index];
-       }
-}
-
-extern "C"
 __global__ void relu(double* A,  double* ret, int rlen, int clen) {
        int ix = blockIdx.x * blockDim.x + threadIdx.x;
        int iy = blockIdx.y * blockDim.y + threadIdx.y;
@@ -102,7 +87,7 @@ __global__ void relu(double* A,  double* ret, int rlen, int 
clen) {
 
 // This method computes the backpropagation errors for previous layer of relu 
operation
 extern "C"
-__global__ void reluBackward(double* X,  double* dout, double* ret, int rlen, 
int clen) {
+__global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, 
int clen) {
        int ix = blockIdx.x * blockDim.x + threadIdx.x;
        int iy = blockIdx.y * blockDim.y + threadIdx.y;
        if(ix < rlen && iy < clen) {
@@ -116,7 +101,7 @@ __global__ void reluBackward(double* X,  double* dout, 
double* ret, int rlen, in
 // output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
 // This operation is often followed by conv2d and hence we have introduced 
bias_add(input, bias) built-in function
 extern "C"
-__global__ void biasAdd(double* input,  double* bias, double* ret, int rlen, 
int clen, int PQ) {
+__global__ void bias_add(double* input,  double* bias, double* ret, int rlen, 
int clen, int PQ) {
        int ix = blockIdx.x * blockDim.x + threadIdx.x;
        int iy = blockIdx.y * blockDim.y + threadIdx.y;
        if(ix < rlen && iy < clen) {
@@ -128,7 +113,7 @@ __global__ void biasAdd(double* input,  double* bias, 
double* ret, int rlen, int
 
 // Compares the value and set
 extern "C"
-__global__ void compareAndSet(double* A,  double* ret, int rlen, int clen, 
double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double 
ifGreaterThanVal) {
+__global__ void compare_and_set(double* A,  double* ret, int rlen, int clen, 
double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double 
ifGreaterThanVal) {
        int ix = blockIdx.x * blockDim.x + threadIdx.x;
        int iy = blockIdx.y * blockDim.y + threadIdx.y;
        int index = ix * clen + iy;

Reply via email to