[SYSTEMML-1359] Added extra instrumentation for CUDA lib calls - Added instrumentation around input copies & output allocations - A config property is available to enable/disable advanced stats for DNN and GPU - Minor refactoring and change of SystemML.cu function names
Closes #412 Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/4f9dcf9a Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/4f9dcf9a Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/4f9dcf9a Branch: refs/heads/master Commit: 4f9dcf9add6b9bdbc190d97efef9781e32772dd9 Parents: ee33ec6 Author: Nakul Jindal <[email protected]> Authored: Tue Mar 7 10:50:47 2017 -0800 Committer: Nakul Jindal <[email protected]> Committed: Tue Mar 7 10:50:47 2017 -0800 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 6 + src/main/cpp/kernels/SystemML.cu | 37 +- src/main/cpp/kernels/SystemML.ptx | 3923 +++++------------- .../java/org/apache/sysml/api/DMLScript.java | 9 +- .../java/org/apache/sysml/conf/DMLConfig.java | 8 +- .../context/ExecutionContext.java | 32 +- .../instructions/GPUInstructionParser.java | 2 +- .../gpu/AggregateBinaryGPUInstruction.java | 13 +- .../gpu/AggregateUnaryGPUInstruction.java | 109 + .../gpu/ConvolutionGPUInstruction.java | 66 +- .../instructions/gpu/GPUInstruction.java | 91 +- .../instructions/gpu/MMTSJGPUInstruction.java | 8 +- .../gpu/MatrixBuiltinGPUInstruction.java | 12 +- .../MatrixMatrixArithmeticGPUInstruction.java | 10 +- .../gpu/MatrixMatrixAxpyGPUInstruction.java | 10 +- .../instructions/gpu/ReorgGPUInstruction.java | 8 +- .../ScalarMatrixArithmeticGPUInstruction.java | 8 +- .../context/AggregateUnaryGPUInstruction.java | 110 - .../instructions/gpu/context/GPUObject.java | 29 +- .../instructions/gpu/context/JCudaContext.java | 5 +- .../instructions/gpu/context/JCudaKernels.java | 3 +- .../instructions/gpu/context/JCudaObject.java | 164 +- .../runtime/matrix/data/LibMatrixCUDA.java | 1681 +++++--- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 4 +- .../org/apache/sysml/utils/GPUStatistics.java | 209 + .../java/org/apache/sysml/utils/Statistics.java | 116 +- 26 files changed, 2782 insertions(+), 3891 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index da80039..a4c7b2f 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -65,4 +65,10 @@ <!-- if codegen.enabled, compile literals as constants: 1..heuristic, 2..always --> <codegen.literals>1</codegen.literals> + + <!-- prints extra statistics information for GPU --> + <systemml.stats.extraGPU>false</systemml.stats.extraGPU> + + <!-- prints extra statistics information for Deep Neural Networks done in CP mode --> + <systemml.stats.extraDNN>false</systemml.stats.extraDNN> </root> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 40a1046..7bb2c34 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -25,10 +25,15 @@ nvcc -ptx -arch=sm_30 SystemML.cu #include <cfloat> -// dim => rlen (Assumption: rlen == clen) -// N = length of dense array + +/** + * Does a copy of upper to lower triangle of the given matrix + * @param ret the input and output array allocated on the GPU + * @param dim the number of rows of the square matrix ret + * @param N total number of elements of the matrix + */ extern "C" -__global__ void copyUpperToLowerTriangleDense(double* ret, int dim, int N) { +__global__ void copy_u2l_dense(double* ret, int dim, int N) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int id_dest = iy * dim + ix; @@ -71,26 +76,6 @@ __forceinline__ __device__ double binaryOp(double x, double y, int op) { } extern "C" -__global__ void dense_matrix_set(double* A, double scalar, int rlen, int clen) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; - int index = ix * clen + iy; - if(index < rlen*clen) { - A[index] = scalar; - } -} - -extern "C" -__global__ void dense_matrix_copy(double* A, double* ret, int rlen, int clen) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; - int index = ix * clen + iy; - if(ix < rlen && iy < clen) { - ret[index] = A[index]; - } -} - -extern "C" __global__ void relu(double* A, double* ret, int rlen, int clen) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; @@ -102,7 +87,7 @@ __global__ void relu(double* A, double* ret, int rlen, int clen) { // This method computes the backpropagation errors for previous layer of relu operation extern "C" -__global__ void reluBackward(double* X, double* dout, double* ret, int rlen, int clen) { +__global__ void relu_backward(double* X, double* dout, double* ret, int rlen, int clen) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix < rlen && iy < clen) { @@ -116,7 +101,7 @@ __global__ void reluBackward(double* X, double* dout, double* ret, int rlen, in // output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) // This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function extern "C" -__global__ void biasAdd(double* input, double* bias, double* ret, int rlen, int clen, int PQ) { +__global__ void bias_add(double* input, double* bias, double* ret, int rlen, int clen, int PQ) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix < rlen && iy < clen) { @@ -128,7 +113,7 @@ __global__ void biasAdd(double* input, double* bias, double* ret, int rlen, int // Compares the value and set extern "C" -__global__ void compareAndSet(double* A, double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) { +__global__ void compare_and_set(double* A, double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int index = ix * clen + iy;
