Repository: systemml Updated Branches: refs/heads/master 98a9d653d -> 815ca4f2a
[MINOR] bug fixes in the GPU backend - Each thread is assigned a cuda library handle - JCudaKernels is also made thread safe - Removed setting GPUContext to null - Bug fix in initial gpu budget estimate - Cuda Kernels use blockId.x and threadId.x only Closes #607 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/815ca4f2 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/815ca4f2 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/815ca4f2 Branch: refs/heads/master Commit: 815ca4f2aedcbe491d10a873db99a9b5e6f29226 Parents: 98a9d65 Author: Nakul Jindal <naku...@gmail.com> Authored: Tue Aug 8 13:29:11 2017 -0700 Committer: Nakul Jindal <naku...@gmail.com> Committed: Tue Aug 8 13:29:11 2017 -0700 ---------------------------------------------------------------------- src/main/cpp/kernels/SystemML.cu | 54 +-- src/main/cpp/kernels/SystemML.ptx | 333 +++++++++---------- .../controlprogram/ParForProgramBlock.java | 3 - .../controlprogram/parfor/LocalParWorker.java | 12 +- .../cp/FunctionCallCPInstruction.java | 7 - .../gpu/context/ExecutionConfig.java | 26 +- .../instructions/gpu/context/GPUContext.java | 94 +++--- .../gpu/context/GPUContextPool.java | 2 +- .../instructions/gpu/context/JCudaKernels.java | 5 +- .../org/apache/sysml/test/gpu/GPUTests.java | 18 + .../test/gpu/MatrixMultiplicationOpTest.java | 1 + 11 files changed, 303 insertions(+), 252 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 297269f..dcd64b2 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -35,12 +35,13 @@ nvcc -ptx -arch=sm_30 SystemML.cu */ extern "C" __global__ void copy_u2l_dense(double* ret, int dim, int N) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / dim; + int iy = tid % dim; int id_dest = iy * dim + ix; if(iy > ix && id_dest < N) { // TODO: Potential to reduce the number of threads by half - int id_src = ix * dim + iy; + int id_src = tid; ret[id_dest] = ret[id_src]; } } @@ -104,8 +105,9 @@ __forceinline__ __device__ double binaryOp(double x, double y, int op) { extern "C" __global__ void relu(double* A, double* ret, int rlen, int clen) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / clen; + int iy = tid % clen; if(ix < rlen && iy < clen) { int index = ix * clen + iy; ret[index] = max(0.0, A[index]); @@ -115,8 +117,9 @@ __global__ void relu(double* A, double* ret, int rlen, int clen) { // This method computes the backpropagation errors for previous layer of relu operation extern "C" __global__ void relu_backward(double* X, double* dout, double* ret, int rlen, int clen) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / clen; + int iy = tid % clen; if(ix < rlen && iy < clen) { int index = ix * clen + iy; ret[index] = X[index] > 0 ? dout[index] : 0; @@ -129,8 +132,9 @@ __global__ void relu_backward(double* X, double* dout, double* ret, int rlen, i // This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function extern "C" __global__ void bias_add(double* input, double* bias, double* ret, int rlen, int clen, int PQ) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / clen; + int iy = tid % clen; if(ix < rlen && iy < clen) { int index = ix * clen + iy; int biasIndex = iy / PQ; @@ -141,8 +145,9 @@ __global__ void bias_add(double* input, double* bias, double* ret, int rlen, in // Performs the operation "ret <- A + alpha*B", where B is a vector extern "C" __global__ void daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / clenA; + int iy = tid % clenA; if(ix < rlenA && iy < clenA) { int index = ix * clenA + iy; if(rlenB == 1) { @@ -157,8 +162,9 @@ __global__ void daxpy_matrix_vector(double* A, double* B, double alpha, double* // Performs similar operation as bias_add except elementwise multiplication instead of add extern "C" __global__ void bias_multiply(double* input, double* bias, double* ret, int rlen, int clen, int PQ) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / clen; + int iy = tid % clen; if(ix < rlen && iy < clen) { int index = ix * clen + iy; int biasIndex = iy / PQ; @@ -169,8 +175,9 @@ __global__ void bias_multiply(double* input, double* bias, double* ret, int rle // Compares the value and set extern "C" __global__ void compare_and_set(double* A, double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / clen; + int iy = tid % clen; int index = ix * clen + iy; if(ix < rlen && iy < clen) { if(abs(A[index]-compareVal) < tol) @@ -199,8 +206,9 @@ __global__ void compare_and_set(double* A, double* ret, int rlen, int clen, dou extern "C" __global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C, int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / maxClen; + int iy = tid % maxClen; if(ix < maxRlen && iy < maxClen) { int outIndex = ix * maxClen + iy; @@ -273,8 +281,10 @@ __global__ void fill(double* A, double scalar, int lenA) { */ extern "C" __global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int maxClen = max(colsA, colsB); + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / maxClen; + int iy = tid % maxClen; int colsC = colsA + colsB; int rowsC = rowsA; @@ -310,8 +320,10 @@ __global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int */ extern "C" __global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) { - int ix = blockIdx.x * blockDim.x + threadIdx.x; - int iy = blockIdx.y * blockDim.y + threadIdx.y; + int maxClen = max(colsA, colsB); + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int ix = tid / maxClen; + int iy = tid % maxClen; int rowsC = rowsA + rowsB; int colsC = colsA; http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 6884d5b..7778317 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -34,36 +34,33 @@ ) { .reg .pred %p<4>; - .reg .b32 %r<13>; + .reg .b32 %r<10>; .reg .f64 %fd<2>; .reg .b64 %rd<7>; ld.param.u64 %rd1, [copy_u2l_dense_param_0]; - ld.param.u32 %r4, [copy_u2l_dense_param_1]; - ld.param.u32 %r5, [copy_u2l_dense_param_2]; - mov.u32 %r6, %ntid.x; - mov.u32 %r7, %ctaid.x; - mov.u32 %r8, %tid.x; - mad.lo.s32 %r1, %r6, %r7, %r8; - mov.u32 %r9, %ntid.y; - mov.u32 %r10, %ctaid.y; - mov.u32 %r11, %tid.y; - mad.lo.s32 %r2, %r9, %r10, %r11; - mad.lo.s32 %r3, %r2, %r4, %r1; - setp.gt.s32 %p1, %r2, %r1; - setp.lt.s32 %p2, %r3, %r5; + ld.param.u32 %r3, [copy_u2l_dense_param_1]; + ld.param.u32 %r4, [copy_u2l_dense_param_2]; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r5, %r6, %r7; + div.s32 %r8, %r1, %r3; + rem.s32 %r9, %r1, %r3; + mad.lo.s32 %r2, %r9, %r3, %r8; + setp.gt.s32 %p1, %r9, %r8; + setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; @!%p3 bra BB0_2; bra.uni BB0_1; BB0_1: cvta.to.global.u64 %rd2, %rd1; - mad.lo.s32 %r12, %r1, %r4, %r2; - mul.wide.s32 %rd3, %r12, 8; + mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; ld.global.f64 %fd1, [%rd4]; - mul.wide.s32 %rd5, %r3, 8; + mul.wide.s32 %rd5, %r2, 8; add.s64 %rd6, %rd2, %rd5; st.global.f64 [%rd6], %fd1; @@ -80,7 +77,7 @@ BB0_2: ) { .reg .pred %p<4>; - .reg .b32 %r<12>; + .reg .b32 %r<10>; .reg .f64 %fd<4>; .reg .b64 %rd<8>; @@ -93,20 +90,18 @@ BB0_2: mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r2, %r8, %r9, %r10; - setp.lt.s32 %p1, %r1, %r4; - setp.lt.s32 %p2, %r2, %r3; + div.s32 %r2, %r1, %r3; + setp.lt.s32 %p1, %r2, %r4; + setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; @!%p3 bra BB1_2; bra.uni BB1_1; BB1_1: + rem.s32 %r8, %r1, %r3; cvta.to.global.u64 %rd3, %rd1; - mad.lo.s32 %r11, %r1, %r3, %r2; - mul.wide.s32 %rd4, %r11, 8; + mad.lo.s32 %r9, %r2, %r3, %r8; + mul.wide.s32 %rd4, %r9, 8; add.s64 %rd5, %rd3, %rd4; ld.global.f64 %fd1, [%rd5]; mov.f64 %fd2, 0d0000000000000000; @@ -129,7 +124,7 @@ BB1_2: ) { .reg .pred %p<5>; - .reg .b32 %r<12>; + .reg .b32 %r<10>; .reg .f64 %fd<6>; .reg .b64 %rd<14>; @@ -143,21 +138,19 @@ BB1_2: mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r2, %r8, %r9, %r10; - setp.lt.s32 %p1, %r1, %r4; - setp.lt.s32 %p2, %r2, %r3; + div.s32 %r2, %r1, %r3; + setp.lt.s32 %p1, %r2, %r4; + setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; @!%p3 bra BB2_4; bra.uni BB2_1; BB2_1: + rem.s32 %r8, %r1, %r3; cvta.to.global.u64 %rd5, %rd2; - mad.lo.s32 %r11, %r1, %r3, %r2; - cvt.s64.s32 %rd1, %r11; - mul.wide.s32 %rd6, %r11, 8; + mad.lo.s32 %r9, %r2, %r3, %r8; + cvt.s64.s32 %rd1, %r9; + mul.wide.s32 %rd6, %r9, 8; add.s64 %rd7, %rd5, %rd6; ld.global.f64 %fd4, [%rd7]; mov.f64 %fd5, 0d0000000000000000; @@ -190,7 +183,7 @@ BB2_4: ) { .reg .pred %p<4>; - .reg .b32 %r<14>; + .reg .b32 %r<12>; .reg .f64 %fd<4>; .reg .b64 %rd<12>; @@ -205,24 +198,22 @@ BB2_4: mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r7, %r6, %r8; - mov.u32 %r9, %ntid.y; - mov.u32 %r10, %ctaid.y; - mov.u32 %r11, %tid.y; - mad.lo.s32 %r2, %r9, %r10, %r11; - setp.lt.s32 %p1, %r1, %r5; - setp.lt.s32 %p2, %r2, %r3; + div.s32 %r2, %r1, %r3; + setp.lt.s32 %p1, %r2, %r5; + setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; @!%p3 bra BB3_2; bra.uni BB3_1; BB3_1: + rem.s32 %r9, %r1, %r3; cvta.to.global.u64 %rd4, %rd1; - mad.lo.s32 %r12, %r1, %r3, %r2; - mul.wide.s32 %rd5, %r12, 8; + mad.lo.s32 %r10, %r2, %r3, %r9; + mul.wide.s32 %rd5, %r10, 8; add.s64 %rd6, %rd4, %rd5; - div.s32 %r13, %r2, %r4; + div.s32 %r11, %r9, %r4; cvta.to.global.u64 %rd7, %rd2; - mul.wide.s32 %rd8, %r13, 8; + mul.wide.s32 %rd8, %r11, 8; add.s64 %rd9, %rd7, %rd8; ld.global.f64 %fd1, [%rd9]; ld.global.f64 %fd2, [%rd6]; @@ -248,7 +239,7 @@ BB3_2: ) { .reg .pred %p<5>; - .reg .b32 %r<13>; + .reg .b32 %r<11>; .reg .f64 %fd<7>; .reg .b64 %rd<14>; @@ -264,22 +255,20 @@ BB3_2: mov.u32 %r6, %ntid.x; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %tid.x; - mad.lo.s32 %r1, %r6, %r7, %r8; - mov.u32 %r9, %ntid.y; - mov.u32 %r10, %ctaid.y; - mov.u32 %r11, %tid.y; - mad.lo.s32 %r2, %r9, %r10, %r11; + mad.lo.s32 %r9, %r6, %r7, %r8; + div.s32 %r1, %r9, %r3; + rem.s32 %r2, %r9, %r3; setp.lt.s32 %p1, %r1, %r5; - setp.lt.s32 %p2, %r2, %r3; + setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; @!%p3 bra BB4_4; bra.uni BB4_1; BB4_1: cvta.to.global.u64 %rd6, %rd4; - mad.lo.s32 %r12, %r1, %r3, %r2; + mad.lo.s32 %r10, %r1, %r3, %r2; cvta.to.global.u64 %rd7, %rd3; - mul.wide.s32 %rd8, %r12, 8; + mul.wide.s32 %rd8, %r10, 8; add.s64 %rd9, %rd7, %rd8; ld.global.f64 %fd1, [%rd9]; add.s64 %rd2, %rd6, %rd8; @@ -317,7 +306,7 @@ BB4_4: ) { .reg .pred %p<4>; - .reg .b32 %r<14>; + .reg .b32 %r<12>; .reg .f64 %fd<4>; .reg .b64 %rd<12>; @@ -332,24 +321,22 @@ BB4_4: mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r7, %r6, %r8; - mov.u32 %r9, %ntid.y; - mov.u32 %r10, %ctaid.y; - mov.u32 %r11, %tid.y; - mad.lo.s32 %r2, %r9, %r10, %r11; - setp.lt.s32 %p1, %r1, %r5; - setp.lt.s32 %p2, %r2, %r3; + div.s32 %r2, %r1, %r3; + setp.lt.s32 %p1, %r2, %r5; + setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; @!%p3 bra BB5_2; bra.uni BB5_1; BB5_1: + rem.s32 %r9, %r1, %r3; cvta.to.global.u64 %rd4, %rd1; - mad.lo.s32 %r12, %r1, %r3, %r2; - mul.wide.s32 %rd5, %r12, 8; + mad.lo.s32 %r10, %r2, %r3, %r9; + mul.wide.s32 %rd5, %r10, 8; add.s64 %rd6, %rd4, %rd5; - div.s32 %r13, %r2, %r4; + div.s32 %r11, %r9, %r4; cvta.to.global.u64 %rd7, %rd2; - mul.wide.s32 %rd8, %r13, 8; + mul.wide.s32 %rd8, %r11, 8; add.s64 %rd9, %rd7, %rd8; ld.global.f64 %fd1, [%rd9]; ld.global.f64 %fd2, [%rd6]; @@ -376,7 +363,7 @@ BB5_2: ) { .reg .pred %p<6>; - .reg .b32 %r<12>; + .reg .b32 %r<10>; .reg .f64 %fd<9>; .reg .b64 %rd<8>; @@ -394,13 +381,11 @@ BB5_2: mov.u32 %r5, %ntid.x; mov.u32 %r6, %tid.x; mad.lo.s32 %r7, %r5, %r4, %r6; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r11, %r8, %r9, %r10; - mad.lo.s32 %r1, %r7, %r3, %r11; - setp.lt.s32 %p1, %r7, %r2; - setp.lt.s32 %p2, %r11, %r3; + div.s32 %r8, %r7, %r3; + rem.s32 %r9, %r7, %r3; + mad.lo.s32 %r1, %r8, %r3, %r9; + setp.lt.s32 %p1, %r8, %r2; + setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; @!%p3 bra BB6_6; bra.uni BB6_1; @@ -451,7 +436,7 @@ BB6_6: ) { .reg .pred %p<73>; - .reg .b32 %r<68>; + .reg .b32 %r<66>; .reg .f64 %fd<56>; .reg .b64 %rd<19>; @@ -467,13 +452,11 @@ BB6_6: mov.u32 %r15, %ntid.x; mov.u32 %r16, %ctaid.x; mov.u32 %r17, %tid.x; - mad.lo.s32 %r1, %r15, %r16, %r17; - mov.u32 %r18, %ntid.y; - mov.u32 %r19, %ctaid.y; - mov.u32 %r20, %tid.y; - mad.lo.s32 %r2, %r18, %r19, %r20; + mad.lo.s32 %r18, %r15, %r16, %r17; + div.s32 %r1, %r18, %r10; + rem.s32 %r2, %r18, %r10; setp.lt.s32 %p2, %r1, %r14; - setp.lt.s32 %p3, %r2, %r10; + setp.gt.s32 %p3, %r10, -1; and.pred %p4, %p2, %p3; @!%p4 bra BB7_77; bra.uni BB7_1; @@ -481,34 +464,34 @@ BB6_6: BB7_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r66, %r1; + mov.u32 %r64, %r1; @%p5 bra BB7_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r67, %r3; + mov.u32 %r65, %r3; @%p6 bra BB7_4; - mov.u32 %r67, %r2; + mov.u32 %r65, %r2; BB7_4: - mov.u32 %r61, %r67; - mov.u32 %r4, %r61; - mov.u32 %r66, %r4; + mov.u32 %r59, %r65; + mov.u32 %r4, %r59; + mov.u32 %r64, %r4; BB7_5: - mov.u32 %r5, %r66; + mov.u32 %r5, %r64; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r64, %r1; + mov.u32 %r62, %r1; @%p7 bra BB7_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r65, %r3; + mov.u32 %r63, %r3; @%p8 bra BB7_8; - mov.u32 %r65, %r2; + mov.u32 %r63, %r2; BB7_8: - mov.u32 %r64, %r65; + mov.u32 %r62, %r63; BB7_9: cvta.to.global.u64 %rd5, %rd3; @@ -516,7 +499,7 @@ BB7_9: mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r64, 8; + mul.wide.s32 %rd9, %r62, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; mov.f64 %fd55, 0d7FEFFFFFFFFFFFFF; @@ -570,10 +553,10 @@ BB7_58: .reg .b32 %temp; mov.b64 {%temp, %r9}, %fd2; } - bfe.u32 %r33, %r9, 20, 11; - add.s32 %r34, %r33, -1012; + bfe.u32 %r31, %r9, 20, 11; + add.s32 %r32, %r31, -1012; mov.b64 %rd15, %fd2; - shl.b64 %rd1, %rd15, %r34; + shl.b64 %rd1, %rd15, %r32; setp.eq.s64 %p53, %rd1, -9223372036854775808; abs.f64 %fd19, %fd1; // Callseq Start 0 @@ -603,14 +586,14 @@ BB7_58: BB7_59: { .reg .b32 %temp; - mov.b64 {%temp, %r35}, %fd54; + mov.b64 {%temp, %r33}, %fd54; } - xor.b32 %r36, %r35, -2147483648; + xor.b32 %r34, %r33, -2147483648; { .reg .b32 %temp; - mov.b64 {%r37, %temp}, %fd54; + mov.b64 {%r35, %temp}, %fd54; } - mov.b64 %fd54, {%r37, %r36}; + mov.b64 %fd54, {%r35, %r34}; BB7_60: mov.f64 %fd53, %fd54; @@ -619,12 +602,12 @@ BB7_60: bra.uni BB7_61; BB7_63: - selp.b32 %r38, %r8, 0, %p53; - or.b32 %r39, %r38, 2146435072; + selp.b32 %r36, %r8, 0, %p53; + or.b32 %r37, %r36, 2146435072; setp.lt.s32 %p59, %r9, 0; - selp.b32 %r40, %r39, %r38, %p59; - mov.u32 %r41, 0; - mov.b64 %fd53, {%r41, %r40}; + selp.b32 %r38, %r37, %r36, %p59; + mov.u32 %r39, 0; + mov.b64 %fd53, {%r39, %r38}; bra.uni BB7_64; BB7_35: @@ -638,10 +621,10 @@ BB7_35: BB7_52: cvt.rni.s64.f64 %rd11, %fd1; cvt.rni.s64.f64 %rd12, %fd2; - cvt.u32.u64 %r27, %rd11; - cvt.u32.u64 %r28, %rd12; - or.b32 %r29, %r28, %r27; - setp.eq.s32 %p45, %r29, 0; + cvt.u32.u64 %r25, %rd11; + cvt.u32.u64 %r26, %rd12; + or.b32 %r27, %r26, %r25; + setp.eq.s32 %p45, %r27, 0; selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45; bra.uni BB7_76; @@ -701,17 +684,17 @@ BB7_46: { .reg .b32 %temp; - mov.b64 {%temp, %r24}, %fd55; + mov.b64 {%temp, %r22}, %fd55; } - and.b32 %r25, %r24, 2147483647; - setp.ne.s32 %p42, %r25, 2146435072; + and.b32 %r23, %r22, 2147483647; + setp.ne.s32 %p42, %r23, 2146435072; @%p42 bra BB7_50; { .reg .b32 %temp; - mov.b64 {%r26, %temp}, %fd55; + mov.b64 {%r24, %temp}, %fd55; } - setp.eq.s32 %p43, %r26, 0; + setp.eq.s32 %p43, %r24, 0; @%p43 bra BB7_76; BB7_50: @@ -781,10 +764,10 @@ BB7_33: BB7_34: cvt.rni.s64.f64 %rd13, %fd1; cvt.rni.s64.f64 %rd14, %fd2; - cvt.u32.u64 %r30, %rd13; - cvt.u32.u64 %r31, %rd14; - and.b32 %r32, %r31, %r30; - setp.eq.s32 %p46, %r32, 0; + cvt.u32.u64 %r28, %rd13; + cvt.u32.u64 %r29, %rd14; + and.b32 %r30, %r29, %r28; + setp.eq.s32 %p46, %r30, 0; selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46; bra.uni BB7_76; @@ -820,17 +803,17 @@ BB7_41: { .reg .b32 %temp; - mov.b64 {%temp, %r21}, %fd55; + mov.b64 {%temp, %r19}, %fd55; } - and.b32 %r22, %r21, 2147483647; - setp.ne.s32 %p36, %r22, 2146435072; + and.b32 %r20, %r19, 2147483647; + setp.ne.s32 %p36, %r20, 2146435072; @%p36 bra BB7_45; { .reg .b32 %temp; - mov.b64 {%r23, %temp}, %fd55; + mov.b64 {%r21, %temp}, %fd55; } - setp.eq.s32 %p37, %r23, 0; + setp.eq.s32 %p37, %r21, 0; @%p37 bra BB7_76; BB7_45: @@ -850,10 +833,10 @@ BB7_64: add.f64 %fd26, %fd1, %fd2; { .reg .b32 %temp; - mov.b64 {%temp, %r42}, %fd26; + mov.b64 {%temp, %r40}, %fd26; } - and.b32 %r43, %r42, 2146435072; - setp.ne.s32 %p60, %r43, 2146435072; + and.b32 %r41, %r40, 2146435072; + setp.ne.s32 %p60, %r41, 2146435072; mov.f64 %fd52, %fd25; @%p60 bra BB7_73; @@ -867,51 +850,51 @@ BB7_64: mov.f64 %fd52, %fd51; @%p62 bra BB7_73; - and.b32 %r44, %r9, 2147483647; - setp.ne.s32 %p63, %r44, 2146435072; + and.b32 %r42, %r9, 2147483647; + setp.ne.s32 %p63, %r42, 2146435072; @%p63 bra BB7_69; { .reg .b32 %temp; - mov.b64 {%r45, %temp}, %fd2; + mov.b64 {%r43, %temp}, %fd2; } - setp.eq.s32 %p64, %r45, 0; + setp.eq.s32 %p64, %r43, 0; @%p64 bra BB7_72; BB7_69: - and.b32 %r46, %r8, 2147483647; - setp.ne.s32 %p65, %r46, 2146435072; + and.b32 %r44, %r8, 2147483647; + setp.ne.s32 %p65, %r44, 2146435072; mov.f64 %fd49, %fd25; mov.f64 %fd52, %fd49; @%p65 bra BB7_73; { .reg .b32 %temp; - mov.b64 {%r47, %temp}, %fd1; + mov.b64 {%r45, %temp}, %fd1; } - setp.ne.s32 %p66, %r47, 0; + setp.ne.s32 %p66, %r45, 0; mov.f64 %fd52, %fd25; @%p66 bra BB7_73; - shr.s32 %r48, %r9, 31; - and.b32 %r49, %r48, -2146435072; - add.s32 %r50, %r49, 2146435072; - or.b32 %r51, %r50, -2147483648; - selp.b32 %r52, %r51, %r50, %p1; - mov.u32 %r53, 0; - mov.b64 %fd52, {%r53, %r52}; + shr.s32 %r46, %r9, 31; + and.b32 %r47, %r46, -2146435072; + add.s32 %r48, %r47, 2146435072; + or.b32 %r49, %r48, -2147483648; + selp.b32 %r50, %r49, %r48, %p1; + mov.u32 %r51, 0; + mov.b64 %fd52, {%r51, %r50}; bra.uni BB7_73; BB7_72: setp.gt.f64 %p67, %fd19, 0d3FF0000000000000; - selp.b32 %r54, 2146435072, 0, %p67; - xor.b32 %r55, %r54, 2146435072; + selp.b32 %r52, 2146435072, 0, %p67; + xor.b32 %r53, %r52, 2146435072; setp.lt.s32 %p68, %r9, 0; - selp.b32 %r56, %r55, %r54, %p68; + selp.b32 %r54, %r53, %r52, %p68; setp.eq.f64 %p69, %fd1, 0dBFF0000000000000; - selp.b32 %r57, 1072693248, %r56, %p69; - mov.u32 %r58, 0; - mov.b64 %fd52, {%r58, %r57}; + selp.b32 %r55, 1072693248, %r54, %p69; + mov.u32 %r56, 0; + mov.b64 %fd52, {%r56, %r55}; BB7_73: setp.eq.f64 %p70, %fd2, 0d0000000000000000; @@ -1825,7 +1808,7 @@ BB9_2: ) { .reg .pred %p<7>; - .reg .b32 %r<19>; + .reg .b32 %r<18>; .reg .f64 %fd<3>; .reg .b64 %rd<15>; @@ -1841,11 +1824,10 @@ BB9_2: mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %tid.x; - mad.lo.s32 %r1, %r8, %r9, %r10; - mov.u32 %r11, %ntid.y; - mov.u32 %r12, %ctaid.y; - mov.u32 %r13, %tid.y; - mad.lo.s32 %r2, %r11, %r12, %r13; + mad.lo.s32 %r11, %r8, %r9, %r10; + max.s32 %r12, %r4, %r6; + div.s32 %r1, %r11, %r12; + rem.s32 %r2, %r11, %r12; add.s32 %r3, %r6, %r4; setp.lt.s32 %p1, %r1, %r7; setp.lt.s32 %p2, %r2, %r4; @@ -1855,12 +1837,12 @@ BB9_2: BB10_1: cvta.to.global.u64 %rd5, %rd2; - mad.lo.s32 %r14, %r1, %r4, %r2; - mul.wide.s32 %rd6, %r14, 8; + mad.lo.s32 %r13, %r1, %r4, %r2; + mul.wide.s32 %rd6, %r13, 8; add.s64 %rd7, %rd5, %rd6; ld.global.f64 %fd1, [%rd7]; - mad.lo.s32 %r15, %r1, %r3, %r2; - mul.wide.s32 %rd8, %r15, 8; + mad.lo.s32 %r14, %r1, %r3, %r2; + mul.wide.s32 %rd8, %r14, 8; add.s64 %rd9, %rd1, %rd8; st.global.f64 [%rd9], %fd1; @@ -1873,13 +1855,13 @@ BB10_2: BB10_3: cvta.to.global.u64 %rd10, %rd3; - mad.lo.s32 %r16, %r1, %r6, %r2; - mul.wide.s32 %rd11, %r16, 8; + mad.lo.s32 %r15, %r1, %r6, %r2; + mul.wide.s32 %rd11, %r15, 8; add.s64 %rd12, %rd10, %rd11; ld.global.f64 %fd2, [%rd12]; - mad.lo.s32 %r17, %r1, %r3, %r4; - add.s32 %r18, %r17, %r2; - mul.wide.s32 %rd13, %r18, 8; + add.s32 %r16, %r2, %r4; + mad.lo.s32 %r17, %r1, %r3, %r16; + mul.wide.s32 %rd13, %r17, 8; add.s64 %rd14, %rd1, %rd13; st.global.f64 [%rd14], %fd2; @@ -1899,7 +1881,7 @@ BB10_4: ) { .reg .pred %p<7>; - .reg .b32 %r<17>; + .reg .b32 %r<16>; .reg .f64 %fd<3>; .reg .b64 %rd<14>; @@ -1915,11 +1897,10 @@ BB10_4: mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; - mad.lo.s32 %r1, %r7, %r8, %r9; - mov.u32 %r10, %ntid.y; - mov.u32 %r11, %ctaid.y; - mov.u32 %r12, %tid.y; - mad.lo.s32 %r2, %r10, %r11, %r12; + mad.lo.s32 %r10, %r7, %r8, %r9; + max.s32 %r11, %r4, %r6; + div.s32 %r1, %r10, %r11; + rem.s32 %r2, %r10, %r11; setp.lt.s32 %p1, %r1, %r3; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; @@ -1928,8 +1909,8 @@ BB10_4: BB11_1: cvta.to.global.u64 %rd5, %rd2; - mad.lo.s32 %r13, %r1, %r4, %r2; - mul.wide.s32 %rd6, %r13, 8; + mad.lo.s32 %r12, %r1, %r4, %r2; + mul.wide.s32 %rd6, %r12, 8; add.s64 %rd7, %rd5, %rd6; ld.global.f64 %fd1, [%rd7]; add.s64 %rd8, %rd1, %rd6; @@ -1944,13 +1925,13 @@ BB11_2: BB11_3: cvta.to.global.u64 %rd9, %rd3; - mad.lo.s32 %r14, %r1, %r6, %r2; - mul.wide.s32 %rd10, %r14, 8; + mad.lo.s32 %r13, %r1, %r6, %r2; + mul.wide.s32 %rd10, %r13, 8; add.s64 %rd11, %rd9, %rd10; ld.global.f64 %fd2, [%rd11]; - add.s32 %r15, %r1, %r3; - mad.lo.s32 %r16, %r15, %r4, %r2; - mul.wide.s32 %rd12, %r16, 8; + add.s32 %r14, %r1, %r3; + mad.lo.s32 %r15, %r14, %r4, %r2; + mul.wide.s32 %rd12, %r15, 8; add.s64 %rd13, %rd1, %rd12; st.global.f64 [%rd13], %fd2; http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java index a2d361c..169c3bb 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java @@ -828,9 +828,6 @@ public class ParForProgramBlock extends ForProgramBlock // Frees up the GPUContexts used in the threaded Parfor and sets // the main thread to use the GPUContext if (DMLScript.USE_ACCELERATOR) { - for (int i = 0; i < _numThreads; i++) { - workers[i].getExecutionContext().setGPUContexts(null); - } ec.getGPUContext(0).initializeThread(); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java index 636b1f8..f77c22e 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java @@ -25,6 +25,7 @@ import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.CompilerConfig; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.controlprogram.parfor.stat.Stat; import org.apache.sysml.runtime.controlprogram.parfor.stat.StatisticMonitor; @@ -82,8 +83,15 @@ public class LocalParWorker extends ParWorker implements Runnable } // Initialize this GPUContext to this thread - if (DMLScript.USE_ACCELERATOR) - _ec.getGPUContext(0).initializeThread(); + if (DMLScript.USE_ACCELERATOR) { + try { + _ec.getGPUContext(0).initializeThread(); + } catch(DMLRuntimeException e) { + LOG.error("Error executing task because of failure in GPU backend: ",e); + LOG.error("Stopping LocalParWorker."); + return; + } + } //setup compiler config for worker thread ConfigurationManager.setLocalConfig(_cconf); http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java index 3cd2633..77c48a7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java @@ -169,7 +169,6 @@ public class FunctionCallCPInstruction extends CPInstruction ExecutionContext fn_ec = ExecutionContextFactory.createContext(false, ec.getProgram()); if (DMLScript.USE_ACCELERATOR) { fn_ec.setGPUContexts(ec.getGPUContexts()); - ec.setGPUContexts(null); fn_ec.getGPUContext(0).initializeThread(); } fn_ec.setVariables(functionVariables); @@ -205,12 +204,6 @@ public class FunctionCallCPInstruction extends CPInstruction // Unpin the pinned variables ec.unpinVariables(_boundInputParamNames, pinStatus); - if (DMLScript.USE_ACCELERATOR) { - ec.setGPUContexts(fn_ec.getGPUContexts()); - fn_ec.setGPUContexts(null); - ec.getGPUContext(0).initializeThread(); - } - // add the updated binding for each return variable to the variables in original symbol table for (int i=0; i< fpb.getOutputParams().size(); i++){ http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java index ef000c2..5a0a772 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java @@ -89,16 +89,34 @@ public class ExecutionConfig { * @return execution configuration * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static ExecutionConfig getConfigForSimpleMatrixOperations(int rlen, int clen) throws DMLRuntimeException { + public static ExecutionConfig getConfigForMatrixOperations(int rlen, int clen) throws DMLRuntimeException { int deviceNumber = 0; int maxBlockDim = getMaxBlockDim(deviceNumber); int blockDimX = (int) Math.min(maxBlockDim, rlen); int gridDimX = (int) Math.ceil((double) rlen / blockDimX); int blockDimY = (int) Math.min(Math.floor(((double) maxBlockDim) / blockDimX), clen); int gridDimY = (int) Math.ceil((double) clen / blockDimY); + if (gridDimY > 65535) + throw new DMLRuntimeException("Internal Error: gridDimY must be less than 65535 for all supported CUDA compute capabilites!"); return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY); } + /** + * Use this for simple vector operations and use following in the kernel + * <code> + * int index = blockIdx.x * blockDim.x + threadIdx.x + * </code> + * <p> + * @param rlen number of rows + * @param clen number of columns + * @return execution configuration + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static ExecutionConfig getConfigForSimpleMatrixOperations(int rlen, int clen) throws DMLRuntimeException { + return getConfigForSimpleVectorOperations(rlen * clen); + } + + public ExecutionConfig(int gridDimX, int blockDimX) { this.gridDimX = gridDimX; this.blockDimX = blockDimX; @@ -134,4 +152,10 @@ public class ExecutionConfig { return ret; } + @Override + public String toString() { + return "ExecutionConfig{" + "gridDimX=" + gridDimX + ", gridDimY=" + gridDimY + ", gridDimZ=" + gridDimZ + + ", blockDimX=" + blockDimX + ", blockDimY=" + blockDimY + ", blockDimZ=" + blockDimZ + + ", sharedMemBytes=" + sharedMemBytes + '}'; + } } http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index b3c19ef..4c0562d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -108,27 +108,27 @@ public class GPUContext { /** * cudnnHandle for Deep Neural Network operations on the GPU */ - private cudnnHandle cudnnHandle; + private final ThreadLocal<cudnnHandle> cudnnHandle = new ThreadLocal<>(); /** * cublasHandle for BLAS operations on the GPU */ - private cublasHandle cublasHandle; + private final ThreadLocal<cublasHandle> cublasHandle = new ThreadLocal<>(); /** * cusparseHandle for certain sparse BLAS operations on the GPU */ - private cusparseHandle cusparseHandle; + private final ThreadLocal<cusparseHandle> cusparseHandle = new ThreadLocal<>(); /** * cusolverDnHandle for invoking solve() function on dense matrices on the GPU */ - private cusolverDnHandle cusolverDnHandle; + private final ThreadLocal<cusolverDnHandle> cusolverDnHandle = new ThreadLocal<>(); /** * cusolverSpHandle for invoking solve() function on sparse matrices on the GPU */ - private cusolverSpHandle cusolverSpHandle; + private final ThreadLocal<cusolverSpHandle> cusolverSpHandle = new ThreadLocal<>(); /** * to launch custom CUDA kernel, specific to the active GPU for this GPUContext */ - private JCudaKernels kernels; + private final ThreadLocal<JCudaKernels> kernels = new ThreadLocal<>(); protected GPUContext(int deviceNum) throws DMLRuntimeException { this.deviceNum = deviceNum; @@ -140,28 +140,51 @@ public class GPUContext { long total[] = { 0 }; cudaMemGetInfo(free, total); - long start = System.nanoTime(); - cudnnHandle = new cudnnHandle(); - cudnnCreate(cudnnHandle); - cublasHandle = new cublasHandle(); - cublasCreate(cublasHandle); + long start = -1; + if (DMLScript.STATISTICS) + start = System.nanoTime(); + initializeCudaLibraryHandles(); + + if (DMLScript.STATISTICS) + GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; + + LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + + this); + + } + + private void initializeCudaLibraryHandles() throws DMLRuntimeException { + if (cudnnHandle.get() == null) { + cudnnHandle.set(new cudnnHandle()); + cudnnCreate(cudnnHandle.get()); + } + + if (cublasHandle.get() == null) { + cublasHandle.set(new cublasHandle()); + cublasCreate(cublasHandle.get()); + } // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); - cusparseHandle = new cusparseHandle(); - cusparseCreate(cusparseHandle); - cusolverDnHandle = new cusolverDnHandle(); - cusolverDnCreate(cusolverDnHandle); - cusolverSpHandle = new cusolverSpHandle(); - cusolverSpCreate(cusolverSpHandle); + if (cusparseHandle.get() == null) { + cusparseHandle.set(new cusparseHandle()); + cusparseCreate(cusparseHandle.get()); + } - kernels = new JCudaKernels(deviceNum); + if (cusolverDnHandle.get() == null) { + cusolverDnHandle.set(new cusolverDnHandle()); + cusolverDnCreate(cusolverDnHandle.get()); + } - GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; - LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " - + this); + if (cusolverSpHandle.get() == null) { + cusolverSpHandle.set(new cusolverSpHandle()); + cusolverSpCreate(cusolverSpHandle.get()); + } + if (kernels.get() == null) { + kernels.set(new JCudaKernels()); + } } public static int cudaGetDevice() { @@ -181,8 +204,9 @@ public class GPUContext { * If in a multi-threaded env like parfor, this method must be called when in the * appropriate thread */ - public void initializeThread() { + public void initializeThread() throws DMLRuntimeException { cudaSetDevice(deviceNum); + initializeCudaLibraryHandles(); } /** @@ -595,27 +619,27 @@ public class GPUContext { } public cudnnHandle getCudnnHandle() { - return cudnnHandle; + return cudnnHandle.get(); } public cublasHandle getCublasHandle() { - return cublasHandle; + return cublasHandle.get(); } public cusparseHandle getCusparseHandle() { - return cusparseHandle; + return cusparseHandle.get(); } public cusolverDnHandle getCusolverDnHandle() { - return cusolverDnHandle; + return cusolverDnHandle.get(); } public cusolverSpHandle getCusolverSpHandle() { - return cusolverSpHandle; + return cusolverSpHandle.get(); } public JCudaKernels getKernels() { - return kernels; + return kernels.get(); } /** @@ -626,15 +650,11 @@ public class GPUContext { public void destroy() throws DMLRuntimeException { LOG.trace("GPU : this context was destroyed, this = " + this.toString()); clearMemory(); - cudnnDestroy(cudnnHandle); - cublasDestroy(cublasHandle); - cusparseDestroy(cusparseHandle); - cusolverDnDestroy(cusolverDnHandle); - cusolverSpDestroy(cusolverSpHandle); - cudnnHandle = null; - cublasHandle = null; - cusparseHandle = null; - + cudnnDestroy(cudnnHandle.get()); + cublasDestroy(cublasHandle.get()); + cusparseDestroy(cusparseHandle.get()); + cusolverDnDestroy(cusolverDnHandle.get()); + cusolverSpDestroy(cusolverSpHandle.get()); } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java index a9b1333..e030180 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java @@ -130,7 +130,7 @@ public class GPUContextPool { // initially available memory is set to the GPU with the lowest memory // This is because at runtime, we wouldn't know which GPU a certain // operation gets scheduled on - long minAvailableMemory = Integer.MAX_VALUE; + long minAvailableMemory = Long.MAX_VALUE; for (GPUContext gCtx : pool) { gCtx.initializeThread(); minAvailableMemory = Math.min(minAvailableMemory, gCtx.getAvailableMemory()); http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java index 246aecc..9cfab2b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java @@ -49,17 +49,14 @@ public class JCudaKernels { private final static String ptxFileName = "/kernels/SystemML.ptx"; private HashMap<String, CUfunction> kernels = new HashMap<String, CUfunction>(); private CUmodule module; - // private final int deviceNum; /** * Loads the kernels in the file ptxFileName. Though cubin files are also supported, we will stick with * ptx file as they are target-independent similar to Java's .class files. * - * @param deviceNum the device number for which to initiate the driver API * @throws DMLRuntimeException if DMLRuntimeException occurs */ - JCudaKernels(int deviceNum) throws DMLRuntimeException { - // this.deviceNum = deviceNum; + JCudaKernels() throws DMLRuntimeException { module = new CUmodule(); // Load the kernels specified in the ptxFileName file checkResult(cuModuleLoadDataEx(module, initKernels(ptxFileName), 0, new int[0], Pointer.to(new int[0]))); http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/test/java/org/apache/sysml/test/gpu/GPUTests.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java index 195968a..d40b7a1 100644 --- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java +++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java @@ -98,6 +98,24 @@ public abstract class GPUTests extends AutomatedTestBase { } /** + * Generates an input matrix which is a sequence of integers + * @param spark valid instance of {@link SparkSession} + * @param m number of rows + * @param n number of columns + * @return a matrix with a sequence of integers + */ + protected Matrix generateIntegerSequenceMatrix(SparkSession spark, int m, int n) { + MLContext genMLC = new MLContext(spark); + String scriptStr; + scriptStr = "temp = seq(1, " + (m*n) + ")" + + "in1 = matrix(temp, rows=" + m + ", cols=" + n + ")"; + Script generateScript = ScriptFactory.dmlFromString(scriptStr).out("in1"); + Matrix in1 = genMLC.execute(generateScript).getMatrix("in1"); + genMLC.close(); + return in1; + } + + /** * Generates a random input matrix with a given size and sparsity * * @param spark valid instance of {@link SparkSession} http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java index f7c7851..81bc254 100644 --- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java +++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java @@ -153,6 +153,7 @@ public class MatrixMultiplicationOpTest extends GPUTests { for (int j = 0; j < sparsities.length; j++) { int side = sizes[i]; double sparsity = sparsities[j]; + System.out.println("Transpose Self matrix multiply, size = " + side + ", sparsity = " + sparsity); Matrix X = generateInputMatrix(spark, side, side, sparsity, seed); HashMap<String, Object> inputs = new HashMap<>(); inputs.put("X", X);