Repository: systemml
Updated Branches:
  refs/heads/master 98a9d653d -> 815ca4f2a


[MINOR] bug fixes in the GPU backend

- Each thread is assigned a cuda library handle
- JCudaKernels is also made thread safe
- Removed setting GPUContext to null
- Bug fix in initial gpu budget estimate
- Cuda Kernels use blockId.x and threadId.x only

Closes #607


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/815ca4f2
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/815ca4f2
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/815ca4f2

Branch: refs/heads/master
Commit: 815ca4f2aedcbe491d10a873db99a9b5e6f29226
Parents: 98a9d65
Author: Nakul Jindal <naku...@gmail.com>
Authored: Tue Aug 8 13:29:11 2017 -0700
Committer: Nakul Jindal <naku...@gmail.com>
Committed: Tue Aug 8 13:29:11 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |  54 +--
 src/main/cpp/kernels/SystemML.ptx               | 333 +++++++++----------
 .../controlprogram/ParForProgramBlock.java      |   3 -
 .../controlprogram/parfor/LocalParWorker.java   |  12 +-
 .../cp/FunctionCallCPInstruction.java           |   7 -
 .../gpu/context/ExecutionConfig.java            |  26 +-
 .../instructions/gpu/context/GPUContext.java    |  94 +++---
 .../gpu/context/GPUContextPool.java             |   2 +-
 .../instructions/gpu/context/JCudaKernels.java  |   5 +-
 .../org/apache/sysml/test/gpu/GPUTests.java     |  18 +
 .../test/gpu/MatrixMultiplicationOpTest.java    |   1 +
 11 files changed, 303 insertions(+), 252 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 297269f..dcd64b2 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -35,12 +35,13 @@ nvcc -ptx -arch=sm_30 SystemML.cu
  */
 extern "C"
 __global__ void copy_u2l_dense(double* ret, int dim, int N) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / dim;
+       int iy = tid % dim;
        int id_dest = iy * dim + ix;
        if(iy > ix && id_dest < N) {
                // TODO: Potential to reduce the number of threads by half
-               int id_src = ix * dim + iy;
+               int id_src = tid;
                ret[id_dest] = ret[id_src];
        }
 }
@@ -104,8 +105,9 @@ __forceinline__ __device__ double binaryOp(double x, double 
y, int op) {
 
 extern "C"
 __global__ void relu(double* A,  double* ret, int rlen, int clen) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / clen;
+       int iy = tid % clen;
        if(ix < rlen && iy < clen) {
                int index = ix * clen + iy;
                ret[index] = max(0.0, A[index]);
@@ -115,8 +117,9 @@ __global__ void relu(double* A,  double* ret, int rlen, int 
clen) {
 // This method computes the backpropagation errors for previous layer of relu 
operation
 extern "C"
 __global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, 
int clen) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / clen;
+       int iy = tid % clen;
        if(ix < rlen && iy < clen) {
                int index = ix * clen + iy;
                ret[index] = X[index] > 0 ?  dout[index] : 0;
@@ -129,8 +132,9 @@ __global__ void relu_backward(double* X,  double* dout, 
double* ret, int rlen, i
 // This operation is often followed by conv2d and hence we have introduced 
bias_add(input, bias) built-in function
 extern "C"
 __global__ void bias_add(double* input,  double* bias, double* ret, int rlen, 
int clen, int PQ) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / clen;
+       int iy = tid % clen;
        if(ix < rlen && iy < clen) {
                int index = ix * clen + iy;
                int biasIndex = iy / PQ;
@@ -141,8 +145,9 @@ __global__ void bias_add(double* input,  double* bias, 
double* ret, int rlen, in
 // Performs the operation "ret <- A + alpha*B", where B is a vector
 extern "C"
 __global__ void daxpy_matrix_vector(double* A,  double* B, double alpha, 
double* ret, int rlenA, int clenA, int rlenB, int clenB) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / clenA;
+       int iy = tid % clenA;
        if(ix < rlenA && iy < clenA) {
                int index = ix * clenA + iy;
                if(rlenB == 1) {
@@ -157,8 +162,9 @@ __global__ void daxpy_matrix_vector(double* A,  double* B, 
double alpha, double*
 // Performs similar operation as bias_add except elementwise multiplication 
instead of add
 extern "C"
 __global__ void bias_multiply(double* input,  double* bias, double* ret, int 
rlen, int clen, int PQ) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / clen;
+       int iy = tid % clen;
        if(ix < rlen && iy < clen) {
                int index = ix * clen + iy;
                int biasIndex = iy / PQ;
@@ -169,8 +175,9 @@ __global__ void bias_multiply(double* input,  double* bias, 
double* ret, int rle
 // Compares the value and set
 extern "C"
 __global__ void compare_and_set(double* A,  double* ret, int rlen, int clen, 
double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double 
ifGreaterThanVal) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / clen;
+       int iy = tid % clen;
        int index = ix * clen + iy;
        if(ix < rlen && iy < clen) {
                if(abs(A[index]-compareVal) < tol)
@@ -199,8 +206,9 @@ __global__ void compare_and_set(double* A,  double* ret, 
int rlen, int clen, dou
 extern "C"
 __global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
        int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) 
{
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / maxClen;
+       int iy = tid % maxClen;
 
        if(ix < maxRlen && iy < maxClen) {
                int outIndex = ix * maxClen + iy;
@@ -273,8 +281,10 @@ __global__ void fill(double* A, double scalar, int lenA) {
  */
 extern "C"
 __global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, 
int rowsB, int colsB) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int maxClen = max(colsA, colsB);
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / maxClen;
+       int iy = tid % maxClen;
 
        int colsC = colsA + colsB;
        int rowsC = rowsA;
@@ -310,8 +320,10 @@ __global__ void cbind(double *A, double *B, double *C, int 
rowsA, int colsA, int
  */
 extern "C"
 __global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, 
int rowsB, int colsB) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       int maxClen = max(colsA, colsB);
+       int tid = blockIdx.x * blockDim.x + threadIdx.x;
+       int ix = tid / maxClen;
+       int iy = tid % maxClen;
 
        int rowsC = rowsA + rowsB;
        int colsC = colsA;

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index 6884d5b..7778317 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -34,36 +34,33 @@
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<13>;
+       .reg .b32       %r<10>;
        .reg .f64       %fd<2>;
        .reg .b64       %rd<7>;
 
 
        ld.param.u64    %rd1, [copy_u2l_dense_param_0];
-       ld.param.u32    %r4, [copy_u2l_dense_param_1];
-       ld.param.u32    %r5, [copy_u2l_dense_param_2];
-       mov.u32         %r6, %ntid.x;
-       mov.u32         %r7, %ctaid.x;
-       mov.u32         %r8, %tid.x;
-       mad.lo.s32      %r1, %r6, %r7, %r8;
-       mov.u32         %r9, %ntid.y;
-       mov.u32         %r10, %ctaid.y;
-       mov.u32         %r11, %tid.y;
-       mad.lo.s32      %r2, %r9, %r10, %r11;
-       mad.lo.s32      %r3, %r2, %r4, %r1;
-       setp.gt.s32     %p1, %r2, %r1;
-       setp.lt.s32     %p2, %r3, %r5;
+       ld.param.u32    %r3, [copy_u2l_dense_param_1];
+       ld.param.u32    %r4, [copy_u2l_dense_param_2];
+       mov.u32         %r5, %ntid.x;
+       mov.u32         %r6, %ctaid.x;
+       mov.u32         %r7, %tid.x;
+       mad.lo.s32      %r1, %r5, %r6, %r7;
+       div.s32         %r8, %r1, %r3;
+       rem.s32         %r9, %r1, %r3;
+       mad.lo.s32      %r2, %r9, %r3, %r8;
+       setp.gt.s32     %p1, %r9, %r8;
+       setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
        @!%p3 bra       BB0_2;
        bra.uni         BB0_1;
 
 BB0_1:
        cvta.to.global.u64      %rd2, %rd1;
-       mad.lo.s32      %r12, %r1, %r4, %r2;
-       mul.wide.s32    %rd3, %r12, 8;
+       mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        ld.global.f64   %fd1, [%rd4];
-       mul.wide.s32    %rd5, %r3, 8;
+       mul.wide.s32    %rd5, %r2, 8;
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
@@ -80,7 +77,7 @@ BB0_2:
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<12>;
+       .reg .b32       %r<10>;
        .reg .f64       %fd<4>;
        .reg .b64       %rd<8>;
 
@@ -93,20 +90,18 @@ BB0_2:
        mov.u32         %r6, %ntid.x;
        mov.u32         %r7, %tid.x;
        mad.lo.s32      %r1, %r6, %r5, %r7;
-       mov.u32         %r8, %ntid.y;
-       mov.u32         %r9, %ctaid.y;
-       mov.u32         %r10, %tid.y;
-       mad.lo.s32      %r2, %r8, %r9, %r10;
-       setp.lt.s32     %p1, %r1, %r4;
-       setp.lt.s32     %p2, %r2, %r3;
+       div.s32         %r2, %r1, %r3;
+       setp.lt.s32     %p1, %r2, %r4;
+       setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
        @!%p3 bra       BB1_2;
        bra.uni         BB1_1;
 
 BB1_1:
+       rem.s32         %r8, %r1, %r3;
        cvta.to.global.u64      %rd3, %rd1;
-       mad.lo.s32      %r11, %r1, %r3, %r2;
-       mul.wide.s32    %rd4, %r11, 8;
+       mad.lo.s32      %r9, %r2, %r3, %r8;
+       mul.wide.s32    %rd4, %r9, 8;
        add.s64         %rd5, %rd3, %rd4;
        ld.global.f64   %fd1, [%rd5];
        mov.f64         %fd2, 0d0000000000000000;
@@ -129,7 +124,7 @@ BB1_2:
 )
 {
        .reg .pred      %p<5>;
-       .reg .b32       %r<12>;
+       .reg .b32       %r<10>;
        .reg .f64       %fd<6>;
        .reg .b64       %rd<14>;
 
@@ -143,21 +138,19 @@ BB1_2:
        mov.u32         %r6, %ctaid.x;
        mov.u32         %r7, %tid.x;
        mad.lo.s32      %r1, %r5, %r6, %r7;
-       mov.u32         %r8, %ntid.y;
-       mov.u32         %r9, %ctaid.y;
-       mov.u32         %r10, %tid.y;
-       mad.lo.s32      %r2, %r8, %r9, %r10;
-       setp.lt.s32     %p1, %r1, %r4;
-       setp.lt.s32     %p2, %r2, %r3;
+       div.s32         %r2, %r1, %r3;
+       setp.lt.s32     %p1, %r2, %r4;
+       setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
        @!%p3 bra       BB2_4;
        bra.uni         BB2_1;
 
 BB2_1:
+       rem.s32         %r8, %r1, %r3;
        cvta.to.global.u64      %rd5, %rd2;
-       mad.lo.s32      %r11, %r1, %r3, %r2;
-       cvt.s64.s32     %rd1, %r11;
-       mul.wide.s32    %rd6, %r11, 8;
+       mad.lo.s32      %r9, %r2, %r3, %r8;
+       cvt.s64.s32     %rd1, %r9;
+       mul.wide.s32    %rd6, %r9, 8;
        add.s64         %rd7, %rd5, %rd6;
        ld.global.f64   %fd4, [%rd7];
        mov.f64         %fd5, 0d0000000000000000;
@@ -190,7 +183,7 @@ BB2_4:
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<14>;
+       .reg .b32       %r<12>;
        .reg .f64       %fd<4>;
        .reg .b64       %rd<12>;
 
@@ -205,24 +198,22 @@ BB2_4:
        mov.u32         %r7, %ntid.x;
        mov.u32         %r8, %tid.x;
        mad.lo.s32      %r1, %r7, %r6, %r8;
-       mov.u32         %r9, %ntid.y;
-       mov.u32         %r10, %ctaid.y;
-       mov.u32         %r11, %tid.y;
-       mad.lo.s32      %r2, %r9, %r10, %r11;
-       setp.lt.s32     %p1, %r1, %r5;
-       setp.lt.s32     %p2, %r2, %r3;
+       div.s32         %r2, %r1, %r3;
+       setp.lt.s32     %p1, %r2, %r5;
+       setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
        @!%p3 bra       BB3_2;
        bra.uni         BB3_1;
 
 BB3_1:
+       rem.s32         %r9, %r1, %r3;
        cvta.to.global.u64      %rd4, %rd1;
-       mad.lo.s32      %r12, %r1, %r3, %r2;
-       mul.wide.s32    %rd5, %r12, 8;
+       mad.lo.s32      %r10, %r2, %r3, %r9;
+       mul.wide.s32    %rd5, %r10, 8;
        add.s64         %rd6, %rd4, %rd5;
-       div.s32         %r13, %r2, %r4;
+       div.s32         %r11, %r9, %r4;
        cvta.to.global.u64      %rd7, %rd2;
-       mul.wide.s32    %rd8, %r13, 8;
+       mul.wide.s32    %rd8, %r11, 8;
        add.s64         %rd9, %rd7, %rd8;
        ld.global.f64   %fd1, [%rd9];
        ld.global.f64   %fd2, [%rd6];
@@ -248,7 +239,7 @@ BB3_2:
 )
 {
        .reg .pred      %p<5>;
-       .reg .b32       %r<13>;
+       .reg .b32       %r<11>;
        .reg .f64       %fd<7>;
        .reg .b64       %rd<14>;
 
@@ -264,22 +255,20 @@ BB3_2:
        mov.u32         %r6, %ntid.x;
        mov.u32         %r7, %ctaid.x;
        mov.u32         %r8, %tid.x;
-       mad.lo.s32      %r1, %r6, %r7, %r8;
-       mov.u32         %r9, %ntid.y;
-       mov.u32         %r10, %ctaid.y;
-       mov.u32         %r11, %tid.y;
-       mad.lo.s32      %r2, %r9, %r10, %r11;
+       mad.lo.s32      %r9, %r6, %r7, %r8;
+       div.s32         %r1, %r9, %r3;
+       rem.s32         %r2, %r9, %r3;
        setp.lt.s32     %p1, %r1, %r5;
-       setp.lt.s32     %p2, %r2, %r3;
+       setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
        @!%p3 bra       BB4_4;
        bra.uni         BB4_1;
 
 BB4_1:
        cvta.to.global.u64      %rd6, %rd4;
-       mad.lo.s32      %r12, %r1, %r3, %r2;
+       mad.lo.s32      %r10, %r1, %r3, %r2;
        cvta.to.global.u64      %rd7, %rd3;
-       mul.wide.s32    %rd8, %r12, 8;
+       mul.wide.s32    %rd8, %r10, 8;
        add.s64         %rd9, %rd7, %rd8;
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd2, %rd6, %rd8;
@@ -317,7 +306,7 @@ BB4_4:
 )
 {
        .reg .pred      %p<4>;
-       .reg .b32       %r<14>;
+       .reg .b32       %r<12>;
        .reg .f64       %fd<4>;
        .reg .b64       %rd<12>;
 
@@ -332,24 +321,22 @@ BB4_4:
        mov.u32         %r7, %ntid.x;
        mov.u32         %r8, %tid.x;
        mad.lo.s32      %r1, %r7, %r6, %r8;
-       mov.u32         %r9, %ntid.y;
-       mov.u32         %r10, %ctaid.y;
-       mov.u32         %r11, %tid.y;
-       mad.lo.s32      %r2, %r9, %r10, %r11;
-       setp.lt.s32     %p1, %r1, %r5;
-       setp.lt.s32     %p2, %r2, %r3;
+       div.s32         %r2, %r1, %r3;
+       setp.lt.s32     %p1, %r2, %r5;
+       setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
        @!%p3 bra       BB5_2;
        bra.uni         BB5_1;
 
 BB5_1:
+       rem.s32         %r9, %r1, %r3;
        cvta.to.global.u64      %rd4, %rd1;
-       mad.lo.s32      %r12, %r1, %r3, %r2;
-       mul.wide.s32    %rd5, %r12, 8;
+       mad.lo.s32      %r10, %r2, %r3, %r9;
+       mul.wide.s32    %rd5, %r10, 8;
        add.s64         %rd6, %rd4, %rd5;
-       div.s32         %r13, %r2, %r4;
+       div.s32         %r11, %r9, %r4;
        cvta.to.global.u64      %rd7, %rd2;
-       mul.wide.s32    %rd8, %r13, 8;
+       mul.wide.s32    %rd8, %r11, 8;
        add.s64         %rd9, %rd7, %rd8;
        ld.global.f64   %fd1, [%rd9];
        ld.global.f64   %fd2, [%rd6];
@@ -376,7 +363,7 @@ BB5_2:
 )
 {
        .reg .pred      %p<6>;
-       .reg .b32       %r<12>;
+       .reg .b32       %r<10>;
        .reg .f64       %fd<9>;
        .reg .b64       %rd<8>;
 
@@ -394,13 +381,11 @@ BB5_2:
        mov.u32         %r5, %ntid.x;
        mov.u32         %r6, %tid.x;
        mad.lo.s32      %r7, %r5, %r4, %r6;
-       mov.u32         %r8, %ntid.y;
-       mov.u32         %r9, %ctaid.y;
-       mov.u32         %r10, %tid.y;
-       mad.lo.s32      %r11, %r8, %r9, %r10;
-       mad.lo.s32      %r1, %r7, %r3, %r11;
-       setp.lt.s32     %p1, %r7, %r2;
-       setp.lt.s32     %p2, %r11, %r3;
+       div.s32         %r8, %r7, %r3;
+       rem.s32         %r9, %r7, %r3;
+       mad.lo.s32      %r1, %r8, %r3, %r9;
+       setp.lt.s32     %p1, %r8, %r2;
+       setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
        @!%p3 bra       BB6_6;
        bra.uni         BB6_1;
@@ -451,7 +436,7 @@ BB6_6:
 )
 {
        .reg .pred      %p<73>;
-       .reg .b32       %r<68>;
+       .reg .b32       %r<66>;
        .reg .f64       %fd<56>;
        .reg .b64       %rd<19>;
 
@@ -467,13 +452,11 @@ BB6_6:
        mov.u32         %r15, %ntid.x;
        mov.u32         %r16, %ctaid.x;
        mov.u32         %r17, %tid.x;
-       mad.lo.s32      %r1, %r15, %r16, %r17;
-       mov.u32         %r18, %ntid.y;
-       mov.u32         %r19, %ctaid.y;
-       mov.u32         %r20, %tid.y;
-       mad.lo.s32      %r2, %r18, %r19, %r20;
+       mad.lo.s32      %r18, %r15, %r16, %r17;
+       div.s32         %r1, %r18, %r10;
+       rem.s32         %r2, %r18, %r10;
        setp.lt.s32     %p2, %r1, %r14;
-       setp.lt.s32     %p3, %r2, %r10;
+       setp.gt.s32     %p3, %r10, -1;
        and.pred        %p4, %p2, %p3;
        @!%p4 bra       BB7_77;
        bra.uni         BB7_1;
@@ -481,34 +464,34 @@ BB6_6:
 BB7_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r66, %r1;
+       mov.u32         %r64, %r1;
        @%p5 bra        BB7_5;
 
        setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r67, %r3;
+       mov.u32         %r65, %r3;
        @%p6 bra        BB7_4;
 
-       mov.u32         %r67, %r2;
+       mov.u32         %r65, %r2;
 
 BB7_4:
-       mov.u32         %r61, %r67;
-       mov.u32         %r4, %r61;
-       mov.u32         %r66, %r4;
+       mov.u32         %r59, %r65;
+       mov.u32         %r4, %r59;
+       mov.u32         %r64, %r4;
 
 BB7_5:
-       mov.u32         %r5, %r66;
+       mov.u32         %r5, %r64;
        setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r64, %r1;
+       mov.u32         %r62, %r1;
        @%p7 bra        BB7_9;
 
        setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r65, %r3;
+       mov.u32         %r63, %r3;
        @%p8 bra        BB7_8;
 
-       mov.u32         %r65, %r2;
+       mov.u32         %r63, %r2;
 
 BB7_8:
-       mov.u32         %r64, %r65;
+       mov.u32         %r62, %r63;
 
 BB7_9:
        cvta.to.global.u64      %rd5, %rd3;
@@ -516,7 +499,7 @@ BB7_9:
        mul.wide.s32    %rd7, %r5, 8;
        add.s64         %rd8, %rd6, %rd7;
        ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r64, 8;
+       mul.wide.s32    %rd9, %r62, 8;
        add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
        mov.f64         %fd55, 0d7FEFFFFFFFFFFFFF;
@@ -570,10 +553,10 @@ BB7_58:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r9}, %fd2;
        }
-       bfe.u32         %r33, %r9, 20, 11;
-       add.s32         %r34, %r33, -1012;
+       bfe.u32         %r31, %r9, 20, 11;
+       add.s32         %r32, %r31, -1012;
        mov.b64          %rd15, %fd2;
-       shl.b64         %rd1, %rd15, %r34;
+       shl.b64         %rd1, %rd15, %r32;
        setp.eq.s64     %p53, %rd1, -9223372036854775808;
        abs.f64         %fd19, %fd1;
        // Callseq Start 0
@@ -603,14 +586,14 @@ BB7_58:
 BB7_59:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r35}, %fd54;
+       mov.b64         {%temp, %r33}, %fd54;
        }
-       xor.b32         %r36, %r35, -2147483648;
+       xor.b32         %r34, %r33, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r37, %temp}, %fd54;
+       mov.b64         {%r35, %temp}, %fd54;
        }
-       mov.b64         %fd54, {%r37, %r36};
+       mov.b64         %fd54, {%r35, %r34};
 
 BB7_60:
        mov.f64         %fd53, %fd54;
@@ -619,12 +602,12 @@ BB7_60:
        bra.uni         BB7_61;
 
 BB7_63:
-       selp.b32        %r38, %r8, 0, %p53;
-       or.b32          %r39, %r38, 2146435072;
+       selp.b32        %r36, %r8, 0, %p53;
+       or.b32          %r37, %r36, 2146435072;
        setp.lt.s32     %p59, %r9, 0;
-       selp.b32        %r40, %r39, %r38, %p59;
-       mov.u32         %r41, 0;
-       mov.b64         %fd53, {%r41, %r40};
+       selp.b32        %r38, %r37, %r36, %p59;
+       mov.u32         %r39, 0;
+       mov.b64         %fd53, {%r39, %r38};
        bra.uni         BB7_64;
 
 BB7_35:
@@ -638,10 +621,10 @@ BB7_35:
 BB7_52:
        cvt.rni.s64.f64 %rd11, %fd1;
        cvt.rni.s64.f64 %rd12, %fd2;
-       cvt.u32.u64     %r27, %rd11;
-       cvt.u32.u64     %r28, %rd12;
-       or.b32          %r29, %r28, %r27;
-       setp.eq.s32     %p45, %r29, 0;
+       cvt.u32.u64     %r25, %rd11;
+       cvt.u32.u64     %r26, %rd12;
+       or.b32          %r27, %r26, %r25;
+       setp.eq.s32     %p45, %r27, 0;
        selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
        bra.uni         BB7_76;
 
@@ -701,17 +684,17 @@ BB7_46:
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r24}, %fd55;
+       mov.b64         {%temp, %r22}, %fd55;
        }
-       and.b32         %r25, %r24, 2147483647;
-       setp.ne.s32     %p42, %r25, 2146435072;
+       and.b32         %r23, %r22, 2147483647;
+       setp.ne.s32     %p42, %r23, 2146435072;
        @%p42 bra       BB7_50;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r26, %temp}, %fd55;
+       mov.b64         {%r24, %temp}, %fd55;
        }
-       setp.eq.s32     %p43, %r26, 0;
+       setp.eq.s32     %p43, %r24, 0;
        @%p43 bra       BB7_76;
 
 BB7_50:
@@ -781,10 +764,10 @@ BB7_33:
 BB7_34:
        cvt.rni.s64.f64 %rd13, %fd1;
        cvt.rni.s64.f64 %rd14, %fd2;
-       cvt.u32.u64     %r30, %rd13;
-       cvt.u32.u64     %r31, %rd14;
-       and.b32         %r32, %r31, %r30;
-       setp.eq.s32     %p46, %r32, 0;
+       cvt.u32.u64     %r28, %rd13;
+       cvt.u32.u64     %r29, %rd14;
+       and.b32         %r30, %r29, %r28;
+       setp.eq.s32     %p46, %r30, 0;
        selp.f64        %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
        bra.uni         BB7_76;
 
@@ -820,17 +803,17 @@ BB7_41:
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r21}, %fd55;
+       mov.b64         {%temp, %r19}, %fd55;
        }
-       and.b32         %r22, %r21, 2147483647;
-       setp.ne.s32     %p36, %r22, 2146435072;
+       and.b32         %r20, %r19, 2147483647;
+       setp.ne.s32     %p36, %r20, 2146435072;
        @%p36 bra       BB7_45;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r23, %temp}, %fd55;
+       mov.b64         {%r21, %temp}, %fd55;
        }
-       setp.eq.s32     %p37, %r23, 0;
+       setp.eq.s32     %p37, %r21, 0;
        @%p37 bra       BB7_76;
 
 BB7_45:
@@ -850,10 +833,10 @@ BB7_64:
        add.f64         %fd26, %fd1, %fd2;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r42}, %fd26;
+       mov.b64         {%temp, %r40}, %fd26;
        }
-       and.b32         %r43, %r42, 2146435072;
-       setp.ne.s32     %p60, %r43, 2146435072;
+       and.b32         %r41, %r40, 2146435072;
+       setp.ne.s32     %p60, %r41, 2146435072;
        mov.f64         %fd52, %fd25;
        @%p60 bra       BB7_73;
 
@@ -867,51 +850,51 @@ BB7_64:
        mov.f64         %fd52, %fd51;
        @%p62 bra       BB7_73;
 
-       and.b32         %r44, %r9, 2147483647;
-       setp.ne.s32     %p63, %r44, 2146435072;
+       and.b32         %r42, %r9, 2147483647;
+       setp.ne.s32     %p63, %r42, 2146435072;
        @%p63 bra       BB7_69;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r45, %temp}, %fd2;
+       mov.b64         {%r43, %temp}, %fd2;
        }
-       setp.eq.s32     %p64, %r45, 0;
+       setp.eq.s32     %p64, %r43, 0;
        @%p64 bra       BB7_72;
 
 BB7_69:
-       and.b32         %r46, %r8, 2147483647;
-       setp.ne.s32     %p65, %r46, 2146435072;
+       and.b32         %r44, %r8, 2147483647;
+       setp.ne.s32     %p65, %r44, 2146435072;
        mov.f64         %fd49, %fd25;
        mov.f64         %fd52, %fd49;
        @%p65 bra       BB7_73;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r47, %temp}, %fd1;
+       mov.b64         {%r45, %temp}, %fd1;
        }
-       setp.ne.s32     %p66, %r47, 0;
+       setp.ne.s32     %p66, %r45, 0;
        mov.f64         %fd52, %fd25;
        @%p66 bra       BB7_73;
 
-       shr.s32         %r48, %r9, 31;
-       and.b32         %r49, %r48, -2146435072;
-       add.s32         %r50, %r49, 2146435072;
-       or.b32          %r51, %r50, -2147483648;
-       selp.b32        %r52, %r51, %r50, %p1;
-       mov.u32         %r53, 0;
-       mov.b64         %fd52, {%r53, %r52};
+       shr.s32         %r46, %r9, 31;
+       and.b32         %r47, %r46, -2146435072;
+       add.s32         %r48, %r47, 2146435072;
+       or.b32          %r49, %r48, -2147483648;
+       selp.b32        %r50, %r49, %r48, %p1;
+       mov.u32         %r51, 0;
+       mov.b64         %fd52, {%r51, %r50};
        bra.uni         BB7_73;
 
 BB7_72:
        setp.gt.f64     %p67, %fd19, 0d3FF0000000000000;
-       selp.b32        %r54, 2146435072, 0, %p67;
-       xor.b32         %r55, %r54, 2146435072;
+       selp.b32        %r52, 2146435072, 0, %p67;
+       xor.b32         %r53, %r52, 2146435072;
        setp.lt.s32     %p68, %r9, 0;
-       selp.b32        %r56, %r55, %r54, %p68;
+       selp.b32        %r54, %r53, %r52, %p68;
        setp.eq.f64     %p69, %fd1, 0dBFF0000000000000;
-       selp.b32        %r57, 1072693248, %r56, %p69;
-       mov.u32         %r58, 0;
-       mov.b64         %fd52, {%r58, %r57};
+       selp.b32        %r55, 1072693248, %r54, %p69;
+       mov.u32         %r56, 0;
+       mov.b64         %fd52, {%r56, %r55};
 
 BB7_73:
        setp.eq.f64     %p70, %fd2, 0d0000000000000000;
@@ -1825,7 +1808,7 @@ BB9_2:
 )
 {
        .reg .pred      %p<7>;
-       .reg .b32       %r<19>;
+       .reg .b32       %r<18>;
        .reg .f64       %fd<3>;
        .reg .b64       %rd<15>;
 
@@ -1841,11 +1824,10 @@ BB9_2:
        mov.u32         %r8, %ntid.x;
        mov.u32         %r9, %ctaid.x;
        mov.u32         %r10, %tid.x;
-       mad.lo.s32      %r1, %r8, %r9, %r10;
-       mov.u32         %r11, %ntid.y;
-       mov.u32         %r12, %ctaid.y;
-       mov.u32         %r13, %tid.y;
-       mad.lo.s32      %r2, %r11, %r12, %r13;
+       mad.lo.s32      %r11, %r8, %r9, %r10;
+       max.s32         %r12, %r4, %r6;
+       div.s32         %r1, %r11, %r12;
+       rem.s32         %r2, %r11, %r12;
        add.s32         %r3, %r6, %r4;
        setp.lt.s32     %p1, %r1, %r7;
        setp.lt.s32     %p2, %r2, %r4;
@@ -1855,12 +1837,12 @@ BB9_2:
 
 BB10_1:
        cvta.to.global.u64      %rd5, %rd2;
-       mad.lo.s32      %r14, %r1, %r4, %r2;
-       mul.wide.s32    %rd6, %r14, 8;
+       mad.lo.s32      %r13, %r1, %r4, %r2;
+       mul.wide.s32    %rd6, %r13, 8;
        add.s64         %rd7, %rd5, %rd6;
        ld.global.f64   %fd1, [%rd7];
-       mad.lo.s32      %r15, %r1, %r3, %r2;
-       mul.wide.s32    %rd8, %r15, 8;
+       mad.lo.s32      %r14, %r1, %r3, %r2;
+       mul.wide.s32    %rd8, %r14, 8;
        add.s64         %rd9, %rd1, %rd8;
        st.global.f64   [%rd9], %fd1;
 
@@ -1873,13 +1855,13 @@ BB10_2:
 
 BB10_3:
        cvta.to.global.u64      %rd10, %rd3;
-       mad.lo.s32      %r16, %r1, %r6, %r2;
-       mul.wide.s32    %rd11, %r16, 8;
+       mad.lo.s32      %r15, %r1, %r6, %r2;
+       mul.wide.s32    %rd11, %r15, 8;
        add.s64         %rd12, %rd10, %rd11;
        ld.global.f64   %fd2, [%rd12];
-       mad.lo.s32      %r17, %r1, %r3, %r4;
-       add.s32         %r18, %r17, %r2;
-       mul.wide.s32    %rd13, %r18, 8;
+       add.s32         %r16, %r2, %r4;
+       mad.lo.s32      %r17, %r1, %r3, %r16;
+       mul.wide.s32    %rd13, %r17, 8;
        add.s64         %rd14, %rd1, %rd13;
        st.global.f64   [%rd14], %fd2;
 
@@ -1899,7 +1881,7 @@ BB10_4:
 )
 {
        .reg .pred      %p<7>;
-       .reg .b32       %r<17>;
+       .reg .b32       %r<16>;
        .reg .f64       %fd<3>;
        .reg .b64       %rd<14>;
 
@@ -1915,11 +1897,10 @@ BB10_4:
        mov.u32         %r7, %ntid.x;
        mov.u32         %r8, %ctaid.x;
        mov.u32         %r9, %tid.x;
-       mad.lo.s32      %r1, %r7, %r8, %r9;
-       mov.u32         %r10, %ntid.y;
-       mov.u32         %r11, %ctaid.y;
-       mov.u32         %r12, %tid.y;
-       mad.lo.s32      %r2, %r10, %r11, %r12;
+       mad.lo.s32      %r10, %r7, %r8, %r9;
+       max.s32         %r11, %r4, %r6;
+       div.s32         %r1, %r10, %r11;
+       rem.s32         %r2, %r10, %r11;
        setp.lt.s32     %p1, %r1, %r3;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
@@ -1928,8 +1909,8 @@ BB10_4:
 
 BB11_1:
        cvta.to.global.u64      %rd5, %rd2;
-       mad.lo.s32      %r13, %r1, %r4, %r2;
-       mul.wide.s32    %rd6, %r13, 8;
+       mad.lo.s32      %r12, %r1, %r4, %r2;
+       mul.wide.s32    %rd6, %r12, 8;
        add.s64         %rd7, %rd5, %rd6;
        ld.global.f64   %fd1, [%rd7];
        add.s64         %rd8, %rd1, %rd6;
@@ -1944,13 +1925,13 @@ BB11_2:
 
 BB11_3:
        cvta.to.global.u64      %rd9, %rd3;
-       mad.lo.s32      %r14, %r1, %r6, %r2;
-       mul.wide.s32    %rd10, %r14, 8;
+       mad.lo.s32      %r13, %r1, %r6, %r2;
+       mul.wide.s32    %rd10, %r13, 8;
        add.s64         %rd11, %rd9, %rd10;
        ld.global.f64   %fd2, [%rd11];
-       add.s32         %r15, %r1, %r3;
-       mad.lo.s32      %r16, %r15, %r4, %r2;
-       mul.wide.s32    %rd12, %r16, 8;
+       add.s32         %r14, %r1, %r3;
+       mad.lo.s32      %r15, %r14, %r4, %r2;
+       mul.wide.s32    %rd12, %r15, 8;
        add.s64         %rd13, %rd1, %rd12;
        st.global.f64   [%rd13], %fd2;
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
index a2d361c..169c3bb 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
@@ -828,9 +828,6 @@ public class ParForProgramBlock extends ForProgramBlock
                        // Frees up the GPUContexts used in the threaded Parfor 
and sets
                        // the main thread to use the GPUContext
                        if (DMLScript.USE_ACCELERATOR) {
-                               for (int i = 0; i < _numThreads; i++) {
-                                       
workers[i].getExecutionContext().setGPUContexts(null);
-                               }
                                ec.getGPUContext(0).initializeThread();
                        }
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
index 636b1f8..f77c22e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
@@ -25,6 +25,7 @@ import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.CompilerConfig;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.controlprogram.parfor.stat.Stat;
 import org.apache.sysml.runtime.controlprogram.parfor.stat.StatisticMonitor;
@@ -82,8 +83,15 @@ public class LocalParWorker extends ParWorker implements 
Runnable
                }
 
                // Initialize this GPUContext to this thread
-               if (DMLScript.USE_ACCELERATOR)
-                       _ec.getGPUContext(0).initializeThread();
+               if (DMLScript.USE_ACCELERATOR) {
+                       try {
+                               _ec.getGPUContext(0).initializeThread();
+                       } catch(DMLRuntimeException e) {
+                               LOG.error("Error executing task because of 
failure in GPU backend: ",e);
+                               LOG.error("Stopping LocalParWorker.");
+                               return;
+                       }
+               }
                
                //setup compiler config for worker thread
                ConfigurationManager.setLocalConfig(_cconf);

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index 3cd2633..77c48a7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -169,7 +169,6 @@ public class FunctionCallCPInstruction extends CPInstruction
                ExecutionContext fn_ec = 
ExecutionContextFactory.createContext(false, ec.getProgram());
                if (DMLScript.USE_ACCELERATOR) {
                        fn_ec.setGPUContexts(ec.getGPUContexts());
-                       ec.setGPUContexts(null);
                        fn_ec.getGPUContext(0).initializeThread();
                }
                fn_ec.setVariables(functionVariables);
@@ -205,12 +204,6 @@ public class FunctionCallCPInstruction extends 
CPInstruction
                // Unpin the pinned variables
                ec.unpinVariables(_boundInputParamNames, pinStatus);
 
-               if (DMLScript.USE_ACCELERATOR) {
-                       ec.setGPUContexts(fn_ec.getGPUContexts());
-                       fn_ec.setGPUContexts(null);
-                       ec.getGPUContext(0).initializeThread();
-               }
-               
                // add the updated binding for each return variable to the 
variables in original symbol table
                for (int i=0; i< fpb.getOutputParams().size(); i++){
                

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index ef000c2..5a0a772 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -89,16 +89,34 @@ public class ExecutionConfig {
         * @return execution configuration
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static ExecutionConfig getConfigForSimpleMatrixOperations(int 
rlen, int clen) throws DMLRuntimeException {
+       public static ExecutionConfig getConfigForMatrixOperations(int rlen, 
int clen) throws DMLRuntimeException {
                int deviceNumber = 0;
                int maxBlockDim = getMaxBlockDim(deviceNumber);
                int blockDimX = (int) Math.min(maxBlockDim, rlen);
                int gridDimX = (int) Math.ceil((double) rlen / blockDimX);
                int blockDimY = (int) Math.min(Math.floor(((double) 
maxBlockDim) / blockDimX), clen);
                int gridDimY = (int) Math.ceil((double) clen / blockDimY);
+               if (gridDimY > 65535)
+                       throw new DMLRuntimeException("Internal Error: gridDimY 
must be less than 65535 for all supported CUDA compute capabilites!");
                return new ExecutionConfig(gridDimX, gridDimY, blockDimX, 
blockDimY);
        }
 
+       /**
+        * Use this for simple vector operations and use following in the kernel
+        * <code>
+        * int index = blockIdx.x * blockDim.x + threadIdx.x
+        * </code>
+        * <p>
+        * @param rlen number of rows
+        * @param clen number of columns
+        * @return execution configuration
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static ExecutionConfig getConfigForSimpleMatrixOperations(int 
rlen, int clen) throws DMLRuntimeException {
+               return getConfigForSimpleVectorOperations(rlen * clen);
+       }
+
+
        public ExecutionConfig(int gridDimX, int blockDimX) {
                this.gridDimX = gridDimX;
                this.blockDimX = blockDimX;
@@ -134,4 +152,10 @@ public class ExecutionConfig {
                return ret;
        }
 
+       @Override
+       public String toString() {
+               return "ExecutionConfig{" + "gridDimX=" + gridDimX + ", 
gridDimY=" + gridDimY + ", gridDimZ=" + gridDimZ
+                               + ", blockDimX=" + blockDimX + ", blockDimY=" + 
blockDimY + ", blockDimZ=" + blockDimZ
+                               + ", sharedMemBytes=" + sharedMemBytes + '}';
+       }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index b3c19ef..4c0562d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -108,27 +108,27 @@ public class GPUContext {
        /**
         * cudnnHandle for Deep Neural Network operations on the GPU
         */
-       private cudnnHandle cudnnHandle;
+       private final ThreadLocal<cudnnHandle> cudnnHandle = new 
ThreadLocal<>();
        /**
         * cublasHandle for BLAS operations on the GPU
         */
-       private cublasHandle cublasHandle;
+       private final ThreadLocal<cublasHandle> cublasHandle = new 
ThreadLocal<>();
        /**
         * cusparseHandle for certain sparse BLAS operations on the GPU
         */
-       private cusparseHandle cusparseHandle;
+       private final ThreadLocal<cusparseHandle> cusparseHandle = new 
ThreadLocal<>();
        /**
         * cusolverDnHandle for invoking solve() function on dense matrices on 
the GPU
         */
-       private cusolverDnHandle cusolverDnHandle;
+       private final ThreadLocal<cusolverDnHandle> cusolverDnHandle = new 
ThreadLocal<>();
        /**
         * cusolverSpHandle for invoking solve() function on sparse matrices on 
the GPU
         */
-       private cusolverSpHandle cusolverSpHandle;
+       private final ThreadLocal<cusolverSpHandle> cusolverSpHandle = new 
ThreadLocal<>();
        /**
         * to launch custom CUDA kernel, specific to the active GPU for this 
GPUContext
         */
-       private JCudaKernels kernels;
+       private final ThreadLocal<JCudaKernels> kernels = new ThreadLocal<>();
 
        protected GPUContext(int deviceNum) throws DMLRuntimeException {
                this.deviceNum = deviceNum;
@@ -140,28 +140,51 @@ public class GPUContext {
                long total[] = { 0 };
                cudaMemGetInfo(free, total);
 
-               long start = System.nanoTime();
-               cudnnHandle = new cudnnHandle();
-               cudnnCreate(cudnnHandle);
-               cublasHandle = new cublasHandle();
-               cublasCreate(cublasHandle);
+               long start = -1;
+               if (DMLScript.STATISTICS)
+                       start = System.nanoTime();
+               initializeCudaLibraryHandles();
+
+               if (DMLScript.STATISTICS)
+                       GPUStatistics.cudaLibrariesInitTime = System.nanoTime() 
- start;
+               
+               LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, 
Available: " + (free[0] * (1e-6)) + " MB on "
+                               + this);
+
+       }
+
+       private void initializeCudaLibraryHandles() throws DMLRuntimeException {
+               if (cudnnHandle.get() == null) {
+                       cudnnHandle.set(new cudnnHandle());
+                       cudnnCreate(cudnnHandle.get());
+               }
+
+               if (cublasHandle.get() == null) {
+                       cublasHandle.set(new cublasHandle());
+                       cublasCreate(cublasHandle.get());
+               }
                // For cublas v2, cublasSetPointerMode tells Cublas whether to 
expect scalar arguments on device or on host
                // This applies to arguments like "alpha" in Dgemm, and "y" in 
Ddot.
                // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
-               cusparseHandle = new cusparseHandle();
-               cusparseCreate(cusparseHandle);
 
-               cusolverDnHandle = new cusolverDnHandle();
-               cusolverDnCreate(cusolverDnHandle);
-               cusolverSpHandle = new cusolverSpHandle();
-               cusolverSpCreate(cusolverSpHandle);
+               if (cusparseHandle.get() == null) {
+                       cusparseHandle.set(new cusparseHandle());
+                       cusparseCreate(cusparseHandle.get());
+               }
 
-               kernels = new JCudaKernels(deviceNum);
+               if (cusolverDnHandle.get() == null) {
+                       cusolverDnHandle.set(new cusolverDnHandle());
+                       cusolverDnCreate(cusolverDnHandle.get());
+               }
 
-               GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
-               LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, 
Available: " + (free[0] * (1e-6)) + " MB on "
-                               + this);
+               if (cusolverSpHandle.get() == null) {
+                       cusolverSpHandle.set(new cusolverSpHandle());
+                       cusolverSpCreate(cusolverSpHandle.get());
+               }
 
+               if (kernels.get() == null) {
+                       kernels.set(new JCudaKernels());
+               }
        }
 
        public static int cudaGetDevice() {
@@ -181,8 +204,9 @@ public class GPUContext {
         * If in a multi-threaded env like parfor, this method must be called 
when in the
         * appropriate thread
         */
-       public void initializeThread() {
+       public void initializeThread() throws DMLRuntimeException {
                cudaSetDevice(deviceNum);
+               initializeCudaLibraryHandles();
        }
 
        /**
@@ -595,27 +619,27 @@ public class GPUContext {
        }
 
        public cudnnHandle getCudnnHandle() {
-               return cudnnHandle;
+               return cudnnHandle.get();
        }
 
        public cublasHandle getCublasHandle() {
-               return cublasHandle;
+               return cublasHandle.get();
        }
 
        public cusparseHandle getCusparseHandle() {
-               return cusparseHandle;
+               return cusparseHandle.get();
        }
 
        public cusolverDnHandle getCusolverDnHandle() {
-               return cusolverDnHandle;
+               return cusolverDnHandle.get();
        }
 
        public cusolverSpHandle getCusolverSpHandle() {
-               return cusolverSpHandle;
+               return cusolverSpHandle.get();
        }
 
        public JCudaKernels getKernels() {
-               return kernels;
+               return kernels.get();
        }
 
        /**
@@ -626,15 +650,11 @@ public class GPUContext {
        public void destroy() throws DMLRuntimeException {
                LOG.trace("GPU : this context was destroyed, this = " + 
this.toString());
                clearMemory();
-               cudnnDestroy(cudnnHandle);
-               cublasDestroy(cublasHandle);
-               cusparseDestroy(cusparseHandle);
-               cusolverDnDestroy(cusolverDnHandle);
-               cusolverSpDestroy(cusolverSpHandle);
-               cudnnHandle = null;
-               cublasHandle = null;
-               cusparseHandle = null;
-
+               cudnnDestroy(cudnnHandle.get());
+               cublasDestroy(cublasHandle.get());
+               cusparseDestroy(cusparseHandle.get());
+               cusolverDnDestroy(cusolverDnHandle.get());
+               cusolverSpDestroy(cusolverSpHandle.get());
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
index a9b1333..e030180 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -130,7 +130,7 @@ public class GPUContextPool {
                // initially available memory is set to the GPU with the lowest 
memory
                // This is because at runtime, we wouldn't know which GPU a 
certain
                // operation gets scheduled on
-               long minAvailableMemory = Integer.MAX_VALUE;
+               long minAvailableMemory = Long.MAX_VALUE;
                for (GPUContext gCtx : pool) {
                        gCtx.initializeThread();
                        minAvailableMemory = Math.min(minAvailableMemory, 
gCtx.getAvailableMemory());

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index 246aecc..9cfab2b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -49,17 +49,14 @@ public class JCudaKernels {
        private final static String ptxFileName = "/kernels/SystemML.ptx";
        private HashMap<String, CUfunction> kernels = new HashMap<String, 
CUfunction>();
        private CUmodule module;
-       //      private final int deviceNum;
 
        /**
         * Loads the kernels in the file ptxFileName. Though cubin files are 
also supported, we will stick with
         * ptx file as they are target-independent similar to Java's .class 
files.
         *
-        * @param deviceNum the device number for which to initiate the driver 
API
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       JCudaKernels(int deviceNum) throws DMLRuntimeException {
-               //              this.deviceNum = deviceNum;
+       JCudaKernels() throws DMLRuntimeException {
                module = new CUmodule();
                // Load the kernels specified in the ptxFileName file
                checkResult(cuModuleLoadDataEx(module, 
initKernels(ptxFileName), 0, new int[0], Pointer.to(new int[0])));

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java 
b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
index 195968a..d40b7a1 100644
--- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -98,6 +98,24 @@ public abstract class GPUTests extends AutomatedTestBase {
        }
 
        /**
+        * Generates an input matrix which is a sequence of integers
+        * @param spark valid instance of {@link SparkSession}
+        * @param m number of rows
+        * @param n number of columns
+        * @return a matrix with a sequence of integers
+        */
+       protected Matrix generateIntegerSequenceMatrix(SparkSession spark, int 
m, int n) {
+               MLContext genMLC = new MLContext(spark);
+               String scriptStr;
+               scriptStr = "temp = seq(1, " + (m*n) + ")" +
+                                   "in1 = matrix(temp, rows=" + m + ", cols=" 
+ n + ")";
+               Script generateScript = 
ScriptFactory.dmlFromString(scriptStr).out("in1");
+               Matrix in1 = genMLC.execute(generateScript).getMatrix("in1");
+               genMLC.close();
+               return in1;
+       }
+
+       /**
         * Generates a random input matrix with a given size and sparsity
         *
         * @param spark    valid instance of {@link SparkSession}

http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java 
b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index f7c7851..81bc254 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -153,6 +153,7 @@ public class MatrixMultiplicationOpTest extends GPUTests {
                        for (int j = 0; j < sparsities.length; j++) {
                                int side = sizes[i];
                                double sparsity = sparsities[j];
+                               System.out.println("Transpose Self matrix 
multiply, size = " + side + ", sparsity = " + sparsity);
                                Matrix X = generateInputMatrix(spark, side, 
side, sparsity, seed);
                                HashMap<String, Object> inputs = new 
HashMap<>();
                                inputs.put("X", X);

Reply via email to