http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 51ddb41..8296f92 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,16 +1,16 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21124049 -// Cuda compilation tools, release 8.0, V8.0.44 +// Compiler Build ID: CL-19856038 +// Cuda compilation tools, release 7.5, V7.5.17 // Based on LLVM 3.4svn // -.version 5.0 +.version 4.3 .target sm_30 .address_size 64 - // .globl copyUpperToLowerTriangleDense + // .globl getBoolean .func (.param .b64 func_retval0) __internal_accurate_pow ( .param .b64 __internal_accurate_pow_param_0, @@ -19,1526 +19,3729 @@ ; .extern .shared .align 8 .b8 sdata[]; -.visible .entry copyUpperToLowerTriangleDense( - .param .u64 copyUpperToLowerTriangleDense_param_0, - .param .u32 copyUpperToLowerTriangleDense_param_1, - .param .u32 copyUpperToLowerTriangleDense_param_2 +.visible .func (.param .b64 func_retval0) getBoolean( + .param .b32 getBoolean_param_0 ) { - .reg .pred %p<4>; - .reg .b32 %r<13>; + .reg .pred %p<2>; + .reg .b32 %r<2>; .reg .f64 %fd<2>; - .reg .b64 %rd<7>; - - - ld.param.u64 %rd1, [copyUpperToLowerTriangleDense_param_0]; - ld.param.u32 %r4, [copyUpperToLowerTriangleDense_param_1]; - ld.param.u32 %r5, [copyUpperToLowerTriangleDense_param_2]; - mov.u32 %r6, %ntid.x; - mov.u32 %r7, %ctaid.x; - mov.u32 %r8, %tid.x; - mad.lo.s32 %r1, %r6, %r7, %r8; - mov.u32 %r9, %ntid.y; - mov.u32 %r10, %ctaid.y; - mov.u32 %r11, %tid.y; - mad.lo.s32 %r2, %r9, %r10, %r11; - mad.lo.s32 %r3, %r2, %r4, %r1; - setp.gt.s32 %p1, %r2, %r1; - setp.lt.s32 %p2, %r3, %r5; - and.pred %p3, %p1, %p2; - @!%p3 bra BB0_2; - bra.uni BB0_1; -BB0_1: - cvta.to.global.u64 %rd2, %rd1; - mad.lo.s32 %r12, %r1, %r4, %r2; - mul.wide.s32 %rd3, %r12, 8; - add.s64 %rd4, %rd2, %rd3; - ld.global.f64 %fd1, [%rd4]; - mul.wide.s32 %rd5, %r3, 8; - add.s64 %rd6, %rd2, %rd5; - st.global.f64 [%rd6], %fd1; -BB0_2: + ld.param.u32 %r1, [getBoolean_param_0]; + setp.eq.s32 %p1, %r1, 0; + selp.f64 %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1; + st.param.f64 [func_retval0+0], %fd1; ret; } - // .globl dense_matrix_set -.visible .entry dense_matrix_set( - .param .u64 dense_matrix_set_param_0, - .param .f64 dense_matrix_set_param_1, - .param .u32 dense_matrix_set_param_2, - .param .u32 dense_matrix_set_param_3 + // .globl binaryOp +.visible .func (.param .b64 func_retval0) binaryOp( + .param .b64 binaryOp_param_0, + .param .b64 binaryOp_param_1, + .param .b32 binaryOp_param_2 ) { - .reg .pred %p<2>; - .reg .b32 %r<13>; - .reg .f64 %fd<2>; - .reg .b64 %rd<5>; + .reg .pred %p<41>; + .reg .b32 %r<30>; + .reg .f64 %fd<40>; + .reg .b64 %rd<3>; - ld.param.u64 %rd1, [dense_matrix_set_param_0]; - ld.param.f64 %fd1, [dense_matrix_set_param_1]; - ld.param.u32 %r2, [dense_matrix_set_param_2]; - ld.param.u32 %r3, [dense_matrix_set_param_3]; - mov.u32 %r4, %ctaid.x; - mov.u32 %r5, %ntid.x; - mov.u32 %r6, %tid.x; - mad.lo.s32 %r7, %r5, %r4, %r6; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r11, %r7, %r3, %r10; - mad.lo.s32 %r1, %r8, %r9, %r11; - mul.lo.s32 %r12, %r3, %r2; - setp.ge.s32 %p1, %r1, %r12; - @%p1 bra BB1_2; + ld.param.f64 %fd26, [binaryOp_param_0]; + ld.param.f64 %fd27, [binaryOp_param_1]; + ld.param.u32 %r3, [binaryOp_param_2]; + setp.eq.s32 %p2, %r3, 0; + @%p2 bra BB1_40; - cvta.to.global.u64 %rd2, %rd1; - mul.wide.s32 %rd3, %r1, 8; - add.s64 %rd4, %rd2, %rd3; - st.global.f64 [%rd4], %fd1; + setp.eq.s32 %p3, %r3, 1; + @%p3 bra BB1_39; + bra.uni BB1_2; + +BB1_39: + sub.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_40: + add.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; BB1_2: + setp.eq.s32 %p4, %r3, 2; + @%p4 bra BB1_38; + bra.uni BB1_3; + +BB1_38: + mul.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_3: + setp.eq.s32 %p5, %r3, 3; + @%p5 bra BB1_37; + bra.uni BB1_4; + +BB1_37: + div.rn.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_4: + setp.eq.s32 %p6, %r3, 4; + @%p6 bra BB1_21; + bra.uni BB1_5; + +BB1_21: + { + .reg .b32 %temp; + mov.b64 {%temp, %r1}, %fd26; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd27; + } + bfe.u32 %r4, %r2, 20, 11; + add.s32 %r5, %r4, -1012; + mov.b64 %rd2, %fd27; + shl.b64 %rd1, %rd2, %r5; + setp.eq.s64 %p21, %rd1, -9223372036854775808; + abs.f64 %fd9, %fd26; + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd9; + .param .b64 param1; + st.param.f64 [param1+0], %fd27; + .param .b64 retval0; + call.uni (retval0), + __internal_accurate_pow, + ( + param0, + param1 + ); + ld.param.f64 %fd38, [retval0+0]; + + //{ + }// Callseq End 0 + setp.lt.s32 %p22, %r1, 0; + and.pred %p1, %p22, %p21; + @!%p1 bra BB1_23; + bra.uni BB1_22; + +BB1_22: + { + .reg .b32 %temp; + mov.b64 {%temp, %r6}, %fd38; + } + xor.b32 %r7, %r6, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%r8, %temp}, %fd38; + } + mov.b64 %fd38, {%r8, %r7}; + +BB1_23: + mov.f64 %fd37, %fd38; + setp.eq.f64 %p23, %fd26, 0d0000000000000000; + @%p23 bra BB1_26; + bra.uni BB1_24; + +BB1_26: + selp.b32 %r9, %r1, 0, %p21; + or.b32 %r10, %r9, 2146435072; + setp.lt.s32 %p27, %r2, 0; + selp.b32 %r11, %r10, %r9, %p27; + mov.u32 %r12, 0; + mov.b64 %fd37, {%r12, %r11}; + bra.uni BB1_27; + +BB1_5: + setp.eq.s32 %p7, %r3, 5; + @%p7 bra BB1_20; + bra.uni BB1_6; + +BB1_20: + setp.lt.f64 %p20, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p20; + bra.uni BB1_41; + +BB1_6: + setp.eq.s32 %p8, %r3, 6; + @%p8 bra BB1_19; + bra.uni BB1_7; + +BB1_19: + setp.le.f64 %p19, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p19; + bra.uni BB1_41; + +BB1_24: + setp.gt.s32 %p24, %r1, -1; + @%p24 bra BB1_27; + + cvt.rzi.f64.f64 %fd29, %fd27; + setp.neu.f64 %p25, %fd29, %fd27; + selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p25; + +BB1_27: + mov.f64 %fd15, %fd37; + add.f64 %fd16, %fd26, %fd27; + { + .reg .b32 %temp; + mov.b64 {%temp, %r13}, %fd16; + } + and.b32 %r14, %r13, 2146435072; + setp.ne.s32 %p28, %r14, 2146435072; + mov.f64 %fd36, %fd15; + @%p28 bra BB1_36; + + setp.gtu.f64 %p29, %fd9, 0d7FF0000000000000; + mov.f64 %fd36, %fd16; + @%p29 bra BB1_36; + + abs.f64 %fd30, %fd27; + setp.gtu.f64 %p30, %fd30, 0d7FF0000000000000; + mov.f64 %fd35, %fd16; + mov.f64 %fd36, %fd35; + @%p30 bra BB1_36; + + and.b32 %r15, %r2, 2147483647; + setp.ne.s32 %p31, %r15, 2146435072; + @%p31 bra BB1_32; + + { + .reg .b32 %temp; + mov.b64 {%r16, %temp}, %fd27; + } + setp.eq.s32 %p32, %r16, 0; + @%p32 bra BB1_35; + +BB1_32: + and.b32 %r17, %r1, 2147483647; + setp.ne.s32 %p33, %r17, 2146435072; + mov.f64 %fd33, %fd15; + mov.f64 %fd36, %fd33; + @%p33 bra BB1_36; + + { + .reg .b32 %temp; + mov.b64 {%r18, %temp}, %fd26; + } + setp.ne.s32 %p34, %r18, 0; + mov.f64 %fd36, %fd15; + @%p34 bra BB1_36; + + shr.s32 %r19, %r2, 31; + and.b32 %r20, %r19, -2146435072; + add.s32 %r21, %r20, 2146435072; + or.b32 %r22, %r21, -2147483648; + selp.b32 %r23, %r22, %r21, %p1; + mov.u32 %r24, 0; + mov.b64 %fd36, {%r24, %r23}; + bra.uni BB1_36; + +BB1_7: + setp.eq.s32 %p9, %r3, 7; + @%p9 bra BB1_18; + bra.uni BB1_8; + +BB1_18: + setp.gt.f64 %p18, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p18; + bra.uni BB1_41; + +BB1_8: + setp.eq.s32 %p10, %r3, 8; + @%p10 bra BB1_17; + bra.uni BB1_9; + +BB1_17: + setp.ge.f64 %p17, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p17; + bra.uni BB1_41; + +BB1_9: + setp.eq.s32 %p11, %r3, 9; + @%p11 bra BB1_16; + bra.uni BB1_10; + +BB1_16: + setp.eq.f64 %p16, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p16; + bra.uni BB1_41; + +BB1_10: + setp.eq.s32 %p12, %r3, 10; + @%p12 bra BB1_15; + bra.uni BB1_11; + +BB1_15: + setp.neu.f64 %p15, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p15; + bra.uni BB1_41; + +BB1_35: + setp.gt.f64 %p35, %fd9, 0d3FF0000000000000; + selp.b32 %r25, 2146435072, 0, %p35; + xor.b32 %r26, %r25, 2146435072; + setp.lt.s32 %p36, %r2, 0; + selp.b32 %r27, %r26, %r25, %p36; + setp.eq.f64 %p37, %fd26, 0dBFF0000000000000; + selp.b32 %r28, 1072693248, %r27, %p37; + mov.u32 %r29, 0; + mov.b64 %fd36, {%r29, %r28}; + +BB1_36: + setp.eq.f64 %p38, %fd27, 0d0000000000000000; + setp.eq.f64 %p39, %fd26, 0d3FF0000000000000; + or.pred %p40, %p39, %p38; + selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p40; + +BB1_41: + st.param.f64 [func_retval0+0], %fd39; ret; + +BB1_11: + setp.eq.s32 %p13, %r3, 11; + @%p13 bra BB1_14; + bra.uni BB1_12; + +BB1_14: + min.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_12: + mov.f64 %fd39, 0dC08F380000000000; + setp.ne.s32 %p14, %r3, 12; + @%p14 bra BB1_41; + + max.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; } - // .globl dense_matrix_copy -.visible .entry dense_matrix_copy( - .param .u64 dense_matrix_copy_param_0, - .param .u64 dense_matrix_copy_param_1, - .param .u32 dense_matrix_copy_param_2, - .param .u32 dense_matrix_copy_param_3 + // .globl _Z6reduceI5SumOpEvPdS1_jT_d +.visible .func _Z6reduceI5SumOpEvPdS1_jT_d( + .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0, + .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1, + .param .b32 _Z6reduceI5SumOpEvPdS1_jT_d_param_2, + .param .align 1 .b8 _Z6reduceI5SumOpEvPdS1_jT_d_param_3[1], + .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_4 ) { - .reg .pred %p<4>; - .reg .b32 %r<12>; - .reg .f64 %fd<2>; - .reg .b64 %rd<8>; + .reg .pred %p<20>; + .reg .b32 %r<33>; + .reg .f64 %fd<79>; + .reg .b64 %rd<12>; - ld.param.u64 %rd1, [dense_matrix_copy_param_0]; - ld.param.u64 %rd2, [dense_matrix_copy_param_1]; - ld.param.u32 %r2, [dense_matrix_copy_param_2]; - ld.param.u32 %r3, [dense_matrix_copy_param_3]; - mov.u32 %r4, %ctaid.x; - mov.u32 %r5, %ntid.x; + ld.param.u64 %rd2, [_Z6reduceI5SumOpEvPdS1_jT_d_param_0]; + ld.param.u64 %rd3, [_Z6reduceI5SumOpEvPdS1_jT_d_param_1]; + ld.param.u32 %r5, [_Z6reduceI5SumOpEvPdS1_jT_d_param_2]; + ld.param.f64 %fd76, [_Z6reduceI5SumOpEvPdS1_jT_d_param_4]; mov.u32 %r6, %tid.x; - mad.lo.s32 %r7, %r5, %r4, %r6; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r11, %r8, %r9, %r10; - mad.lo.s32 %r1, %r7, %r3, %r11; - setp.lt.s32 %p1, %r7, %r2; - setp.lt.s32 %p2, %r11, %r3; - and.pred %p3, %p1, %p2; - @!%p3 bra BB2_2; - bra.uni BB2_1; + mov.u32 %r7, %ctaid.x; + shl.b32 %r8, %r7, 1; + mov.u32 %r9, %ntid.x; + mad.lo.s32 %r32, %r8, %r9, %r6; + setp.ge.u32 %p1, %r32, %r5; + @%p1 bra BB2_5; -BB2_1: - cvta.to.global.u64 %rd3, %rd1; - mul.wide.s32 %rd4, %r1, 8; - add.s64 %rd5, %rd3, %rd4; - ld.global.f64 %fd1, [%rd5]; - cvta.to.global.u64 %rd6, %rd2; - add.s64 %rd7, %rd6, %rd4; - st.global.f64 [%rd7], %fd1; + mov.f64 %fd77, %fd76; BB2_2: - ret; -} + mov.f64 %fd1, %fd77; + mul.wide.u32 %rd4, %r32, 8; + add.s64 %rd5, %rd2, %rd4; + ld.f64 %fd29, [%rd5]; + add.f64 %fd78, %fd1, %fd29; + add.s32 %r3, %r32, %r9; + setp.ge.u32 %p2, %r3, %r5; + @%p2 bra BB2_4; - // .globl relu -.visible .entry relu( - .param .u64 relu_param_0, - .param .u64 relu_param_1, - .param .u32 relu_param_2, - .param .u32 relu_param_3 -) -{ - .reg .pred %p<4>; - .reg .b32 %r<12>; - .reg .f64 %fd<4>; - .reg .b64 %rd<8>; + mul.wide.u32 %rd6, %r3, 8; + add.s64 %rd7, %rd2, %rd6; + ld.f64 %fd30, [%rd7]; + add.f64 %fd78, %fd78, %fd30; +BB2_4: + mov.f64 %fd77, %fd78; + shl.b32 %r12, %r9, 1; + mov.u32 %r13, %nctaid.x; + mad.lo.s32 %r32, %r12, %r13, %r32; + setp.lt.u32 %p3, %r32, %r5; + mov.f64 %fd76, %fd77; + @%p3 bra BB2_2; + +BB2_5: + mov.f64 %fd74, %fd76; + mul.wide.u32 %rd8, %r6, 8; + mov.u64 %rd9, sdata; + add.s64 %rd1, %rd9, %rd8; + st.shared.f64 [%rd1], %fd74; + bar.sync 0; + setp.lt.u32 %p4, %r9, 1024; + @%p4 bra BB2_9; - ld.param.u64 %rd1, [relu_param_0]; - ld.param.u64 %rd2, [relu_param_1]; - ld.param.u32 %r4, [relu_param_2]; - ld.param.u32 %r3, [relu_param_3]; - mov.u32 %r5, %ctaid.x; - mov.u32 %r6, %ntid.x; - mov.u32 %r7, %tid.x; - mad.lo.s32 %r1, %r6, %r5, %r7; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r2, %r8, %r9, %r10; - setp.lt.s32 %p1, %r1, %r4; - setp.lt.s32 %p2, %r2, %r3; - and.pred %p3, %p1, %p2; - @!%p3 bra BB3_2; - bra.uni BB3_1; + setp.gt.u32 %p5, %r6, 511; + mov.f64 %fd75, %fd74; + @%p5 bra BB2_8; -BB3_1: - cvta.to.global.u64 %rd3, %rd1; - mad.lo.s32 %r11, %r1, %r3, %r2; - mul.wide.s32 %rd4, %r11, 8; - add.s64 %rd5, %rd3, %rd4; - ld.global.f64 %fd1, [%rd5]; - mov.f64 %fd2, 0d0000000000000000; - max.f64 %fd3, %fd2, %fd1; - cvta.to.global.u64 %rd6, %rd2; - add.s64 %rd7, %rd6, %rd4; - st.global.f64 [%rd7], %fd3; + ld.shared.f64 %fd31, [%rd1+4096]; + add.f64 %fd75, %fd74, %fd31; + st.shared.f64 [%rd1], %fd75; -BB3_2: - ret; -} +BB2_8: + mov.f64 %fd74, %fd75; + bar.sync 0; - // .globl reluBackward -.visible .entry reluBackward( - .param .u64 reluBackward_param_0, - .param .u64 reluBackward_param_1, - .param .u64 reluBackward_param_2, - .param .u32 reluBackward_param_3, - .param .u32 reluBackward_param_4 -) -{ - .reg .pred %p<5>; - .reg .b32 %r<12>; - .reg .f64 %fd<6>; - .reg .b64 %rd<14>; +BB2_9: + mov.f64 %fd72, %fd74; + setp.lt.u32 %p6, %r9, 512; + @%p6 bra BB2_13; + setp.gt.u32 %p7, %r6, 255; + mov.f64 %fd73, %fd72; + @%p7 bra BB2_12; - ld.param.u64 %rd2, [reluBackward_param_0]; - ld.param.u64 %rd3, [reluBackward_param_1]; - ld.param.u64 %rd4, [reluBackward_param_2]; - ld.param.u32 %r4, [reluBackward_param_3]; - ld.param.u32 %r3, [reluBackward_param_4]; - mov.u32 %r5, %ntid.x; - mov.u32 %r6, %ctaid.x; - mov.u32 %r7, %tid.x; - mad.lo.s32 %r1, %r5, %r6, %r7; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r2, %r8, %r9, %r10; - setp.lt.s32 %p1, %r1, %r4; - setp.lt.s32 %p2, %r2, %r3; - and.pred %p3, %p1, %p2; - @!%p3 bra BB4_4; - bra.uni BB4_1; + ld.shared.f64 %fd32, [%rd1+2048]; + add.f64 %fd73, %fd72, %fd32; + st.shared.f64 [%rd1], %fd73; -BB4_1: - cvta.to.global.u64 %rd5, %rd2; - mad.lo.s32 %r11, %r1, %r3, %r2; - cvt.s64.s32 %rd1, %r11; - mul.wide.s32 %rd6, %r11, 8; - add.s64 %rd7, %rd5, %rd6; - ld.global.f64 %fd4, [%rd7]; - mov.f64 %fd5, 0d0000000000000000; - setp.leu.f64 %p4, %fd4, 0d0000000000000000; - @%p4 bra BB4_3; +BB2_12: + mov.f64 %fd72, %fd73; + bar.sync 0; - cvta.to.global.u64 %rd8, %rd3; - shl.b64 %rd9, %rd1, 3; - add.s64 %rd10, %rd8, %rd9; - ld.global.f64 %fd5, [%rd10]; +BB2_13: + mov.f64 %fd70, %fd72; + setp.lt.u32 %p8, %r9, 256; + @%p8 bra BB2_17; -BB4_3: - cvta.to.global.u64 %rd11, %rd4; - shl.b64 %rd12, %rd1, 3; - add.s64 %rd13, %rd11, %rd12; - st.global.f64 [%rd13], %fd5; + setp.gt.u32 %p9, %r6, 127; + mov.f64 %fd71, %fd70; + @%p9 bra BB2_16; -BB4_4: - ret; -} + ld.shared.f64 %fd33, [%rd1+1024]; + add.f64 %fd71, %fd70, %fd33; + st.shared.f64 [%rd1], %fd71; - // .globl biasAdd -.visible .entry biasAdd( - .param .u64 biasAdd_param_0, - .param .u64 biasAdd_param_1, - .param .u64 biasAdd_param_2, - .param .u32 biasAdd_param_3, - .param .u32 biasAdd_param_4, - .param .u32 biasAdd_param_5 -) -{ - .reg .pred %p<4>; - .reg .b32 %r<14>; - .reg .f64 %fd<4>; - .reg .b64 %rd<12>; +BB2_16: + mov.f64 %fd70, %fd71; + bar.sync 0; +BB2_17: + mov.f64 %fd68, %fd70; + setp.lt.u32 %p10, %r9, 128; + @%p10 bra BB2_21; - ld.param.u64 %rd1, [biasAdd_param_0]; - ld.param.u64 %rd2, [biasAdd_param_1]; - ld.param.u64 %rd3, [biasAdd_param_2]; - ld.param.u32 %r5, [biasAdd_param_3]; - ld.param.u32 %r3, [biasAdd_param_4]; - ld.param.u32 %r4, [biasAdd_param_5]; - mov.u32 %r6, %ctaid.x; - mov.u32 %r7, %ntid.x; - mov.u32 %r8, %tid.x; - mad.lo.s32 %r1, %r7, %r6, %r8; - mov.u32 %r9, %ntid.y; - mov.u32 %r10, %ctaid.y; - mov.u32 %r11, %tid.y; - mad.lo.s32 %r2, %r9, %r10, %r11; - setp.lt.s32 %p1, %r1, %r5; - setp.lt.s32 %p2, %r2, %r3; - and.pred %p3, %p1, %p2; - @!%p3 bra BB5_2; - bra.uni BB5_1; + setp.gt.u32 %p11, %r6, 63; + mov.f64 %fd69, %fd68; + @%p11 bra BB2_20; -BB5_1: - cvta.to.global.u64 %rd4, %rd1; - mad.lo.s32 %r12, %r1, %r3, %r2; - mul.wide.s32 %rd5, %r12, 8; - add.s64 %rd6, %rd4, %rd5; - div.s32 %r13, %r2, %r4; - cvta.to.global.u64 %rd7, %rd2; - mul.wide.s32 %rd8, %r13, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.f64 %fd1, [%rd9]; - ld.global.f64 %fd2, [%rd6]; - add.f64 %fd3, %fd2, %fd1; - cvta.to.global.u64 %rd10, %rd3; - add.s64 %rd11, %rd10, %rd5; - st.global.f64 [%rd11], %fd3; + ld.shared.f64 %fd34, [%rd1+512]; + add.f64 %fd69, %fd68, %fd34; + st.shared.f64 [%rd1], %fd69; -BB5_2: +BB2_20: + mov.f64 %fd68, %fd69; + bar.sync 0; + +BB2_21: + mov.f64 %fd67, %fd68; + setp.gt.u32 %p12, %r6, 31; + @%p12 bra BB2_34; + + setp.lt.u32 %p13, %r9, 64; + @%p13 bra BB2_24; + + ld.volatile.shared.f64 %fd35, [%rd1+256]; + add.f64 %fd67, %fd67, %fd35; + st.volatile.shared.f64 [%rd1], %fd67; + +BB2_24: + mov.f64 %fd66, %fd67; + setp.lt.u32 %p14, %r9, 32; + @%p14 bra BB2_26; + + ld.volatile.shared.f64 %fd36, [%rd1+128]; + add.f64 %fd66, %fd66, %fd36; + st.volatile.shared.f64 [%rd1], %fd66; + +BB2_26: + mov.f64 %fd65, %fd66; + setp.lt.u32 %p15, %r9, 16; + @%p15 bra BB2_28; + + ld.volatile.shared.f64 %fd37, [%rd1+64]; + add.f64 %fd65, %fd65, %fd37; + st.volatile.shared.f64 [%rd1], %fd65; + +BB2_28: + mov.f64 %fd64, %fd65; + setp.lt.u32 %p16, %r9, 8; + @%p16 bra BB2_30; + + ld.volatile.shared.f64 %fd38, [%rd1+32]; + add.f64 %fd64, %fd64, %fd38; + st.volatile.shared.f64 [%rd1], %fd64; + +BB2_30: + mov.f64 %fd63, %fd64; + setp.lt.u32 %p17, %r9, 4; + @%p17 bra BB2_32; + + ld.volatile.shared.f64 %fd39, [%rd1+16]; + add.f64 %fd63, %fd63, %fd39; + st.volatile.shared.f64 [%rd1], %fd63; + +BB2_32: + setp.lt.u32 %p18, %r9, 2; + @%p18 bra BB2_34; + + ld.volatile.shared.f64 %fd40, [%rd1+8]; + add.f64 %fd41, %fd63, %fd40; + st.volatile.shared.f64 [%rd1], %fd41; + +BB2_34: + setp.ne.s32 %p19, %r6, 0; + @%p19 bra BB2_36; + + ld.shared.f64 %fd42, [sdata]; + mul.wide.u32 %rd10, %r7, 8; + add.s64 %rd11, %rd3, %rd10; + st.f64 [%rd11], %fd42; + +BB2_36: ret; } - // .globl compareAndSet -.visible .entry compareAndSet( - .param .u64 compareAndSet_param_0, - .param .u64 compareAndSet_param_1, - .param .u32 compareAndSet_param_2, - .param .u32 compareAndSet_param_3, - .param .f64 compareAndSet_param_4, - .param .f64 compareAndSet_param_5, - .param .f64 compareAndSet_param_6, - .param .f64 compareAndSet_param_7, - .param .f64 compareAndSet_param_8 + // .globl _Z10reduce_rowI5SumOpEvPdS1_jjT_d +.visible .func _Z10reduce_rowI5SumOpEvPdS1_jjT_d( + .param .b64 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_0, + .param .b64 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_1, + .param .b32 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_2, + .param .b32 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_3, + .param .align 1 .b8 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_4[1], + .param .b64 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_5 ) { - .reg .pred %p<6>; - .reg .b32 %r<12>; - .reg .f64 %fd<9>; - .reg .b64 %rd<8>; + .reg .pred %p<20>; + .reg .b32 %r<29>; + .reg .f64 %fd<41>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_0]; + ld.param.u64 %rd3, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_1]; + ld.param.u32 %r7, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_2]; + ld.param.u32 %r6, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_3]; + ld.param.f64 %fd40, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_5]; + mov.u32 %r1, %ctaid.x; + setp.ge.u32 %p1, %r1, %r7; + @%p1 bra BB3_34; + + mov.u32 %r28, %tid.x; + mul.lo.s32 %r3, %r1, %r6; + setp.ge.u32 %p2, %r28, %r6; + @%p2 bra BB3_3; +BB3_2: + add.s32 %r8, %r28, %r3; + mul.wide.u32 %rd4, %r8, 8; + add.s64 %rd5, %rd2, %rd4; + ld.f64 %fd27, [%rd5]; + add.f64 %fd40, %fd40, %fd27; + mov.u32 %r9, %ntid.x; + add.s32 %r28, %r9, %r28; + setp.lt.u32 %p3, %r28, %r6; + @%p3 bra BB3_2; - ld.param.u64 %rd2, [compareAndSet_param_0]; - ld.param.u64 %rd3, [compareAndSet_param_1]; - ld.param.u32 %r2, [compareAndSet_param_2]; - ld.param.u32 %r3, [compareAndSet_param_3]; - ld.param.f64 %fd2, [compareAndSet_param_4]; - ld.param.f64 %fd3, [compareAndSet_param_5]; - ld.param.f64 %fd4, [compareAndSet_param_6]; - ld.param.f64 %fd5, [compareAndSet_param_7]; - ld.param.f64 %fd6, [compareAndSet_param_8]; - mov.u32 %r4, %ctaid.x; - mov.u32 %r5, %ntid.x; +BB3_3: + mov.u32 %r10, %tid.x; + mul.wide.u32 %rd6, %r10, 8; + mov.u64 %rd7, sdata; + add.s64 %rd1, %rd7, %rd6; + st.shared.f64 [%rd1], %fd40; + bar.sync 0; + mov.u32 %r11, %ntid.x; + setp.lt.u32 %p4, %r11, 1024; + @%p4 bra BB3_7; + + setp.gt.u32 %p5, %r10, 511; + @%p5 bra BB3_6; + + ld.shared.f64 %fd28, [%rd1+4096]; + add.f64 %fd40, %fd40, %fd28; + st.shared.f64 [%rd1], %fd40; + +BB3_6: + bar.sync 0; + +BB3_7: + setp.lt.u32 %p6, %r11, 512; + @%p6 bra BB3_11; + + setp.gt.u32 %p7, %r10, 255; + @%p7 bra BB3_10; + + ld.shared.f64 %fd29, [%rd1+2048]; + add.f64 %fd40, %fd40, %fd29; + st.shared.f64 [%rd1], %fd40; + +BB3_10: + bar.sync 0; + +BB3_11: + setp.lt.u32 %p8, %r11, 256; + @%p8 bra BB3_15; + + setp.gt.u32 %p9, %r10, 127; + @%p9 bra BB3_14; + + ld.shared.f64 %fd30, [%rd1+1024]; + add.f64 %fd40, %fd40, %fd30; + st.shared.f64 [%rd1], %fd40; + +BB3_14: + bar.sync 0; + +BB3_15: + setp.lt.u32 %p10, %r11, 128; + @%p10 bra BB3_19; + + setp.gt.u32 %p11, %r10, 63; + @%p11 bra BB3_18; + + ld.shared.f64 %fd31, [%rd1+512]; + add.f64 %fd40, %fd40, %fd31; + st.shared.f64 [%rd1], %fd40; + +BB3_18: + bar.sync 0; + +BB3_19: + setp.gt.u32 %p12, %r10, 31; + @%p12 bra BB3_32; + + setp.lt.u32 %p13, %r11, 64; + @%p13 bra BB3_22; + + ld.volatile.shared.f64 %fd32, [%rd1+256]; + add.f64 %fd40, %fd40, %fd32; + st.volatile.shared.f64 [%rd1], %fd40; + +BB3_22: + setp.lt.u32 %p14, %r11, 32; + @%p14 bra BB3_24; + + ld.volatile.shared.f64 %fd33, [%rd1+128]; + add.f64 %fd40, %fd40, %fd33; + st.volatile.shared.f64 [%rd1], %fd40; + +BB3_24: + setp.lt.u32 %p15, %r11, 16; + @%p15 bra BB3_26; + + ld.volatile.shared.f64 %fd34, [%rd1+64]; + add.f64 %fd40, %fd40, %fd34; + st.volatile.shared.f64 [%rd1], %fd40; + +BB3_26: + setp.lt.u32 %p16, %r11, 8; + @%p16 bra BB3_28; + + ld.volatile.shared.f64 %fd35, [%rd1+32]; + add.f64 %fd40, %fd40, %fd35; + st.volatile.shared.f64 [%rd1], %fd40; + +BB3_28: + setp.lt.u32 %p17, %r11, 4; + @%p17 bra BB3_30; + + ld.volatile.shared.f64 %fd36, [%rd1+16]; + add.f64 %fd40, %fd40, %fd36; + st.volatile.shared.f64 [%rd1], %fd40; + +BB3_30: + setp.lt.u32 %p18, %r11, 2; + @%p18 bra BB3_32; + + ld.volatile.shared.f64 %fd37, [%rd1+8]; + add.f64 %fd38, %fd40, %fd37; + st.volatile.shared.f64 [%rd1], %fd38; + +BB3_32: + setp.ne.s32 %p19, %r10, 0; + @%p19 bra BB3_34; + + ld.shared.f64 %fd39, [sdata]; + mul.wide.u32 %rd8, %r1, 8; + add.s64 %rd9, %rd3, %rd8; + st.f64 [%rd9], %fd39; + +BB3_34: + ret; +} + + // .globl _Z10reduce_colI5SumOpEvPdS1_jjT_d +.visible .func _Z10reduce_colI5SumOpEvPdS1_jjT_d( + .param .b64 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_0, + .param .b64 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_1, + .param .b32 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_2, + .param .b32 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_3, + .param .align 1 .b8 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_4[1], + .param .b64 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_5 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<11>; + .reg .f64 %fd<7>; + .reg .b64 %rd<7>; + + + ld.param.u64 %rd1, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_0]; + ld.param.u64 %rd2, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_1]; + ld.param.u32 %r5, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_2]; + ld.param.u32 %r6, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_3]; + ld.param.f64 %fd6, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_5]; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %ntid.x; + mov.u32 %r9, %tid.x; + mad.lo.s32 %r1, %r8, %r7, %r9; + setp.ge.u32 %p1, %r1, %r6; + @%p1 bra BB4_5; + + mul.lo.s32 %r2, %r6, %r5; + setp.ge.u32 %p2, %r1, %r2; + @%p2 bra BB4_4; + + mov.u32 %r10, %r1; + +BB4_3: + mov.u32 %r3, %r10; + mul.wide.u32 %rd3, %r3, 8; + add.s64 %rd4, %rd1, %rd3; + ld.f64 %fd5, [%rd4]; + add.f64 %fd6, %fd6, %fd5; + add.s32 %r4, %r3, %r6; + setp.lt.u32 %p3, %r4, %r2; + mov.u32 %r10, %r4; + @%p3 bra BB4_3; + +BB4_4: + mul.wide.u32 %rd5, %r1, 8; + add.s64 %rd6, %rd2, %rd5; + st.f64 [%rd6], %fd6; + +BB4_5: + ret; +} + + // .globl _Z6reduceI5MaxOpEvPdS1_jT_d +.visible .func _Z6reduceI5MaxOpEvPdS1_jT_d( + .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_0, + .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_1, + .param .b32 _Z6reduceI5MaxOpEvPdS1_jT_d_param_2, + .param .align 1 .b8 _Z6reduceI5MaxOpEvPdS1_jT_d_param_3[1], + .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_4 +) +{ + .reg .pred %p<20>; + .reg .b32 %r<33>; + .reg .f64 %fd<79>; + .reg .b64 %rd<12>; + + + ld.param.u64 %rd2, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_0]; + ld.param.u64 %rd3, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_1]; + ld.param.u32 %r5, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_2]; + ld.param.f64 %fd76, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_4]; mov.u32 %r6, %tid.x; - mad.lo.s32 %r7, %r5, %r4, %r6; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r11, %r8, %r9, %r10; - mad.lo.s32 %r1, %r7, %r3, %r11; - setp.lt.s32 %p1, %r7, %r2; - setp.lt.s32 %p2, %r11, %r3; - and.pred %p3, %p1, %p2; - @!%p3 bra BB6_6; - bra.uni BB6_1; + mov.u32 %r7, %ctaid.x; + shl.b32 %r8, %r7, 1; + mov.u32 %r9, %ntid.x; + mad.lo.s32 %r32, %r8, %r9, %r6; + setp.ge.u32 %p1, %r32, %r5; + @%p1 bra BB5_5; -BB6_1: - cvta.to.global.u64 %rd4, %rd2; - mul.wide.s32 %rd5, %r1, 8; - add.s64 %rd6, %rd4, %rd5; - ld.global.f64 %fd1, [%rd6]; - sub.f64 %fd7, %fd1, %fd2; - abs.f64 %fd8, %fd7; - setp.lt.f64 %p4, %fd8, %fd3; - cvta.to.global.u64 %rd7, %rd3; - add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB6_5; - bra.uni BB6_2; + mov.f64 %fd77, %fd76; -BB6_5: - st.global.f64 [%rd1], %fd4; - bra.uni BB6_6; +BB5_2: + mov.f64 %fd1, %fd77; + mul.wide.u32 %rd4, %r32, 8; + add.s64 %rd5, %rd2, %rd4; + ld.f64 %fd29, [%rd5]; + max.f64 %fd78, %fd1, %fd29; + add.s32 %r3, %r32, %r9; + setp.ge.u32 %p2, %r3, %r5; + @%p2 bra BB5_4; -BB6_2: - setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB6_4; - bra.uni BB6_3; + mul.wide.u32 %rd6, %r3, 8; + add.s64 %rd7, %rd2, %rd6; + ld.f64 %fd30, [%rd7]; + max.f64 %fd78, %fd78, %fd30; -BB6_4: - st.global.f64 [%rd1], %fd5; - bra.uni BB6_6; +BB5_4: + mov.f64 %fd77, %fd78; + shl.b32 %r12, %r9, 1; + mov.u32 %r13, %nctaid.x; + mad.lo.s32 %r32, %r12, %r13, %r32; + setp.lt.u32 %p3, %r32, %r5; + mov.f64 %fd76, %fd77; + @%p3 bra BB5_2; + +BB5_5: + mov.f64 %fd74, %fd76; + mul.wide.u32 %rd8, %r6, 8; + mov.u64 %rd9, sdata; + add.s64 %rd1, %rd9, %rd8; + st.shared.f64 [%rd1], %fd74; + bar.sync 0; + setp.lt.u32 %p4, %r9, 1024; + @%p4 bra BB5_9; -BB6_3: - st.global.f64 [%rd1], %fd6; + setp.gt.u32 %p5, %r6, 511; + mov.f64 %fd75, %fd74; + @%p5 bra BB5_8; -BB6_6: + ld.shared.f64 %fd31, [%rd1+4096]; + max.f64 %fd75, %fd74, %fd31; + st.shared.f64 [%rd1], %fd75; + +BB5_8: + mov.f64 %fd74, %fd75; + bar.sync 0; + +BB5_9: + mov.f64 %fd72, %fd74; + setp.lt.u32 %p6, %r9, 512; + @%p6 bra BB5_13; + + setp.gt.u32 %p7, %r6, 255; + mov.f64 %fd73, %fd72; + @%p7 bra BB5_12; + + ld.shared.f64 %fd32, [%rd1+2048]; + max.f64 %fd73, %fd72, %fd32; + st.shared.f64 [%rd1], %fd73; + +BB5_12: + mov.f64 %fd72, %fd73; + bar.sync 0; + +BB5_13: + mov.f64 %fd70, %fd72; + setp.lt.u32 %p8, %r9, 256; + @%p8 bra BB5_17; + + setp.gt.u32 %p9, %r6, 127; + mov.f64 %fd71, %fd70; + @%p9 bra BB5_16; + + ld.shared.f64 %fd33, [%rd1+1024]; + max.f64 %fd71, %fd70, %fd33; + st.shared.f64 [%rd1], %fd71; + +BB5_16: + mov.f64 %fd70, %fd71; + bar.sync 0; + +BB5_17: + mov.f64 %fd68, %fd70; + setp.lt.u32 %p10, %r9, 128; + @%p10 bra BB5_21; + + setp.gt.u32 %p11, %r6, 63; + mov.f64 %fd69, %fd68; + @%p11 bra BB5_20; + + ld.shared.f64 %fd34, [%rd1+512]; + max.f64 %fd69, %fd68, %fd34; + st.shared.f64 [%rd1], %fd69; + +BB5_20: + mov.f64 %fd68, %fd69; + bar.sync 0; + +BB5_21: + mov.f64 %fd67, %fd68; + setp.gt.u32 %p12, %r6, 31; + @%p12 bra BB5_34; + + setp.lt.u32 %p13, %r9, 64; + @%p13 bra BB5_24; + + ld.volatile.shared.f64 %fd35, [%rd1+256]; + max.f64 %fd67, %fd67, %fd35; + st.volatile.shared.f64 [%rd1], %fd67; + +BB5_24: + mov.f64 %fd66, %fd67; + setp.lt.u32 %p14, %r9, 32; + @%p14 bra BB5_26; + + ld.volatile.shared.f64 %fd36, [%rd1+128]; + max.f64 %fd66, %fd66, %fd36; + st.volatile.shared.f64 [%rd1], %fd66; + +BB5_26: + mov.f64 %fd65, %fd66; + setp.lt.u32 %p15, %r9, 16; + @%p15 bra BB5_28; + + ld.volatile.shared.f64 %fd37, [%rd1+64]; + max.f64 %fd65, %fd65, %fd37; + st.volatile.shared.f64 [%rd1], %fd65; + +BB5_28: + mov.f64 %fd64, %fd65; + setp.lt.u32 %p16, %r9, 8; + @%p16 bra BB5_30; + + ld.volatile.shared.f64 %fd38, [%rd1+32]; + max.f64 %fd64, %fd64, %fd38; + st.volatile.shared.f64 [%rd1], %fd64; + +BB5_30: + mov.f64 %fd63, %fd64; + setp.lt.u32 %p17, %r9, 4; + @%p17 bra BB5_32; + + ld.volatile.shared.f64 %fd39, [%rd1+16]; + max.f64 %fd63, %fd63, %fd39; + st.volatile.shared.f64 [%rd1], %fd63; + +BB5_32: + setp.lt.u32 %p18, %r9, 2; + @%p18 bra BB5_34; + + ld.volatile.shared.f64 %fd40, [%rd1+8]; + max.f64 %fd41, %fd63, %fd40; + st.volatile.shared.f64 [%rd1], %fd41; + +BB5_34: + setp.ne.s32 %p19, %r6, 0; + @%p19 bra BB5_36; + + ld.shared.f64 %fd42, [sdata]; + mul.wide.u32 %rd10, %r7, 8; + add.s64 %rd11, %rd3, %rd10; + st.f64 [%rd11], %fd42; + +BB5_36: ret; } - // .globl binCellOp -.visible .entry binCellOp( - .param .u64 binCellOp_param_0, - .param .u64 binCellOp_param_1, - .param .u64 binCellOp_param_2, - .param .u32 binCellOp_param_3, - .param .u32 binCellOp_param_4, - .param .u32 binCellOp_param_5, - .param .u32 binCellOp_param_6, - .param .u32 binCellOp_param_7 + // .globl _Z10reduce_rowI5MaxOpEvPdS1_jjT_d +.visible .func _Z10reduce_rowI5MaxOpEvPdS1_jjT_d( + .param .b64 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_0, + .param .b64 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_1, + .param .b32 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_2, + .param .b32 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_3, + .param .align 1 .b8 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_4[1], + .param .b64 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_5 +) +{ + .reg .pred %p<20>; + .reg .b32 %r<29>; + .reg .f64 %fd<41>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_0]; + ld.param.u64 %rd3, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_1]; + ld.param.u32 %r7, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_2]; + ld.param.u32 %r6, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_3]; + ld.param.f64 %fd40, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_5]; + mov.u32 %r1, %ctaid.x; + setp.ge.u32 %p1, %r1, %r7; + @%p1 bra BB6_34; + + mov.u32 %r28, %tid.x; + mul.lo.s32 %r3, %r1, %r6; + setp.ge.u32 %p2, %r28, %r6; + @%p2 bra BB6_3; + +BB6_2: + add.s32 %r8, %r28, %r3; + mul.wide.u32 %rd4, %r8, 8; + add.s64 %rd5, %rd2, %rd4; + ld.f64 %fd27, [%rd5]; + max.f64 %fd40, %fd40, %fd27; + mov.u32 %r9, %ntid.x; + add.s32 %r28, %r9, %r28; + setp.lt.u32 %p3, %r28, %r6; + @%p3 bra BB6_2; + +BB6_3: + mov.u32 %r10, %tid.x; + mul.wide.u32 %rd6, %r10, 8; + mov.u64 %rd7, sdata; + add.s64 %rd1, %rd7, %rd6; + st.shared.f64 [%rd1], %fd40; + bar.sync 0; + mov.u32 %r11, %ntid.x; + setp.lt.u32 %p4, %r11, 1024; + @%p4 bra BB6_7; + + setp.gt.u32 %p5, %r10, 511; + @%p5 bra BB6_6; + + ld.shared.f64 %fd28, [%rd1+4096]; + max.f64 %fd40, %fd40, %fd28; + st.shared.f64 [%rd1], %fd40; + +BB6_6: + bar.sync 0; + +BB6_7: + setp.lt.u32 %p6, %r11, 512; + @%p6 bra BB6_11; + + setp.gt.u32 %p7, %r10, 255; + @%p7 bra BB6_10; + + ld.shared.f64 %fd29, [%rd1+2048]; + max.f64 %fd40, %fd40, %fd29; + st.shared.f64 [%rd1], %fd40; + +BB6_10: + bar.sync 0; + +BB6_11: + setp.lt.u32 %p8, %r11, 256; + @%p8 bra BB6_15; + + setp.gt.u32 %p9, %r10, 127; + @%p9 bra BB6_14; + + ld.shared.f64 %fd30, [%rd1+1024]; + max.f64 %fd40, %fd40, %fd30; + st.shared.f64 [%rd1], %fd40; + +BB6_14: + bar.sync 0; + +BB6_15: + setp.lt.u32 %p10, %r11, 128; + @%p10 bra BB6_19; + + setp.gt.u32 %p11, %r10, 63; + @%p11 bra BB6_18; + + ld.shared.f64 %fd31, [%rd1+512]; + max.f64 %fd40, %fd40, %fd31; + st.shared.f64 [%rd1], %fd40; + +BB6_18: + bar.sync 0; + +BB6_19: + setp.gt.u32 %p12, %r10, 31; + @%p12 bra BB6_32; + + setp.lt.u32 %p13, %r11, 64; + @%p13 bra BB6_22; + + ld.volatile.shared.f64 %fd32, [%rd1+256]; + max.f64 %fd40, %fd40, %fd32; + st.volatile.shared.f64 [%rd1], %fd40; + +BB6_22: + setp.lt.u32 %p14, %r11, 32; + @%p14 bra BB6_24; + + ld.volatile.shared.f64 %fd33, [%rd1+128]; + max.f64 %fd40, %fd40, %fd33; + st.volatile.shared.f64 [%rd1], %fd40; + +BB6_24: + setp.lt.u32 %p15, %r11, 16; + @%p15 bra BB6_26; + + ld.volatile.shared.f64 %fd34, [%rd1+64]; + max.f64 %fd40, %fd40, %fd34; + st.volatile.shared.f64 [%rd1], %fd40; + +BB6_26: + setp.lt.u32 %p16, %r11, 8; + @%p16 bra BB6_28; + + ld.volatile.shared.f64 %fd35, [%rd1+32]; + max.f64 %fd40, %fd40, %fd35; + st.volatile.shared.f64 [%rd1], %fd40; + +BB6_28: + setp.lt.u32 %p17, %r11, 4; + @%p17 bra BB6_30; + + ld.volatile.shared.f64 %fd36, [%rd1+16]; + max.f64 %fd40, %fd40, %fd36; + st.volatile.shared.f64 [%rd1], %fd40; + +BB6_30: + setp.lt.u32 %p18, %r11, 2; + @%p18 bra BB6_32; + + ld.volatile.shared.f64 %fd37, [%rd1+8]; + max.f64 %fd38, %fd40, %fd37; + st.volatile.shared.f64 [%rd1], %fd38; + +BB6_32: + setp.ne.s32 %p19, %r10, 0; + @%p19 bra BB6_34; + + ld.shared.f64 %fd39, [sdata]; + mul.wide.u32 %rd8, %r1, 8; + add.s64 %rd9, %rd3, %rd8; + st.f64 [%rd9], %fd39; + +BB6_34: + ret; +} + + // .globl _Z10reduce_colI5MaxOpEvPdS1_jjT_d +.visible .func _Z10reduce_colI5MaxOpEvPdS1_jjT_d( + .param .b64 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_0, + .param .b64 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_1, + .param .b32 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_2, + .param .b32 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_3, + .param .align 1 .b8 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_4[1], + .param .b64 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_5 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<11>; + .reg .f64 %fd<7>; + .reg .b64 %rd<7>; + + + ld.param.u64 %rd1, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_0]; + ld.param.u64 %rd2, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_1]; + ld.param.u32 %r5, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_2]; + ld.param.u32 %r6, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_3]; + ld.param.f64 %fd6, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_5]; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %ntid.x; + mov.u32 %r9, %tid.x; + mad.lo.s32 %r1, %r8, %r7, %r9; + setp.ge.u32 %p1, %r1, %r6; + @%p1 bra BB7_5; + + mul.lo.s32 %r2, %r6, %r5; + setp.ge.u32 %p2, %r1, %r2; + @%p2 bra BB7_4; + + mov.u32 %r10, %r1; + +BB7_3: + mov.u32 %r3, %r10; + mul.wide.u32 %rd3, %r3, 8; + add.s64 %rd4, %rd1, %rd3; + ld.f64 %fd5, [%rd4]; + max.f64 %fd6, %fd6, %fd5; + add.s32 %r4, %r3, %r6; + setp.lt.u32 %p3, %r4, %r2; + mov.u32 %r10, %r4; + @%p3 bra BB7_3; + +BB7_4: + mul.wide.u32 %rd5, %r1, 8; + add.s64 %rd6, %rd2, %rd5; + st.f64 [%rd6], %fd6; + +BB7_5: + ret; +} + + // .globl _Z6reduceI5MinOpEvPdS1_jT_d +.visible .func _Z6reduceI5MinOpEvPdS1_jT_d( + .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_0, + .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_1, + .param .b32 _Z6reduceI5MinOpEvPdS1_jT_d_param_2, + .param .align 1 .b8 _Z6reduceI5MinOpEvPdS1_jT_d_param_3[1], + .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_4 +) +{ + .reg .pred %p<20>; + .reg .b32 %r<33>; + .reg .f64 %fd<79>; + .reg .b64 %rd<12>; + + + ld.param.u64 %rd2, [_Z6reduceI5MinOpEvPdS1_jT_d_param_0]; + ld.param.u64 %rd3, [_Z6reduceI5MinOpEvPdS1_jT_d_param_1]; + ld.param.u32 %r5, [_Z6reduceI5MinOpEvPdS1_jT_d_param_2]; + ld.param.f64 %fd76, [_Z6reduceI5MinOpEvPdS1_jT_d_param_4]; + mov.u32 %r6, %tid.x; + mov.u32 %r7, %ctaid.x; + shl.b32 %r8, %r7, 1; + mov.u32 %r9, %ntid.x; + mad.lo.s32 %r32, %r8, %r9, %r6; + setp.ge.u32 %p1, %r32, %r5; + @%p1 bra BB8_5; + + mov.f64 %fd77, %fd76; + +BB8_2: + mov.f64 %fd1, %fd77; + mul.wide.u32 %rd4, %r32, 8; + add.s64 %rd5, %rd2, %rd4; + ld.f64 %fd29, [%rd5]; + min.f64 %fd78, %fd1, %fd29; + add.s32 %r3, %r32, %r9; + setp.ge.u32 %p2, %r3, %r5; + @%p2 bra BB8_4; + + mul.wide.u32 %rd6, %r3, 8; + add.s64 %rd7, %rd2, %rd6; + ld.f64 %fd30, [%rd7]; + min.f64 %fd78, %fd78, %fd30; + +BB8_4: + mov.f64 %fd77, %fd78; + shl.b32 %r12, %r9, 1; + mov.u32 %r13, %nctaid.x; + mad.lo.s32 %r32, %r12, %r13, %r32; + setp.lt.u32 %p3, %r32, %r5; + mov.f64 %fd76, %fd77; + @%p3 bra BB8_2; + +BB8_5: + mov.f64 %fd74, %fd76; + mul.wide.u32 %rd8, %r6, 8; + mov.u64 %rd9, sdata; + add.s64 %rd1, %rd9, %rd8; + st.shared.f64 [%rd1], %fd74; + bar.sync 0; + setp.lt.u32 %p4, %r9, 1024; + @%p4 bra BB8_9; + + setp.gt.u32 %p5, %r6, 511; + mov.f64 %fd75, %fd74; + @%p5 bra BB8_8; + + ld.shared.f64 %fd31, [%rd1+4096]; + min.f64 %fd75, %fd74, %fd31; + st.shared.f64 [%rd1], %fd75; + +BB8_8: + mov.f64 %fd74, %fd75; + bar.sync 0; + +BB8_9: + mov.f64 %fd72, %fd74; + setp.lt.u32 %p6, %r9, 512; + @%p6 bra BB8_13; + + setp.gt.u32 %p7, %r6, 255; + mov.f64 %fd73, %fd72; + @%p7 bra BB8_12; + + ld.shared.f64 %fd32, [%rd1+2048]; + min.f64 %fd73, %fd72, %fd32; + st.shared.f64 [%rd1], %fd73; + +BB8_12: + mov.f64 %fd72, %fd73; + bar.sync 0; + +BB8_13: + mov.f64 %fd70, %fd72; + setp.lt.u32 %p8, %r9, 256; + @%p8 bra BB8_17; + + setp.gt.u32 %p9, %r6, 127; + mov.f64 %fd71, %fd70; + @%p9 bra BB8_16; + + ld.shared.f64 %fd33, [%rd1+1024]; + min.f64 %fd71, %fd70, %fd33; + st.shared.f64 [%rd1], %fd71; + +BB8_16: + mov.f64 %fd70, %fd71; + bar.sync 0; + +BB8_17: + mov.f64 %fd68, %fd70; + setp.lt.u32 %p10, %r9, 128; + @%p10 bra BB8_21; + + setp.gt.u32 %p11, %r6, 63; + mov.f64 %fd69, %fd68; + @%p11 bra BB8_20; + + ld.shared.f64 %fd34, [%rd1+512]; + min.f64 %fd69, %fd68, %fd34; + st.shared.f64 [%rd1], %fd69; + +BB8_20: + mov.f64 %fd68, %fd69; + bar.sync 0; + +BB8_21: + mov.f64 %fd67, %fd68; + setp.gt.u32 %p12, %r6, 31; + @%p12 bra BB8_34; + + setp.lt.u32 %p13, %r9, 64; + @%p13 bra BB8_24; + + ld.volatile.shared.f64 %fd35, [%rd1+256]; + min.f64 %fd67, %fd67, %fd35; + st.volatile.shared.f64 [%rd1], %fd67; + +BB8_24: + mov.f64 %fd66, %fd67; + setp.lt.u32 %p14, %r9, 32; + @%p14 bra BB8_26; + + ld.volatile.shared.f64 %fd36, [%rd1+128]; + min.f64 %fd66, %fd66, %fd36; + st.volatile.shared.f64 [%rd1], %fd66; + +BB8_26: + mov.f64 %fd65, %fd66; + setp.lt.u32 %p15, %r9, 16; + @%p15 bra BB8_28; + + ld.volatile.shared.f64 %fd37, [%rd1+64]; + min.f64 %fd65, %fd65, %fd37; + st.volatile.shared.f64 [%rd1], %fd65; + +BB8_28: + mov.f64 %fd64, %fd65; + setp.lt.u32 %p16, %r9, 8; + @%p16 bra BB8_30; + + ld.volatile.shared.f64 %fd38, [%rd1+32]; + min.f64 %fd64, %fd64, %fd38; + st.volatile.shared.f64 [%rd1], %fd64; + +BB8_30: + mov.f64 %fd63, %fd64; + setp.lt.u32 %p17, %r9, 4; + @%p17 bra BB8_32; + + ld.volatile.shared.f64 %fd39, [%rd1+16]; + min.f64 %fd63, %fd63, %fd39; + st.volatile.shared.f64 [%rd1], %fd63; + +BB8_32: + setp.lt.u32 %p18, %r9, 2; + @%p18 bra BB8_34; + + ld.volatile.shared.f64 %fd40, [%rd1+8]; + min.f64 %fd41, %fd63, %fd40; + st.volatile.shared.f64 [%rd1], %fd41; + +BB8_34: + setp.ne.s32 %p19, %r6, 0; + @%p19 bra BB8_36; + + ld.shared.f64 %fd42, [sdata]; + mul.wide.u32 %rd10, %r7, 8; + add.s64 %rd11, %rd3, %rd10; + st.f64 [%rd11], %fd42; + +BB8_36: + ret; +} + + // .globl _Z10reduce_rowI5MinOpEvPdS1_jjT_d +.visible .func _Z10reduce_rowI5MinOpEvPdS1_jjT_d( + .param .b64 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_0, + .param .b64 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_1, + .param .b32 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_2, + .param .b32 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_3, + .param .align 1 .b8 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_4[1], + .param .b64 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_5 +) +{ + .reg .pred %p<20>; + .reg .b32 %r<29>; + .reg .f64 %fd<41>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_0]; + ld.param.u64 %rd3, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_1]; + ld.param.u32 %r7, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_2]; + ld.param.u32 %r6, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_3]; + ld.param.f64 %fd40, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_5]; + mov.u32 %r1, %ctaid.x; + setp.ge.u32 %p1, %r1, %r7; + @%p1 bra BB9_34; + + mov.u32 %r28, %tid.x; + mul.lo.s32 %r3, %r1, %r6; + setp.ge.u32 %p2, %r28, %r6; + @%p2 bra BB9_3; + +BB9_2: + add.s32 %r8, %r28, %r3; + mul.wide.u32 %rd4, %r8, 8; + add.s64 %rd5, %rd2, %rd4; + ld.f64 %fd27, [%rd5]; + min.f64 %fd40, %fd40, %fd27; + mov.u32 %r9, %ntid.x; + add.s32 %r28, %r9, %r28; + setp.lt.u32 %p3, %r28, %r6; + @%p3 bra BB9_2; + +BB9_3: + mov.u32 %r10, %tid.x; + mul.wide.u32 %rd6, %r10, 8; + mov.u64 %rd7, sdata; + add.s64 %rd1, %rd7, %rd6; + st.shared.f64 [%rd1], %fd40; + bar.sync 0; + mov.u32 %r11, %ntid.x; + setp.lt.u32 %p4, %r11, 1024; + @%p4 bra BB9_7; + + setp.gt.u32 %p5, %r10, 511; + @%p5 bra BB9_6; + + ld.shared.f64 %fd28, [%rd1+4096]; + min.f64 %fd40, %fd40, %fd28; + st.shared.f64 [%rd1], %fd40; + +BB9_6: + bar.sync 0; + +BB9_7: + setp.lt.u32 %p6, %r11, 512; + @%p6 bra BB9_11; + + setp.gt.u32 %p7, %r10, 255; + @%p7 bra BB9_10; + + ld.shared.f64 %fd29, [%rd1+2048]; + min.f64 %fd40, %fd40, %fd29; + st.shared.f64 [%rd1], %fd40; + +BB9_10: + bar.sync 0; + +BB9_11: + setp.lt.u32 %p8, %r11, 256; + @%p8 bra BB9_15; + + setp.gt.u32 %p9, %r10, 127; + @%p9 bra BB9_14; + + ld.shared.f64 %fd30, [%rd1+1024]; + min.f64 %fd40, %fd40, %fd30; + st.shared.f64 [%rd1], %fd40; + +BB9_14: + bar.sync 0; + +BB9_15: + setp.lt.u32 %p10, %r11, 128; + @%p10 bra BB9_19; + + setp.gt.u32 %p11, %r10, 63; + @%p11 bra BB9_18; + + ld.shared.f64 %fd31, [%rd1+512]; + min.f64 %fd40, %fd40, %fd31; + st.shared.f64 [%rd1], %fd40; + +BB9_18: + bar.sync 0; + +BB9_19: + setp.gt.u32 %p12, %r10, 31; + @%p12 bra BB9_32; + + setp.lt.u32 %p13, %r11, 64; + @%p13 bra BB9_22; + + ld.volatile.shared.f64 %fd32, [%rd1+256]; + min.f64 %fd40, %fd40, %fd32; + st.volatile.shared.f64 [%rd1], %fd40; + +BB9_22: + setp.lt.u32 %p14, %r11, 32; + @%p14 bra BB9_24; + + ld.volatile.shared.f64 %fd33, [%rd1+128]; + min.f64 %fd40, %fd40, %fd33; + st.volatile.shared.f64 [%rd1], %fd40; + +BB9_24: + setp.lt.u32 %p15, %r11, 16; + @%p15 bra BB9_26; + + ld.volatile.shared.f64 %fd34, [%rd1+64]; + min.f64 %fd40, %fd40, %fd34; + st.volatile.shared.f64 [%rd1], %fd40; + +BB9_26: + setp.lt.u32 %p16, %r11, 8; + @%p16 bra BB9_28; + + ld.volatile.shared.f64 %fd35, [%rd1+32]; + min.f64 %fd40, %fd40, %fd35; + st.volatile.shared.f64 [%rd1], %fd40; + +BB9_28: + setp.lt.u32 %p17, %r11, 4; + @%p17 bra BB9_30; + + ld.volatile.shared.f64 %fd36, [%rd1+16]; + min.f64 %fd40, %fd40, %fd36; + st.volatile.shared.f64 [%rd1], %fd40; + +BB9_30: + setp.lt.u32 %p18, %r11, 2; + @%p18 bra BB9_32; + + ld.volatile.shared.f64 %fd37, [%rd1+8]; + min.f64 %fd38, %fd40, %fd37; + st.volatile.shared.f64 [%rd1], %fd38; + +BB9_32: + setp.ne.s32 %p19, %r10, 0; + @%p19 bra BB9_34; + + ld.shared.f64 %fd39, [sdata]; + mul.wide.u32 %rd8, %r1, 8; + add.s64 %rd9, %rd3, %rd8; + st.f64 [%rd9], %fd39; + +BB9_34: + ret; +} + + // .globl _Z10reduce_colI5MinOpEvPdS1_jjT_d +.visible .func _Z10reduce_colI5MinOpEvPdS1_jjT_d( + .param .b64 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_0, + .param .b64 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_1, + .param .b32 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_2, + .param .b32 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_3, + .param .align 1 .b8 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_4[1], + .param .b64 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_5 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<11>; + .reg .f64 %fd<7>; + .reg .b64 %rd<7>; + + + ld.param.u64 %rd1, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_0]; + ld.param.u64 %rd2, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_1]; + ld.param.u32 %r5, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_2]; + ld.param.u32 %r6, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_3]; + ld.param.f64 %fd6, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_5]; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %ntid.x; + mov.u32 %r9, %tid.x; + mad.lo.s32 %r1, %r8, %r7, %r9; + setp.ge.u32 %p1, %r1, %r6; + @%p1 bra BB10_5; + + mul.lo.s32 %r2, %r6, %r5; + setp.ge.u32 %p2, %r1, %r2; + @%p2 bra BB10_4; + + mov.u32 %r10, %r1; + +BB10_3: + mov.u32 %r3, %r10; + mul.wide.u32 %rd3, %r3, 8; + add.s64 %rd4, %rd1, %rd3; + ld.f64 %fd5, [%rd4]; + min.f64 %fd6, %fd6, %fd5; + add.s32 %r4, %r3, %r6; + setp.lt.u32 %p3, %r4, %r2; + mov.u32 %r10, %r4; + @%p3 bra BB10_3; + +BB10_4: + mul.wide.u32 %rd5, %r1, 8; + add.s64 %rd6, %rd2, %rd5; + st.f64 [%rd6], %fd6; + +BB10_5: + ret; +} + + // .globl copyUpperToLowerTriangleDense +.visible .entry copyUpperToLowerTriangleDense( + .param .u64 copyUpperToLowerTriangleDense_param_0, + .param .u32 copyUpperToLowerTriangleDense_param_1, + .param .u32 copyUpperToLowerTriangleDense_param_2 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<13>; + .reg .f64 %fd<2>; + .reg .b64 %rd<7>; + + + ld.param.u64 %rd1, [copyUpperToLowerTriangleDense_param_0]; + ld.param.u32 %r4, [copyUpperToLowerTriangleDense_param_1]; + ld.param.u32 %r5, [copyUpperToLowerTriangleDense_param_2]; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r6, %r7, %r8; + mov.u32 %r9, %ntid.y; + mov.u32 %r10, %ctaid.y; + mov.u32 %r11, %tid.y; + mad.lo.s32 %r2, %r9, %r10, %r11; + mad.lo.s32 %r3, %r2, %r4, %r1; + setp.gt.s32 %p1, %r2, %r1; + setp.lt.s32 %p2, %r3, %r5; + and.pred %p3, %p1, %p2; + @!%p3 bra BB11_2; + bra.uni BB11_1; + +BB11_1: + cvta.to.global.u64 %rd2, %rd1; + mad.lo.s32 %r12, %r1, %r4, %r2; + mul.wide.s32 %rd3, %r12, 8; + add.s64 %rd4, %rd2, %rd3; + ld.global.f64 %fd1, [%rd4]; + mul.wide.s32 %rd5, %r3, 8; + add.s64 %rd6, %rd2, %rd5; + st.global.f64 [%rd6], %fd1; + +BB11_2: + ret; +} + + // .globl dense_matrix_set +.visible .entry dense_matrix_set( + .param .u64 dense_matrix_set_param_0, + .param .f64 dense_matrix_set_param_1, + .param .u32 dense_matrix_set_param_2, + .param .u32 dense_matrix_set_param_3 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<13>; + .reg .f64 %fd<2>; + .reg .b64 %rd<5>; + + + ld.param.u64 %rd1, [dense_matrix_set_param_0]; + ld.param.f64 %fd1, [dense_matrix_set_param_1]; + ld.param.u32 %r2, [dense_matrix_set_param_2]; + ld.param.u32 %r3, [dense_matrix_set_param_3]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r7, %r5, %r4, %r6; + mov.u32 %r8, %ntid.y; + mov.u32 %r9, %ctaid.y; + mov.u32 %r10, %tid.y; + mad.lo.s32 %r11, %r7, %r3, %r10; + mad.lo.s32 %r1, %r8, %r9, %r11; + mul.lo.s32 %r12, %r3, %r2; + setp.ge.s32 %p1, %r1, %r12; + @%p1 bra BB12_2; + + cvta.to.global.u64 %rd2, %rd1; + mul.wide.s32 %rd3, %r1, 8; + add.s64 %rd4, %rd2, %rd3; + st.global.f64 [%rd4], %fd1; + +BB12_2: + ret; +} + + // .globl dense_matrix_copy +.visible .entry dense_matrix_copy( + .param .u64 dense_matrix_copy_param_0, + .param .u64 dense_matrix_copy_param_1, + .param .u32 dense_matrix_copy_param_2, + .param .u32 dense_matrix_copy_param_3 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<12>; + .reg .f64 %fd<2>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [dense_matrix_copy_param_0]; + ld.param.u64 %rd2, [dense_matrix_copy_param_1]; + ld.param.u32 %r2, [dense_matrix_copy_param_2]; + ld.param.u32 %r3, [dense_matrix_copy_param_3]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r7, %r5, %r4, %r6; + mov.u32 %r8, %ntid.y; + mov.u32 %r9, %ctaid.y; + mov.u32 %r10, %tid.y; + mad.lo.s32 %r11, %r8, %r9, %r10; + mad.lo.s32 %r1, %r7, %r3, %r11; + setp.lt.s32 %p1, %r7, %r2; + setp.lt.s32 %p2, %r11, %r3; + and.pred %p3, %p1, %p2; + @!%p3 bra BB13_2; + bra.uni BB13_1; + +BB13_1: + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + st.global.f64 [%rd7], %fd1; + +BB13_2: + ret; +} + + // .globl relu +.visible .entry relu( + .param .u64 relu_param_0, + .param .u64 relu_param_1, + .param .u32 relu_param_2, + .param .u32 relu_param_3 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<12>; + .reg .f64 %fd<4>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [relu_param_0]; + ld.param.u64 %rd2, [relu_param_1]; + ld.param.u32 %r4, [relu_param_2]; + ld.param.u32 %r3, [relu_param_3]; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r6, %r5, %r7; + mov.u32 %r8, %ntid.y; + mov.u32 %r9, %ctaid.y; + mov.u32 %r10, %tid.y; + mad.lo.s32 %r2, %r8, %r9, %r10; + setp.lt.s32 %p1, %r1, %r4; + setp.lt.s32 %p2, %r2, %r3; + and.pred %p3, %p1, %p2; + @!%p3 bra BB14_2; + bra.uni BB14_1; + +BB14_1: + cvta.to.global.u64 %rd3, %rd1; + mad.lo.s32 %r11, %r1, %r3, %r2; + mul.wide.s32 %rd4, %r11, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + mov.f64 %fd2, 0d0000000000000000; + max.f64 %fd3, %fd2, %fd1; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + st.global.f64 [%rd7], %fd3; + +BB14_2: + ret; +} + + // .globl reluBackward +.visible .entry reluBackward( + .param .u64 reluBackward_param_0, + .param .u64 reluBackward_param_1, + .param .u64 reluBackward_param_2, + .param .u32 reluBackward_param_3, + .param .u32 reluBackward_param_4 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<12>; + .reg .f64 %fd<6>; + .reg .b64 %rd<13>; + + + ld.param.u64 %rd1, [reluBackward_param_0]; + ld.param.u64 %rd2, [reluBackward_param_1]; + ld.param.u64 %rd3, [reluBackward_param_2]; + ld.param.u32 %r5, [reluBackward_param_3]; + ld.param.u32 %r4, [reluBackward_param_4]; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r6, %r7, %r8; + mov.u32 %r9, %ntid.y; + mov.u32 %r10, %ctaid.y; + mov.u32 %r11, %tid.y; + mad.lo.s32 %r2, %r9, %r10, %r11; + setp.lt.s32 %p1, %r1, %r5; + setp.lt.s32 %p2, %r2, %r4; + and.pred %p3, %p1, %p2; + @!%p3 bra BB15_4; + bra.uni BB15_1; + +BB15_1: + cvta.to.global.u64 %rd4, %rd1; + mad.lo.s32 %r3, %r1, %r4, %r2; + mul.wide.s32 %rd5, %r3, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd4, [%rd6]; + mov.f64 %fd5, 0d0000000000000000; + setp.leu.f64 %p4, %fd4, 0d0000000000000000; + @%p4 bra BB15_3; + + cvta.to.global.u64 %rd7, %rd2; + add.s64 %rd9, %rd7, %rd5; + ld.global.f64 %fd5, [%rd9]; + +BB15_3: + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd12, %rd10, %rd5; + st.global.f64 [%rd12], %fd5; + +BB15_4: + ret; +} + + // .globl biasAdd +.visible .entry biasAdd( + .param .u64 biasAdd_param_0, + .param .u64 biasAdd_param_1, + .param .u64 biasAdd_param_2, + .param .u32 biasAdd_param_3, + .param .u32 biasAdd_param_4, + .param .u32 biasAdd_param_5 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<14>; + .reg .f64 %fd<4>; + .reg .b64 %rd<12>; + + + ld.param.u64 %rd1, [biasAdd_param_0]; + ld.param.u64 %rd2, [biasAdd_param_1]; + ld.param.u64 %rd3, [biasAdd_param_2]; + ld.param.u32 %r5, [biasAdd_param_3]; + ld.param.u32 %r3, [biasAdd_param_4]; + ld.param.u32 %r4, [biasAdd_param_5]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + mov.u32 %r9, %ntid.y; + mov.u32 %r10, %ctaid.y; + mov.u32 %r11, %tid.y; + mad.lo.s32 %r2, %r9, %r10, %r11; + setp.lt.s32 %p1, %r1, %r5; + setp.lt.s32 %p2, %r2, %r3; + and.pred %p3, %p1, %p2; + @!%p3 bra BB16_2; + bra.uni BB16_1; + +BB16_1: + cvta.to.global.u64 %rd4, %rd1; + mad.lo.s32 %r12, %r1, %r3, %r2; + mul.wide.s32 %rd5, %r12, 8; + add.s64 %rd6, %rd4, %rd5; + div.s32 %r13, %r2, %r4; + cvta.to.global.u64 %rd7, %rd2; + mul.wide.s32 %rd8, %r13, 8; + add.s64 %rd9, %rd7, %rd8; + ld.global.f64 %fd1, [%rd9]; + ld.global.f64 %fd2, [%rd6]; + add.f64 %fd3, %fd2, %fd1; + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd11, %rd10, %rd5; + st.global.f64 [%rd11], %fd3; + +BB16_2: + ret; +} + + // .globl compareAndSet +.visible .entry compareAndSet( + .param .u64 compareAndSet_param_0, + .param .u64 compareAndSet_param_1, + .param .u32 compareAndSet_param_2, + .param .u32 compareAndSet_param_3, + .param .f64 compareAndSet_param_4, + .param .f64 compareAndSet_param_5, + .param .f64 compareAndSet_param_6, + .param .f64 compareAndSet_param_7, + .param .f64 compareAndSet_param_8 +) +{ + .reg .pred %p<6>; + .reg .b32 %r<12>; + .reg .f64 %fd<9>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd2, [compareAndSet_param_0]; + ld.param.u64 %rd3, [compareAndSet_param_1]; + ld.param.u32 %r2, [compareAndSet_param_2]; + ld.param.u32 %r3, [compareAndSet_param_3]; + ld.param.f64 %fd2, [compareAndSet_param_4]; + ld.param.f64 %fd3, [compareAndSet_param_5]; + ld.param.f64 %fd4, [compareAndSet_param_6]; + ld.param.f64 %fd5, [compareAndSet_param_7]; + ld.param.f64 %fd6, [compareAndSet_param_8]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r7, %r5, %r4, %r6; + mov.u32 %r8, %ntid.y; + mov.u32 %r9, %ctaid.y; + mov.u32 %r10, %tid.y; + mad.lo.s32 %r11, %r8, %r9, %r10; + mad.lo.s32 %r1, %r7, %r3, %r11; + setp.lt.s32 %p1, %r7, %r2; + setp.lt.s32 %p2, %r11, %r3; + and.pred %p3, %p1, %p2; + @!%p3 bra BB17_6; + bra.uni BB17_1; + +BB17_1: + cvta.to.global.u64 %rd4, %rd2; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd1, [%rd6]; + sub.f64 %fd7, %fd1, %fd2; + abs.f64 %fd8, %fd7; + setp.lt.f64 %p4, %fd8, %fd3; + cvta.to.global.u64 %rd7, %rd3; + add.s64 %rd1, %rd7, %rd5; + @%p4 bra BB17_5; + bra.uni BB17_2; + +BB17_5: + st.global.f64 [%rd1], %fd4; + bra.uni BB17_6; + +BB17_2: + setp.lt.f64 %p5, %fd1, %fd2; + @%p5 bra BB17_4; + bra.uni BB17_3; + +BB17_4: + st.global.f64 [%rd1], %fd5; + bra.uni BB17_6; + +BB17_3: + st.global.f64 [%rd1], %fd6; + +BB17_6: + ret; +} + + // .globl binCellOp +.visible .entry binCellOp( + .param .u64 binCellOp_param_0, + .param .u64 binCellOp_param_1, + .param .u64 binCellOp_param_2, + .param .u32 binCellOp_param_3, + .param .u32 binCellOp_param_4, + .param .u32 binCellOp_param_5, + .param .u32 binCellOp_param_6, + .param .u32 binCellOp_param_7 +) +{ + .reg .pred %p<52>; + .reg .b32 %r<56>; + .reg .f64 %fd<40>; + .reg .b64 %rd<15>; + + + ld.param.u64 %rd2, [binCellOp_param_0]; + ld.param.u64 %rd3, [binCellOp_param_1]; + ld.param.u64 %rd4, [binCellOp_param_2]; + ld.param.u32 %r14, [binCellOp_param_3]; + ld.param.u32 %r10, [binCellOp_param_4]; + ld.param.u32 %r11, [binCellOp_param_5]; + ld.param.u32 %r12, [binCellOp_param_6]; + ld.param.u32 %r13, [binCellOp_param_7]; + mov.u32 %r15, %ntid.x; + mov.u32 %r16, %ctaid.x; + mov.u32 %r17, %tid.x; + mad.lo.s32 %r1, %r15, %r16, %r17; + mov.u32 %r18, %ntid.y; + mov.u32 %r19, %ctaid.y; + mov.u32 %r20, %tid.y; + mad.lo.s32 %r2, %r18, %r19, %r20; + setp.lt.s32 %p2, %r1, %r14; + setp.lt.s32 %p3, %r2, %r10; + and.pred %p4, %p2, %p3; + @!%p4 bra BB18_55; + bra.uni BB18_1; + +BB18_1: + mad.lo.s32 %r3, %r1, %r10, %r2; + setp.eq.s32 %p5, %r11, 1; + mov.u32 %r54, %r1; + @%p5 bra BB18_5; + + setp.ne.s32 %p6, %r11, 2; + mov.u32 %r55, %r3; + @%p6 bra BB18_4; + + mov.u32 %r55, %r2; + +BB18_4: + mov.u32 %r49, %r55; + mov.u32 %r4, %r49; + mov.u32 %r54, %r4; + +BB18_5: + mov.u32 %r5, %r54; + setp.eq.s32 %p7, %r12, 1; + mov.u32 %r52, %r1; + @%p7 bra BB18_9; + + setp.ne.s32 %p8, %r12, 2; + mov.u32 %r53, %r3; + @%p8 bra BB18_8; + + mov.u32 %r53, %r2; + +BB18_8: + mov.u32 %r52, %r53; + +BB18_9: + cvta.to.global.u64 %rd5, %rd3; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r5, 8; + add.s64 %rd8, %rd6, %rd7; + ld.global.f64 %fd1, [%rd8]; + mul.wide.s32 %rd9, %r52, 8; + add.s64 %rd10, %rd5, %rd9; + ld.global.f64 %fd2, [%rd10]; + mov.f64 %fd39, 0dC08F380000000000; + setp.gt.s32 %p9, %r13, 5; + @%p9 bra BB18_19; + + setp.gt.s32 %p19, %r13, 2; + @%p19 bra BB18_15; + + setp.eq.s32 %p23, %r13, 0; + @%p23 bra BB18_53; + + setp.eq.s32 %p24, %r13, 1; + @%p24 bra BB18_52; + bra.uni BB18_13; + +BB18_52: + sub.f64 %fd39, %fd1, %fd2; + bra.uni BB18_54; + +BB18_19: + setp.gt.s32 %p10, %r13, 8; + @%p10 bra BB18_24; + + setp.eq.s32 %p16, %r13, 6; + @%p16 bra BB18_34; + + setp.eq.s32 %p17, %r13, 7; + @%p17 bra BB18_33; + bra.uni BB18_22; + +BB18_33: + setp.gt.f64 %p29, %fd1, %fd2; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29; + bra.uni BB18_54; + +BB18_15: + setp.eq.s32 %p20, %r13, 3; + @%p20 bra BB18_51; + + setp.eq.s32 %p21, %r13, 4; + @%p21 bra BB18_35; + bra.uni BB18_17; + +BB18_35: + { + .reg .b32 %temp; + mov.b64 {%temp, %r8}, %fd1; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r9}, %fd2; + } + bfe.u32 %r21, %r9, 20, 11; + add.s32 %r22, %r21, -1012; + mov.b64 %rd11, %fd2; + shl.b64 %rd1, %rd11, %r22; + setp.eq.s64 %p32, %rd1, -9223372036854775808; + abs.f64 %fd11, %fd1; + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd11; + .param .b64 param1; + st.param.f64 [param1+0], %fd2; + .param .b64 retval0; + call.uni (retval0), + __internal_accurate_pow, + ( + param0, + param1 + ); + ld.param.f64 %fd38, [retval0+0]; + + //{ + }// Callseq End 1 + setp.lt.s32 %p33, %r8, 0; + and.pred %p1, %p33, %p32; + @!%p1 bra BB18_37; + bra.uni BB18_36; + +BB18_36: + { + .reg .b32 %temp; + mov.b64 {%temp, %r23}, %fd38; + } + xor.b32 %r24, %r23, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%r25, %temp}, %fd38; + } + mov.b64 %fd38, {%r25, %r24}; + +BB18_37: + mov.f64 %fd37, %fd38; + setp.eq.f64 %p34, %fd1, 0d0000000000000000; + @%p34 bra BB18_40; + bra.uni BB18_38; + +BB18_40: + selp.b32 %r26, %r8, 0, %p32; + or.b32 %r27, %r26, 2146435072; + setp.lt.s32 %p38, %r9, 0; + selp.b32 %r28, %r27, %r26, %p38; + mov.u32 %r29, 0; + mov.b64 %fd37, {%r29, %r28}; + bra.uni BB18_41; + +BB18_24: + setp.gt.s32 %p11, %r13, 10; + @%p11 bra BB18_28; + + setp.eq.s32 %p14, %r13, 9; + @%p14 bra BB18_32; + bra.uni BB18_26; + +BB18_32: + setp.eq.f64 %p27, %fd1, %fd2; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27; + bra.uni BB18_54; + +BB18_28: + setp.eq.s32 %p12, %r13, 11; + @%p12 bra BB18_31; + bra.uni BB18_29; + +BB18_31: + min.f64 %fd39, %fd1, %fd2; + bra.uni BB18_54; + +BB18_53: + add.f64 %fd39, %fd1, %fd2; + bra.uni BB18_54; + +BB18_13: + setp.eq.s32 %p25, %r13, 2; + @%p25 bra BB18_14; + bra.uni BB18_54; + +BB18_14: + mul.f64 %fd39, %fd1, %fd2; + bra.uni BB18_54; + +BB18_34: + setp.le.f64 %p30, %fd1, %fd2; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30; + bra.uni BB18_54; + +BB18_22: + setp.eq.s32 %p18, %r13, 8; + @%p18 bra BB18_23; + bra.uni BB18_54; + +BB18_23: + setp.ge.f64 %p28, %fd1, %fd2; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28; + bra.uni BB18_54; + +BB18_51: + div.rn.f64 %fd39, %fd1, %fd2; + bra.uni BB18_54; + +BB18_17: + setp.eq.s32 %p22, %r13, 5; + @%p22 bra BB18_18; + bra.uni BB18_54; + +BB18_18: + setp.lt.f64 %p31, %fd1, %fd2; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31; + bra.uni BB18_54; + +BB18_26: + setp.eq.s32 %p15, %r13, 10; + @%p15 bra BB18_27; + bra.uni BB18_54; + +BB18_27: + setp.neu.f64 %p26, %fd1, %fd2; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB18_54; + +BB18_29: + setp.ne.s32 %p13, %r13, 12; + @%p13 bra BB18_54; + + max.f64 %fd39, %fd1, %fd2; + bra.uni BB18_54; + +BB18_38: + setp.gt.s32 %p35, %r8, -1; + @%p35 bra BB18_41; + + cvt.rzi.f64.f64 %fd29, %fd2; + setp.neu.f64 %p36, %fd29, %fd2; + selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36; + +BB18_41: + mov.f64 %fd17, %fd37; + add.f64 %fd18, %fd1, %fd2; + { + .reg .b32 %temp; + mov.b64 {%temp, %r30}, %fd18; + } + and.b32 %r31, %r30, 2146435072; + setp.ne.s32 %p39, %r31, 2146435072; + mov.f64 %fd36, %fd17; + @%p39 bra BB18_50; + + setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; + mov.f64 %fd36, %fd18; + @%p40 bra BB18_50; + + abs.f64 %fd30, %fd2; + setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; + mov.f64 %fd35, %fd18; + mov.f64 %fd36, %fd35; + @%p41 bra BB18_50; + + and.b32 %r32, %r9, 2147483647; + setp.ne.s32 %p42, %r32, 2146435072; + @%p42 bra BB18_46; + + { + .reg .b32 %temp; + mov.b64 {%r33, %temp}, %fd2; + } + setp.eq.s32 %p43, %r33, 0; + @%p43 bra BB18_49; + +BB18_46: + and.b32 %r34, %r8, 2147483647; + setp.ne.s32 %p44, %r34, 2146435072; + mov.f64 %fd33, %fd17; + mov.f64 %fd36, %fd33; + @%p44 bra BB18_50; + + { + .reg .b32 %temp; + mov.b64 {%r35, %temp}, %fd1; + } + setp.ne.s32 %p45, %r35, 0; + mov.f64 %fd36, %fd17; + @%p45 bra BB18_50; + + shr.s32 %r36, %r9, 31; + and.b32 %r37, %r36, -2146435072; + add.s32 %r38, %r37, 2146435072; + or.b32 %r39, %r38, -2147483648; + selp.b32 %r40, %r39, %r38, %p1; + mov.u32 %r41, 0; + mov.b64 %fd36, {%r41, %r40}; + bra.uni BB18_50; + +BB18_49: + setp.gt.f64 %p46, %fd11, 0d3FF0000000000000; + selp.b32 %r42, 2146435072, 0, %p46; + xor.b32 %r43, %r42, 2146435072; + setp.lt.s32 %p47, %r9, 0; + selp.b32 %r44, %r43, %r42, %p47; + setp.eq.f64 %p48, %fd1, 0dBFF0000000000000; + selp.b32 %r45, 1072693248, %r44, %p48; + mov.u32 %r46, 0; + mov.b64 %fd36, {%r46, %r45}; + +BB18_50: + setp.eq.f64 %p49, %fd2, 0d0000000000000000; + setp.eq.f64 %p50, %fd1, 0d3FF0000000000000; + or.pred %p51, %p50, %p49; + selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51; + +BB18_54: + cvta.to.global.u64 %rd12, %rd4; + mul.wide.s32 %rd13, %r3, 8; + add.s64 %rd14, %rd12, %rd13; + st.global.f64 [%rd14], %fd39; + +BB18_55: + ret; +} + + // .globl binCellScalarOp +.visible .entry binCellScalarOp( + .param .u64 binCellScalarOp_param_0, + .param .f64 binCellScalarOp_param_1, + .param .u64 binCellScalarOp_param_2, + .param .u32 binCellScalarOp_param_3, + .param .u32 binCellScalarOp_param_4, + .param .u32 binCellScalarOp_param_5, + .param .u32 binCellScalarOp_param_6 +) +{ + .reg .pred %p<89>; + .reg .b32 %r<71>; + .reg .f64 %fd<77>; + .reg .b64 %rd<12>; + + + ld.param.u64 %rd4, [binCellScalarOp_param_0]; + ld.param.f64 %fd52, [binCellScalarOp_param_1]; + ld.param.u64 %rd5, [binCellScalarOp_param_2]; + ld.param.u32 %r8, [binCellScalarOp_param_3]; + ld.param.u32 %r9, [binCellScalarOp_param_4]; + ld.param.u32 %r6, [binCellScalarOp_param_5]; + ld.param.u32 %r7, [binCellScalarOp_param_6]; + mov.u32 %r10, %ctaid.x; + mov.u32 %r11, %ntid.x; + mov.u32 %r12, %tid.x; + mad.lo.s32 %r13, %r11, %r10, %r12; + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r17, %r13, %r9, %r16; + mad.lo.s32 %r1, %r14, %r15, %r17; + mul.lo.s32 %r18, %r9, %r8; + setp.ge.s32 %p3, %r1, %r18; + @%p3 bra BB19_92; + + cvta.to.global.u64 %rd6, %rd5; + cvta.to.global.u64 %rd7, %rd4; + mul.wide.s32 %rd8, %r1, 8; + add.s64 %rd9, %rd7, %rd8; + ld.global.f64 %fd1, [%rd9]; + add.s64 %rd1, %rd6, %rd8; + setp.eq.s32 %p4, %r7, 0; + @%p4 bra BB19_47; + + setp.eq.s32 %p5, %r6, 0; + @%p5 bra BB19_45; + + mov.f64 %fd67, 0dC08F380000000000; + setp.gt.s32 %p6, %r6, 6; + @%p6 bra BB19_13; + + setp.gt.s32 %p14, %r6, 3; + @%p14 bra BB19_9; + + setp.eq.s32 %p18, %r6, 1; + @%p18 bra BB19_44; + + setp.eq.s32 %p19, %r6, 2; + @%p19 bra BB19_43; + bra.uni BB19_7; + +BB19_43: + mul.f64 %fd67, %fd1, %fd52; + bra.uni BB19_46; + +BB19_47: + setp.eq.s32 %p47, %r6, 0; + @%p47 bra BB19_90; + + mov.f64 %fd76, 0dC08F380000000000; + setp.gt.s32 %p48, %r6, 6; + @%p48 bra BB19_58; + + setp.gt.s32 %p56, %r6, 3; + @%p56 bra BB19_54; + + setp.eq.s32 %p60, %r6, 1; + @%p60 bra BB19_89; + + setp.eq.s32 %p61, %r6, 2; + @%p61 bra BB19_88; + bra.uni BB19_52; + +BB19_88: + mul.f64 %fd76, %fd1, %fd52; + bra.uni BB19_91; + +BB19_45: + add.f64 %fd67, %fd1, %fd52; + +BB19_46: + st.global.f64 [%rd1], %fd67; + bra.uni BB19_92; + +BB19_13: + setp.gt.s32 %p7, %r6, 9; + @%p7 bra BB19_18; + + setp.eq.s32 %p11, %r6, 7; + @%p11 bra BB19_25; + + setp.eq.s32 %p12, %r6, 8; + @%p12 bra BB19_24; + bra.uni BB19_16; + +BB19_24: + setp.le.f64 %p23, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23; + bra.uni BB19_46; + +BB19_90: + add.f64 %fd76, %fd1, %fd52; + +BB19_91: + st.global.f64 [%rd1], %fd76; + +BB19_92: + ret; + +BB19_58: + setp.gt.s32 %p49, %r6, 9; + @%p49 bra BB19_63; + + setp.eq.s32 %p53, %r6, 7; + @%p53 bra BB19_70; + + setp.eq.s32 %p54, %r6, 8; + @%p54 bra BB19_69; + bra.uni BB19_61; + +BB19_69: + setp.ge.f64 %p65, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65; + bra.uni BB19_91; + +BB19_9: + setp.eq.s32 %p15, %r6, 4; + @%p15 bra BB19_27; + + setp.eq.s32 %p16, %r6, 5; + @%p16 bra BB19_26; + bra.uni BB19_11; + +BB19_26: + setp.gt.f64 %p26, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB19_46; + +BB19_18: + setp.eq.s32 %p8, %r6, 10; + @%p8 bra BB19_23; + + setp.eq.s32 %p9, %r6, 11; + @%p9 bra BB19_22; + bra.uni BB19_20; + +BB19_22: + min.f64 %fd67, %fd52, %fd1; + bra.uni BB19_46; + +BB19_54: + setp.eq.s32 %p57, %r6, 4; + @%p57 bra BB19_72; + + setp.eq.s32 %p58, %r6, 5; + @%p58 bra BB19_71; + bra.uni BB19_56; + +BB19_71: + setp.lt.f64 %p68, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68; + bra.uni BB19_91; + +BB19_63: + setp.eq.s32 %p50, %r6, 10; + @%p50 bra BB19_68; + + setp.eq.s32 %p51, %r6, 11; + @%p51 bra BB19_67; + bra.uni BB19_65; + +BB19_67: + min.f64 %fd76, %fd1, %fd52; + bra.uni BB19_91; + +BB19_44: + sub.f64 %fd67, %fd52, %fd1; + bra.uni BB19_46; + +BB19_7: + setp.eq.s32 %p20, %r6, 3; + @%p20 bra BB19_8; + bra.uni BB19_46; + +BB19_8: + div.rn.f64 %fd67, %fd52, %fd1; + bra.uni BB19_46; + +BB19_25: + setp.lt.f64 %p24, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24; + bra.uni BB19_46; + +BB19_16: + setp.eq.s32 %p13, %r6, 9; + @%p13 bra BB19_17; + bra.uni BB19_46; + +BB19_17: + setp.eq.f64 %p22, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22; + bra.uni BB19_46; + +BB19_27: + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd52; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r3}, %fd1; + } + bfe.u32 %r19, %r3, 20, 11; + add.s32 %r20, %r19, -1012; + mov.b64 %rd10, %fd1; + shl.b64 %rd2, %rd10, %r20; + setp.eq.s64 %p27, %rd2, -9223372036854775808; + abs.f64 %fd10, %fd52; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd10; + .param .b64 param1; + st.param.f64 [param1+0], %fd1; + .param .b64 retval0; + call.uni (retval0), + __internal_accurate_pow, + ( + param0, + param1 + ); + ld.param.f64 %fd66, [retval0+0]; + + //{ + }// Callseq End 2 + setp.lt.s32 %p28, %r2, 0; + and.pred %p1, %p28, %p27; + @!%p1 bra BB19_29; + bra.uni BB19_28; + +BB19_28: + { + .reg .b32 %temp; + mov.b64 {%temp, %r21}, %fd66; + } + xor.b32 %r22, %r21, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%r23, %temp}, %fd66; + } + mov.b64 %fd66, {%r23, %r22}; + +BB19_29: + mov.f64 %fd65, %fd66; + setp.eq.f64 %p29, %fd52, 0d0000000000000000; + @%p29 bra BB19_32; + bra.uni BB19_30; + +BB19_32: + selp.b32 %r24, %r2, 0, %p27; + or.b32 %r25, %r24, 2146435072; + setp.lt.s32 %p33, %r3, 0; + selp.b32 %r26, %r25, %r24, %p33; + mov.u32 %r27, 0; + mov.b64 %fd65, {%r27, %r26}; + bra.uni BB19_33; + +BB19_11: + setp.eq.s32 %p17, %r6, 6; + @%p17 bra BB19_12; + bra.uni BB19_46; + +BB19_12: + setp.ge.f64 %p25, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25; + bra.uni BB19_46; + +BB19_23: + setp.neu.f64 %p21, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21; + bra.uni BB19_46; + +BB19_20: + setp.ne.s32 %p10, %r6, 12; + @%p10 bra BB19_46; + + max.f64 %fd67, %fd52, %fd1; + bra.uni BB19_46; + +BB19_89: + sub.f64 %fd76, %fd1, %fd52; + bra.uni BB19_91; + +BB19_52: + setp.eq.s32 %p62, %r6, 3; + @%p62 bra BB19_53; + bra.uni BB19_91; + +BB19_53: + div.rn.f64 %fd76, %fd1, %fd52; + bra.uni BB19_91; + +BB19_70: + setp.gt.f64 %p66, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66; + bra.uni BB19_91; + +BB19_61: + setp.eq.s32 %p55, %r6, 9; + @%p55 bra BB19_62; + bra.uni BB19_91; + +BB19_62: + setp.eq.f64 %p64, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64; + bra.uni BB19_91; + +BB19_72: + { + .reg .b32 %temp; + mov.b64 {%temp, %r4}, %fd1; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r5}, %fd52; + } + bfe.u32 %r45, %r5, 20, 11; + add.s32 %r46, %r45, -1012; + mov.b64 %rd11, %fd52; + shl.b64 %rd3, %rd11, %r46; + setp.eq.s64 %p69, %rd3, -9223372036854775808; + abs.f64 %fd35, %fd1; + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd35; + .param .b64 param1; + st.param.f64 [param1+0], %fd52; + .param .b64 retval0; + call.uni (retval0), + __internal_accurate_pow, + ( + param0, + param1 + ); + ld.param.f64 %fd75, [retval0+0]; + + //{ + }// Callseq End 3 + setp.lt.s32 %p70, %r4, 0; + and.pred %p2, %p70, %p69; + @!%p2 bra BB19_74; + bra.uni BB19_73; + +BB19_73: + { + .reg .b32 %temp; + mov.b64 {%temp, %r47}, %fd75; + } + xor.b32 %r48, %r47, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%r49, %temp}, %fd75; + } + mov.b64 %fd75, {%r49, %r48}; + +BB19_74: + mov.f64 %fd74, %fd75; + setp.eq.f64 %p71, %fd1, 0d0000000000000000; + @%p71 bra BB19_77; + bra.uni BB19_75; + +BB19_77: + selp.b32 %r50, %r4, 0, %p69; + or.b32 %r51, %r50, 2146435072; + setp.lt.s32 %p75, %r5, 0; + selp.b32 %r52, %r51, %r50, %p75; + mov.u32 %r53, 0; + mov.b64 %fd74, {%r53, %r52}; + bra.uni BB19_78; + +BB19_56: + setp.eq.s32 %p59, %r6, 6; + @%p59 bra BB19_57; + bra.uni BB19_91; + +BB19_57: + setp.le.f64 %p67, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67; + bra.uni BB19_91; + +BB19_68: + setp.neu.f64 %p63, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63; + bra.uni BB19_91; + +BB19_65: + setp.ne.s32 %p52, %r6, 12; + @%p52 bra BB19_91; + + max.f64 %fd76, %fd1, %fd52; + bra.uni BB19_91; + +BB19_30: + setp.gt.s32 %p30, %r2, -1; + @%p30 bra BB19_33; + + cvt.rzi.f64.f64 %fd54, %fd1; + setp.neu.f64 %p31, %fd54, %fd1; + selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p31; + +BB19_33: + mov.f64 %fd16, %fd65; + add.f64 %fd17, %fd1, %fd52; + { + .reg .b32 %temp; + mov.b64 {%temp, %r28}, %fd17; + } + and.b32 %r29, %r28, 2146435072; + setp.ne.s32 %p34, %r29, 2146435072; + mov.f64 %fd64, %fd16; + @%p34 bra BB19_42; + + setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000; + mov.f64 %fd64, %fd17; + @%p35 bra BB19_42; + + abs.f64 %fd55, %fd1; + setp.gtu.f64 %p36, %fd55, 0d7FF0000000000000; + mov.f64 %fd63, %fd17; + mov.f64 %fd64, %fd63; + @%p36 bra BB19_42; + + and.b32 %r30, %r3, 2147483647; + setp.ne.s32 %p37, %r30, 2146435072; + @%p37 bra BB19_38; + + { + .reg .b32 %temp; + mov.b64 {%r31, %temp}, %fd1; + } + setp.eq.s32 %p38, %r31, 0; + @%p38 bra BB19_41; + +BB19_38: + and.b32 %r32, %r2, 2147483647; + setp.ne.s32 %p39, %r32, 2146435072; + mov.f64 %fd61, %fd16; + mov.f64 %fd64, %fd61; + @%p39 bra BB19_42; + + { + .reg .b32 %temp; + mov.b64 {%r33, %temp}, %fd52; + } + setp.ne.s32 %p40, %r33, 0; + mov.f64 %fd64, %fd16; + @%p40 bra BB19_42; + + shr.s32 %r34, %r3, 31; + and.b32 %r35, %r34, -2146435072; + add.s32 %r36, %r35, 2146435072; + or.b32 %r37, %r36, -2147483648; + selp.b32 %r38, %r37, %r36, %p1; + mov.u32 %r39, 0; + mov.b64 %fd64, {%r39, %r38}; + bra.uni BB19_42; + +BB19_75: + setp.gt.s32 %p72, %r4, -1; + @%p72 bra BB19_78; + + cvt.rzi.f64.f64 %fd57, %fd52; + setp.neu.f64 %p73, %fd57, %fd52; + selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p73; + +BB19_78: + mov.f64 %fd41, %fd74; + add.f64 %fd42, %fd1, %fd52; + { + .reg .b32 %temp; + mov.b64 {%temp, %r54}, %fd42; + } + and.b32 %r55, %r54, 2146435072; + setp.ne.s32 %p76, %r55, 2146435072; + mov.f64 %fd73, %fd41; + @%p76 bra BB19_87; + + setp.gtu.f64 %p77, %fd35, 0d7FF0000000000000; + mov.f64 %fd73, %fd42; + @%p77 bra BB19_87; + + abs.f64 %fd58, %fd52; + setp.gtu.f64 %p78, %fd58, 0d7FF0000000000000; + mov.f64 %fd72, %fd42; + mov.f64 %fd73, %fd72; + @%p78 bra BB19_87; + + and.b32 %r56, %r5, 2147483647; + setp.ne.s32 %p79, %r56, 2146435072; + @%p79 bra BB19_83; + + { + .reg .b32 %temp; + mov.b64 {%r57, %temp}, %fd52; + } + setp.eq.s32 %p80, %r57, 0; + @%p80 bra BB19_86; + +BB19_83: + and.b32 %r58, %r4, 2147483647; + setp.ne.s32 %p81, %r58, 2146435072; + mov.f64 %fd70, %fd41; + mov.f64 %fd73, %fd70; + @%p81 bra BB19_87; + + { + .reg .b32 %temp; + mov.b64 {%r59, %temp}, %fd1; + } + setp.ne.s32 %p82, %r59, 0; + mov.f64 %fd73, %fd41; + @%p82 bra BB19_87; + + shr.s32 %r60, %r5, 31; + and.b32 %r61, %r60, -2146435072; + add.s32 %r62, %r61, 2146435072; + or.b32 %r63, %r62, -2147483648; + selp.b32 %r64, %r63, %r62, %p2; + mov.u32 %r65, 0; + mov.b64 %fd73, {%r65, %r64}; + bra.uni BB19_87; + +BB19_41: + setp.gt.f64 %p41, %fd10, 0d3FF0000000000000; + selp.b32 %r40, 2146435072, 0, %p41; + xor.b32 %r41, %r40, 2146435072; + setp.lt.s32 %p42, %r3, 0; + selp.b32 %r42, %r41, %r40, %p42; + setp.eq.f64 %p43, %fd52, 0dBFF0000000000000; + selp.b32 %r43, 1072693248, %r42, %p43; + mov.u32 %r44, 0; + mov.b64 %fd64, {%r44, %r43}; + +BB19_42: + setp.eq.f64 %p44, %fd1, 0d0000000000000000; + setp.eq.f64 %p45, %fd52, 0d3FF0000000000000; + or.pred %p46, %p45, %p44; + selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p46; + bra.uni BB19_46; + +BB19_86: + setp.gt.f64 %p83, %fd35, 0d3FF0000000000000; + selp.b32 %r66, 2146435072, 0, %p83; + xor.b32 %r67, %r66, 2146435072; + setp.lt.s32 %p84, %r5, 0; + selp.b32 %r68, %r67, %r66, %p84; + setp.eq.f64 %p85, %fd1, 0dBFF0000000000000; + selp.b32 %r69, 1072693248, %r68, %p85; + mov.u32 %r70, 0; + mov.b64 %fd73, {%r70, %r69}; + +BB19_87: + setp.eq.f64 %p86, %fd52, 0d0000000000000000; + setp.eq.f64 %p87, %fd1, 0d3FF0000000000000; + or.pred %p88, %p87, %p86; + selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p88; + bra.uni BB19_91; +} + + // .globl fill +.visible .entry fill( + .param .u64 fill_param_0, + .param .f64 fill_param_1, + .param .u32 fill_param_2 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<6>; + .reg .f64 %fd<2>; + .reg .b64 %rd<5>; + + + ld.param.u64 %rd1, [fill_param_0]; + ld.param.f64 %fd1, [fill_param_1]; + ld.param.u32 %r2, [fill_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.s32 %p1, %r1, %r2; + @%p1 bra BB20_2; + + cvta.to.global.u64 %rd2, %rd1; + mul.wide.s32 %rd3, %r1, 8; + add.s64 %rd4, %rd2, %rd3; + st.global.f64 [%rd4], %fd1; + +BB20_2: + ret; +} + + // .globl reduce_sum +.visible .entry reduce_sum( + .param .u64 reduce_sum_param_0, + .param .u64 reduce_sum_param_1, + .param .u32 reduce_sum_param_2 ) { - .reg .pred %p<54>; - .reg .b32 %r<55>; - .reg .f64 %fd<39>; + .reg .pred %p<20>; + .reg .b32 %r<33>; + .reg .f64 %fd<79>; .reg .b64 %rd<15>; - ld.param.u64 %rd2, [binCellOp_param_0]; - ld.param.u64 %rd3, [binCellOp_param_1]; - ld.param.u64 %rd4, [binCellOp_param_2]; - ld.param.u32 %r14, [binCellOp_param_3]; - ld.param.u32 %r10, [binCellOp_param_4]; - ld.param.u32 %r11, [binCellOp_param_5]; - ld.param.u32 %r12, [binCellOp_param_6]; - ld.param.u32 %r13, [binCellOp_param_7]; - mov.u32 %r15, %ntid.x; - mov.u32 %r16, %ctaid.x; - mov.u32 %r17, %tid.x; - mad.lo.s32 %r1, %r15, %r16, %r17; - mov.u32 %r18, %ntid.y; - mov.u32 %r19, %ctaid.y; - mov.u32 %r20, %tid.y; - mad.lo.s32 %r2, %r18, %r19, %r20; - setp.lt.s32 %p2, %r1, %r14; - setp.lt.s32 %p3, %r2, %r10; - and.pred %p4, %p2, %p3; - @!%p4 bra BB7_53; - bra.uni BB7_1; + ld.param.u64 %rd2, [reduce_sum_param_0]; + ld.param.u64 %rd3, [reduce_sum_param_1]; + ld.param.u32 %r5, [reduce_sum_param_2]; + mov.u32 %r6, %tid.x; + mov.u32 %r7, %ctaid.x; + shl.b32 %r8, %r7, 1; + mov.u32 %r9, %ntid.x; + mad.lo.s32 %r32, %r8, %r9, %r6; + mov.f64 %fd76, 0d0000000000000000; + mov.f64 %fd77, %fd76; + setp.ge.u32 %p1, %r32, %r5; + @%p1 bra BB21_4; + +BB21_1: + mov.f64 %fd1, %fd77; + cvta.to.global.u64 %rd4, %rd2; + mul.wide.u32 %rd5, %r32, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd30, [%rd6]; + add.f64 %fd78, %fd1, %fd30; + add.s32 %r3, %r32, %r9; + setp.ge.u32 %p2, %r3, %r5; + @%p2 bra BB21_3; -BB7_1: - mad.lo.s32 %r3, %r1, %r10, %r2; - setp.eq.s32 %p5, %r11, 1; - mov.u32 %r53, %r1; - @%p5 bra BB7_5; + mul.wide.u32 %rd8, %r3, 8; + add.s64 %rd9, %rd4, %rd8; + ld.global.f64 %fd31, [%rd9]; + add.f64 %fd78, %fd78, %fd31; - setp.ne.s32 %p6, %r11, 2; - mov.u32 %r54, %r3; - @%p6 bra BB7_4; +BB21_3: + mov.f64 %fd77, %fd78; + shl.b32 %r12, %r9, 1; + mov.u32 %r13, %nctaid.x; + mad.lo.s32 %r32, %r12, %r13, %r32; + setp.lt.u32 %p3, %r32, %r5; + mov.f64 %fd76, %fd77; + @%p3 bra BB21_1; - mov.u32 %r54, %r2; +BB21_4: + mov.f64 %fd74, %fd76; + mul.wide.u32 %rd10, %r6, 8; + mov.u64 %rd11, sdata; + add.s64 %rd1, %rd11, %rd10; + st.shared.f64 [%rd1], %fd74; + bar.sync 0; + setp.lt.u32 %p4, %r9, 1024; + @%p4 bra BB21_8; -BB7_4: - mov.u32 %r48, %r54; - mov.u32 %r4, %r48; - mov.u32 %r53, %r4; + setp.gt.u32 %p5, %r6, 511; + mov.f64 %fd75, %fd74; + @%p5 bra BB21_7; -BB7_5: - mov.u32 %r5, %r53; - setp.eq.s32 %p7, %r12, 1; - mov.u32 %r51, %r1; - @%p7 bra BB7_9; + ld.shared.f64 %fd32, [%rd1+4096]; + add.f64 %fd75, %fd74, %fd32; + st.shared.f64 [%rd1], %fd75; - setp.ne.s32 %p8, %r12, 2; - mov.u32 %r52, %r3; - @%p8 bra BB7_8; +BB21_7: + mov.f64 %fd74, %fd75; + bar.sync 0; - mov.u32 %r52, %r2; +BB21_8: + mov.f64 %fd72, %fd74; + setp.lt.u32 %p6, %r9, 512; + @%p6 bra BB21_12; -BB7_8: - mov.u32 %r51, %r52; + setp.gt.u32 %p7, %r6, 255; + mov.f64 %fd73, %fd72; + @%p7 bra BB21_11; -BB7_9: - cvta.to.global.u64 %rd5, %rd3; - cvta.to.global.u64 %rd6, %rd2; - mul.wide.s32 %rd7, %r5, 8; - add.s64 %rd8, %rd6, %rd7; - ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r51, 8; - add.s64 %rd10, %rd5, %rd9; - ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd38, 0dC08F380000000000; - setp.gt.s32 %p9, %r13, 5; - @%p9 bra BB7_19; + ld.shared.f64 %fd33, [%rd1+2048]; + add.f64 %fd73, %fd72, %fd33; + st.shared.f64 [%rd1], %fd73; - setp.gt.s32 %p19, %r13, 2; - @%p19 bra BB7_15; +BB21_11: + mov.f64 %fd72, %fd73; + bar.sync 0; - setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB7_51; +BB21_12: + mov.f64 %fd70, %fd72; + setp.lt.u32 %p8, %r9, 256; + @%p8 bra BB21_16; - setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB7_50; - bra.uni BB7_13; + setp.gt.u32 %p9, %r6, 127; + mov.f64 %fd71, %fd70; + @%p9 bra BB21_15; -BB7_50: - sub.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + ld.shared.f64 %fd34, [%rd1+1024]; + add.f64 %fd71, %fd70, %fd34; + st.shared.f64 [%rd1], %fd71; -BB7_19: - setp.gt.s32 %p10, %r13, 8; - @%p10 bra BB7_24; +BB21_15: + mov.f64 %fd70, %fd71; + bar.sync 0; - setp.eq.s32 %p16, %r13, 6; - @%p16 bra BB7_34; +BB21_16: + mov.f64 %fd68, %fd70; + setp.lt.u32 %p10, %r9, 128; + @%p10 bra BB21_20; - setp.eq.s32 %p17, %r13, 7; - @%p17 bra BB7_33; - bra.uni BB7_22; + setp.gt.u32 %p11, %r6, 63; + mov.f64 %fd69, %fd68; + @%p11 bra BB21_19; -BB7_33: - setp.gt.f64 %p29, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB7_52; + ld.shared.f64 %fd35, [%rd1+512]; + add.f64 %fd69, %fd68, %fd35; + st.shared.f64 [%rd1], %fd69; -BB7_15: - setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB7_49; +BB21_19: + mov.f64 %fd68, %fd69; + bar.sync 0; - setp.eq.s32 %p21, %r13, 4; - @%p21 bra BB7_35; - bra.uni BB7_17; +BB21_20: + mov.f64 %fd67, %fd68; + setp.gt.u32 %p12, %r6, 31; + @%p12 bra BB21_33; -BB7_35: - { - .reg .b32 %temp; - mov.b64 {%temp, %r8}, %fd1; - } - { - .reg .b32 %temp; - mov.b64 {%temp, %r9}, %fd2; - } - bfe.u32 %r21, %r9, 20, 11; - add.s32 %r22, %r21, -1012; - mov.b64 %rd11, %fd2; - shl.b64 %rd1, %rd11, %r22; - setp.eq.s64 %p32, %rd1, -9223372036854775808; - abs.f64 %fd11, %fd1; - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - // <end>} - .param .b64 param0; - st.param.f64 [param0+0], %fd11; - .param .b64 param1; - st.param.f64 [param1+0], %fd2; - .param .b64 retval0; - call.uni (retval0), - __internal_accurate_pow, - ( - param0, - param1 - ); - ld.param.f64 %fd37, [retval0+0]; - - //{ - }// Callseq End 0 - setp.lt.s32 %p33, %r8, 0; - and.pred %p1, %p33, %p32; - @!%p1 bra BB7_37; - bra.uni BB7_36; + setp.lt.u32 %p13, %r9, 64; + @%p13 bra BB21_23; -BB7_36: - { - .reg .b32 %temp; - mov.b64 {%temp, %r23}, %fd37; - } - xor.b32 %r24, %r23, -2147483648; - { - .reg .b32 %temp; - mov.b64 {%r25, %temp}, %fd37; - } - mov.b64 %fd37, {%r25, %r24}; + ld.volatile.shared.f64 %fd36, [%rd1+256]; + add.f64 %fd67, %fd67, %fd36; + st.volatile.shared.f64 [%rd1], %fd67; -BB7_37: - mov.f64 %fd36, %fd37; - setp.eq.f64 %p34, %fd1, 0d0000000000000000; - @%p34 bra BB7_40; - bra.uni BB7_38; +BB21_23: + mov.f64 %fd66, %fd67; + setp.lt.u32 %p14, %r9, 32; + @%p14 bra BB21_25; -BB7_40: - selp.b32 %r26, %r8, 0, %p32; - or.b32 %r27, %r26, 2146435072; - setp.lt.s32 %p38, %r9, 0; - selp.b32 %r28, %r27, %r26, %p38; - mov.u32 %r29, 0; - mov.b64 %fd36, {%r29, %r28}; - bra.uni BB7_41; + ld.volatile.shared.f64 %fd37, [%rd1+128]; + add.f64 %fd66, %fd66, %fd37; + st.volatile.shared.f64 [%rd1], %fd66; -BB7_24: - setp.gt.s32 %p11, %r13, 10; - @%p11 bra BB7_28; +BB21_25: + mov.f64 %fd65, %fd66; + setp.lt.u32 %p15, %r9, 16; + @%p15 bra BB21_27; - setp.eq.s32 %p14, %r13, 9; - @%p14 bra BB7_32; - bra.uni BB7_26; + ld.volatile.shared.f64 %fd38, [%rd1+64]; + add.f64 %fd65, %fd65, %fd38; + st.volatile.shared.f64 [%rd1], %fd65; -BB7_32: - setp.eq.f64 %p27, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB7_52; +BB21_27: + mov.f64 %fd64, %fd65; + setp.lt.u32 %p16, %r9, 8; + @%p16 bra BB21_29; -BB7_28: - setp.eq.s32 %p12, %r13, 11; - @%p12 bra BB7_31; - bra.uni BB7_29; + ld.volatile.shared.f64 %fd39, [%rd1+32]; + add.f64 %fd64, %fd64, %fd39; + st.volatile.shared.f64 [%rd1], %fd64; -BB7_31: - min.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; +BB21_29: + mov.f64 %fd63, %fd64; + setp.lt.u32 %p17, %r9, 4; + @%p17 bra BB21_31; -BB7_51: - add.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + ld.volatile.shared.f64 %fd40, [%rd1+16]; + add.f64 %fd63, %fd63, %fd40; + st.volatile.shared.f64 [%rd1], %fd63; -BB7_13: - setp.eq.s32 %p25, %r13, 2; - @%p25 bra BB7_14; - bra.uni BB7_52; +BB21_31: + setp.lt.u32 %p18, %r9, 2; + @%p18 bra BB21_33; -BB7_14: - mul.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + ld.volatile.shared.f64 %fd41, [%rd1+8]; + add.f64 %fd42, %fd63, %fd41; + st.volatile.shared.f64 [%rd1], %fd42; -BB7_34: - setp.le.f64 %p30, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB7_52; +BB21_33: + setp.ne.s32 %p19, %r6, 0; + @%p19 bra BB21_35; -BB7_22: - setp.eq.s32 %p18, %r13, 8; - @%p18 bra BB7_23; - bra.uni BB7_52; + ld.shared.f64 %fd43, [sdata]; + cvta.to.global.u64 %rd12, %rd3; + mul.wide.u32 %rd13, %r7, 8; + add.s64 %rd14, %rd12, %rd13; + st.global.f64 [%rd14], %fd43; -BB7_23: - setp.ge.f64 %p28, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB7_52; +BB21_35: + ret; +} -BB7_49: - div.rn.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + // .globl reduce_row_sum +.visible .entry reduce_row_sum( + .param .u64 reduce_row_sum_param_0, + .param .u64 reduce_row_sum_param_1, + .param .u32 reduce_row_sum_param_2, + .param .u32 reduce_row_sum_param_3 +) +{ + .reg .pred %p<20>; + .reg .b32 %r<39>; + .reg .f64 %fd<74>; + .reg .b64 %rd<42>; -BB7_17: - setp.eq.s32 %p22, %r13, 5; - @%p22 bra BB7_18; - bra.uni BB7_52; -BB7_18: - setp.lt.f64 %p31, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB7_52; + ld.param.u64 %rd1, [reduce_row_sum_param_0]; + ld.param.u64 %rd2, [reduce_row_sum_param_1]; + ld.param.u32 %r5, [reduce_row_sum_param_2]; + ld.param.u32 %r4, [reduce_row_sum_param_3]; + mov.u32 %r6, %ctaid.x; + setp.ge.u32 %p1, %r6, %r5; + @%p1 bra BB22_35; -BB7_26: - setp.eq.s32 %p15, %r13, 10; - @%p15 bra BB7_27; - bra.uni BB7_52; + mov.u32 %r38, %tid.x; + mov.f64 %fd72, 0d0000000000000000; + mov.f64 %fd73, %fd72; + setp.ge.u32 %p2, %r38, %r4; + @%p2 bra BB22_4; -BB7_27: - setp.neu.f64 %p26, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB7_52; + cvta.to.global.u64 %rd3, %rd1; -BB7_29: - setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB7_52; +BB22_3: + mad.lo.s32 %r8, %r6, %r4, %r38; + mul.wide.u32 %rd4, %r8, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd28, [%rd5]; + add.f64 %fd73, %fd73, %fd28; + mov.u32 %r9, %ntid.x; + add.s32 %r38, %r9, %r38; + setp.lt.u32 %p3, %r38, %r4; + mov.f64 %fd72, %fd73; + @%p3 bra BB22_3; - max.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; +BB22_4: + mov.f64 %fd70, %fd72; + mov.u32 %r10, %tid.x; + mul.wide.u32 %rd6, %r10, 8; + mov.u64 %rd7, sdata; + add.s64 %rd8, %rd7, %rd6; + st.shared.f64 [%rd8], %fd70; + bar.sync 0; + mov.u32 %r11, %ntid.x; + setp.lt.u32 %p4, %r11, 1024; + @%p4 bra BB22_8; -BB7_38: - setp.gt.s32 %p35, %r8, -1; - @%p35 bra BB7_41; + setp.gt.u32 %p5, %r10, 511; + mov.f64 %fd71, %fd70; + @%p5 bra BB22_7; - cvt.rzi.f64.f64 %fd29, %fd2; - setp.neu.f64 %p36, %fd29, %fd2; - selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; + ld.shared.f64 %fd29, [%rd8+4096]; + add.f64 %fd71, %fd70, %fd29; + st.shared.f64 [%rd8], %fd71; -BB7_41: - mov.f64 %fd17, %fd36; - add.f64 %fd18, %fd1, %fd2; - { - .reg .b32 %temp; - mov.b64 {%temp, %r30}, %fd18; - } - and.b32 %r31, %r30, 2146435072; - setp.ne.s32 %p39, %r31, 2146435072; - mov.f64 %fd35, %fd17; - @%p39 bra BB7_48; +BB22_7: + mov.f64 %fd70, %fd71; + bar.sync 0; - setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; - mov.f64 %fd35, %fd18; - @%p40 bra BB7_48; +BB22_8: + mov.f64 %fd68, %fd70; + setp.lt.u32 %p6, %r11, 512; + @%p6 bra BB22_12; - abs.f64 %fd30, %fd2; - setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; - mov.f64 %fd34, %fd18; - mov.f64 %fd35, %fd34; - @%p41 bra BB7_48; + setp.gt.u32 %p7, %r10, 255; + mov.f64 %fd69, %fd68; + @%p7 bra BB22_11; - { - .reg .b32 %temp; - mov.b64 {%r32, %temp}, %fd2; - } - and.b32 %r33, %r9, 2147483647; - setp.eq.s32 %p42, %r33, 2146435072; - setp.eq.s32 %p43, %r32, 0; - and.pred %p44, %p42, %p43; - @%p44 bra BB7_47; - bra.uni BB7_45; - -BB7_47: - setp.gt.f64 %p48, %fd11, 0d3FF0000000000000; - selp.b32 %r41, 2146435072, 0, %p48; - xor.b32 %r42, %r41, 2146435072; - setp.lt.s32 %p49, %r9, 0; - selp.b32 %r43, %r42, %r41, %p49; - setp.eq.f64 %p50, %fd1, 0dBFF0000000000000; - selp.b32 %r44, 1072693248, %r43, %p50; - mov.u32 %r45, 0; - mov.b64 %fd35, {%r45, %r44}; - bra.uni BB7_48; - -BB7_45: - { - .reg .b32 %temp; - mov.b64 {%r34, %temp}, %fd1; - } - and.b32 %r35, %r8, 2147483647; - setp.eq.s32 %p45, %r35, 2146435072; - setp.eq.s32 %p46, %r34, 0; - and.pred %p47, %p45, %p46; - mov.f64 %fd35, %fd17; - @!%p47 bra BB7_48; - bra.uni BB7_46; - -BB7_46: - shr.s32 %r36, %r9, 31; - and.b32 %r37, %r36, -2146435072; - selp.b32 %r38, -1048576, 2146435072, %p1; - add.s32 %r39, %r38, %r37; - mov.u32 %r40, 0; - mov.b64 %fd35, {%r40, %r39}; - -BB7_48: - setp.eq.f64 %p51, %fd2, 0d0000000000000000; - setp.eq.f64 %p52, %fd1, 0d3FF0000000000000; - or.pred %p53, %p52, %p51; - selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53; - -BB7_52: - cvta.to.global.u64 %rd12, %rd4; - mul.wide.s32 %rd13, %r3, 8; - add.s64 %rd14, %rd12, %rd13; - st.global.f64 [%rd14], %fd38; + ld.shared.f64 %fd30, [%rd8+2048]; + add.f64 %fd69, %fd68, %fd30; + st.shared.f64 [%rd8], %fd69; -BB7_53: - ret; -} +BB22_11: + mov.f64 %fd68, %fd69; + bar.sync 0; - // .globl binCellScalarOp -.visible .entry binCellScalarOp( - .param .u64 binCellScalarOp_param_0, - .param .f64 binCellScalarOp_param_1, - .param .u64 binCellScalarOp_param_2, - .param .u32 binCellScalarOp_param_3, - .param .u32 binCellScalarOp_param_4, - .param .u32 binCellScalarOp_param_5, - .param .u32 binCellScalarOp_param_6 -) -{ - .reg .pred %p<93>; - .reg .b32 %r<69>; - .reg .f64 %fd<75>; - .reg .b64 %rd<12>; +BB22_12: + mov.f64 %fd66, %fd68; + setp.lt.u32 %p8, %r11, 256; + @%p8 bra BB22_16; + setp.gt.u32 %p9, %r10, 127; + mov.f64 %fd67, %fd66; + @%p9 bra BB22_15; - ld.param.u64 %rd4, [binCellScalarOp_param_0]; - ld.param.f64 %fd52, [binCellScalarOp_param_1]; - ld.param.u64 %rd5, [binCellScalarOp_param_2]; - ld.param.u32 %r8, [binCellScalarOp_param_3]; - ld.param.u32 %r9, [binCellScalarOp_param_4]; - ld.param.u32 %r6, [binCellScalarOp_param_5]; - ld.param.u32 %r7, [binCellScalarOp_param_6]; - mov.u32 %r10, %ctaid.x; - mov.u32 %r11, %ntid.x; - mov.u32 %r12, %tid.x; - mad.lo.s32 %r13, %r11, %r10, %r12; - mov.u32 %r14, %ntid.y; - mov.u32 %r15, %ctaid.y; - mov.u32 %r16, %tid.y; - mad.lo.s32 %r17, %r13, %r9, %r16; - mad.lo.s32 %r1, %r14, %r15, %r17; - mul.lo.s32 %r18, %r9, %r8; - setp.ge.s32 %p3, %r1, %r18; - @%p3 bra BB8_88; + ld.shared.f64 %fd31, [%rd8+1024]; + add.f64 %fd67, %fd66, %fd31; + st.shared.f64 [%rd8], %fd67; - cvta.to.global.u64 %rd6, %rd5; - cvta.to.global.u64 %rd7, %rd4; - mul.wide.s32 %rd8, %r1, 8; - a
<TRUNCATED>