http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 73b057e..d382fc5 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,8 +1,8 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21124049 -// Cuda compilation tools, release 8.0, V8.0.44 +// Compiler Build ID: CL-21554848 +// Cuda compilation tools, release 8.0, V8.0.61 // Based on LLVM 3.4svn // @@ -10,7 +10,7 @@ .target sm_30 .address_size 64 - // .globl slice_sparse_dense_row + // .globl double2float_f .func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd ( .param .b64 __internal_trig_reduction_slowpathd_param_0, @@ -23,20 +23,97 @@ .param .b64 __internal_accurate_pow_param_1 ) ; -.extern .shared .align 8 .b8 sdata[]; +.extern .shared .align 1 .b8 my_sdata[]; +.const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; .const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; .const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63}; -.visible .entry slice_sparse_dense_row( - .param .u64 slice_sparse_dense_row_param_0, - .param .u64 slice_sparse_dense_row_param_1, - .param .u64 slice_sparse_dense_row_param_2, - .param .u64 slice_sparse_dense_row_param_3, - .param .u32 slice_sparse_dense_row_param_4, - .param .u32 slice_sparse_dense_row_param_5, - .param .u32 slice_sparse_dense_row_param_6, - .param .u32 slice_sparse_dense_row_param_7, - .param .u32 slice_sparse_dense_row_param_8 +.visible .entry double2float_f( + .param .u64 double2float_f_param_0, + .param .u64 double2float_f_param_1, + .param .u32 double2float_f_param_2 +) +{ + .reg .pred %p<2>; + .reg .f32 %f<2>; + .reg .b32 %r<6>; + .reg .f64 %fd<2>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [double2float_f_param_0]; + ld.param.u64 %rd2, [double2float_f_param_1]; + ld.param.u32 %r2, [double2float_f_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.s32 %p1, %r1, %r2; + @%p1 bra BB0_2; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + cvt.rn.f32.f64 %f1, %fd1; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r1, 4; + add.s64 %rd8, %rd6, %rd7; + st.global.f32 [%rd8], %f1; + +BB0_2: + ret; +} + + // .globl float2double_f +.visible .entry float2double_f( + .param .u64 float2double_f_param_0, + .param .u64 float2double_f_param_1, + .param .u32 float2double_f_param_2 +) +{ + .reg .pred %p<2>; + .reg .f32 %f<2>; + .reg .b32 %r<6>; + .reg .f64 %fd<2>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [float2double_f_param_0]; + ld.param.u64 %rd2, [float2double_f_param_1]; + ld.param.u32 %r2, [float2double_f_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.s32 %p1, %r1, %r2; + @%p1 bra BB1_2; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 4; + add.s64 %rd5, %rd3, %rd4; + ld.global.f32 %f1, [%rd5]; + cvt.f64.f32 %fd1, %f1; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r1, 8; + add.s64 %rd8, %rd6, %rd7; + st.global.f64 [%rd8], %fd1; + +BB1_2: + ret; +} + + // .globl slice_sparse_dense_row_d +.visible .entry slice_sparse_dense_row_d( + .param .u64 slice_sparse_dense_row_d_param_0, + .param .u64 slice_sparse_dense_row_d_param_1, + .param .u64 slice_sparse_dense_row_d_param_2, + .param .u64 slice_sparse_dense_row_d_param_3, + .param .u32 slice_sparse_dense_row_d_param_4, + .param .u32 slice_sparse_dense_row_d_param_5, + .param .u32 slice_sparse_dense_row_d_param_6, + .param .u32 slice_sparse_dense_row_d_param_7, + .param .u32 slice_sparse_dense_row_d_param_8 ) { .reg .pred %p<7>; @@ -45,22 +122,22 @@ .reg .b64 %rd<23>; - ld.param.u64 %rd9, [slice_sparse_dense_row_param_0]; - ld.param.u64 %rd10, [slice_sparse_dense_row_param_1]; - ld.param.u64 %rd11, [slice_sparse_dense_row_param_2]; - ld.param.u64 %rd12, [slice_sparse_dense_row_param_3]; - ld.param.u32 %r15, [slice_sparse_dense_row_param_4]; - ld.param.u32 %r16, [slice_sparse_dense_row_param_5]; - ld.param.u32 %r12, [slice_sparse_dense_row_param_6]; - ld.param.u32 %r13, [slice_sparse_dense_row_param_7]; - ld.param.u32 %r14, [slice_sparse_dense_row_param_8]; + ld.param.u64 %rd9, [slice_sparse_dense_row_d_param_0]; + ld.param.u64 %rd10, [slice_sparse_dense_row_d_param_1]; + ld.param.u64 %rd11, [slice_sparse_dense_row_d_param_2]; + ld.param.u64 %rd12, [slice_sparse_dense_row_d_param_3]; + ld.param.u32 %r15, [slice_sparse_dense_row_d_param_4]; + ld.param.u32 %r16, [slice_sparse_dense_row_d_param_5]; + ld.param.u32 %r12, [slice_sparse_dense_row_d_param_6]; + ld.param.u32 %r13, [slice_sparse_dense_row_d_param_7]; + ld.param.u32 %r14, [slice_sparse_dense_row_d_param_8]; mov.u32 %r17, %ntid.x; mov.u32 %r18, %ctaid.x; mov.u32 %r19, %tid.x; mad.lo.s32 %r1, %r17, %r18, %r19; add.s32 %r2, %r1, %r15; setp.gt.s32 %p1, %r2, %r16; - @%p1 bra BB0_6; + @%p1 bra BB2_6; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r2, 4; @@ -68,7 +145,7 @@ ld.global.u32 %r23, [%rd1]; ld.global.u32 %r22, [%rd1+4]; setp.ge.s32 %p2, %r23, %r22; - @%p2 bra BB0_6; + @%p2 bra BB2_6; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd15, %rd9; @@ -80,12 +157,12 @@ mul.wide.s32 %rd18, %r23, 4; add.s64 %rd21, %rd16, %rd18; -BB0_3: +BB2_3: ld.global.u32 %r8, [%rd21]; setp.lt.s32 %p3, %r8, %r12; setp.gt.s32 %p4, %r8, %r13; or.pred %p5, %p3, %p4; - @%p5 bra BB0_5; + @%p5 bra BB2_5; ld.global.f64 %fd1, [%rd22]; add.s32 %r21, %r5, %r8; @@ -94,28 +171,106 @@ BB0_3: st.global.f64 [%rd20], %fd1; ld.global.u32 %r22, [%rd1+4]; -BB0_5: +BB2_5: add.s64 %rd22, %rd22, 8; add.s64 %rd21, %rd21, 4; add.s32 %r23, %r23, 1; setp.lt.s32 %p6, %r23, %r22; - @%p6 bra BB0_3; + @%p6 bra BB2_3; + +BB2_6: + ret; +} + + // .globl slice_sparse_dense_row_f +.visible .entry slice_sparse_dense_row_f( + .param .u64 slice_sparse_dense_row_f_param_0, + .param .u64 slice_sparse_dense_row_f_param_1, + .param .u64 slice_sparse_dense_row_f_param_2, + .param .u64 slice_sparse_dense_row_f_param_3, + .param .u32 slice_sparse_dense_row_f_param_4, + .param .u32 slice_sparse_dense_row_f_param_5, + .param .u32 slice_sparse_dense_row_f_param_6, + .param .u32 slice_sparse_dense_row_f_param_7, + .param .u32 slice_sparse_dense_row_f_param_8 +) +{ + .reg .pred %p<7>; + .reg .f32 %f<2>; + .reg .b32 %r<24>; + .reg .b64 %rd<22>; + + + ld.param.u64 %rd9, [slice_sparse_dense_row_f_param_0]; + ld.param.u64 %rd10, [slice_sparse_dense_row_f_param_1]; + ld.param.u64 %rd11, [slice_sparse_dense_row_f_param_2]; + ld.param.u64 %rd12, [slice_sparse_dense_row_f_param_3]; + ld.param.u32 %r15, [slice_sparse_dense_row_f_param_4]; + ld.param.u32 %r16, [slice_sparse_dense_row_f_param_5]; + ld.param.u32 %r12, [slice_sparse_dense_row_f_param_6]; + ld.param.u32 %r13, [slice_sparse_dense_row_f_param_7]; + ld.param.u32 %r14, [slice_sparse_dense_row_f_param_8]; + mov.u32 %r17, %ntid.x; + mov.u32 %r18, %ctaid.x; + mov.u32 %r19, %tid.x; + mad.lo.s32 %r1, %r17, %r18, %r19; + add.s32 %r2, %r1, %r15; + setp.gt.s32 %p1, %r2, %r16; + @%p1 bra BB3_6; + + cvta.to.global.u64 %rd13, %rd10; + mul.wide.s32 %rd14, %r2, 4; + add.s64 %rd1, %rd13, %rd14; + ld.global.u32 %r23, [%rd1]; + ld.global.u32 %r22, [%rd1+4]; + setp.ge.s32 %p2, %r23, %r22; + @%p2 bra BB3_6; + + cvta.to.global.u64 %rd2, %rd12; + cvta.to.global.u64 %rd15, %rd9; + cvta.to.global.u64 %rd16, %rd11; + mul.lo.s32 %r20, %r1, %r14; + sub.s32 %r5, %r20, %r12; + mul.wide.s32 %rd17, %r23, 4; + add.s64 %rd21, %rd15, %rd17; + add.s64 %rd20, %rd16, %rd17; + +BB3_3: + ld.global.u32 %r8, [%rd20]; + setp.lt.s32 %p3, %r8, %r12; + setp.gt.s32 %p4, %r8, %r13; + or.pred %p5, %p3, %p4; + @%p5 bra BB3_5; + + ld.global.f32 %f1, [%rd21]; + add.s32 %r21, %r5, %r8; + mul.wide.s32 %rd18, %r21, 4; + add.s64 %rd19, %rd2, %rd18; + st.global.f32 [%rd19], %f1; + ld.global.u32 %r22, [%rd1+4]; + +BB3_5: + add.s64 %rd21, %rd21, 4; + add.s64 %rd20, %rd20, 4; + add.s32 %r23, %r23, 1; + setp.lt.s32 %p6, %r23, %r22; + @%p6 bra BB3_3; -BB0_6: +BB3_6: ret; } - // .globl slice_sparse_dense_nnz -.visible .entry slice_sparse_dense_nnz( - .param .u64 slice_sparse_dense_nnz_param_0, - .param .u64 slice_sparse_dense_nnz_param_1, - .param .u64 slice_sparse_dense_nnz_param_2, - .param .u64 slice_sparse_dense_nnz_param_3, - .param .u32 slice_sparse_dense_nnz_param_4, - .param .u32 slice_sparse_dense_nnz_param_5, - .param .u32 slice_sparse_dense_nnz_param_6, - .param .u32 slice_sparse_dense_nnz_param_7, - .param .u32 slice_sparse_dense_nnz_param_8 + // .globl slice_sparse_dense_nnz_d +.visible .entry slice_sparse_dense_nnz_d( + .param .u64 slice_sparse_dense_nnz_d_param_0, + .param .u64 slice_sparse_dense_nnz_d_param_1, + .param .u64 slice_sparse_dense_nnz_d_param_2, + .param .u64 slice_sparse_dense_nnz_d_param_3, + .param .u32 slice_sparse_dense_nnz_d_param_4, + .param .u32 slice_sparse_dense_nnz_d_param_5, + .param .u32 slice_sparse_dense_nnz_d_param_6, + .param .u32 slice_sparse_dense_nnz_d_param_7, + .param .u32 slice_sparse_dense_nnz_d_param_8 ) { .reg .pred %p<6>; @@ -124,15 +279,15 @@ BB0_6: .reg .b64 %rd<22>; - ld.param.u64 %rd5, [slice_sparse_dense_nnz_param_0]; - ld.param.u64 %rd8, [slice_sparse_dense_nnz_param_1]; - ld.param.u64 %rd6, [slice_sparse_dense_nnz_param_2]; - ld.param.u64 %rd7, [slice_sparse_dense_nnz_param_3]; - ld.param.u32 %r5, [slice_sparse_dense_nnz_param_4]; - ld.param.u32 %r9, [slice_sparse_dense_nnz_param_5]; - ld.param.u32 %r6, [slice_sparse_dense_nnz_param_6]; - ld.param.u32 %r7, [slice_sparse_dense_nnz_param_7]; - ld.param.u32 %r8, [slice_sparse_dense_nnz_param_8]; + ld.param.u64 %rd5, [slice_sparse_dense_nnz_d_param_0]; + ld.param.u64 %rd8, [slice_sparse_dense_nnz_d_param_1]; + ld.param.u64 %rd6, [slice_sparse_dense_nnz_d_param_2]; + ld.param.u64 %rd7, [slice_sparse_dense_nnz_d_param_3]; + ld.param.u32 %r5, [slice_sparse_dense_nnz_d_param_4]; + ld.param.u32 %r9, [slice_sparse_dense_nnz_d_param_5]; + ld.param.u32 %r6, [slice_sparse_dense_nnz_d_param_6]; + ld.param.u32 %r7, [slice_sparse_dense_nnz_d_param_7]; + ld.param.u32 %r8, [slice_sparse_dense_nnz_d_param_8]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; @@ -146,7 +301,7 @@ BB0_6: add.s64 %rd12, %rd1, %rd11; ld.global.u32 %r15, [%rd12+4]; setp.ge.s32 %p1, %r1, %r15; - @%p1 bra BB1_5; + @%p1 bra BB4_5; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd5; @@ -158,11 +313,11 @@ BB0_6: setp.lt.s32 %p2, %r2, %r6; setp.gt.s32 %p3, %r2, %r7; or.pred %p4, %p2, %p3; - @%p4 bra BB1_5; + @%p4 bra BB4_5; mov.u32 %r21, %r5; -BB1_3: +BB4_3: mov.u32 %r3, %r21; add.s32 %r4, %r3, 1; mul.wide.s32 %rd16, %r4, 4; @@ -170,7 +325,7 @@ BB1_3: ld.global.u32 %r16, [%rd17]; setp.le.s32 %p5, %r16, %r1; mov.u32 %r21, %r4; - @%p5 bra BB1_3; + @%p5 bra BB4_3; shl.b64 %rd18, %rd4, 3; add.s64 %rd19, %rd3, %rd18; @@ -183,21 +338,103 @@ BB1_3: add.s64 %rd21, %rd2, %rd20; st.global.f64 [%rd21], %fd1; -BB1_5: +BB4_5: + ret; +} + + // .globl slice_sparse_dense_nnz_f +.visible .entry slice_sparse_dense_nnz_f( + .param .u64 slice_sparse_dense_nnz_f_param_0, + .param .u64 slice_sparse_dense_nnz_f_param_1, + .param .u64 slice_sparse_dense_nnz_f_param_2, + .param .u64 slice_sparse_dense_nnz_f_param_3, + .param .u32 slice_sparse_dense_nnz_f_param_4, + .param .u32 slice_sparse_dense_nnz_f_param_5, + .param .u32 slice_sparse_dense_nnz_f_param_6, + .param .u32 slice_sparse_dense_nnz_f_param_7, + .param .u32 slice_sparse_dense_nnz_f_param_8 +) +{ + .reg .pred %p<6>; + .reg .f32 %f<2>; + .reg .b32 %r<22>; + .reg .b64 %rd<22>; + + + ld.param.u64 %rd5, [slice_sparse_dense_nnz_f_param_0]; + ld.param.u64 %rd8, [slice_sparse_dense_nnz_f_param_1]; + ld.param.u64 %rd6, [slice_sparse_dense_nnz_f_param_2]; + ld.param.u64 %rd7, [slice_sparse_dense_nnz_f_param_3]; + ld.param.u32 %r5, [slice_sparse_dense_nnz_f_param_4]; + ld.param.u32 %r9, [slice_sparse_dense_nnz_f_param_5]; + ld.param.u32 %r6, [slice_sparse_dense_nnz_f_param_6]; + ld.param.u32 %r7, [slice_sparse_dense_nnz_f_param_7]; + ld.param.u32 %r8, [slice_sparse_dense_nnz_f_param_8]; + mov.u32 %r10, %ntid.x; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %tid.x; + mad.lo.s32 %r13, %r10, %r11, %r12; + cvta.to.global.u64 %rd1, %rd8; + mul.wide.s32 %rd9, %r5, 4; + add.s64 %rd10, %rd1, %rd9; + ld.global.u32 %r14, [%rd10]; + add.s32 %r1, %r13, %r14; + mul.wide.s32 %rd11, %r9, 4; + add.s64 %rd12, %rd1, %rd11; + ld.global.u32 %r15, [%rd12+4]; + setp.ge.s32 %p1, %r1, %r15; + @%p1 bra BB5_5; + + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd5; + cvta.to.global.u64 %rd13, %rd6; + cvt.s64.s32 %rd4, %r1; + mul.wide.s32 %rd14, %r1, 4; + add.s64 %rd15, %rd13, %rd14; + ld.global.u32 %r2, [%rd15]; + setp.lt.s32 %p2, %r2, %r6; + setp.gt.s32 %p3, %r2, %r7; + or.pred %p4, %p2, %p3; + @%p4 bra BB5_5; + + mov.u32 %r21, %r5; + +BB5_3: + mov.u32 %r3, %r21; + add.s32 %r4, %r3, 1; + mul.wide.s32 %rd16, %r4, 4; + add.s64 %rd17, %rd1, %rd16; + ld.global.u32 %r16, [%rd17]; + setp.le.s32 %p5, %r16, %r1; + mov.u32 %r21, %r4; + @%p5 bra BB5_3; + + shl.b64 %rd18, %rd4, 2; + add.s64 %rd19, %rd3, %rd18; + ld.global.f32 %f1, [%rd19]; + sub.s32 %r17, %r3, %r5; + mul.lo.s32 %r18, %r17, %r8; + sub.s32 %r19, %r18, %r6; + add.s32 %r20, %r19, %r2; + mul.wide.s32 %rd20, %r20, 4; + add.s64 %rd21, %rd2, %rd20; + st.global.f32 [%rd21], %f1; + +BB5_5: ret; } - // .globl slice_dense_dense -.visible .entry slice_dense_dense( - .param .u64 slice_dense_dense_param_0, - .param .u64 slice_dense_dense_param_1, - .param .u32 slice_dense_dense_param_2, - .param .u32 slice_dense_dense_param_3, - .param .u32 slice_dense_dense_param_4, - .param .u32 slice_dense_dense_param_5, - .param .u32 slice_dense_dense_param_6, - .param .u32 slice_dense_dense_param_7, - .param .u32 slice_dense_dense_param_8 + // .globl slice_dense_dense_d +.visible .entry slice_dense_dense_d( + .param .u64 slice_dense_dense_d_param_0, + .param .u64 slice_dense_dense_d_param_1, + .param .u32 slice_dense_dense_d_param_2, + .param .u32 slice_dense_dense_d_param_3, + .param .u32 slice_dense_dense_d_param_4, + .param .u32 slice_dense_dense_d_param_5, + .param .u32 slice_dense_dense_d_param_6, + .param .u32 slice_dense_dense_d_param_7, + .param .u32 slice_dense_dense_d_param_8 ) { .reg .pred %p<4>; @@ -206,13 +443,13 @@ BB1_5: .reg .b64 %rd<9>; - ld.param.u64 %rd1, [slice_dense_dense_param_0]; - ld.param.u64 %rd2, [slice_dense_dense_param_1]; - ld.param.u32 %r3, [slice_dense_dense_param_2]; - ld.param.u32 %r4, [slice_dense_dense_param_4]; - ld.param.u32 %r5, [slice_dense_dense_param_6]; - ld.param.u32 %r7, [slice_dense_dense_param_7]; - ld.param.u32 %r6, [slice_dense_dense_param_8]; + ld.param.u64 %rd1, [slice_dense_dense_d_param_0]; + ld.param.u64 %rd2, [slice_dense_dense_d_param_1]; + ld.param.u32 %r3, [slice_dense_dense_d_param_2]; + ld.param.u32 %r4, [slice_dense_dense_d_param_4]; + ld.param.u32 %r5, [slice_dense_dense_d_param_6]; + ld.param.u32 %r7, [slice_dense_dense_d_param_7]; + ld.param.u32 %r6, [slice_dense_dense_d_param_8]; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %ntid.x; mov.u32 %r10, %tid.x; @@ -221,10 +458,10 @@ BB1_5: setp.lt.s32 %p1, %r2, %r7; setp.gt.s32 %p2, %r6, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB2_2; - bra.uni BB2_1; + @!%p3 bra BB6_2; + bra.uni BB6_1; -BB2_1: +BB6_1: rem.s32 %r11, %r1, %r6; cvta.to.global.u64 %rd3, %rd1; add.s32 %r12, %r2, %r3; @@ -238,15 +475,70 @@ BB2_1: add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd1; -BB2_2: +BB6_2: + ret; +} + + // .globl slice_dense_dense_f +.visible .entry slice_dense_dense_f( + .param .u64 slice_dense_dense_f_param_0, + .param .u64 slice_dense_dense_f_param_1, + .param .u32 slice_dense_dense_f_param_2, + .param .u32 slice_dense_dense_f_param_3, + .param .u32 slice_dense_dense_f_param_4, + .param .u32 slice_dense_dense_f_param_5, + .param .u32 slice_dense_dense_f_param_6, + .param .u32 slice_dense_dense_f_param_7, + .param .u32 slice_dense_dense_f_param_8 +) +{ + .reg .pred %p<4>; + .reg .f32 %f<2>; + .reg .b32 %r<15>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [slice_dense_dense_f_param_0]; + ld.param.u64 %rd2, [slice_dense_dense_f_param_1]; + ld.param.u32 %r3, [slice_dense_dense_f_param_2]; + ld.param.u32 %r4, [slice_dense_dense_f_param_4]; + ld.param.u32 %r5, [slice_dense_dense_f_param_6]; + ld.param.u32 %r7, [slice_dense_dense_f_param_7]; + ld.param.u32 %r6, [slice_dense_dense_f_param_8]; + mov.u32 %r8, %ctaid.x; + mov.u32 %r9, %ntid.x; + mov.u32 %r10, %tid.x; + mad.lo.s32 %r1, %r9, %r8, %r10; + div.s32 %r2, %r1, %r6; + setp.lt.s32 %p1, %r2, %r7; + setp.gt.s32 %p2, %r6, -1; + and.pred %p3, %p1, %p2; + @!%p3 bra BB7_2; + bra.uni BB7_1; + +BB7_1: + rem.s32 %r11, %r1, %r6; + cvta.to.global.u64 %rd3, %rd1; + add.s32 %r12, %r2, %r3; + add.s32 %r13, %r11, %r4; + mad.lo.s32 %r14, %r12, %r5, %r13; + mul.wide.s32 %rd4, %r14, 4; + add.s64 %rd5, %rd3, %rd4; + ld.global.f32 %f1, [%rd5]; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r1, 4; + add.s64 %rd8, %rd6, %rd7; + st.global.f32 [%rd8], %f1; + +BB7_2: ret; } - // .globl copy_u2l_dense -.visible .entry copy_u2l_dense( - .param .u64 copy_u2l_dense_param_0, - .param .u32 copy_u2l_dense_param_1, - .param .u32 copy_u2l_dense_param_2 + // .globl copy_u2l_dense_d +.visible .entry copy_u2l_dense_d( + .param .u64 copy_u2l_dense_d_param_0, + .param .u32 copy_u2l_dense_d_param_1, + .param .u32 copy_u2l_dense_d_param_2 ) { .reg .pred %p<4>; @@ -255,9 +547,9 @@ BB2_2: .reg .b64 %rd<7>; - ld.param.u64 %rd1, [copy_u2l_dense_param_0]; - ld.param.u32 %r3, [copy_u2l_dense_param_1]; - ld.param.u32 %r4, [copy_u2l_dense_param_2]; + ld.param.u64 %rd1, [copy_u2l_dense_d_param_0]; + ld.param.u32 %r3, [copy_u2l_dense_d_param_1]; + ld.param.u32 %r4, [copy_u2l_dense_d_param_2]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; @@ -268,10 +560,10 @@ BB2_2: setp.gt.s32 %p1, %r9, %r8; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; - @!%p3 bra BB3_2; - bra.uni BB3_1; + @!%p3 bra BB8_2; + bra.uni BB8_1; -BB3_1: +BB8_1: cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; @@ -280,16 +572,58 @@ BB3_1: add.s64 %rd6, %rd2, %rd5; st.global.f64 [%rd6], %fd1; -BB3_2: +BB8_2: + ret; +} + + // .globl copy_u2l_dense_f +.visible .entry copy_u2l_dense_f( + .param .u64 copy_u2l_dense_f_param_0, + .param .u32 copy_u2l_dense_f_param_1, + .param .u32 copy_u2l_dense_f_param_2 +) +{ + .reg .pred %p<4>; + .reg .f32 %f<2>; + .reg .b32 %r<10>; + .reg .b64 %rd<7>; + + + ld.param.u64 %rd1, [copy_u2l_dense_f_param_0]; + ld.param.u32 %r3, [copy_u2l_dense_f_param_1]; + ld.param.u32 %r4, [copy_u2l_dense_f_param_2]; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r5, %r6, %r7; + div.s32 %r8, %r1, %r3; + rem.s32 %r9, %r1, %r3; + mad.lo.s32 %r2, %r9, %r3, %r8; + setp.gt.s32 %p1, %r9, %r8; + setp.lt.s32 %p2, %r2, %r4; + and.pred %p3, %p1, %p2; + @!%p3 bra BB9_2; + bra.uni BB9_1; + +BB9_1: + cvta.to.global.u64 %rd2, %rd1; + mul.wide.s32 %rd3, %r1, 4; + add.s64 %rd4, %rd2, %rd3; + ld.global.f32 %f1, [%rd4]; + mul.wide.s32 %rd5, %r2, 4; + add.s64 %rd6, %rd2, %rd5; + st.global.f32 [%rd6], %f1; + +BB9_2: ret; } - // .globl relu -.visible .entry relu( - .param .u64 relu_param_0, - .param .u64 relu_param_1, - .param .u32 relu_param_2, - .param .u32 relu_param_3 + // .globl relu_d +.visible .entry relu_d( + .param .u64 relu_d_param_0, + .param .u64 relu_d_param_1, + .param .u32 relu_d_param_2, + .param .u32 relu_d_param_3 ) { .reg .pred %p<4>; @@ -298,10 +632,10 @@ BB3_2: .reg .b64 %rd<8>; - ld.param.u64 %rd1, [relu_param_0]; - ld.param.u64 %rd2, [relu_param_1]; - ld.param.u32 %r2, [relu_param_2]; - ld.param.u32 %r3, [relu_param_3]; + ld.param.u64 %rd1, [relu_d_param_0]; + ld.param.u64 %rd2, [relu_d_param_1]; + ld.param.u32 %r2, [relu_d_param_2]; + ld.param.u32 %r3, [relu_d_param_3]; mov.u32 %r4, %ctaid.x; mov.u32 %r5, %ntid.x; mov.u32 %r6, %tid.x; @@ -310,10 +644,10 @@ BB3_2: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB4_2; - bra.uni BB4_1; + @!%p3 bra BB10_2; + bra.uni BB10_1; -BB4_1: +BB10_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; add.s64 %rd5, %rd3, %rd4; @@ -324,17 +658,64 @@ BB4_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd3; -BB4_2: +BB10_2: + ret; +} + + // .globl relu_f +.visible .entry relu_f( + .param .u64 relu_f_param_0, + .param .u64 relu_f_param_1, + .param .u32 relu_f_param_2, + .param .u32 relu_f_param_3 +) +{ + .reg .pred %p<4>; + .reg .f32 %f<3>; + .reg .b32 %r<8>; + .reg .f64 %fd<4>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [relu_f_param_0]; + ld.param.u64 %rd2, [relu_f_param_1]; + ld.param.u32 %r2, [relu_f_param_2]; + ld.param.u32 %r3, [relu_f_param_3]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r1, %r5, %r4, %r6; + div.s32 %r7, %r1, %r3; + setp.lt.s32 %p1, %r7, %r2; + setp.gt.s32 %p2, %r3, -1; + and.pred %p3, %p1, %p2; + @!%p3 bra BB11_2; + bra.uni BB11_1; + +BB11_1: + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 4; + add.s64 %rd5, %rd3, %rd4; + ld.global.f32 %f1, [%rd5]; + cvt.f64.f32 %fd1, %f1; + mov.f64 %fd2, 0d0000000000000000; + max.f64 %fd3, %fd2, %fd1; + cvt.rn.f32.f64 %f2, %fd3; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + st.global.f32 [%rd7], %f2; + +BB11_2: ret; } - // .globl relu_backward -.visible .entry relu_backward( - .param .u64 relu_backward_param_0, - .param .u64 relu_backward_param_1, - .param .u64 relu_backward_param_2, - .param .u32 relu_backward_param_3, - .param .u32 relu_backward_param_4 + // .globl relu_backward_d +.visible .entry relu_backward_d( + .param .u64 relu_backward_d_param_0, + .param .u64 relu_backward_d_param_1, + .param .u64 relu_backward_d_param_2, + .param .u32 relu_backward_d_param_3, + .param .u32 relu_backward_d_param_4 ) { .reg .pred %p<5>; @@ -343,11 +724,11 @@ BB4_2: .reg .b64 %rd<14>; - ld.param.u64 %rd2, [relu_backward_param_0]; - ld.param.u64 %rd3, [relu_backward_param_1]; - ld.param.u64 %rd4, [relu_backward_param_2]; - ld.param.u32 %r2, [relu_backward_param_3]; - ld.param.u32 %r3, [relu_backward_param_4]; + ld.param.u64 %rd2, [relu_backward_d_param_0]; + ld.param.u64 %rd3, [relu_backward_d_param_1]; + ld.param.u64 %rd4, [relu_backward_d_param_2]; + ld.param.u32 %r2, [relu_backward_d_param_3]; + ld.param.u32 %r3, [relu_backward_d_param_4]; mov.u32 %r4, %ntid.x; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %tid.x; @@ -356,10 +737,10 @@ BB4_2: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB5_4; - bra.uni BB5_1; + @!%p3 bra BB12_4; + bra.uni BB12_1; -BB5_1: +BB12_1: cvta.to.global.u64 %rd5, %rd2; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd6, %r1, 8; @@ -367,42 +748,98 @@ BB5_1: ld.global.f64 %fd4, [%rd7]; mov.f64 %fd5, 0d0000000000000000; setp.leu.f64 %p4, %fd4, 0d0000000000000000; - @%p4 bra BB5_3; + @%p4 bra BB12_3; cvta.to.global.u64 %rd8, %rd3; shl.b64 %rd9, %rd1, 3; add.s64 %rd10, %rd8, %rd9; ld.global.f64 %fd5, [%rd10]; -BB5_3: +BB12_3: cvta.to.global.u64 %rd11, %rd4; shl.b64 %rd12, %rd1, 3; add.s64 %rd13, %rd11, %rd12; st.global.f64 [%rd13], %fd5; -BB5_4: +BB12_4: ret; } - // .globl inplace_add -.visible .entry inplace_add( - .param .u64 inplace_add_param_0, - .param .u64 inplace_add_param_1, - .param .u32 inplace_add_param_2, - .param .u32 inplace_add_param_3 + // .globl relu_backward_f +.visible .entry relu_backward_f( + .param .u64 relu_backward_f_param_0, + .param .u64 relu_backward_f_param_1, + .param .u64 relu_backward_f_param_2, + .param .u32 relu_backward_f_param_3, + .param .u32 relu_backward_f_param_4 ) { - .reg .pred %p<4>; + .reg .pred %p<5>; + .reg .f32 %f<6>; .reg .b32 %r<8>; - .reg .f64 %fd<4>; - .reg .b64 %rd<8>; + .reg .b64 %rd<14>; - ld.param.u64 %rd1, [inplace_add_param_0]; - ld.param.u64 %rd2, [inplace_add_param_1]; - ld.param.u32 %r2, [inplace_add_param_2]; - ld.param.u32 %r3, [inplace_add_param_3]; - mov.u32 %r4, %ctaid.x; + ld.param.u64 %rd2, [relu_backward_f_param_0]; + ld.param.u64 %rd3, [relu_backward_f_param_1]; + ld.param.u64 %rd4, [relu_backward_f_param_2]; + ld.param.u32 %r2, [relu_backward_f_param_3]; + ld.param.u32 %r3, [relu_backward_f_param_4]; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r1, %r4, %r5, %r6; + div.s32 %r7, %r1, %r3; + setp.lt.s32 %p1, %r7, %r2; + setp.gt.s32 %p2, %r3, -1; + and.pred %p3, %p1, %p2; + @!%p3 bra BB13_4; + bra.uni BB13_1; + +BB13_1: + cvta.to.global.u64 %rd5, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd6, %r1, 4; + add.s64 %rd7, %rd5, %rd6; + ld.global.f32 %f4, [%rd7]; + mov.f32 %f5, 0f00000000; + setp.leu.f32 %p4, %f4, 0f00000000; + @%p4 bra BB13_3; + + cvta.to.global.u64 %rd8, %rd3; + shl.b64 %rd9, %rd1, 2; + add.s64 %rd10, %rd8, %rd9; + ld.global.f32 %f5, [%rd10]; + +BB13_3: + cvta.to.global.u64 %rd11, %rd4; + shl.b64 %rd12, %rd1, 2; + add.s64 %rd13, %rd11, %rd12; + st.global.f32 [%rd13], %f5; + +BB13_4: + ret; +} + + // .globl inplace_add_d +.visible .entry inplace_add_d( + .param .u64 inplace_add_d_param_0, + .param .u64 inplace_add_d_param_1, + .param .u32 inplace_add_d_param_2, + .param .u32 inplace_add_d_param_3 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<8>; + .reg .f64 %fd<4>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [inplace_add_d_param_0]; + ld.param.u64 %rd2, [inplace_add_d_param_1]; + ld.param.u32 %r2, [inplace_add_d_param_2]; + ld.param.u32 %r3, [inplace_add_d_param_3]; + mov.u32 %r4, %ctaid.x; mov.u32 %r5, %ntid.x; mov.u32 %r6, %tid.x; mad.lo.s32 %r1, %r5, %r4, %r6; @@ -410,10 +847,10 @@ BB5_4: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB6_2; - bra.uni BB6_1; + @!%p3 bra BB14_2; + bra.uni BB14_1; -BB6_1: +BB14_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; add.s64 %rd5, %rd3, %rd4; @@ -424,18 +861,62 @@ BB6_1: add.f64 %fd3, %fd2, %fd1; st.global.f64 [%rd7], %fd3; -BB6_2: +BB14_2: + ret; +} + + // .globl inplace_add_f +.visible .entry inplace_add_f( + .param .u64 inplace_add_f_param_0, + .param .u64 inplace_add_f_param_1, + .param .u32 inplace_add_f_param_2, + .param .u32 inplace_add_f_param_3 +) +{ + .reg .pred %p<4>; + .reg .f32 %f<4>; + .reg .b32 %r<8>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [inplace_add_f_param_0]; + ld.param.u64 %rd2, [inplace_add_f_param_1]; + ld.param.u32 %r2, [inplace_add_f_param_2]; + ld.param.u32 %r3, [inplace_add_f_param_3]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r1, %r5, %r4, %r6; + div.s32 %r7, %r1, %r3; + setp.lt.s32 %p1, %r7, %r2; + setp.gt.s32 %p2, %r3, -1; + and.pred %p3, %p1, %p2; + @!%p3 bra BB15_2; + bra.uni BB15_1; + +BB15_1: + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 4; + add.s64 %rd5, %rd3, %rd4; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + ld.global.f32 %f1, [%rd7]; + ld.global.f32 %f2, [%rd5]; + add.f32 %f3, %f2, %f1; + st.global.f32 [%rd7], %f3; + +BB15_2: ret; } - // .globl bias_add -.visible .entry bias_add( - .param .u64 bias_add_param_0, - .param .u64 bias_add_param_1, - .param .u64 bias_add_param_2, - .param .u32 bias_add_param_3, - .param .u32 bias_add_param_4, - .param .u32 bias_add_param_5 + // .globl bias_add_d +.visible .entry bias_add_d( + .param .u64 bias_add_d_param_0, + .param .u64 bias_add_d_param_1, + .param .u64 bias_add_d_param_2, + .param .u32 bias_add_d_param_3, + .param .u32 bias_add_d_param_4, + .param .u32 bias_add_d_param_5 ) { .reg .pred %p<4>; @@ -444,12 +925,12 @@ BB6_2: .reg .b64 %rd<12>; - ld.param.u64 %rd1, [bias_add_param_0]; - ld.param.u64 %rd2, [bias_add_param_1]; - ld.param.u64 %rd3, [bias_add_param_2]; - ld.param.u32 %r4, [bias_add_param_3]; - ld.param.u32 %r2, [bias_add_param_4]; - ld.param.u32 %r3, [bias_add_param_5]; + ld.param.u64 %rd1, [bias_add_d_param_0]; + ld.param.u64 %rd2, [bias_add_d_param_1]; + ld.param.u64 %rd3, [bias_add_d_param_2]; + ld.param.u32 %r4, [bias_add_d_param_3]; + ld.param.u32 %r2, [bias_add_d_param_4]; + ld.param.u32 %r3, [bias_add_d_param_5]; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; @@ -458,10 +939,10 @@ BB6_2: setp.lt.s32 %p1, %r8, %r4; setp.gt.s32 %p2, %r2, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB7_2; - bra.uni BB7_1; + @!%p3 bra BB16_2; + bra.uni BB16_1; -BB7_1: +BB16_1: rem.s32 %r9, %r1, %r2; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r1, 8; @@ -477,20 +958,73 @@ BB7_1: add.s64 %rd11, %rd10, %rd5; st.global.f64 [%rd11], %fd3; -BB7_2: +BB16_2: + ret; +} + + // .globl bias_add_f +.visible .entry bias_add_f( + .param .u64 bias_add_f_param_0, + .param .u64 bias_add_f_param_1, + .param .u64 bias_add_f_param_2, + .param .u32 bias_add_f_param_3, + .param .u32 bias_add_f_param_4, + .param .u32 bias_add_f_param_5 +) +{ + .reg .pred %p<4>; + .reg .f32 %f<4>; + .reg .b32 %r<11>; + .reg .b64 %rd<12>; + + + ld.param.u64 %rd1, [bias_add_f_param_0]; + ld.param.u64 %rd2, [bias_add_f_param_1]; + ld.param.u64 %rd3, [bias_add_f_param_2]; + ld.param.u32 %r4, [bias_add_f_param_3]; + ld.param.u32 %r2, [bias_add_f_param_4]; + ld.param.u32 %r3, [bias_add_f_param_5]; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r6, %r5, %r7; + div.s32 %r8, %r1, %r2; + setp.lt.s32 %p1, %r8, %r4; + setp.gt.s32 %p2, %r2, -1; + and.pred %p3, %p1, %p2; + @!%p3 bra BB17_2; + bra.uni BB17_1; + +BB17_1: + rem.s32 %r9, %r1, %r2; + cvta.to.global.u64 %rd4, %rd1; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd6, %rd4, %rd5; + div.s32 %r10, %r9, %r3; + cvta.to.global.u64 %rd7, %rd2; + mul.wide.s32 %rd8, %r10, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.f32 %f1, [%rd9]; + ld.global.f32 %f2, [%rd6]; + add.f32 %f3, %f2, %f1; + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd11, %rd10, %rd5; + st.global.f32 [%rd11], %f3; + +BB17_2: ret; } - // .globl daxpy_matrix_vector -.visible .entry daxpy_matrix_vector( - .param .u64 daxpy_matrix_vector_param_0, - .param .u64 daxpy_matrix_vector_param_1, - .param .f64 daxpy_matrix_vector_param_2, - .param .u64 daxpy_matrix_vector_param_3, - .param .u32 daxpy_matrix_vector_param_4, - .param .u32 daxpy_matrix_vector_param_5, - .param .u32 daxpy_matrix_vector_param_6, - .param .u32 daxpy_matrix_vector_param_7 + // .globl daxpy_matrix_vector_d +.visible .entry daxpy_matrix_vector_d( + .param .u64 daxpy_matrix_vector_d_param_0, + .param .u64 daxpy_matrix_vector_d_param_1, + .param .f64 daxpy_matrix_vector_d_param_2, + .param .u64 daxpy_matrix_vector_d_param_3, + .param .u32 daxpy_matrix_vector_d_param_4, + .param .u32 daxpy_matrix_vector_d_param_5, + .param .u32 daxpy_matrix_vector_d_param_6, + .param .u32 daxpy_matrix_vector_d_param_7 ) { .reg .pred %p<5>; @@ -499,13 +1033,13 @@ BB7_2: .reg .b64 %rd<14>; - ld.param.u64 %rd3, [daxpy_matrix_vector_param_0]; - ld.param.u64 %rd5, [daxpy_matrix_vector_param_1]; - ld.param.f64 %fd2, [daxpy_matrix_vector_param_2]; - ld.param.u64 %rd4, [daxpy_matrix_vector_param_3]; - ld.param.u32 %r5, [daxpy_matrix_vector_param_4]; - ld.param.u32 %r3, [daxpy_matrix_vector_param_5]; - ld.param.u32 %r4, [daxpy_matrix_vector_param_6]; + ld.param.u64 %rd3, [daxpy_matrix_vector_d_param_0]; + ld.param.u64 %rd5, [daxpy_matrix_vector_d_param_1]; + ld.param.f64 %fd2, [daxpy_matrix_vector_d_param_2]; + ld.param.u64 %rd4, [daxpy_matrix_vector_d_param_3]; + ld.param.u32 %r5, [daxpy_matrix_vector_d_param_4]; + ld.param.u32 %r3, [daxpy_matrix_vector_d_param_5]; + ld.param.u32 %r4, [daxpy_matrix_vector_d_param_6]; cvta.to.global.u64 %rd1, %rd5; mov.u32 %r6, %ntid.x; mov.u32 %r7, %ctaid.x; @@ -516,10 +1050,10 @@ BB7_2: setp.lt.s32 %p1, %r1, %r5; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB8_4; - bra.uni BB8_1; + @!%p3 bra BB18_4; + bra.uni BB18_1; -BB8_1: +BB18_1: cvta.to.global.u64 %rd6, %rd4; mad.lo.s32 %r10, %r1, %r3, %r2; cvta.to.global.u64 %rd7, %rd3; @@ -528,36 +1062,111 @@ BB8_1: ld.global.f64 %fd1, [%rd9]; add.s64 %rd2, %rd6, %rd8; setp.eq.s32 %p4, %r4, 1; - @%p4 bra BB8_3; - bra.uni BB8_2; + @%p4 bra BB18_3; + bra.uni BB18_2; -BB8_3: +BB18_3: mul.wide.s32 %rd12, %r2, 8; add.s64 %rd13, %rd1, %rd12; ld.global.f64 %fd5, [%rd13]; fma.rn.f64 %fd6, %fd5, %fd2, %fd1; st.global.f64 [%rd2], %fd6; - bra.uni BB8_4; + bra.uni BB18_4; -BB8_2: +BB18_2: mul.wide.s32 %rd10, %r1, 8; add.s64 %rd11, %rd1, %rd10; ld.global.f64 %fd3, [%rd11]; fma.rn.f64 %fd4, %fd3, %fd2, %fd1; st.global.f64 [%rd2], %fd4; -BB8_4: +BB18_4: + ret; +} + + // .globl daxpy_matrix_vector_f +.visible .entry daxpy_matrix_vector_f( + .param .u64 daxpy_matrix_vector_f_param_0, + .param .u64 daxpy_matrix_vector_f_param_1, + .param .f64 daxpy_matrix_vector_f_param_2, + .param .u64 daxpy_matrix_vector_f_param_3, + .param .u32 daxpy_matrix_vector_f_param_4, + .param .u32 daxpy_matrix_vector_f_param_5, + .param .u32 daxpy_matrix_vector_f_param_6, + .param .u32 daxpy_matrix_vector_f_param_7 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<6>; + .reg .b32 %r<11>; + .reg .f64 %fd<7>; + .reg .b64 %rd<14>; + + + ld.param.u64 %rd3, [daxpy_matrix_vector_f_param_0]; + ld.param.u64 %rd5, [daxpy_matrix_vector_f_param_1]; + ld.param.f64 %fd2, [daxpy_matrix_vector_f_param_2]; + ld.param.u64 %rd4, [daxpy_matrix_vector_f_param_3]; + ld.param.u32 %r5, [daxpy_matrix_vector_f_param_4]; + ld.param.u32 %r3, [daxpy_matrix_vector_f_param_5]; + ld.param.u32 %r4, [daxpy_matrix_vector_f_param_6]; + cvta.to.global.u64 %rd1, %rd5; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r9, %r6, %r7, %r8; + div.s32 %r1, %r9, %r3; + rem.s32 %r2, %r9, %r3; + setp.lt.s32 %p1, %r1, %r5; + setp.gt.s32 %p2, %r3, -1; + and.pred %p3, %p1, %p2; + @!%p3 bra BB19_4; + bra.uni BB19_1; + +BB19_1: + cvta.to.global.u64 %rd6, %rd4; + mad.lo.s32 %r10, %r1, %r3, %r2; + cvta.to.global.u64 %rd7, %rd3; + mul.wide.s32 %rd8, %r10, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.f32 %f1, [%rd9]; + cvt.f64.f32 %fd1, %f1; + add.s64 %rd2, %rd6, %rd8; + setp.eq.s32 %p4, %r4, 1; + @%p4 bra BB19_3; + bra.uni BB19_2; + +BB19_3: + mul.wide.s32 %rd12, %r2, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.f32 %f4, [%rd13]; + cvt.f64.f32 %fd5, %f4; + fma.rn.f64 %fd6, %fd5, %fd2, %fd1; + cvt.rn.f32.f64 %f5, %fd6; + st.global.f32 [%rd2], %f5; + bra.uni BB19_4; + +BB19_2: + mul.wide.s32 %rd10, %r1, 4; + add.s64 %rd11, %rd1, %rd10; + ld.global.f32 %f2, [%rd11]; + cvt.f64.f32 %fd3, %f2; + fma.rn.f64 %fd4, %fd3, %fd2, %fd1; + cvt.rn.f32.f64 %f3, %fd4; + st.global.f32 [%rd2], %f3; + +BB19_4: ret; } - // .globl bias_multiply -.visible .entry bias_multiply( - .param .u64 bias_multiply_param_0, - .param .u64 bias_multiply_param_1, - .param .u64 bias_multiply_param_2, - .param .u32 bias_multiply_param_3, - .param .u32 bias_multiply_param_4, - .param .u32 bias_multiply_param_5 + // .globl bias_multiply_d +.visible .entry bias_multiply_d( + .param .u64 bias_multiply_d_param_0, + .param .u64 bias_multiply_d_param_1, + .param .u64 bias_multiply_d_param_2, + .param .u32 bias_multiply_d_param_3, + .param .u32 bias_multiply_d_param_4, + .param .u32 bias_multiply_d_param_5 ) { .reg .pred %p<4>; @@ -566,12 +1175,12 @@ BB8_4: .reg .b64 %rd<12>; - ld.param.u64 %rd1, [bias_multiply_param_0]; - ld.param.u64 %rd2, [bias_multiply_param_1]; - ld.param.u64 %rd3, [bias_multiply_param_2]; - ld.param.u32 %r4, [bias_multiply_param_3]; - ld.param.u32 %r2, [bias_multiply_param_4]; - ld.param.u32 %r3, [bias_multiply_param_5]; + ld.param.u64 %rd1, [bias_multiply_d_param_0]; + ld.param.u64 %rd2, [bias_multiply_d_param_1]; + ld.param.u64 %rd3, [bias_multiply_d_param_2]; + ld.param.u32 %r4, [bias_multiply_d_param_3]; + ld.param.u32 %r2, [bias_multiply_d_param_4]; + ld.param.u32 %r3, [bias_multiply_d_param_5]; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; @@ -580,10 +1189,10 @@ BB8_4: setp.lt.s32 %p1, %r8, %r4; setp.gt.s32 %p2, %r2, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB9_2; - bra.uni BB9_1; + @!%p3 bra BB20_2; + bra.uni BB20_1; -BB9_1: +BB20_1: rem.s32 %r9, %r1, %r2; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r1, 8; @@ -599,110 +1208,89 @@ BB9_1: add.s64 %rd11, %rd10, %rd5; st.global.f64 [%rd11], %fd3; -BB9_2: +BB20_2: ret; } - // .globl compare_and_set -.visible .entry compare_and_set( - .param .u64 compare_and_set_param_0, - .param .u64 compare_and_set_param_1, - .param .u32 compare_and_set_param_2, - .param .u32 compare_and_set_param_3, - .param .f64 compare_and_set_param_4, - .param .f64 compare_and_set_param_5, - .param .f64 compare_and_set_param_6, - .param .f64 compare_and_set_param_7, - .param .f64 compare_and_set_param_8 + // .globl bias_multiply_f +.visible .entry bias_multiply_f( + .param .u64 bias_multiply_f_param_0, + .param .u64 bias_multiply_f_param_1, + .param .u64 bias_multiply_f_param_2, + .param .u32 bias_multiply_f_param_3, + .param .u32 bias_multiply_f_param_4, + .param .u32 bias_multiply_f_param_5 ) { - .reg .pred %p<6>; - .reg .b32 %r<10>; - .reg .f64 %fd<9>; - .reg .b64 %rd<8>; + .reg .pred %p<4>; + .reg .f32 %f<4>; + .reg .b32 %r<11>; + .reg .b64 %rd<12>; - ld.param.u64 %rd2, [compare_and_set_param_0]; - ld.param.u64 %rd3, [compare_and_set_param_1]; - ld.param.u32 %r2, [compare_and_set_param_2]; - ld.param.u32 %r3, [compare_and_set_param_3]; - ld.param.f64 %fd2, [compare_and_set_param_4]; - ld.param.f64 %fd3, [compare_and_set_param_5]; - ld.param.f64 %fd4, [compare_and_set_param_6]; - ld.param.f64 %fd5, [compare_and_set_param_7]; - ld.param.f64 %fd6, [compare_and_set_param_8]; - mov.u32 %r4, %ctaid.x; - mov.u32 %r5, %ntid.x; - mov.u32 %r6, %tid.x; - mad.lo.s32 %r7, %r5, %r4, %r6; - div.s32 %r8, %r7, %r3; - rem.s32 %r9, %r7, %r3; - mad.lo.s32 %r1, %r8, %r3, %r9; - setp.lt.s32 %p1, %r8, %r2; - setp.gt.s32 %p2, %r3, -1; + ld.param.u64 %rd1, [bias_multiply_f_param_0]; + ld.param.u64 %rd2, [bias_multiply_f_param_1]; + ld.param.u64 %rd3, [bias_multiply_f_param_2]; + ld.param.u32 %r4, [bias_multiply_f_param_3]; + ld.param.u32 %r2, [bias_multiply_f_param_4]; + ld.param.u32 %r3, [bias_multiply_f_param_5]; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r6, %r5, %r7; + div.s32 %r8, %r1, %r2; + setp.lt.s32 %p1, %r8, %r4; + setp.gt.s32 %p2, %r2, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB10_6; - bra.uni BB10_1; + @!%p3 bra BB21_2; + bra.uni BB21_1; -BB10_1: - cvta.to.global.u64 %rd4, %rd2; - mul.wide.s32 %rd5, %r1, 8; +BB21_1: + rem.s32 %r9, %r1, %r2; + cvta.to.global.u64 %rd4, %rd1; + mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; - ld.global.f64 %fd1, [%rd6]; - sub.f64 %fd7, %fd1, %fd2; - abs.f64 %fd8, %fd7; - setp.lt.f64 %p4, %fd8, %fd3; - cvta.to.global.u64 %rd7, %rd3; - add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB10_5; - bra.uni BB10_2; - -BB10_5: - st.global.f64 [%rd1], %fd4; - bra.uni BB10_6; - -BB10_2: - setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB10_4; - bra.uni BB10_3; - -BB10_4: - st.global.f64 [%rd1], %fd5; - bra.uni BB10_6; - -BB10_3: - st.global.f64 [%rd1], %fd6; + div.s32 %r10, %r9, %r3; + cvta.to.global.u64 %rd7, %rd2; + mul.wide.s32 %rd8, %r10, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.f32 %f1, [%rd9]; + ld.global.f32 %f2, [%rd6]; + mul.f32 %f3, %f2, %f1; + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd11, %rd10, %rd5; + st.global.f32 [%rd11], %f3; -BB10_6: +BB21_2: ret; } - // .globl matrix_matrix_cellwise_op -.visible .entry matrix_matrix_cellwise_op( - .param .u64 matrix_matrix_cellwise_op_param_0, - .param .u64 matrix_matrix_cellwise_op_param_1, - .param .u64 matrix_matrix_cellwise_op_param_2, - .param .u32 matrix_matrix_cellwise_op_param_3, - .param .u32 matrix_matrix_cellwise_op_param_4, - .param .u32 matrix_matrix_cellwise_op_param_5, - .param .u32 matrix_matrix_cellwise_op_param_6, - .param .u32 matrix_matrix_cellwise_op_param_7 + // .globl matrix_matrix_cellwise_op_d +.visible .entry matrix_matrix_cellwise_op_d( + .param .u64 matrix_matrix_cellwise_op_d_param_0, + .param .u64 matrix_matrix_cellwise_op_d_param_1, + .param .u64 matrix_matrix_cellwise_op_d_param_2, + .param .u32 matrix_matrix_cellwise_op_d_param_3, + .param .u32 matrix_matrix_cellwise_op_d_param_4, + .param .u32 matrix_matrix_cellwise_op_d_param_5, + .param .u32 matrix_matrix_cellwise_op_d_param_6, + .param .u32 matrix_matrix_cellwise_op_d_param_7 ) { - .reg .pred %p<77>; - .reg .b32 %r<65>; - .reg .f64 %fd<55>; + .reg .pred %p<73>; + .reg .b32 %r<66>; + .reg .f64 %fd<56>; .reg .b64 %rd<19>; - ld.param.u64 %rd2, [matrix_matrix_cellwise_op_param_0]; - ld.param.u64 %rd3, [matrix_matrix_cellwise_op_param_1]; - ld.param.u64 %rd4, [matrix_matrix_cellwise_op_param_2]; - ld.param.u32 %r14, [matrix_matrix_cellwise_op_param_3]; - ld.param.u32 %r10, [matrix_matrix_cellwise_op_param_4]; - ld.param.u32 %r11, [matrix_matrix_cellwise_op_param_5]; - ld.param.u32 %r12, [matrix_matrix_cellwise_op_param_6]; - ld.param.u32 %r13, [matrix_matrix_cellwise_op_param_7]; + ld.param.u64 %rd2, [matrix_matrix_cellwise_op_d_param_0]; + ld.param.u64 %rd3, [matrix_matrix_cellwise_op_d_param_1]; + ld.param.u64 %rd4, [matrix_matrix_cellwise_op_d_param_2]; + ld.param.u32 %r14, [matrix_matrix_cellwise_op_d_param_3]; + ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_4]; + ld.param.u32 %r11, [matrix_matrix_cellwise_op_d_param_5]; + ld.param.u32 %r12, [matrix_matrix_cellwise_op_d_param_6]; + ld.param.u32 %r13, [matrix_matrix_cellwise_op_d_param_7]; mov.u32 %r15, %ntid.x; mov.u32 %r16, %ctaid.x; mov.u32 %r17, %tid.x; @@ -712,93 +1300,93 @@ BB10_6: setp.lt.s32 %p2, %r1, %r14; setp.gt.s32 %p3, %r10, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB11_73; - bra.uni BB11_1; + @!%p4 bra BB22_77; + bra.uni BB22_1; -BB11_1: +BB22_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r63, %r1; - @%p5 bra BB11_5; + mov.u32 %r64, %r1; + @%p5 bra BB22_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r64, %r3; - @%p6 bra BB11_4; + mov.u32 %r65, %r3; + @%p6 bra BB22_4; - mov.u32 %r64, %r2; + mov.u32 %r65, %r2; -BB11_4: - mov.u32 %r58, %r64; - mov.u32 %r4, %r58; - mov.u32 %r63, %r4; +BB22_4: + mov.u32 %r59, %r65; + mov.u32 %r4, %r59; + mov.u32 %r64, %r4; -BB11_5: - mov.u32 %r5, %r63; +BB22_5: + mov.u32 %r5, %r64; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r61, %r1; - @%p7 bra BB11_9; + mov.u32 %r62, %r1; + @%p7 bra BB22_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r62, %r3; - @%p8 bra BB11_8; + mov.u32 %r63, %r3; + @%p8 bra BB22_8; - mov.u32 %r62, %r2; + mov.u32 %r63, %r2; -BB11_8: - mov.u32 %r61, %r62; +BB22_8: + mov.u32 %r62, %r63; -BB11_9: +BB22_9: cvta.to.global.u64 %rd5, %rd3; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r61, 8; + mul.wide.s32 %rd9, %r62, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd55, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p9, %r13, 8; - @%p9 bra BB11_26; + @%p9 bra BB22_26; setp.gt.s32 %p23, %r13, 3; - @%p23 bra BB11_18; + @%p23 bra BB22_18; setp.gt.s32 %p30, %r13, 1; - @%p30 bra BB11_15; + @%p30 bra BB22_15; setp.eq.s32 %p33, %r13, 0; - @%p33 bra BB11_71; - bra.uni BB11_13; + @%p33 bra BB22_75; + bra.uni BB22_13; -BB11_71: - add.f64 %fd54, %fd1, %fd2; - bra.uni BB11_72; +BB22_75: + add.f64 %fd55, %fd1, %fd2; + bra.uni BB22_76; -BB11_26: +BB22_26: setp.gt.s32 %p10, %r13, 13; - @%p10 bra BB11_35; + @%p10 bra BB22_35; setp.gt.s32 %p17, %r13, 10; - @%p17 bra BB11_31; + @%p17 bra BB22_31; setp.eq.s32 %p21, %r13, 9; - @%p21 bra BB11_53; - bra.uni BB11_29; + @%p21 bra BB22_55; + bra.uni BB22_29; -BB11_53: - setp.eq.f64 %p50, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB11_72; +BB22_55: + setp.eq.f64 %p48, %fd1, %fd2; + selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p48; + bra.uni BB22_76; -BB11_18: +BB22_18: setp.gt.s32 %p24, %r13, 5; - @%p24 bra BB11_22; + @%p24 bra BB22_22; setp.eq.s32 %p28, %r13, 4; - @%p28 bra BB11_56; - bra.uni BB11_20; + @%p28 bra BB22_58; + bra.uni BB22_20; -BB11_56: +BB22_58: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -811,7 +1399,7 @@ BB11_56: add.s32 %r32, %r31, -1012; mov.b64 %rd15, %fd2; shl.b64 %rd1, %rd15, %r32; - setp.eq.s64 %p55, %rd1, -9223372036854775808; + setp.eq.s64 %p53, %rd1, -9223372036854775808; abs.f64 %fd19, %fd1; // Callseq Start 0 { @@ -828,472 +1416,966 @@ BB11_56: param0, param1 ); - ld.param.f64 %fd53, [retval0+0]; + ld.param.f64 %fd54, [retval0+0]; //{ }// Callseq End 0 - setp.lt.s32 %p56, %r8, 0; - and.pred %p1, %p56, %p55; - @!%p1 bra BB11_58; - bra.uni BB11_57; + setp.lt.s32 %p54, %r8, 0; + and.pred %p1, %p54, %p53; + @!%p1 bra BB22_60; + bra.uni BB22_59; -BB11_57: +BB22_59: { .reg .b32 %temp; - mov.b64 {%temp, %r33}, %fd53; + mov.b64 {%temp, %r33}, %fd54; } xor.b32 %r34, %r33, -2147483648; { .reg .b32 %temp; - mov.b64 {%r35, %temp}, %fd53; + mov.b64 {%r35, %temp}, %fd54; } - mov.b64 %fd53, {%r35, %r34}; + mov.b64 %fd54, {%r35, %r34}; -BB11_58: - mov.f64 %fd52, %fd53; - setp.eq.f64 %p57, %fd1, 0d0000000000000000; - @%p57 bra BB11_61; - bra.uni BB11_59; +BB22_60: + mov.f64 %fd53, %fd54; + setp.eq.f64 %p55, %fd1, 0d0000000000000000; + @%p55 bra BB22_63; + bra.uni BB22_61; -BB11_61: - selp.b32 %r36, %r8, 0, %p55; +BB22_63: + selp.b32 %r36, %r8, 0, %p53; or.b32 %r37, %r36, 2146435072; - setp.lt.s32 %p61, %r9, 0; - selp.b32 %r38, %r37, %r36, %p61; + setp.lt.s32 %p59, %r9, 0; + selp.b32 %r38, %r37, %r36, %p59; mov.u32 %r39, 0; - mov.b64 %fd52, {%r39, %r38}; - bra.uni BB11_62; + mov.b64 %fd53, {%r39, %r38}; + bra.uni BB22_64; -BB11_35: +BB22_35: setp.gt.s32 %p11, %r13, 15; - @%p11 bra BB11_39; + @%p11 bra BB22_39; setp.eq.s32 %p15, %r13, 14; - @%p15 bra BB11_50; - bra.uni BB11_37; + @%p15 bra BB22_52; + bra.uni BB22_37; -BB11_50: +BB22_52: cvt.rni.s64.f64 %rd11, %fd1; cvt.rni.s64.f64 %rd12, %fd2; cvt.u32.u64 %r25, %rd11; cvt.u32.u64 %r26, %rd12; or.b32 %r27, %r26, %r25; - setp.eq.s32 %p47, %r27, 0; - selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47; - bra.uni BB11_72; + setp.eq.s32 %p45, %r27, 0; + selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45; + bra.uni BB22_76; -BB11_15: +BB22_15: setp.eq.s32 %p31, %r13, 2; - @%p31 bra BB11_70; - bra.uni BB11_16; + @%p31 bra BB22_74; + bra.uni BB22_16; -BB11_70: - mul.f64 %fd54, %fd1, %fd2; - bra.uni BB11_72; +BB22_74: + mul.f64 %fd55, %fd1, %fd2; + bra.uni BB22_76; -BB11_31: +BB22_31: setp.eq.s32 %p18, %r13, 11; - @%p18 bra BB11_52; + @%p18 bra BB22_54; setp.eq.s32 %p19, %r13, 12; - @%p19 bra BB11_51; - bra.uni BB11_33; + @%p19 bra BB22_53; + bra.uni BB22_33; -BB11_51: - max.f64 %fd54, %fd1, %fd2; - bra.uni BB11_72; +BB22_53: + max.f64 %fd55, %fd1, %fd2; + bra.uni BB22_76; -BB11_22: +BB22_22: setp.eq.s32 %p25, %r13, 6; - @%p25 bra BB11_55; + @%p25 bra BB22_57; setp.eq.s32 %p26, %r13, 7; - @%p26 bra BB11_54; - bra.uni BB11_24; + @%p26 bra BB22_56; + bra.uni BB22_24; -BB11_54: - setp.gt.f64 %p52, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52; - bra.uni BB11_72; +BB22_56: + setp.gt.f64 %p50, %fd1, %fd2; + selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p50; + bra.uni BB22_76; -BB11_39: +BB22_39: setp.eq.s32 %p12, %r13, 16; - @%p12 bra BB11_49; + @%p12 bra BB22_51; setp.eq.s32 %p13, %r13, 17; - @%p13 bra BB11_45; - bra.uni BB11_41; + @%p13 bra BB22_46; + bra.uni BB22_41; -BB11_45: - setp.eq.f64 %p39, %fd2, 0d0000000000000000; - setp.eq.f64 %p40, %fd2, 0d8000000000000000; - or.pred %p41, %p39, %p40; - mov.f64 %fd54, 0d7FF8000000000000; - @%p41 bra BB11_72; +BB22_46: + setp.eq.f64 %p38, %fd2, 0d0000000000000000; + setp.eq.f64 %p39, %fd2, 0d8000000000000000; + or.pred %p40, %p38, %p39; + mov.f64 %fd55, 0d7FF8000000000000; + @%p40 bra BB22_76; - div.rn.f64 %fd54, %fd1, %fd2; - abs.f64 %fd39, %fd54; - setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000; - @%p42 bra BB11_72; + div.rn.f64 %fd55, %fd1, %fd2; + abs.f64 %fd39, %fd55; + setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000; + @%p41 bra BB22_76; { .reg .b32 %temp; - mov.b64 {%r22, %temp}, %fd54; + mov.b64 {%temp, %r22}, %fd55; } + and.b32 %r23, %r22, 2147483647; + setp.ne.s32 %p42, %r23, 2146435072; + @%p42 bra BB22_50; + { .reg .b32 %temp; - mov.b64 {%temp, %r23}, %fd54; + mov.b64 {%r24, %temp}, %fd55; } - and.b32 %r24, %r23, 2147483647; - setp.ne.s32 %p43, %r24, 2146435072; - setp.ne.s32 %p44, %r22, 0; - or.pred %p45, %p43, %p44; - @!%p45 bra BB11_72; - bra.uni BB11_48; - -BB11_48: - cvt.rmi.f64.f64 %fd40, %fd54; + setp.eq.s32 %p43, %r24, 0; + @%p43 bra BB22_76; + +BB22_50: + cvt.rmi.f64.f64 %fd40, %fd55; mul.f64 %fd41, %fd2, %fd40; - sub.f64 %fd54, %fd1, %fd41; - bra.uni BB11_72; + sub.f64 %fd55, %fd1, %fd41; + bra.uni BB22_76; -BB11_13: +BB22_13: setp.eq.s32 %p34, %r13, 1; - @%p34 bra BB11_14; - bra.uni BB11_72; + @%p34 bra BB22_14; + bra.uni BB22_76; -BB11_14: - sub.f64 %fd54, %fd1, %fd2; - bra.uni BB11_72; +BB22_14: + sub.f64 %fd55, %fd1, %fd2; + bra.uni BB22_76; -BB11_29: +BB22_29: setp.eq.s32 %p22, %r13, 10; - @%p22 bra BB11_30; - bra.uni BB11_72; + @%p22 bra BB22_30; + bra.uni BB22_76; -BB11_30: - setp.neu.f64 %p49, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49; - bra.uni BB11_72; +BB22_30: + setp.neu.f64 %p47, %fd1, %fd2; + selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p47; + bra.uni BB22_76; -BB11_20: +BB22_20: setp.eq.s32 %p29, %r13, 5; - @%p29 bra BB11_21; - bra.uni BB11_72; + @%p29 bra BB22_21; + bra.uni BB22_76; -BB11_21: - setp.lt.f64 %p54, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54; - bra.uni BB11_72; +BB22_21: + setp.lt.f64 %p52, %fd1, %fd2; + selp.f64 %fd55, 0d3FF0000000000000, 0d0000000000000000, %p52; + bra.uni BB22_76; -BB11_37: +BB22_37: setp.eq.s32 %p16, %r13, 15; - @%p16 bra BB11_38; - bra.uni BB11_72; + @%p16 bra BB22_38; + bra.uni BB22_76; -BB11_38: +BB22_38: mul.f64 %fd43, %fd1, %fd2; mov.f64 %fd44, 0d3FF0000000000000; - sub.f64 %fd54, %fd44, %fd43; - bra.uni BB11_72; + sub.f64 %fd55, %fd44, %fd43; + bra.uni BB22_76; -BB11_16: +BB22_16: setp.eq.s32 %p32, %r13, 3; - @%p32 bra BB11_17; - bra.uni BB11_72; + @%p32 bra BB22_17; + bra.uni BB22_76; -BB11_17: - div.rn.f64 %fd54, %fd1, %fd2; - bra.uni BB11_72; +BB22_17: + div.rn.f64 %fd55, %fd1, %fd2; + bra.uni BB22_76; -BB11_52: - min.f64 %fd54, %fd1, %fd2; - bra.uni BB11_72; +BB22_54: + min.f64 %fd55, %fd1, %fd2; + bra.uni BB22_76; -BB11_33: +BB22_33: setp.eq.s32 %p20, %r13, 13; - @%p20 bra BB11_34; - bra.uni BB11_72; + @%p20 bra BB22_34; + bra.uni BB22_76; -BB11_34: +BB22_34: cvt.rni.s64.f64 %rd13, %fd1; cvt.rni.s64.f64 %rd14, %fd2; cvt.u32.u64 %r28, %rd13; cvt.u32.u64 %r29, %rd14; and.b32 %r30, %r29, %r28; - setp.eq.s32 %p48, %r30, 0; - selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48; - bra.uni BB11_72; + setp.eq.s32 %p46, %r30, 0; + selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46; + bra.uni BB22_76; -BB11_55: - setp.le.f64 %p53, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53; - bra.uni BB11_72; +BB22_57: + setp.gtu.f64 %p51, %fd1, %fd2; + selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p51; + bra.uni BB22_76; -BB11_24: +BB22_24: setp.eq.s32 %p27, %r13, 8; - @%p27 bra BB11_25; - bra.uni BB11_72; + @%p27 bra BB22_25; + bra.uni BB22_76; -BB11_25: - setp.ge.f64 %p51, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51; - bra.uni BB11_72; +BB22_25: + setp.ltu.f64 %p49, %fd1, %fd2; + selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p49; + bra.uni BB22_76; -BB11_49: - setp.neu.f64 %p46, %fd1, 0d0000000000000000; +BB22_51: + setp.neu.f64 %p44, %fd1, 0d0000000000000000; sub.f64 %fd42, %fd1, %fd2; - selp.f64 %fd54, %fd42, 0d0000000000000000, %p46; - bra.uni BB11_72; + selp.f64 %fd55, %fd42, 0d0000000000000000, %p44; + bra.uni BB22_76; -BB11_41: +BB22_41: setp.ne.s32 %p14, %r13, 18; - @%p14 bra BB11_72; + @%p14 bra BB22_76; - div.rn.f64 %fd54, %fd1, %fd2; - abs.f64 %fd37, %fd54; + div.rn.f64 %fd55, %fd1, %fd2; + abs.f64 %fd37, %fd55; setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000; - @%p35 bra BB11_72; + @%p35 bra BB22_76; { .reg .b32 %temp; - mov.b64 {%r19, %temp}, %fd54; + mov.b64 {%temp, %r19}, %fd55; } + and.b32 %r20, %r19, 2147483647; + setp.ne.s32 %p36, %r20, 2146435072; + @%p36 bra BB22_45; + { .reg .b32 %temp; - mov.b64 {%temp, %r20}, %fd54; + mov.b64 {%r21, %temp}, %fd55; } - and.b32 %r21, %r20, 2147483647; - setp.ne.s32 %p36, %r21, 2146435072; - setp.ne.s32 %p37, %r19, 0; - or.pred %p38, %p36, %p37; - @!%p38 bra BB11_72; - bra.uni BB11_44; + setp.eq.s32 %p37, %r21, 0; + @%p37 bra BB22_76; -BB11_44: - cvt.rmi.f64.f64 %fd54, %fd54; - bra.uni BB11_72; +BB22_45: + cvt.rmi.f64.f64 %fd55, %fd55; + bra.uni BB22_76; -BB11_59: - setp.gt.s32 %p58, %r8, -1; - @%p58 bra BB11_62; +BB22_61: + setp.gt.s32 %p56, %r8, -1; + @%p56 bra BB22_64; cvt.rzi.f64.f64 %fd45, %fd2; - setp.neu.f64 %p59, %fd45, %fd2; - selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59; + setp.neu.f64 %p57, %fd45, %fd2; + selp.f64 %fd53, 0dFFF8000000000000, %fd53, %p57; -BB11_62: - mov.f64 %fd25, %fd52; +BB22_64: + mov.f64 %fd25, %fd53; add.f64 %fd26, %fd1, %fd2; { .reg .b32 %temp; mov.b64 {%temp, %r40}, %fd26; } and.b32 %r41, %r40, 2146435072; - setp.ne.s32 %p62, %r41, 2146435072; - mov.f64 %fd51, %fd25; - @%p62 bra BB11_69; + setp.ne.s32 %p60, %r41, 2146435072; + mov.f64 %fd52, %fd25; + @%p60 bra BB22_73; - setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000; - mov.f64 %fd51, %fd26; - @%p63 bra BB11_69; + setp.gtu.f64 %p61, %fd19, 0d7FF0000000000000; + mov.f64 %fd52, %fd26; + @%p61 bra BB22_73; abs.f64 %fd46, %fd2; - setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000; - mov.f64 %fd50, %fd26; - mov.f64 %fd51, %fd50; - @%p64 bra BB11_69; + setp.gtu.f64 %p62, %fd46, 0d7FF0000000000000; + mov.f64 %fd51, %fd26; + mov.f64 %fd52, %fd51; + @%p62 bra BB22_73; + + and.b32 %r42, %r9, 2147483647; + setp.ne.s32 %p63, %r42, 2146435072; + @%p63 bra BB22_69; { .reg .b32 %temp; - mov.b64 {%r42, %temp}, %fd2; + mov.b64 {%r43, %temp}, %fd2; } - and.b32 %r43, %r9, 2147483647; - setp.eq.s32 %p65, %r43, 2146435072; - setp.eq.s32 %p66, %r42, 0; - and.pred %p67, %p65, %p66; - @%p67 bra BB11_68; - bra.uni BB11_66; - -BB11_68: - setp.gt.f64 %p71, %fd19, 0d3FF0000000000000; - selp.b32 %r51, 2146435072, 0, %p71; - xor.b32 %r52, %r51, 2146435072; - setp.lt.s32 %p72, %r9, 0; - selp.b32 %r53, %r52, %r51, %p72; - setp.eq.f64 %p73, %fd1, 0dBFF0000000000000; - selp.b32 %r54, 1072693248, %r53, %p73; - mov.u32 %r55, 0; - mov.b64 %fd51, {%r55, %r54}; - bra.uni BB11_69; - -BB11_66: + setp.eq.s32 %p64, %r43, 0; + @%p64 bra BB22_72; + +BB22_69: + and.b32 %r44, %r8, 2147483647; + setp.ne.s32 %p65, %r44, 2146435072; + mov.f64 %fd49, %fd25; + mov.f64 %fd52, %fd49; + @%p65 bra BB22_73; + { .reg .b32 %temp; - mov.b64 {%r44, %temp}, %fd1; + mov.b64 {%r45, %temp}, %fd1; } - and.b32 %r45, %r8, 2147483647; - setp.eq.s32 %p68, %r45, 2146435072; - setp.eq.s32 %p69, %r44, 0; - and.pred %p70, %p68, %p69; - mov.f64 %fd51, %fd25; - @!%p70 bra BB11_69; - bra.uni BB11_67; - -BB11_67: + setp.ne.s32 %p66, %r45, 0; + mov.f64 %fd52, %fd25; + @%p66 bra BB22_73; + shr.s32 %r46, %r9, 31; and.b32 %r47, %r46, -2146435072; - selp.b32 %r48, -1048576, 2146435072, %p1; - add.s32 %r49, %r48, %r47; - mov.u32 %r50, 0; - mov.b64 %fd51, {%r50, %r49}; - -BB11_69: - setp.eq.f64 %p74, %fd2, 0d0000000000000000; - setp.eq.f64 %p75, %fd1, 0d3FF0000000000000; - or.pred %p76, %p75, %p74; - selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76; - -BB11_72: + add.s32 %r48, %r47, 2146435072; + or.b32 %r49, %r48, -2147483648; + selp.b32 %r50, %r49, %r48, %p1; + mov.u32 %r51, 0; + mov.b64 %fd52, {%r51, %r50}; + bra.uni BB22_73; + +BB22_72: + setp.gt.f64 %p67, %fd19, 0d3FF0000000000000; + selp.b32 %r52, 2146435072, 0, %p67; + xor.b32 %r53, %r52, 2146435072; + setp.lt.s32 %p68, %r9, 0; + selp.b32 %r54, %r53, %r52, %p68; + setp.eq.f64 %p69, %fd1, 0dBFF0000000000000; + selp.b32 %r55, 1072693248, %r54, %p69; + mov.u32 %r56, 0; + mov.b64 %fd52, {%r56, %r55}; + +BB22_73: + setp.eq.f64 %p70, %fd2, 0d0000000000000000; + setp.eq.f64 %p71, %fd1, 0d3FF0000000000000; + or.pred %p72, %p71, %p70; + selp.f64 %fd55, 0d3FF0000000000000, %fd52, %p72; + +BB22_76: cvta.to.global.u64 %rd16, %rd4; mul.wide.s32 %rd17, %r3, 8; add.s64 %rd18, %rd16, %rd17; - st.global.f64 [%rd18], %fd54; + st.global.f64 [%rd18], %fd55; bar.sync 0; -BB11_73: +BB22_77: ret; } - // .globl matrix_scalar_op -.visible .entry matrix_scalar_op( - .param .u64 matrix_scalar_op_param_0, - .param .f64 matrix_scalar_op_param_1, - .param .u64 matrix_scalar_op_param_2, - .param .u32 matrix_scalar_op_param_3, - .param .u32 matrix_scalar_op_param_4, - .param .u32 matrix_scalar_op_param_5 + // .globl matrix_matrix_cellwise_op_f +.visible .entry matrix_matrix_cellwise_op_f( + .param .u64 matrix_matrix_cellwise_op_f_param_0, + .param .u64 matrix_matrix_cellwise_op_f_param_1, + .param .u64 matrix_matrix_cellwise_op_f_param_2, + .param .u32 matrix_matrix_cellwise_op_f_param_3, + .param .u32 matrix_matrix_cellwise_op_f_param_4, + .param .u32 matrix_matrix_cellwise_op_f_param_5, + .param .u32 matrix_matrix_cellwise_op_f_param_6, + .param .u32 matrix_matrix_cellwise_op_f_param_7 ) { - .reg .pred %p<141>; - .reg .b32 %r<86>; - .reg .f64 %fd<107>; - .reg .b64 %rd<20>; + .reg .pred %p<76>; + .reg .f32 %f<134>; + .reg .b32 %r<51>; + .reg .b64 %rd<17>; - ld.param.u64 %rd4, [matrix_scalar_op_param_0]; - ld.param.f64 %fd68, [matrix_scalar_op_param_1]; - ld.param.u64 %rd5, [matrix_scalar_op_param_2]; - ld.param.u32 %r8, [matrix_scalar_op_param_3]; - ld.param.u32 %r6, [matrix_scalar_op_param_4]; - ld.param.u32 %r7, [matrix_scalar_op_param_5]; - mov.u32 %r9, %ntid.x; - mov.u32 %r10, %ctaid.x; - mov.u32 %r11, %tid.x; - mad.lo.s32 %r1, %r9, %r10, %r11; - setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB12_130; + ld.param.u64 %rd1, [matrix_matrix_cellwise_op_f_param_0]; + ld.param.u64 %rd2, [matrix_matrix_cellwise_op_f_param_1]; + ld.param.u64 %rd3, [matrix_matrix_cellwise_op_f_param_2]; + ld.param.u32 %r12, [matrix_matrix_cellwise_op_f_param_3]; + ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_4]; + ld.param.u32 %r9, [matrix_matrix_cellwise_op_f_param_5]; + ld.param.u32 %r10, [matrix_matrix_cellwise_op_f_param_6]; + ld.param.u32 %r11, [matrix_matrix_cellwise_op_f_param_7]; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %ctaid.x; + mov.u32 %r15, %tid.x; + mad.lo.s32 %r16, %r13, %r14, %r15; + div.s32 %r1, %r16, %r8; + rem.s32 %r2, %r16, %r8; + setp.lt.s32 %p2, %r1, %r12; + setp.gt.s32 %p3, %r8, -1; + and.pred %p4, %p2, %p3; + @!%p4 bra BB23_71; + bra.uni BB23_1; - cvta.to.global.u64 %rd6, %rd5; - cvta.to.global.u64 %rd7, %rd4; - mul.wide.s32 %rd8, %r1, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.f64 %fd1, [%rd9]; - add.s64 %rd1, %rd6, %rd8; - setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB12_66; +BB23_1: + mad.lo.s32 %r3, %r1, %r8, %r2; + setp.eq.s32 %p5, %r9, 1; + mov.u32 %r49, %r1; + @%p5 bra BB23_5; - mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p5, %r6, 8; - @%p5 bra BB12_19; + setp.ne.s32 %p6, %r9, 2; + mov.u32 %r50, %r3; + @%p6 bra BB23_4; - setp.gt.s32 %p19, %r6, 3; - @%p19 bra BB12_11; + mov.u32 %r50, %r2; - setp.gt.s32 %p26, %r6, 1; - @%p26 bra BB12_8; +BB23_4: + mov.u32 %r44, %r50; + mov.u32 %r4, %r44; + mov.u32 %r49, %r4; - setp.eq.s32 %p29, %r6, 0; - @%p29 bra BB12_64; - bra.uni BB12_6; +BB23_5: + mov.u32 %r5, %r49; + setp.eq.s32 %p7, %r10, 1; + mov.u32 %r47, %r1; + @%p7 bra BB23_9; -BB12_64: - add.f64 %fd98, %fd1, %fd68; - bra.uni BB12_65; + setp.ne.s32 %p8, %r10, 2; + mov.u32 %r48, %r3; + @%p8 bra BB23_8; -BB12_66: - mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p73, %r6, 8; - @%p73 bra BB12_83; + mov.u32 %r48, %r2; - setp.gt.s32 %p87, %r6, 3; - @%p87 bra BB12_75; +BB23_8: + mov.u32 %r47, %r48; - setp.gt.s32 %p94, %r6, 1; - @%p94 bra BB12_72; +BB23_9: + cvta.to.global.u64 %rd4, %rd2; + cvta.to.global.u64 %rd5, %rd1; + mul.wide.s32 %rd6, %r5, 4; + add.s64 %rd7, %rd5, %rd6; + ld.global.f32 %f1, [%rd7]; + mul.wide.s32 %rd8, %r47, 4; + add.s64 %rd9, %rd4, %rd8; + ld.global.f32 %f2, [%rd9]; + mov.f32 %f133, 0f7F7FFFFF; + setp.gt.s32 %p9, %r11, 8; + @%p9 bra BB23_26; - setp.eq.s32 %p97, %r6, 0; - @%p97 bra BB12_128; - bra.uni BB12_70; + setp.gt.s32 %p23, %r11, 3; + @%p23 bra BB23_18; -BB12_128: - add.f64 %fd106, %fd1, %fd68; - bra.uni BB12_129; + setp.gt.s32 %p30, %r11, 1; + @%p30 bra BB23_15; -BB12_19: - setp.gt.s32 %p6, %r6, 13; - @%p6 bra BB12_28; + setp.eq.s32 %p33, %r11, 0; + @%p33 bra BB23_69; + bra.uni BB23_13; - setp.gt.s32 %p13, %r6, 10; - @%p13 bra BB12_24; +BB23_69: + add.f32 %f133, %f1, %f2; + bra.uni BB23_70; - setp.eq.s32 %p17, %r6, 9; - @%p17 bra BB12_46; - bra.uni BB12_22; +BB23_26: + setp.gt.s32 %p10, %r11, 13; + @%p10 bra BB23_35; -BB12_46: - setp.eq.f64 %p46, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46; - bra.uni BB12_65; + setp.gt.s32 %p17, %r11, 10; + @%p17 bra BB23_31; -BB12_83: - setp.gt.s32 %p74, %r6, 13; - @%p74 bra BB12_92; + setp.eq.s32 %p21, %r11, 9; + @%p21 bra BB23_51; + bra.uni BB23_29; + +BB23_51: + setp.eq.f32 %p44, %f1, %f2; + selp.f32 %f133, 0f3F800000, 0f00000000, %p44; + bra.uni BB23_70; + +BB23_18: + setp.gt.s32 %p24, %r11, 5; + @%p24 bra BB23_22; + + setp.eq.s32 %p28, %r11, 4; + @%p28 bra BB23_54; + bra.uni BB23_20; + +BB23_54: + mul.f32 %f53, %f2, 0f3F000000; + cvt.rzi.f32.f32 %f54, %f53; + fma.rn.f32 %f55, %f54, 0fC0000000, %f2; + abs.f32 %f19, %f55; + abs.f32 %f20, %f1; + setp.lt.f32 %p49, %f20, 0f00800000; + mul.f32 %f56, %f20, 0f4B800000; + selp.f32 %f57, 0fC3170000, 0fC2FE0000, %p49; + selp.f32 %f58, %f56, %f20, %p49; + mov.b32 %r23, %f58; + and.b32 %r24, %r23, 8388607; + or.b32 %r25, %r24, 1065353216; + mov.b32 %f59, %r25; + shr.u32 %r26, %r23, 23; + cvt.rn.f32.u32 %f60, %r26; + add.f32 %f61, %f57, %f60; + setp.gt.f32 %p50, %f59, 0f3FB504F3; + mul.f32 %f62, %f59, 0f3F000000; + add.f32 %f63, %f61, 0f3F800000; + selp.f32 %f64, %f62, %f59, %p50; + selp.f32 %f65, %f63, %f61, %p50; + add.f32 %f66, %f64, 0fBF800000; + add.f32 %f50, %f64, 0f3F800000; + // inline asm + rcp.approx.ftz.f32 %f49,%f50; + // inline asm + add.f32 %f67, %f66, %f66; + mul.f32 %f68, %f49, %f67; + mul.f32 %f69, %f68, %f68; + mov.f32 %f70, 0f3C4CAF63; + mov.f32 %f71, 0f3B18F0FE; + fma.rn.f32 %f72, %f71, %f69, %f70; + mov.f32 %f73, 0f3DAAAABD; + fma.rn.f32 %f74, %f72, %f69, %f73; + mul.rn.f32 %f75, %f74, %f69; + mul.rn.f32 %f76, %f75, %f68; + sub.f32 %f77, %f66, %f68; + neg.f32 %f78, %f68; + add.f32 %f79, %f77, %f77; + fma.rn.f32 %f80, %f78, %f66, %f79; + mul.rn.f32 %f81, %f49, %f80; + add.f32 %f82, %f76, %f68; + sub.f32 %f83, %f68, %f82; + add.f32 %f84, %f76, %f83; + add.f32 %f85, %f81, %f84; + add.f32 %f86, %f82, %f85; + sub.f32 %f87, %f82, %f86; + add.f32 %f88, %f85, %f87; + mov.f32 %f89, 0f3F317200; + mul.rn.f32 %f90, %f65, %f89; + mov.f32 %f91, 0f35BFBE8E; + mul.rn.f32 %f92, %f65, %f91; + add.f32 %f93, %f90, %f86; + sub.f32 %f94, %f90, %f93; + add.f32 %f95, %f86, %f94; + add.f32 %f96, %f88, %f95; + add.f32 %f97, %f92, %f96; + add.f32 %f98, %f93, %f97; + sub.f32 %f99, %f93, %f98; + add.f32 %f100, %f97, %f99; + abs.f32 %f21, %f2; + setp.gt.f32 %p51, %f21, 0f77F684DF; + mul.f32 %f101, %f2, 0f39000000; + selp.f32 %f102, %f101, %f2, %p51; + mul.rn.f32 %f103, %f102, %f98; + neg.f32 %f104, %f103; + fma.rn.f32 %f105, %f102, %f98, %f104; + fma.rn.f32 %f106, %f102, %f100, %f105; + mov.f32 %f107, 0f00000000; + fma.rn.f32 %f108, %f107, %f98, %f106; + add.rn.f32 %f109, %f103, %f108; + neg.f32 %f110, %f109; + add.rn.f32 %f111, %f103, %f110; + add.rn.f32 %f112, %f111, %f108; + mov.b32 %r27, %f109; + setp.eq.s32 %p52, %r27, 1118925336; + add.s32 %r28, %r27, -1; + mov.b32 %f113, %r28; + add.f32 %f114, %f112, 0f37000000; + selp.f32 %f115, %f113, %f109, %p52; + selp.f32 %f22, %f114, %f112, %p52; + mul.f32 %f116, %f115, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f117, %f116; + mov.f32 %f118, 0fBF317200; + fma.rn.f32 %f119, %f117, %f118, %f115; + mov.f32 %f120, 0fB5BFBE8E; + fma.rn.f32 %f121, %f117, %f120, %f119; + mul.f32 %f52, %f121, 0f3FB8AA3B; + // inline asm + ex2.approx.ftz.f32 %f51,%f52; + // inline asm + add.f32 %f122, %f117, 0f00000000; + ex2.approx.f32 %f123, %f122; + mul.f32 %f124, %f51, %f123; + setp.lt.f32 %p53, %f115, 0fC2D20000; + selp.f32 %f125, 0f00000000, %f124, %p53; + setp.gt.f32 %p54, %f115, 0f42D20000; + selp.f32 %f131, 0f7F800000, %f125, %p54; + setp.eq.f32 %p55, %f131, 0f7F800000; + @%p55 bra BB23_56; + + fma.rn.f32 %f131, %f131, %f22, %f131; + +BB23_56: + setp.lt.f32 %p56, %f1, 0f00000000; + setp.eq.f32 %p57, %f19, 0f3F800000; + and.pred %p1, %p56, %p57; + mov.b32 %r29, %f131; + xor.b32 %r30, %r29, -2147483648; + mov.b32 %f126, %r30; + selp.f32 %f132, %f126, %f131, %p1; + setp.eq.f32 %p58, %f1, 0f00000000; + @%p58 bra BB23_59; + bra.uni BB23_57; + +BB23_59: + add.f32 %f128, %f1, %f1; + mov.b32 %r31, %f128; + selp.b32 %r32, %r31, 0, %p57; + or.b32 %r33, %r32, 2139095040; + setp.lt.f32 %p62, %f2, 0f00000000; + selp.b32 %r34, %r33, %r32, %p62; + mov.b32 %f132, %r34; + bra.uni BB23_60; - setp.gt.s32 %p81, %r6, 10; - @%p81 bra BB12_88; +BB23_35: + setp.gt.s32 %p11, %r11, 15; + @%p11 bra BB23_39; + + setp.eq.s32 %p15, %r11, 14; + @%p15 bra BB23_48; + bra.uni BB23_37; + +BB23_48: + cvt.rni.s64.f32 %rd10, %f1; + cvt.rni.s64.f32 %rd11, %f2; + cvt.u32.u64 %r17, %rd10; + cvt.u32.u64 %r18, %rd11; + or.b32 %r19, %r18, %r17; + setp.eq.s32 %p41, %r19, 0; + selp.f32 %f133, 0f00000000, 0f3F800000, %p41; + bra.uni BB23_70; - setp.eq.s32 %p85, %r6, 9; - @%p85 bra BB12_110; - bra.uni BB12_86; +BB23_15: + setp.eq.s32 %p31, %r11, 2; + @%p31 bra BB23_68; + bra.uni BB23_16; -BB12_110: - setp.eq.f64 %p114, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114; - bra.uni BB12_129; +BB23_68: + mul.f32 %f133, %f1, %f2; + bra.uni BB23_70; -BB12_11: - setp.gt.s32 %p20, %r6, 5; - @%p20 bra BB12_15; +BB23_31: + setp.eq.s32 %p18, %r11, 11; + @%p18 bra BB23_50; - setp.eq.s32 %p24, %r6, 4; - @%p24 bra BB12_49; - bra.uni BB12_13; + setp.eq.s32 %p19, %r11, 12; + @%p19 bra BB23_49; + bra.uni BB23_33; -BB12_49: - { - .reg .b32 %temp; - mov.b64 {%temp, %r2}, %fd68; - } - { - .reg .b32 %temp; - mov.b64 {%temp, %r3}, %fd1; - } - bfe.u32 %r24, %r3, 20, 11; +BB23_49: + max.f32 %f133, %f1, %f2; + bra.uni BB23_70; + +BB23_22: + setp.eq.s32 %p25, %r11, 6; + @%p25 bra BB23_53; + + setp.eq.s32 %p26, %r11, 7; + @%p26 bra BB23_52; + bra.uni BB23_24; + +BB23_52: + setp.gt.f32 %p46, %f1, %f2; + selp.f32 %f133, 0f3F800000, 0f00000000, %p46; + bra.uni BB23_70; + +BB23_39: + setp.eq.s32 %p12, %r11, 16; + @%p12 bra BB23_47; + + setp.eq.s32 %p13, %r11, 17; + @%p13 bra BB23_44; + bra.uni BB23_41; + +BB23_44: + setp.eq.f32 %p36, %f2, 0f00000000; + setp.eq.f32 %p37, %f2, 0f80000000; + or.pred %p38, %p36, %p37; + mov.f32 %f133, 0f7FC00000; + @%p38 bra BB23_70; + + div.rn.f32 %f133, %f1, %f2; + abs.f32 %f43, %f133; + setp.geu.f32 %p39, %f43, 0f7F800000; + @%p39 bra BB23_70; + + cvt.rmi.f32.f32 %f44, %f133; + mul.f32 %f45, %f2, %f44; + sub.f32 %f133, %f1, %f45; + bra.uni BB23_70; + +BB23_13: + setp.eq.s32 %p34, %r11, 1; + @%p34 bra BB23_14; + bra.uni BB23_70; + +BB23_14: + sub.f32 %f133, %f1, %f2; + bra.uni BB23_70; + +BB23_29: + setp.eq.s32 %p22, %r11, 10; + @%p22 bra BB23_30; + bra.uni BB23_70; + +BB23_30: + setp.neu.f32 %p43, %f1, %f2; + selp.f32 %f133, 0f3F800000, 0f00000000, %p43; + bra.uni BB23_70; + +BB23_20: + setp.eq.s32 %p29, %r11, 5; + @%p29 bra BB23_21; + bra.uni BB23_70; + +BB23_21: + setp.lt.f32 %p48, %f1, %f2; + selp.f32 %f133, 0f3F800000, 0f00000000, %p48; + bra.uni BB23_70; + +BB23_37: + setp.eq.s32 %p16, %r11, 15; + @%p16 bra BB23_38; + bra.uni BB23_70; + +BB23_38: + mul.f32 %f47, %f1, %f2; + mov.f32 %f48, 0f3F800000; + sub.f32 %f133, %f48, %f47; + bra.uni BB23_70; + +BB23_16: + setp.eq.s32 %p32, %r11, 3; + @%p32 bra BB23_17; + bra.uni BB23_70; + +BB23_17: + div.rn.f32 %f133, %f1, %f2; + bra.uni BB23_70; + +BB23_50: + min.f32 %f133, %f1, %f2; + bra.uni BB23_70; + +BB23_33: + setp.eq.s32 %p20, %r11, 13; + @%p20 bra BB23_34; + bra.uni BB23_70; + +BB23_34: + cvt.rni.s64.f32 %rd12, %f1; + cvt.rni.s64.f32 %rd13, %f2; + cvt.u32.u64 %r20, %rd12; + cvt.u32.u64 %r21, %rd13; + and.b32 %r22, %r21, %r20; + setp.eq.s32 %p42, %r22, 0; + selp.f32 %f133, 0f00000000, 0f3F800000, %p42; + bra.uni BB23_70; + +BB23_53: + setp.gtu.f32 %p47, %f1, %f2; + selp.f32 %f133, 0f00000000, 0f3F800000, %p47; + bra.uni BB23_70; + +BB23_24: + setp.eq.s32 %p27, %r11, 8; + @%p27 bra BB23_25; + bra.uni BB23_70; + +BB23_25: + setp.ltu.f32 %p45, %f1, %f2; + selp.f32 %f133, 0f00000000, 0f3F800000, %p45; + bra.uni BB23_70; + +BB23_47: + setp.neu.f32 %p40, %f1, 0f00000000; + sub.f32 %f46, %f1, %f2; + selp.f32 %f133, %f46, 0f00000000, %p40; + bra.uni BB23_70; + +BB23_41: + setp.ne.s32 %p14, %r11, 18; + @%p14 bra BB23_70; + + div.rn.f32 %f133, %f1, %f2; + abs.f32 %f41, %f133; + setp.geu.f32 %p35, %f41, 0f7F800000; + @%p35 bra BB23_70; + + cvt.rmi.f32.f32 %f133, %f133; + bra.uni BB23_70; + +BB23_57: + setp.geu.f32 %p59, %f1, 0f00000000; + @%p59 bra BB23_60; + + cvt.rzi.f32.f32 %f127, %f2; + setp.neu.f32 %p60, %f127, %f2; + selp.f32 %f132, 0f7FFFFFFF, %f132, %p60; + +BB23_60: + add.f32 %f129, %f20, %f21; + mov.b32 %r35, %f129; + setp.lt.s32 %p63, %r35, 2139095040; + @%p63 bra BB23_67; + + setp.gtu.f32 %p64, %f20, 0f7F800000; + setp.gtu.f32 %p65, %f21, 0f7F800000; + or.pred %p66, %p64, %p65; + @%p66 bra BB23_66; + bra.uni BB23_62; + +BB23_66: + add.f32 %f132, %f1, %f2; + bra.uni BB23_67; + +BB23_62: + setp.eq.f32 %p67, %f21, 0f7F800000; + @%p67 bra BB23_65; + bra.uni BB23_63; + +BB23_65: + setp.gt.f32 %p70, %f20, 0f3F800000; + selp.b32 %r39, 2139095040, 0, %p70; + xor.b32 %r40, %r39, 2139095040; + setp.lt.f32 %p71, %f2, 0f00000000; + selp.b32 %r41, %r40, %r39, %p71; + mov.b32 %f130, %r41; + setp.eq.f32 %p72, %f1, 0fBF800000; + selp.f32 %f132, 0f3F800000, %f130, %p72; + bra.uni BB23_67; + +BB23_63: + setp.neu.f32 %p68, %f20, 0f7F800000; + @%p68 bra BB23_67; + + setp.ltu.f32 %p69, %f2, 0f00000000; + selp.b32 %r36, 0, 2139095040, %p69; + or.b32 %r37, %r36, -2147483648; + selp.b32 %r38, %r37, %r36, %p1; + mov.b32 %f132, %r38; + +BB23_67: + setp.eq.f32 %p73, %f2, 0f00000000; + setp.eq.f32 %p74, %f1, 0f3F800000; + or.pred %p75, %p74, %p73; + selp.f32 %f133, 0f3F800000, %f132, %p75; + +BB23_70: + cvta.to.global.u64 %rd14, %rd3; + mul.wide.s32 %rd15, %r3, 4; + add.s64 %rd16, %rd14, %rd15; + st.global.f32 [%rd16], %f133; + bar.sync 0; + +BB23_71: + ret; +} + + // .globl matrix_scalar_op_d +.visible .entry matrix_scalar_op_d( + .param .u64 matrix_scalar_op_d_param_0, + .param .f64 matrix_scalar_op_d_param_1, + .param .u64 matrix_scalar_op_d_param_2, + .param .u32 matrix_scalar_op_d_param_3, + .param .u32 matrix_scalar_op_d_param_4, + .param .u32 matrix_scalar_op_d_param_5 +) +{ + .reg .pred %p<133>; + .reg .b32 %r<88>; + .reg .f64 %fd<109>; + .reg .b64 %rd<20>; + + + ld.param.u64 %rd4, [matrix_scalar_op_d_param_0]; + ld.param.f64 %fd68, [matrix_scalar_op_d_param_1]; + ld.param.u64 %rd5, [matrix_scalar_op_d_param_2]; + ld.param.u32 %r8, [matrix_scalar_op_d_param_3]; + ld.param.u32 %r6, [matrix_scalar_op_d_param_4]; + ld.param.u32 %r7, [matrix_scalar_op_d_param_5]; + mov.u32 %r9, %ntid.x; + mov.u32 %r10, %ctaid.x; + mov.u32 %r11, %tid.x; + mad.lo.s32 %r1, %r9, %r10, %r11; + setp.ge.s32 %p3, %r1, %r8; + @%p3 bra BB24_138; + + cvta.to.global.u64 %rd6, %rd5; + cvta.to.global.u64 %rd7, %rd4; + mul.wide.s32 %rd8, %r1, 8; + add.s64 %rd9, %rd7, %rd8; + ld.global.f64 %fd1, [%rd9]; + add.s64 %rd1, %rd6, %rd8; + setp.eq.s32 %p4, %r7, 0; + @%p4 bra BB24_70; + + mov.f64 %fd99, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p5, %r6, 8; + @%p5 bra BB24_19; + + setp.gt.s32 %p19, %r6, 3; + @%p19 bra BB24_11; + + setp.gt.s32 %p26, %r6, 1; + @%p26 bra BB24_8; + + setp.eq.s32 %p29, %r6, 0; + @%p29 bra BB24_68; + bra.uni BB24_6; + +BB24_68: + add.f64 %fd99, %fd1, %fd68; + bra.uni BB24_69; + +BB24_70: + mov.f64 %fd108, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p69, %r6, 8; + @%p69 bra BB24_87; + + setp.gt.s32 %p83, %r6, 3; + @%p83 bra BB24_79; + + setp.gt.s32 %p90, %r6, 1; + @%p90 bra BB24_76; + + setp.eq.s32 %p93, %r6, 0; + @%p93 bra BB24_136; + bra.uni BB24_74; + +BB24_136: + add.f64 %fd108, %fd1, %fd68; + bra.uni BB24_137; + +BB24_19: + setp.gt.s32 %p6, %r6, 13; + @%p6 bra BB24_28; + + setp.gt.s32 %p13, %r6, 10; + @%p13 bra BB24_24; + + setp.eq.s32 %p17, %r6, 9; + @%p17 bra BB24_48; + bra.uni BB24_22; + +BB24_48: + setp.eq.f64 %p44, %fd1, %fd68; + selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p44; + bra.uni BB24_69; + +BB24_87: + setp.gt.s32 %p70, %r6, 13; + @%p70 bra BB24_96; + + setp.gt.s32 %p77, %r6, 10; + @%p77 bra BB24_92; + + setp.eq.s32 %p81, %r6, 9; + @%p81 bra BB24_116; + bra.uni BB24_90; + +BB24_116: + setp.eq.f64 %p108, %fd1, %fd68; + selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p108; + bra.uni BB24_137; + +BB24_11: + setp.gt.s32 %p20, %r6, 5; + @%p20 bra BB24_15; + + setp.eq.s32 %p24, %r6, 4; + @%p24 bra BB24_51; + bra.uni BB24_13; + +BB24_51: + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd68; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r3}, %fd1; + } + bfe.u32 %r24, %r3, 20, 11; add.s32 %r25, %r24, -1012; mov.b64 %rd14, %fd1; shl.b64 %rd2, %rd14, %r25; - setp.eq.s64 %p51, %rd2, -9223372036854775808; + setp.eq.s64 %p49, %rd2, -9223372036854775808; abs.f64 %fd18, %fd68; // Callseq Start 1 { @@ -1310,69 +2392,69 @@ BB12_49: param0, param1 ); - ld.param.f64 %fd97, [retval0+0]; + ld.param.f64 %fd98, [retval0+0]; //{ }// Callseq End 1 - setp.lt.s32 %p52, %r2, 0; - and.pred %p1, %p52, %p51; - @!%p1 bra BB12_51; - bra.uni BB12_50; + setp.lt.s32 %p50, %r2, 0; + and.pred %p1, %p50, %p49; + @!%p1 bra BB24_53; + bra.uni BB24_52; -BB12_50: +BB24_52: { .reg .b32 %temp; - mov.b64 {%temp, %r26}, %fd97; + mov.b64 {%temp, %r26}, %fd98; } xor.b32 %r27, %r26, -2147483648; { .reg .b32 %temp; - mov.b64 {%r28, %temp}, %fd97; + mov.b64 {%r28, %temp}, %fd98; } - mov.b64 %fd97, {%r28, %r27}; + mov.b64 %fd98, {%r28, %r27}; -BB12_51: - mov.f64 %fd96, %fd97; - setp.eq.f64 %p53, %fd68, 0d0000000000000000; - @%p53 bra BB12_54; - bra.uni BB12_52; +BB24_53: + mov.f64 %fd97, %fd98; + setp.eq.f64 %p51, %fd68, 0d0000000000000000; + @%p51 bra BB24_56; + bra.uni BB24_54; -BB12_54: - selp.b32 %r29, %r2, 0, %p51; +BB24_56: + selp.b32 %r29, %r2, 0, %p49; or.b32 %r30, %r29, 2146435072; - setp.lt.s32 %p57, %r3, 0; - selp.b32 %r31, %r30, %r29, %p57; + setp.lt.s32 %p55, %r3, 0; + selp.b32 %r31, %r30, %r29, %p55; mov.u32 %r32, 0; - mov.b64 %fd96, {%r32, %r31}; - bra.uni BB12_55; + mov.b64 %fd97, {%r32, %r31}; + bra.uni BB24_57; -BB12_28: +BB24_28: setp.gt.s32 %p7, %r6, 15; - @%p7 bra BB12_32; + @%p7 bra BB24_32; setp.eq.s32 %p11, %r6, 14; - @%p11 bra BB12_43; - bra.uni BB12_30; + @%p11 bra BB24_45; + bra.uni BB24_30; -BB12_43: +BB24_45: cvt.rni.s64.f64 %rd10, %fd68; cvt.rni.s64.f64 %rd11, %fd1; cvt.u32.u64 %r18, %rd10; cvt.u32.u64 %r19, %rd11; or.b32 %r20, %r19, %r18; - setp.eq.s32 %p43, %r20, 0; - selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43; - bra.uni BB12_65; + setp.eq.s32 %p41, %r20, 0; + selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p41; + bra.uni BB24_69; -BB12_75: - setp.gt.s32 %p88, %r6, 5; - @%p88 bra BB12_79; +BB24_79: + setp.gt.s32 %p84, %r6, 5; + @%p84 bra BB24_83; - setp.eq.s32 %p92, %r6, 4; - @%p92 bra BB12_113; - bra.uni BB12_77; + setp.eq.s32 %p88, %r6, 4; + @%p88 bra BB24_119; + bra.uni BB24_81; -BB12_113: +BB24_119: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -1381,11 +2463,11 @@ BB12_113: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd68; } - bfe.u32 %r61, %r5, 20, 11; - add.s32 %r62, %r61, -1012; + bfe.u32 %r62, %r5, 20, 11; + add.s32 %r63, %r62, -1012; mov.b64 %rd19, %fd68; - shl.b64 %rd3, %rd19, %r62; - setp.eq.s64 %p119, %rd3, -9223372036854775808; + shl.b64 %rd3, %rd19, %r63; + setp.eq.s64 %p113, %rd3, -9223372036854775808; abs.f64 %fd51, %fd1; // Callseq Start 2 { @@ -1402,621 +2484,1482 @@ BB12_113: param0, param1 ); - ld.param.f64 %fd105, [retval0+0]; + ld.param.f64 %fd107, [retval0+0]; //{ }// Callseq End 2 - setp.lt.s32 %p120, %r4, 0; - and.pred %p2, %p120, %p119; - @!%p2 bra BB12_115; - bra.uni BB12_114; + setp.lt.s32 %p114, %r4, 0; + and.pred %p2, %p114, %p113; + @!%p2 bra BB24_121; + bra.uni BB24_120; -BB12_114: +BB24_120: { .reg .b32 %temp; - mov.b64 {%temp, %r63}, %fd105; + mov.b64 {%temp, %r64}, %fd107; } - xor.b32 %r64, %r63, -2147483648; + xor.b32 %r65, %r64, -2147483648; { .reg .b32 %temp; - mov.b64 {%r65, %temp}, %fd105; + mov.b64 {%r66, %temp}, %fd107; } - mov.b64 %fd105, {%r65, %r64}; - -BB12_115: - mov.f64 %fd104, %fd105; - setp.eq.f64 %p121, %fd1, 0d0000000000000000; - @%p121 bra BB12_118; - bra.uni BB12_116; - -BB12_118: - selp.b32 %r66, %r4, 0, %p119; - or.b32 %r67, %r66, 2146435072; - setp.lt.s32 %p125, %r5, 0; - selp.b32 %r68, %r67, %r66, %p125; - mov.u32 %r69, 0; - mov.b64 %fd104, {%r69, %r68}; - bra.uni BB12_119; - -BB12_92: - setp.gt.s32 %p75, %r6, 15; - @%p75 bra BB12_96; - - setp.eq.s32 %p79, %r6, 14; - @%p79 bra BB12_107; - bra.uni BB12_94; - -BB12_107: + mov.b64 %fd107, {%r66, %r65}; + +BB24_121: + mov.f64 %fd106, %fd107; + setp.eq.f64 %p115, %fd1, 0d0000000000000000; + @%p115 bra BB24_124; + bra.uni BB24_122; + +BB24_124: + selp.b32 %r67, %r4, 0, %p113; + or.b32 %r68, %r67, 2146435072; + setp.lt.s32 %p119, %r5, 0; + selp.b32 %r69, %r68, %r67, %p119; + mov.u32 %r70, 0; + mov.b64 %fd106, {%r70, %r69}; + bra.uni BB24_125; + +BB24_96: + setp.gt.s32 %p71, %r6, 15; + @%p71 bra BB24_100; + + setp.eq.s32 %p75, %r6, 14; + @%p75 bra BB24_113; + bra.uni BB24_98; + +BB24_113: cvt.rni.s64.f64 %rd15, %fd1; cvt.rni.s64.f64 %rd16, %fd68; - cvt.u32.u64 %r55, %rd15; - cvt.u32.u64 %r56, %rd16; - or.b32 %r57, %r56, %r55; - setp.eq.s32 %p111, %r57, 0; - selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111; - bra.uni BB12_129; - -BB12_8: + cvt.u32.u64 %r56, %rd15; + cvt.u32.u64 %r57, %rd16; + or.b32 %r58, %r57, %r56; + setp.eq.s32 %p105, %r58, 0; + selp.f64 %fd108, 0d0000000000000000, 0d3FF0000000000000, %p105; + bra.uni BB24_137; + +BB24_8: setp.eq.s32 %p27, %r6, 2; - @%p27 bra BB12_63; - bra.uni BB12_9; + @%p27 bra BB24_67; + bra.uni BB24_9; -BB12_63: - mul.f64 %fd98, %fd1, %fd68; - bra.uni BB12_65; +BB24_67: + mul.f64 %fd99, %fd1, %fd68; + bra.uni BB24_69; -BB12_24: +BB24_24: setp.eq.s32 %p14, %r6, 11; - @%p14 bra BB12_45; + @%p14 bra BB24_47; setp.eq.s32 %p15, %r6, 12; - @%p15 bra BB12_44; - bra.uni BB12_26; + @%p15 bra BB24_46; + bra.uni BB24_26; -BB12_44: - max.f64 %fd98, %fd68, %fd1; - bra.uni BB12_65; +BB24_46: + max.f64 %fd99, %fd68, %fd1; + bra.uni BB24_69; -BB12_15: +BB24_15: setp.eq.s32 %p21, %r6, 6; - @%p21 bra BB12_48; + @%p21 bra BB24_50; setp.eq.s32 %p22, %r6, 7; - @%p22 bra BB12_47; - bra.uni BB12_17; + @%p22 bra BB24_49; + bra.uni BB24_17; -BB12_47: - setp.lt.f64 %p48, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48; - bra.uni BB12_65; +BB24_49: + setp.lt.f64 %p46, %fd1, %fd68; + selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p46; + bra.uni BB24_69; -BB12_32: +BB24_32: setp.eq.s32 %p8, %r6, 16; - @%p8 bra BB12_42; + @%p8 bra BB24_44; setp.eq.s32 %p9, %r6, 17; - @%p9 bra BB12_38; - bra.uni BB12_34; + @%p9 bra BB24_39; + bra.uni BB24_34; -BB12_38: - setp.eq.f64 %p35, %fd1, 0d0000000000000000; - setp.eq.f64 %p36, %fd1, 0d8000000000000000; - or.pred %p37, %p35, %p36; - mov.f64 %fd98, 0d7FF8000000000000; - @%p37 bra BB12_65; +BB24_39: + setp.eq.f64 %p34, %fd1, 0d0000000000000000; + setp.eq.f64 %p35, %fd1, 0d8000000000000000; + or.pred %p36, %p34, %p35; + mov.f64 %fd99, 0d7FF8000000000000; + @%p36 bra BB24_69; - div.rn.f64 %fd98, %fd68, %fd1; - abs.f64 %fd72, %fd98; - setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000; - @%p38 bra BB12_65; + div.rn.f64 %fd99, %fd68, %fd1; + abs.f64 %fd72, %fd99; + setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000; + @%p37 bra BB24_69; { .reg .b32 %temp; - mov.b64 {%r15, %temp}, %fd98; + mov.b64 {%temp, %r15}, %fd99; } + and.b32 %r16, %r15, 2147483647; + setp.ne.s32 %p38, %r16, 2146435072; + @%p38 bra BB24_43; + { .reg .b32 %temp; - mov.b64 {%temp, %r16}, %fd98; + mov.b64 {%r17, %temp}, %fd99; } - and.b32 %r17, %r16, 2147483647; - setp.ne.s32 %p39, %r17, 2146435072; - setp.ne.s32 %p40, %r15, 0; - or.pred %p41, %p39, %p40; - @!%p41 bra BB12_65; - bra.uni BB12_41; - -BB12_41: - cvt.rmi.f64.f64 %fd73, %fd98; + setp.eq.s32 %p39, %r17, 0; + @%p39 bra BB24_69; + +BB24_43: + cvt.rmi.f64.f64 %fd73, %fd99; mul.f64 %fd74, %fd1, %fd73; - sub.f64 %fd98, %fd68, %fd74; - bra.uni BB12_65; - -BB12_72: - setp.eq.s32 %p95, %r6, 2; - @%p95 bra BB12_127; - bra.uni BB12_73; - -BB12_127: - mul.f64 %fd106, %fd1, %fd68; - bra.uni BB12_129; - -BB12_88: - setp.eq.s32 %p82, %r6, 11; - @%p82 bra BB12_109; - - setp.eq.s32 %p83, %r6, 12; - @%p83 bra BB12_108; - bra.uni BB12_90; - -BB12_108: - max.f64 %fd106, %fd1, %fd68; - bra.uni BB12_129; - -BB12_79: - setp.eq.s32 %p89, %r6, 6; - @%p89 bra BB12_112; - - setp.eq.s32 %p90, %r6, 7; - @%p90 bra BB12_111; - bra.uni BB12_81; - -BB12_111: - setp.gt.f64 %p116, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116; - bra.uni BB12_129; - -BB12_96: - setp.eq.s32 %p76, %r6, 16; - @%p76 bra BB12_106; - - setp.eq.s32 %p77, %r6, 17; - @%p77 bra BB12_102; - bra.uni BB12_98; - -BB12_102: - setp.eq.f64 %p103, %fd68, 0d0000000000000000; - setp.eq.f64 %p104, %fd68, 0d8000000000000000; - or.pred %p105, %p103, %p104; - mov.f64 %fd106, 0d7FF8000000000000; - @%p105 bra BB12_129; - - div.rn.f64 %fd106, %fd1, %fd68; - abs.f64 %fd83, %fd106; - setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000; - @%p106 bra BB12_129; + sub.f64 %fd99, %fd68, %fd74; + bra.uni BB24_69; + +BB24_76: + setp.eq.s32 %p91, %r6, 2; + @%p91 bra BB24_135; + bra.uni BB24_77; + +BB24_135: + mul.f64 %fd108, %fd1, %fd68; + bra.uni BB24_137; + +BB24_92: + setp.eq.s32 %p78, %r6, 11; + @%p78 bra BB24_115; + + setp.eq.s32 %p79, %r6, 12; + @%p79 bra BB24_114; + bra.uni BB24_94; + +BB24_114: + max.f64 %fd108, %fd1, %fd68; + bra.uni BB24_137; + +BB24_83: + setp.eq.s32 %p85, %r6, 6; + @%p85 bra BB24_118; + + setp.eq.s32 %p86, %r6, 7; + @%p86 bra BB24_117; + bra.uni BB24_85; + +BB24_117: + setp.gt.f64 %p110, %fd1, %fd68; + selp.f64 %fd108, 0d3FF0000000000000, 0d0000000000000000, %p110; + bra.uni BB24_137; + +BB24_100: + setp.eq.s32 %p72, %r6, 16; + @%p72 bra BB24_112; + + setp.eq.s32 %p73, %r6, 17; + @%p73 bra BB24_107; + bra.uni BB24_102; + +BB24_107: + setp.eq.f64 %p98, %fd68, 0d0000000000000000; + setp.eq.f64 %p99, %fd68, 0d8000000000000000; + or.pred %p100, %p98, %p99; + mov.f64 %fd108, 0d7FF8000000000000; + @%p100 bra BB24_137; + + div.rn.f64 %fd108, %fd1, %fd68; + abs.f64 %fd83, %fd108; + setp.gtu.f64 %p101, %fd83, 0d7FF0000000000000; + @%p101 bra BB24_137; { .reg .b32 %temp; - mov.b64 {%r52, %temp}, %fd106; + mov.b64 {%temp, %r53}, %fd108; } + and.b32 %r54, %r53, 2147483647; + setp.ne.s32 %p102, %r54, 2146435072; + @%p102 bra BB24_111; + { .reg .b32 %temp; - mov.b64 {%temp, %r53}, %fd106; + mov.b64 {%r55, %temp}, %fd108; } - and.b32 %r54, %r53, 2147483647; - setp.ne.s32 %p107, %r54, 2146435072; - setp.ne.s32 %p108, %r52, 0; - or.pred %p109, %p107, %p108; - @!%p109 bra BB12_129; - bra.uni BB12_105; - -BB12_105: - cvt.rmi.f64.f64 %fd84, %fd106; + setp.eq.s32 %p103, %r55, 0; + @%p103 bra BB24_137; + +BB24_111: + cvt.rmi.f64.f64 %fd84, %fd108; mul.f64 %fd85, %fd84, %fd68; - sub.f64 %fd106, %fd1, %fd85; - bra.uni BB12_129; + sub.f64 %fd108, %fd1, %fd85; + bra.uni BB24_137; -BB12_6: +BB24_6: setp.eq.s32 %p30, %r6, 1; - @%p30 bra BB12_7; - bra.uni BB12_65; + @%p30 bra BB24_7; + bra.uni BB24_69; -BB12_7: - sub.f64 %fd98, %fd68, %fd1; - bra.uni BB12_65; +BB24_7: + sub.f64 %fd99, %fd68, %fd1; + bra.uni BB24_69; -BB12_22: +BB24_22: setp.eq.s32 %p18, %r6, 10; - @%p18 bra BB12_23; - bra.uni BB12_65; + @%p18 bra BB24_23; + bra.uni BB24_69; -BB12_23: - setp.neu.f64 %p45, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45; - bra.uni BB12_65; +BB24_23: + setp.neu.f64 %p43, %fd1, %fd68; + selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p43; + bra.uni BB24_69; -BB12_13: +BB24_13: setp.eq.s32 %p25, %r6, 5; - @%p25 bra BB12_14; - bra.uni BB12_65; + @%p25 bra BB24_14; + bra.uni BB24_69; -BB12_14: - setp.gt.f64 %p50, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB12_65; +BB24_14: + setp.gt.f64 %p48, %fd1, %fd68; + selp.f64 %fd99, 0d3FF0000000000000, 0d0000000000000000, %p48; + bra.uni BB24_69; -BB12_30: +BB24_30: setp.eq.s32 %p12, %r6, 15; - @%p12 bra BB12_31; - bra.uni BB12_65; + @%p12 bra BB24_31; + bra.uni BB24_69; -BB12_31: +BB24_31: mul.f64 %fd76, %fd1, %fd68; mov.f64 %fd77, 0d3FF0000000000000; - sub.f64 %fd98, %fd77, %fd76; - bra.uni BB12_65; + sub.f64 %fd99, %fd77, %fd76; + bra.uni BB24_69; -BB12_9: +BB24_9: setp.eq.s32 %p28, %r6, 3; - @%p28 bra BB12_10; - bra.uni BB12_65; + @%p28 bra BB24_10; + bra.uni BB24_69; -BB12_10: - div.rn.f64 %fd98, %fd68, %fd1; - bra.uni BB12_65; +BB24_10: + div.rn.f64 %fd99, %fd68, %fd1; + bra.uni BB24_69; -BB12_45: - min.f64 %fd98, %fd68, %fd1; - bra.uni BB12_65; +BB24_47: + min.f64 %fd99, %fd68, %fd1; + bra.uni BB24_69; -BB12_26: +BB24_26: setp.eq.s32 %p16, %r6, 13; - @%p16 bra BB12_27; - bra.uni BB12_65; + @%p16 bra BB24_27; + bra.uni BB24_69; -BB12_27: +BB24_27: cvt.rni.s64.f64 %rd12, %fd68; cvt.rni.s64.f64 %rd13, %fd1; cvt.u32.u64 %r21, %rd12; cvt.u32.u64 %r22, %rd13; and.b32 %r23, %r22, %r21; - setp.eq.s32 %p44, %r23, 0; - selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44; - bra.uni BB12_65; + setp.eq.s32 %p42, %r23, 0; + selp.f64 %fd99, 0d0000000000000000, 0d3FF0000000000000, %p42; + bra.uni BB24_69; -BB12_48:
<TRUNCATED>
