[SYSTEMML-1344] sqrt,round,abs,log,floor,ceil,trig funcs & sign for GPU
Closes #503 Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/1fc764b9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/1fc764b9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/1fc764b9 Branch: refs/heads/master Commit: 1fc764b9b099271822056a82e248acdbb785dc63 Parents: 0d553e3 Author: Nakul Jindal <[email protected]> Authored: Wed May 17 10:55:51 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Wed May 17 10:55:51 2017 -0700 ---------------------------------------------------------------------- src/main/cpp/kernels/Makefile | 28 + src/main/cpp/kernels/SystemML.cu | 187 ++ src/main/cpp/kernels/SystemML.ptx | 2506 ++++++++++++++---- .../java/org/apache/sysml/hops/UnaryOp.java | 10 +- .../instructions/GPUInstructionParser.java | 19 +- .../instructions/gpu/GPUInstruction.java | 72 +- .../gpu/MatrixBuiltinGPUInstruction.java | 41 +- .../instructions/gpu/context/CSRPointer.java | 2 +- .../instructions/gpu/context/GPUObject.java | 4 +- .../runtime/matrix/data/LibMatrixCUDA.java | 226 +- 10 files changed, 2577 insertions(+), 518 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/Makefile ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile new file mode 100644 index 0000000..0b003f3 --- /dev/null +++ b/src/main/cpp/kernels/Makefile @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +NVCC=nvcc +CUDAFLAGS= -ptx -c -arch=sm_30 + +SystemML.o: SystemML.cu + $(NVCC) $(CUDAFLAGS) SystemML.cu + +all: SystemML.o + ; + +clean: + rm -rf SystemML.ptx http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 2651e4a..5b4574e 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -656,3 +656,190 @@ __global__ void matrix_exp(double *A, double *C, unsigned int size) { C[index] = exp(A[index]); } } + +/** + * Do an sqrt over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_sqrt(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = sqrt(A[index]); + } +} + +/** + * Do an round over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_round(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = (double)llround(A[index]); + } +} + +/** + * Do an abs over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_abs(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = (double)fabs(A[index]); + } +} + +/** + * Do an log over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_log(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = log(A[index]); + } +} + +/** + * Do an floor over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_floor(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = floor(A[index]); + } +} + +/** + * Do an ceil over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_ceil(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = ceil(A[index]); + } +} + +/** + * Do an sin over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_sin(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = sin(A[index]); + } +} + +/** + * Do an cos over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_cos(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = cos(A[index]); + } +} + +/** + * Do an tan over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_tan(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = tan(A[index]); + } +} + +/** + * Do an asin over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_asin(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = asin(A[index]); + } +} + +/** + * Do an acos over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_acos(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = acos(A[index]); + } +} + +/** + * Do an atan over all the elements of a matrix + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_atan(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + C[index] = atan(A[index]); + } +} + +/** + * Do an sign over all the elements of a matrix + * Assign -1, 0 or 1 depending on the element being negative, 0 or positive + * @param A the input matrix (of length = size) + * @param C the pre-allocated output matrix (of length = size) + * @param siz the length of the input and output matrices + */ +extern "C" +__global__ void matrix_sign(double *A, double *C, unsigned int size) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < size){ + if (A[index] == 0.0) { + C[index] = 0.0; + } else { + C[index] = copysign(1.0, A[index]); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 50002f5..3229581 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,8 +1,8 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21112126 -// Cuda compilation tools, release 8.0, V8.0.43 +// Compiler Build ID: CL-21554848 +// Cuda compilation tools, release 8.0, V8.0.61 // Based on LLVM 3.4svn // @@ -11,6 +11,12 @@ .address_size 64 // .globl copy_u2l_dense +.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd +( + .param .b64 __internal_trig_reduction_slowpathd_param_0, + .param .b64 __internal_trig_reduction_slowpathd_param_1 +) +; .func (.param .b64 func_retval0) __internal_accurate_pow ( .param .b64 __internal_accurate_pow_param_0, @@ -18,6 +24,8 @@ ) ; .extern .shared .align 8 .b8 sdata[]; +.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; +.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63}; .visible .entry copy_u2l_dense( .param .u64 copy_u2l_dense_param_0, @@ -442,9 +450,9 @@ BB6_6: .param .u32 matrix_matrix_cellwise_op_param_7 ) { - .reg .pred %p<54>; - .reg .b32 %r<55>; - .reg .f64 %fd<39>; + .reg .pred %p<52>; + .reg .b32 %r<56>; + .reg .f64 %fd<40>; .reg .b64 %rd<15>; @@ -467,40 +475,40 @@ BB6_6: setp.lt.s32 %p2, %r1, %r14; setp.lt.s32 %p3, %r2, %r10; and.pred %p4, %p2, %p3; - @!%p4 bra BB7_53; + @!%p4 bra BB7_55; bra.uni BB7_1; BB7_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r53, %r1; + mov.u32 %r54, %r1; @%p5 bra BB7_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r54, %r3; + mov.u32 %r55, %r3; @%p6 bra BB7_4; - mov.u32 %r54, %r2; + mov.u32 %r55, %r2; BB7_4: - mov.u32 %r48, %r54; - mov.u32 %r4, %r48; - mov.u32 %r53, %r4; + mov.u32 %r49, %r55; + mov.u32 %r4, %r49; + mov.u32 %r54, %r4; BB7_5: - mov.u32 %r5, %r53; + mov.u32 %r5, %r54; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r51, %r1; + mov.u32 %r52, %r1; @%p7 bra BB7_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r52, %r3; + mov.u32 %r53, %r3; @%p8 bra BB7_8; - mov.u32 %r52, %r2; + mov.u32 %r53, %r2; BB7_8: - mov.u32 %r51, %r52; + mov.u32 %r52, %r53; BB7_9: cvta.to.global.u64 %rd5, %rd3; @@ -508,10 +516,10 @@ BB7_9: mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r51, 8; + mul.wide.s32 %rd9, %r52, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd38, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd39, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p9, %r13, 5; @%p9 bra BB7_19; @@ -519,15 +527,15 @@ BB7_9: @%p19 bra BB7_15; setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB7_51; + @%p23 bra BB7_53; setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB7_50; + @%p24 bra BB7_52; bra.uni BB7_13; -BB7_50: - sub.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; +BB7_52: + sub.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; BB7_19: setp.gt.s32 %p10, %r13, 8; @@ -542,12 +550,12 @@ BB7_19: BB7_33: setp.gt.f64 %p29, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB7_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29; + bra.uni BB7_54; BB7_15: setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB7_49; + @%p20 bra BB7_51; setp.eq.s32 %p21, %r13, 4; @%p21 bra BB7_35; @@ -583,7 +591,7 @@ BB7_35: param0, param1 ); - ld.param.f64 %fd37, [retval0+0]; + ld.param.f64 %fd38, [retval0+0]; //{ }// Callseq End 0 @@ -595,17 +603,17 @@ BB7_35: BB7_36: { .reg .b32 %temp; - mov.b64 {%temp, %r23}, %fd37; + mov.b64 {%temp, %r23}, %fd38; } xor.b32 %r24, %r23, -2147483648; { .reg .b32 %temp; - mov.b64 {%r25, %temp}, %fd37; + mov.b64 {%r25, %temp}, %fd38; } - mov.b64 %fd37, {%r25, %r24}; + mov.b64 %fd38, {%r25, %r24}; BB7_37: - mov.f64 %fd36, %fd37; + mov.f64 %fd37, %fd38; setp.eq.f64 %p34, %fd1, 0d0000000000000000; @%p34 bra BB7_40; bra.uni BB7_38; @@ -616,7 +624,7 @@ BB7_40: setp.lt.s32 %p38, %r9, 0; selp.b32 %r28, %r27, %r26, %p38; mov.u32 %r29, 0; - mov.b64 %fd36, {%r29, %r28}; + mov.b64 %fd37, {%r29, %r28}; bra.uni BB7_41; BB7_24: @@ -629,8 +637,8 @@ BB7_24: BB7_32: setp.eq.f64 %p27, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB7_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27; + bra.uni BB7_54; BB7_28: setp.eq.s32 %p12, %r13, 11; @@ -638,67 +646,67 @@ BB7_28: bra.uni BB7_29; BB7_31: - min.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + min.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; -BB7_51: - add.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; +BB7_53: + add.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; BB7_13: setp.eq.s32 %p25, %r13, 2; @%p25 bra BB7_14; - bra.uni BB7_52; + bra.uni BB7_54; BB7_14: - mul.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + mul.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; BB7_34: setp.le.f64 %p30, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB7_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30; + bra.uni BB7_54; BB7_22: setp.eq.s32 %p18, %r13, 8; @%p18 bra BB7_23; - bra.uni BB7_52; + bra.uni BB7_54; BB7_23: setp.ge.f64 %p28, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB7_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28; + bra.uni BB7_54; -BB7_49: - div.rn.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; +BB7_51: + div.rn.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; BB7_17: setp.eq.s32 %p22, %r13, 5; @%p22 bra BB7_18; - bra.uni BB7_52; + bra.uni BB7_54; BB7_18: setp.lt.f64 %p31, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB7_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31; + bra.uni BB7_54; BB7_26: setp.eq.s32 %p15, %r13, 10; @%p15 bra BB7_27; - bra.uni BB7_52; + bra.uni BB7_54; BB7_27: setp.neu.f64 %p26, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB7_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB7_54; BB7_29: setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB7_52; + @%p13 bra BB7_54; - max.f64 %fd38, %fd1, %fd2; - bra.uni BB7_52; + max.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; BB7_38: setp.gt.s32 %p35, %r8, -1; @@ -706,10 +714,10 @@ BB7_38: cvt.rzi.f64.f64 %fd29, %fd2; setp.neu.f64 %p36, %fd29, %fd2; - selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; + selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36; BB7_41: - mov.f64 %fd17, %fd36; + mov.f64 %fd17, %fd37; add.f64 %fd18, %fd1, %fd2; { .reg .b32 %temp; @@ -717,77 +725,79 @@ BB7_41: } and.b32 %r31, %r30, 2146435072; setp.ne.s32 %p39, %r31, 2146435072; - mov.f64 %fd35, %fd17; - @%p39 bra BB7_48; + mov.f64 %fd36, %fd17; + @%p39 bra BB7_50; setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; - mov.f64 %fd35, %fd18; - @%p40 bra BB7_48; + mov.f64 %fd36, %fd18; + @%p40 bra BB7_50; abs.f64 %fd30, %fd2; setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; - mov.f64 %fd34, %fd18; - mov.f64 %fd35, %fd34; - @%p41 bra BB7_48; + mov.f64 %fd35, %fd18; + mov.f64 %fd36, %fd35; + @%p41 bra BB7_50; + + and.b32 %r32, %r9, 2147483647; + setp.ne.s32 %p42, %r32, 2146435072; + @%p42 bra BB7_46; { .reg .b32 %temp; - mov.b64 {%r32, %temp}, %fd2; + mov.b64 {%r33, %temp}, %fd2; } - and.b32 %r33, %r9, 2147483647; - setp.eq.s32 %p42, %r33, 2146435072; - setp.eq.s32 %p43, %r32, 0; - and.pred %p44, %p42, %p43; - @%p44 bra BB7_47; - bra.uni BB7_45; + setp.eq.s32 %p43, %r33, 0; + @%p43 bra BB7_49; -BB7_47: - setp.gt.f64 %p48, %fd11, 0d3FF0000000000000; - selp.b32 %r41, 2146435072, 0, %p48; - xor.b32 %r42, %r41, 2146435072; - setp.lt.s32 %p49, %r9, 0; - selp.b32 %r43, %r42, %r41, %p49; - setp.eq.f64 %p50, %fd1, 0dBFF0000000000000; - selp.b32 %r44, 1072693248, %r43, %p50; - mov.u32 %r45, 0; - mov.b64 %fd35, {%r45, %r44}; - bra.uni BB7_48; +BB7_46: + and.b32 %r34, %r8, 2147483647; + setp.ne.s32 %p44, %r34, 2146435072; + mov.f64 %fd33, %fd17; + mov.f64 %fd36, %fd33; + @%p44 bra BB7_50; -BB7_45: { .reg .b32 %temp; - mov.b64 {%r34, %temp}, %fd1; + mov.b64 {%r35, %temp}, %fd1; } - and.b32 %r35, %r8, 2147483647; - setp.eq.s32 %p45, %r35, 2146435072; - setp.eq.s32 %p46, %r34, 0; - and.pred %p47, %p45, %p46; - mov.f64 %fd35, %fd17; - @!%p47 bra BB7_48; - bra.uni BB7_46; + setp.ne.s32 %p45, %r35, 0; + mov.f64 %fd36, %fd17; + @%p45 bra BB7_50; -BB7_46: shr.s32 %r36, %r9, 31; and.b32 %r37, %r36, -2146435072; - selp.b32 %r38, -1048576, 2146435072, %p1; - add.s32 %r39, %r38, %r37; - mov.u32 %r40, 0; - mov.b64 %fd35, {%r40, %r39}; + add.s32 %r38, %r37, 2146435072; + or.b32 %r39, %r38, -2147483648; + selp.b32 %r40, %r39, %r38, %p1; + mov.u32 %r41, 0; + mov.b64 %fd36, {%r41, %r40}; + bra.uni BB7_50; -BB7_48: - setp.eq.f64 %p51, %fd2, 0d0000000000000000; - setp.eq.f64 %p52, %fd1, 0d3FF0000000000000; - or.pred %p53, %p52, %p51; - selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53; +BB7_49: + setp.gt.f64 %p46, %fd11, 0d3FF0000000000000; + selp.b32 %r42, 2146435072, 0, %p46; + xor.b32 %r43, %r42, 2146435072; + setp.lt.s32 %p47, %r9, 0; + selp.b32 %r44, %r43, %r42, %p47; + setp.eq.f64 %p48, %fd1, 0dBFF0000000000000; + selp.b32 %r45, 1072693248, %r44, %p48; + mov.u32 %r46, 0; + mov.b64 %fd36, {%r46, %r45}; -BB7_52: +BB7_50: + setp.eq.f64 %p49, %fd2, 0d0000000000000000; + setp.eq.f64 %p50, %fd1, 0d3FF0000000000000; + or.pred %p51, %p50, %p49; + selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51; + +BB7_54: cvta.to.global.u64 %rd12, %rd4; mul.wide.s32 %rd13, %r3, 8; add.s64 %rd14, %rd12, %rd13; - st.global.f64 [%rd14], %fd38; + st.global.f64 [%rd14], %fd39; bar.sync 0; -BB7_53: +BB7_55: ret; } @@ -801,9 +811,9 @@ BB7_53: .param .u32 matrix_scalar_op_param_5 ) { - .reg .pred %p<95>; - .reg .b32 %r<62>; - .reg .f64 %fd<75>; + .reg .pred %p<91>; + .reg .b32 %r<64>; + .reg .f64 %fd<77>; .reg .b64 %rd<12>; @@ -818,7 +828,7 @@ BB7_53: mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB8_90; + @%p3 bra BB8_94; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -827,9 +837,9 @@ BB7_53: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB8_46; + @%p4 bra BB8_48; - mov.f64 %fd66, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd67, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p5, %r6, 5; @%p5 bra BB8_12; @@ -837,34 +847,34 @@ BB7_53: @%p15 bra BB8_8; setp.eq.s32 %p19, %r6, 0; - @%p19 bra BB8_44; + @%p19 bra BB8_46; setp.eq.s32 %p20, %r6, 1; - @%p20 bra BB8_43; + @%p20 bra BB8_45; bra.uni BB8_6; -BB8_43: - sub.f64 %fd66, %fd52, %fd1; - bra.uni BB8_45; +BB8_45: + sub.f64 %fd67, %fd52, %fd1; + bra.uni BB8_47; -BB8_46: - mov.f64 %fd74, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p50, %r6, 5; - @%p50 bra BB8_56; +BB8_48: + mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p48, %r6, 5; + @%p48 bra BB8_58; - setp.gt.s32 %p60, %r6, 2; - @%p60 bra BB8_52; + setp.gt.s32 %p58, %r6, 2; + @%p58 bra BB8_54; - setp.eq.s32 %p64, %r6, 0; - @%p64 bra BB8_88; + setp.eq.s32 %p62, %r6, 0; + @%p62 bra BB8_92; - setp.eq.s32 %p65, %r6, 1; - @%p65 bra BB8_87; - bra.uni BB8_50; + setp.eq.s32 %p63, %r6, 1; + @%p63 bra BB8_91; + bra.uni BB8_52; -BB8_87: - sub.f64 %fd74, %fd1, %fd52; - bra.uni BB8_89; +BB8_91: + sub.f64 %fd76, %fd1, %fd52; + bra.uni BB8_93; BB8_12: setp.gt.s32 %p6, %r6, 8; @@ -879,28 +889,28 @@ BB8_12: BB8_26: setp.lt.f64 %p25, %fd1, %fd52; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25; - bra.uni BB8_45; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25; + bra.uni BB8_47; -BB8_56: - setp.gt.s32 %p51, %r6, 8; - @%p51 bra BB8_61; +BB8_58: + setp.gt.s32 %p49, %r6, 8; + @%p49 bra BB8_63; - setp.eq.s32 %p57, %r6, 6; - @%p57 bra BB8_71; + setp.eq.s32 %p55, %r6, 6; + @%p55 bra BB8_73; - setp.eq.s32 %p58, %r6, 7; - @%p58 bra BB8_70; - bra.uni BB8_59; + setp.eq.s32 %p56, %r6, 7; + @%p56 bra BB8_72; + bra.uni BB8_61; -BB8_70: - setp.gt.f64 %p70, %fd1, %fd52; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70; - bra.uni BB8_89; +BB8_72: + setp.gt.f64 %p68, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68; + bra.uni BB8_93; BB8_8: setp.eq.s32 %p16, %r6, 3; - @%p16 bra BB8_42; + @%p16 bra BB8_44; setp.eq.s32 %p17, %r6, 4; @%p17 bra BB8_28; @@ -936,7 +946,7 @@ BB8_28: param0, param1 ); - ld.param.f64 %fd65, [retval0+0]; + ld.param.f64 %fd66, [retval0+0]; //{ }// Callseq End 1 @@ -948,17 +958,17 @@ BB8_28: BB8_29: { .reg .b32 %temp; - mov.b64 {%temp, %r14}, %fd65; + mov.b64 {%temp, %r14}, %fd66; } xor.b32 %r15, %r14, -2147483648; { .reg .b32 %temp; - mov.b64 {%r16, %temp}, %fd65; + mov.b64 {%r16, %temp}, %fd66; } - mov.b64 %fd65, {%r16, %r15}; + mov.b64 %fd66, {%r16, %r15}; BB8_30: - mov.f64 %fd64, %fd65; + mov.f64 %fd65, %fd66; setp.eq.f64 %p30, %fd52, 0d0000000000000000; @%p30 bra BB8_33; bra.uni BB8_31; @@ -969,7 +979,7 @@ BB8_33: setp.lt.s32 %p34, %r3, 0; selp.b32 %r19, %r18, %r17, %p34; mov.u32 %r20, 0; - mov.b64 %fd64, {%r20, %r19}; + mov.b64 %fd65, {%r20, %r19}; bra.uni BB8_34; BB8_17: @@ -982,18 +992,18 @@ BB8_17: BB8_25: setp.eq.f64 %p23, %fd1, %fd52; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23; - bra.uni BB8_45; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23; + bra.uni BB8_47; -BB8_52: - setp.eq.s32 %p61, %r6, 3; - @%p61 bra BB8_86; +BB8_54: + setp.eq.s32 %p59, %r6, 3; + @%p59 bra BB8_90; - setp.eq.s32 %p62, %r6, 4; - @%p62 bra BB8_72; - bra.uni BB8_54; + setp.eq.s32 %p60, %r6, 4; + @%p60 bra BB8_74; + bra.uni BB8_56; -BB8_72: +BB8_74: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -1002,11 +1012,11 @@ BB8_72: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd52; } - bfe.u32 %r37, %r5, 20, 11; - add.s32 %r38, %r37, -1012; + bfe.u32 %r38, %r5, 20, 11; + add.s32 %r39, %r38, -1012; mov.b64 %rd11, %fd52; - shl.b64 %rd3, %rd11, %r38; - setp.eq.s64 %p73, %rd3, -9223372036854775808; + shl.b64 %rd3, %rd11, %r39; + setp.eq.s64 %p71, %rd3, -9223372036854775808; abs.f64 %fd35, %fd1; // Callseq Start 2 { @@ -1023,54 +1033,54 @@ BB8_72: param0, param1 ); - ld.param.f64 %fd73, [retval0+0]; + ld.param.f64 %fd75, [retval0+0]; //{ }// Callseq End 2 - setp.lt.s32 %p74, %r4, 0; - and.pred %p2, %p74, %p73; - @!%p2 bra BB8_74; - bra.uni BB8_73; + setp.lt.s32 %p72, %r4, 0; + and.pred %p2, %p72, %p71; + @!%p2 bra BB8_76; + bra.uni BB8_75; -BB8_73: +BB8_75: { .reg .b32 %temp; - mov.b64 {%temp, %r39}, %fd73; + mov.b64 {%temp, %r40}, %fd75; } - xor.b32 %r40, %r39, -2147483648; + xor.b32 %r41, %r40, -2147483648; { .reg .b32 %temp; - mov.b64 {%r41, %temp}, %fd73; + mov.b64 {%r42, %temp}, %fd75; } - mov.b64 %fd73, {%r41, %r40}; - -BB8_74: - mov.f64 %fd72, %fd73; - setp.eq.f64 %p75, %fd1, 0d0000000000000000; - @%p75 bra BB8_77; - bra.uni BB8_75; + mov.b64 %fd75, {%r42, %r41}; -BB8_77: - selp.b32 %r42, %r4, 0, %p73; - or.b32 %r43, %r42, 2146435072; - setp.lt.s32 %p79, %r5, 0; - selp.b32 %r44, %r43, %r42, %p79; - mov.u32 %r45, 0; - mov.b64 %fd72, {%r45, %r44}; - bra.uni BB8_78; +BB8_76: + mov.f64 %fd74, %fd75; + setp.eq.f64 %p73, %fd1, 0d0000000000000000; + @%p73 bra BB8_79; + bra.uni BB8_77; + +BB8_79: + selp.b32 %r43, %r4, 0, %p71; + or.b32 %r44, %r43, 2146435072; + setp.lt.s32 %p77, %r5, 0; + selp.b32 %r45, %r44, %r43, %p77; + mov.u32 %r46, 0; + mov.b64 %fd74, {%r46, %r45}; + bra.uni BB8_80; -BB8_61: - setp.gt.s32 %p52, %r6, 10; - @%p52 bra BB8_65; +BB8_63: + setp.gt.s32 %p50, %r6, 10; + @%p50 bra BB8_67; - setp.eq.s32 %p55, %r6, 9; - @%p55 bra BB8_69; - bra.uni BB8_63; + setp.eq.s32 %p53, %r6, 9; + @%p53 bra BB8_71; + bra.uni BB8_65; -BB8_69: - setp.eq.f64 %p68, %fd1, %fd52; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68; - bra.uni BB8_89; +BB8_71: + setp.eq.f64 %p66, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66; + bra.uni BB8_93; BB8_21: setp.eq.s32 %p8, %r6, 11; @@ -1078,135 +1088,135 @@ BB8_21: bra.uni BB8_22; BB8_24: - min.f64 %fd66, %fd52, %fd1; - bra.uni BB8_45; + min.f64 %fd67, %fd52, %fd1; + bra.uni BB8_47; -BB8_44: - add.f64 %fd66, %fd1, %fd52; - bra.uni BB8_45; +BB8_46: + add.f64 %fd67, %fd1, %fd52; + bra.uni BB8_47; BB8_6: setp.eq.s32 %p21, %r6, 2; @%p21 bra BB8_7; - bra.uni BB8_45; + bra.uni BB8_47; BB8_7: - mul.f64 %fd66, %fd1, %fd52; - bra.uni BB8_45; + mul.f64 %fd67, %fd1, %fd52; + bra.uni BB8_47; BB8_27: setp.ge.f64 %p26, %fd1, %fd52; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB8_45; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB8_47; BB8_15: setp.eq.s32 %p14, %r6, 8; @%p14 bra BB8_16; - bra.uni BB8_45; + bra.uni BB8_47; BB8_16: setp.le.f64 %p24, %fd1, %fd52; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24; - bra.uni BB8_45; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24; + bra.uni BB8_47; -BB8_42: - div.rn.f64 %fd66, %fd52, %fd1; - bra.uni BB8_45; +BB8_44: + div.rn.f64 %fd67, %fd52, %fd1; + bra.uni BB8_47; BB8_10: setp.eq.s32 %p18, %r6, 5; @%p18 bra BB8_11; - bra.uni BB8_45; + bra.uni BB8_47; BB8_11: setp.gt.f64 %p27, %fd1, %fd52; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB8_45; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p27; + bra.uni BB8_47; -BB8_65: - setp.eq.s32 %p53, %r6, 11; - @%p53 bra BB8_68; - bra.uni BB8_66; +BB8_67: + setp.eq.s32 %p51, %r6, 11; + @%p51 bra BB8_70; + bra.uni BB8_68; -BB8_68: - min.f64 %fd74, %fd1, %fd52; - bra.uni BB8_89; +BB8_70: + min.f64 %fd76, %fd1, %fd52; + bra.uni BB8_93; BB8_19: setp.eq.s32 %p11, %r6, 10; @%p11 bra BB8_20; - bra.uni BB8_45; + bra.uni BB8_47; BB8_20: setp.neu.f64 %p22, %fd1, %fd52; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22; - bra.uni BB8_45; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22; + bra.uni BB8_47; BB8_22: setp.ne.s32 %p9, %r6, 12; - @%p9 bra BB8_45; - - max.f64 %fd66, %fd52, %fd1; - bra.uni BB8_45; + @%p9 bra BB8_47; -BB8_88: - add.f64 %fd74, %fd1, %fd52; - bra.uni BB8_89; + max.f64 %fd67, %fd52, %fd1; + bra.uni BB8_47; -BB8_50: - setp.eq.s32 %p66, %r6, 2; - @%p66 bra BB8_51; - bra.uni BB8_89; +BB8_92: + add.f64 %fd76, %fd1, %fd52; + bra.uni BB8_93; -BB8_51: - mul.f64 %fd74, %fd1, %fd52; - bra.uni BB8_89; +BB8_52: + setp.eq.s32 %p64, %r6, 2; + @%p64 bra BB8_53; + bra.uni BB8_93; -BB8_71: - setp.le.f64 %p71, %fd1, %fd52; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71; - bra.uni BB8_89; +BB8_53: + mul.f64 %fd76, %fd1, %fd52; + bra.uni BB8_93; -BB8_59: - setp.eq.s32 %p59, %r6, 8; - @%p59 bra BB8_60; - bra.uni BB8_89; +BB8_73: + setp.le.f64 %p69, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p69; + bra.uni BB8_93; -BB8_60: - setp.ge.f64 %p69, %fd1, %fd52; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69; - bra.uni BB8_89; +BB8_61: + setp.eq.s32 %p57, %r6, 8; + @%p57 bra BB8_62; + bra.uni BB8_93; -BB8_86: - div.rn.f64 %fd74, %fd1, %fd52; - bra.uni BB8_89; +BB8_62: + setp.ge.f64 %p67, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67; + bra.uni BB8_93; -BB8_54: - setp.eq.s32 %p63, %r6, 5; - @%p63 bra BB8_55; - bra.uni BB8_89; +BB8_90: + div.rn.f64 %fd76, %fd1, %fd52; + bra.uni BB8_93; -BB8_55: - setp.lt.f64 %p72, %fd1, %fd52; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72; - bra.uni BB8_89; +BB8_56: + setp.eq.s32 %p61, %r6, 5; + @%p61 bra BB8_57; + bra.uni BB8_93; -BB8_63: - setp.eq.s32 %p56, %r6, 10; - @%p56 bra BB8_64; - bra.uni BB8_89; +BB8_57: + setp.lt.f64 %p70, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p70; + bra.uni BB8_93; -BB8_64: - setp.neu.f64 %p67, %fd1, %fd52; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67; - bra.uni BB8_89; +BB8_65: + setp.eq.s32 %p54, %r6, 10; + @%p54 bra BB8_66; + bra.uni BB8_93; BB8_66: - setp.ne.s32 %p54, %r6, 12; - @%p54 bra BB8_89; + setp.neu.f64 %p65, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65; + bra.uni BB8_93; - max.f64 %fd74, %fd1, %fd52; - bra.uni BB8_89; +BB8_68: + setp.ne.s32 %p52, %r6, 12; + @%p52 bra BB8_93; + + max.f64 %fd76, %fd1, %fd52; + bra.uni BB8_93; BB8_31: setp.gt.s32 %p31, %r2, -1; @@ -1214,10 +1224,10 @@ BB8_31: cvt.rzi.f64.f64 %fd54, %fd1; setp.neu.f64 %p32, %fd54, %fd1; - selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p32; + selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p32; BB8_34: - mov.f64 %fd16, %fd64; + mov.f64 %fd16, %fd65; add.f64 %fd17, %fd1, %fd52; { .reg .b32 %temp; @@ -1225,157 +1235,161 @@ BB8_34: } and.b32 %r22, %r21, 2146435072; setp.ne.s32 %p35, %r22, 2146435072; - mov.f64 %fd63, %fd16; - @%p35 bra BB8_41; + mov.f64 %fd64, %fd16; + @%p35 bra BB8_43; setp.gtu.f64 %p36, %fd10, 0d7FF0000000000000; - mov.f64 %fd63, %fd17; - @%p36 bra BB8_41; + mov.f64 %fd64, %fd17; + @%p36 bra BB8_43; abs.f64 %fd55, %fd1; setp.gtu.f64 %p37, %fd55, 0d7FF0000000000000; - mov.f64 %fd62, %fd17; - mov.f64 %fd63, %fd62; - @%p37 bra BB8_41; + mov.f64 %fd63, %fd17; + mov.f64 %fd64, %fd63; + @%p37 bra BB8_43; + + and.b32 %r23, %r3, 2147483647; + setp.ne.s32 %p38, %r23, 2146435072; + @%p38 bra BB8_39; { .reg .b32 %temp; - mov.b64 {%r23, %temp}, %fd1; + mov.b64 {%r24, %temp}, %fd1; } - and.b32 %r24, %r3, 2147483647; - setp.eq.s32 %p38, %r24, 2146435072; - setp.eq.s32 %p39, %r23, 0; - and.pred %p40, %p38, %p39; - @%p40 bra BB8_40; - bra.uni BB8_38; + setp.eq.s32 %p39, %r24, 0; + @%p39 bra BB8_42; -BB8_40: - setp.gt.f64 %p44, %fd10, 0d3FF0000000000000; - selp.b32 %r32, 2146435072, 0, %p44; - xor.b32 %r33, %r32, 2146435072; - setp.lt.s32 %p45, %r3, 0; - selp.b32 %r34, %r33, %r32, %p45; - setp.eq.f64 %p46, %fd52, 0dBFF0000000000000; - selp.b32 %r35, 1072693248, %r34, %p46; - mov.u32 %r36, 0; - mov.b64 %fd63, {%r36, %r35}; - bra.uni BB8_41; +BB8_39: + and.b32 %r25, %r2, 2147483647; + setp.ne.s32 %p40, %r25, 2146435072; + mov.f64 %fd61, %fd16; + mov.f64 %fd64, %fd61; + @%p40 bra BB8_43; -BB8_75: - setp.gt.s32 %p76, %r4, -1; - @%p76 bra BB8_78; + { + .reg .b32 %temp; + mov.b64 {%r26, %temp}, %fd52; + } + setp.ne.s32 %p41, %r26, 0; + mov.f64 %fd64, %fd16; + @%p41 bra BB8_43; + + shr.s32 %r27, %r3, 31; + and.b32 %r28, %r27, -2146435072; + add.s32 %r29, %r28, 2146435072; + or.b32 %r30, %r29, -2147483648; + selp.b32 %r31, %r30, %r29, %p1; + mov.u32 %r32, 0; + mov.b64 %fd64, {%r32, %r31}; + bra.uni BB8_43; + +BB8_77: + setp.gt.s32 %p74, %r4, -1; + @%p74 bra BB8_80; cvt.rzi.f64.f64 %fd57, %fd52; - setp.neu.f64 %p77, %fd57, %fd52; - selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p77; + setp.neu.f64 %p75, %fd57, %fd52; + selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p75; -BB8_78: - mov.f64 %fd41, %fd72; +BB8_80: + mov.f64 %fd41, %fd74; add.f64 %fd42, %fd1, %fd52; { .reg .b32 %temp; - mov.b64 {%temp, %r46}, %fd42; + mov.b64 {%temp, %r47}, %fd42; } - and.b32 %r47, %r46, 2146435072; - setp.ne.s32 %p80, %r47, 2146435072; - mov.f64 %fd71, %fd41; - @%p80 bra BB8_85; + and.b32 %r48, %r47, 2146435072; + setp.ne.s32 %p78, %r48, 2146435072; + mov.f64 %fd73, %fd41; + @%p78 bra BB8_89; - setp.gtu.f64 %p81, %fd35, 0d7FF0000000000000; - mov.f64 %fd71, %fd42; - @%p81 bra BB8_85; + setp.gtu.f64 %p79, %fd35, 0d7FF0000000000000; + mov.f64 %fd73, %fd42; + @%p79 bra BB8_89; abs.f64 %fd58, %fd52; - setp.gtu.f64 %p82, %fd58, 0d7FF0000000000000; - mov.f64 %fd70, %fd42; - mov.f64 %fd71, %fd70; - @%p82 bra BB8_85; + setp.gtu.f64 %p80, %fd58, 0d7FF0000000000000; + mov.f64 %fd72, %fd42; + mov.f64 %fd73, %fd72; + @%p80 bra BB8_89; + + and.b32 %r49, %r5, 2147483647; + setp.ne.s32 %p81, %r49, 2146435072; + @%p81 bra BB8_85; { .reg .b32 %temp; - mov.b64 {%r48, %temp}, %fd52; + mov.b64 {%r50, %temp}, %fd52; } - and.b32 %r49, %r5, 2147483647; - setp.eq.s32 %p83, %r49, 2146435072; - setp.eq.s32 %p84, %r48, 0; - and.pred %p85, %p83, %p84; - @%p85 bra BB8_84; - bra.uni BB8_82; - -BB8_84: - setp.gt.f64 %p89, %fd35, 0d3FF0000000000000; - selp.b32 %r57, 2146435072, 0, %p89; - xor.b32 %r58, %r57, 2146435072; - setp.lt.s32 %p90, %r5, 0; - selp.b32 %r59, %r58, %r57, %p90; - setp.eq.f64 %p91, %fd1, 0dBFF0000000000000; - selp.b32 %r60, 1072693248, %r59, %p91; - mov.u32 %r61, 0; - mov.b64 %fd71, {%r61, %r60}; - bra.uni BB8_85; - -BB8_38: - { - .reg .b32 %temp; - mov.b64 {%r25, %temp}, %fd52; - } - and.b32 %r26, %r2, 2147483647; - setp.eq.s32 %p41, %r26, 2146435072; - setp.eq.s32 %p42, %r25, 0; - and.pred %p43, %p41, %p42; - mov.f64 %fd63, %fd16; - @!%p43 bra BB8_41; - bra.uni BB8_39; - -BB8_39: - shr.s32 %r27, %r3, 31; - and.b32 %r28, %r27, -2146435072; - selp.b32 %r29, -1048576, 2146435072, %p1; - add.s32 %r30, %r29, %r28; - mov.u32 %r31, 0; - mov.b64 %fd63, {%r31, %r30}; + setp.eq.s32 %p82, %r50, 0; + @%p82 bra BB8_88; -BB8_41: - setp.eq.f64 %p47, %fd1, 0d0000000000000000; - setp.eq.f64 %p48, %fd52, 0d3FF0000000000000; - or.pred %p49, %p48, %p47; - selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p49; - -BB8_45: - st.global.f64 [%rd1], %fd66; - bra.uni BB8_90; +BB8_85: + and.b32 %r51, %r4, 2147483647; + setp.ne.s32 %p83, %r51, 2146435072; + mov.f64 %fd70, %fd41; + mov.f64 %fd73, %fd70; + @%p83 bra BB8_89; -BB8_82: { .reg .b32 %temp; - mov.b64 {%r50, %temp}, %fd1; + mov.b64 {%r52, %temp}, %fd1; } - and.b32 %r51, %r4, 2147483647; - setp.eq.s32 %p86, %r51, 2146435072; - setp.eq.s32 %p87, %r50, 0; - and.pred %p88, %p86, %p87; - mov.f64 %fd71, %fd41; - @!%p88 bra BB8_85; - bra.uni BB8_83; - -BB8_83: - shr.s32 %r52, %r5, 31; - and.b32 %r53, %r52, -2146435072; - selp.b32 %r54, -1048576, 2146435072, %p2; - add.s32 %r55, %r54, %r53; - mov.u32 %r56, 0; - mov.b64 %fd71, {%r56, %r55}; + setp.ne.s32 %p84, %r52, 0; + mov.f64 %fd73, %fd41; + @%p84 bra BB8_89; + + shr.s32 %r53, %r5, 31; + and.b32 %r54, %r53, -2146435072; + add.s32 %r55, %r54, 2146435072; + or.b32 %r56, %r55, -2147483648; + selp.b32 %r57, %r56, %r55, %p2; + mov.u32 %r58, 0; + mov.b64 %fd73, {%r58, %r57}; + bra.uni BB8_89; -BB8_85: - setp.eq.f64 %p92, %fd52, 0d0000000000000000; - setp.eq.f64 %p93, %fd1, 0d3FF0000000000000; - or.pred %p94, %p93, %p92; - selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p94; +BB8_42: + setp.gt.f64 %p42, %fd10, 0d3FF0000000000000; + selp.b32 %r33, 2146435072, 0, %p42; + xor.b32 %r34, %r33, 2146435072; + setp.lt.s32 %p43, %r3, 0; + selp.b32 %r35, %r34, %r33, %p43; + setp.eq.f64 %p44, %fd52, 0dBFF0000000000000; + selp.b32 %r36, 1072693248, %r35, %p44; + mov.u32 %r37, 0; + mov.b64 %fd64, {%r37, %r36}; + +BB8_43: + setp.eq.f64 %p45, %fd1, 0d0000000000000000; + setp.eq.f64 %p46, %fd52, 0d3FF0000000000000; + or.pred %p47, %p46, %p45; + selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p47; + +BB8_47: + st.global.f64 [%rd1], %fd67; + bra.uni BB8_94; + +BB8_88: + setp.gt.f64 %p85, %fd35, 0d3FF0000000000000; + selp.b32 %r59, 2146435072, 0, %p85; + xor.b32 %r60, %r59, 2146435072; + setp.lt.s32 %p86, %r5, 0; + selp.b32 %r61, %r60, %r59, %p86; + setp.eq.f64 %p87, %fd1, 0dBFF0000000000000; + selp.b32 %r62, 1072693248, %r61, %p87; + mov.u32 %r63, 0; + mov.b64 %fd73, {%r63, %r62}; BB8_89: - st.global.f64 [%rd1], %fd74; + setp.eq.f64 %p88, %fd52, 0d0000000000000000; + setp.eq.f64 %p89, %fd1, 0d3FF0000000000000; + or.pred %p90, %p89, %p88; + selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p90; -BB8_90: +BB8_93: + st.global.f64 [%rd1], %fd76; + +BB8_94: bar.sync 0; ret; } @@ -2928,7 +2942,7 @@ BB19_35: .reg .pred %p<20>; .reg .b32 %r<39>; .reg .f64 %fd<76>; - .reg .b64 %rd<42>; + .reg .b64 %rd<43>; ld.param.u64 %rd1, [reduce_row_mean_param_0]; @@ -3095,12 +3109,13 @@ BB20_33: @%p19 bra BB20_35; ld.shared.f64 %fd40, [sdata]; - cvt.rn.f64.s32 %fd41, %r4; + cvt.u64.u32 %rd39, %r4; + cvt.rn.f64.s64 %fd41, %rd39; div.rn.f64 %fd42, %fd40, %fd41; - cvta.to.global.u64 %rd39, %rd2; - mul.wide.u32 %rd40, %r6, 8; - add.s64 %rd41, %rd39, %rd40; - st.global.f64 [%rd41], %fd42; + cvta.to.global.u64 %rd40, %rd2; + mul.wide.u32 %rd41, %r6, 8; + add.s64 %rd42, %rd40, %rd41; + st.global.f64 [%rd42], %fd42; BB20_35: ret; @@ -3117,7 +3132,7 @@ BB20_35: .reg .pred %p<4>; .reg .b32 %r<11>; .reg .f64 %fd<12>; - .reg .b64 %rd<9>; + .reg .b64 %rd<10>; ld.param.u64 %rd2, [reduce_col_mean_param_0]; @@ -3154,11 +3169,12 @@ BB21_3: BB21_4: cvta.to.global.u64 %rd6, %rd3; - cvt.rn.f64.s32 %fd7, %r5; + cvt.u64.u32 %rd7, %r5; + cvt.rn.f64.s64 %fd7, %rd7; div.rn.f64 %fd8, %fd10, %fd7; - mul.wide.u32 %rd7, %r1, 8; - add.s64 %rd8, %rd6, %rd7; - st.global.f64 [%rd8], %fd8; + mul.wide.u32 %rd8, %r1, 8; + add.s64 %rd9, %rd6, %rd8; + st.global.f64 [%rd9], %fd8; BB21_5: ret; @@ -3277,82 +3293,1638 @@ BB22_5: ret; } -.func (.param .b64 func_retval0) __internal_accurate_pow( - .param .b64 __internal_accurate_pow_param_0, - .param .b64 __internal_accurate_pow_param_1 + // .globl matrix_sqrt +.visible .entry matrix_sqrt( + .param .u64 matrix_sqrt_param_0, + .param .u64 matrix_sqrt_param_1, + .param .u32 matrix_sqrt_param_2 ) { - .reg .pred %p<10>; - .reg .f32 %f<3>; - .reg .b32 %r<52>; - .reg .f64 %fd<134>; + .reg .pred %p<2>; + .reg .b32 %r<6>; + .reg .f64 %fd<3>; + .reg .b64 %rd<8>; - ld.param.f64 %fd12, [__internal_accurate_pow_param_0]; - ld.param.f64 %fd13, [__internal_accurate_pow_param_1]; + ld.param.u64 %rd1, [matrix_sqrt_param_0]; + ld.param.u64 %rd2, [matrix_sqrt_param_1]; + ld.param.u32 %r2, [matrix_sqrt_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB23_2; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + sqrt.rn.f64 %fd2, %fd1; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + st.global.f64 [%rd7], %fd2; + +BB23_2: + ret; +} + + // .globl matrix_round +.visible .entry matrix_round( + .param .u64 matrix_round_param_0, + .param .u64 matrix_round_param_1, + .param .u32 matrix_round_param_2 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<11>; + .reg .f64 %fd<10>; + .reg .b64 %rd<11>; + + + ld.param.u64 %rd2, [matrix_round_param_0]; + ld.param.u64 %rd3, [matrix_round_param_1]; + ld.param.u32 %r2, [matrix_round_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB24_4; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd9, [%rd6]; + abs.f64 %fd2, %fd9; + setp.ge.f64 %p2, %fd2, 0d4330000000000000; + @%p2 bra BB24_3; + + add.f64 %fd5, %fd2, 0d3FE0000000000000; + cvt.rzi.f64.f64 %fd6, %fd5; + setp.lt.f64 %p3, %fd2, 0d3FE0000000000000; + selp.f64 %fd7, 0d0000000000000000, %fd6, %p3; { .reg .b32 %temp; - mov.b64 {%temp, %r49}, %fd12; + mov.b64 {%r6, %temp}, %fd7; } { .reg .b32 %temp; - mov.b64 {%r48, %temp}, %fd12; + mov.b64 {%temp, %r7}, %fd7; } - shr.u32 %r50, %r49, 20; - setp.ne.s32 %p1, %r50, 0; - @%p1 bra BB23_2; + { + .reg .b32 %temp; + mov.b64 {%temp, %r8}, %fd9; + } + and.b32 %r9, %r8, -2147483648; + or.b32 %r10, %r7, %r9; + mov.b64 %fd9, {%r6, %r10}; - mul.f64 %fd14, %fd12, 0d4350000000000000; +BB24_3: + cvta.to.global.u64 %rd7, %rd3; + cvt.rzi.s64.f64 %rd8, %fd9; + cvt.rn.f64.s64 %fd8, %rd8; + shl.b64 %rd9, %rd1, 3; + add.s64 %rd10, %rd7, %rd9; + st.global.f64 [%rd10], %fd8; + +BB24_4: + ret; +} + + // .globl matrix_abs +.visible .entry matrix_abs( + .param .u64 matrix_abs_param_0, + .param .u64 matrix_abs_param_1, + .param .u32 matrix_abs_param_2 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<6>; + .reg .f64 %fd<3>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [matrix_abs_param_0]; + ld.param.u64 %rd2, [matrix_abs_param_1]; + ld.param.u32 %r2, [matrix_abs_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB25_2; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + abs.f64 %fd2, %fd1; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + st.global.f64 [%rd7], %fd2; + +BB25_2: + ret; +} + + // .globl matrix_log +.visible .entry matrix_log( + .param .u64 matrix_log_param_0, + .param .u64 matrix_log_param_1, + .param .u32 matrix_log_param_2 +) +{ + .reg .pred %p<6>; + .reg .f32 %f<2>; + .reg .b32 %r<33>; + .reg .f64 %fd<59>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [matrix_log_param_0]; + ld.param.u64 %rd3, [matrix_log_param_1]; + ld.param.u32 %r12, [matrix_log_param_2]; + mov.u32 %r13, %ctaid.x; + mov.u32 %r14, %ntid.x; + mov.u32 %r15, %tid.x; + mad.lo.s32 %r1, %r14, %r13, %r15; + setp.ge.u32 %p1, %r1, %r12; + @%p1 bra BB26_9; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd56, [%rd6]; { .reg .b32 %temp; - mov.b64 {%temp, %r49}, %fd14; + mov.b64 {%temp, %r29}, %fd56; } { .reg .b32 %temp; - mov.b64 {%r48, %temp}, %fd14; + mov.b64 {%r30, %temp}, %fd56; } - shr.u32 %r16, %r49, 20; - add.s32 %r50, %r16, -54; + mov.u32 %r31, -1023; + setp.gt.s32 %p2, %r29, 1048575; + @%p2 bra BB26_3; -BB23_2: - add.s32 %r51, %r50, -1023; - and.b32 %r17, %r49, -2146435073; - or.b32 %r18, %r17, 1072693248; - mov.b64 %fd132, {%r48, %r18}; - setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB23_4; + mul.f64 %fd56, %fd56, 0d4350000000000000; + { + .reg .b32 %temp; + mov.b64 {%temp, %r29}, %fd56; + } + { + .reg .b32 %temp; + mov.b64 {%r30, %temp}, %fd56; + } + mov.u32 %r31, -1077; + +BB26_3: + add.s32 %r18, %r29, -1; + setp.lt.u32 %p3, %r18, 2146435071; + @%p3 bra BB26_5; + bra.uni BB26_4; + +BB26_5: + shr.u32 %r20, %r29, 20; + add.s32 %r32, %r31, %r20; + and.b32 %r21, %r29, -2146435073; + or.b32 %r22, %r21, 1072693248; + mov.b64 %fd57, {%r30, %r22}; + setp.lt.s32 %p5, %r22, 1073127583; + @%p5 bra BB26_7; { .reg .b32 %temp; - mov.b64 {%r19, %temp}, %fd132; + mov.b64 {%r23, %temp}, %fd57; } { .reg .b32 %temp; - mov.b64 {%temp, %r20}, %fd132; + mov.b64 {%temp, %r24}, %fd57; } - add.s32 %r21, %r20, -1048576; - mov.b64 %fd132, {%r19, %r21}; - add.s32 %r51, %r50, -1022; + add.s32 %r25, %r24, -1048576; + mov.b64 %fd57, {%r23, %r25}; + add.s32 %r32, %r32, 1; -BB23_4: - add.f64 %fd16, %fd132, 0d3FF0000000000000; +BB26_7: + add.f64 %fd13, %fd57, 0d3FF0000000000000; // inline asm - rcp.approx.ftz.f64 %fd15,%fd16; + rcp.approx.ftz.f64 %fd12,%fd13; // inline asm - neg.f64 %fd17, %fd16; - mov.f64 %fd18, 0d3FF0000000000000; - fma.rn.f64 %fd19, %fd17, %fd15, %fd18; - fma.rn.f64 %fd20, %fd19, %fd19, %fd19; - fma.rn.f64 %fd21, %fd20, %fd15, %fd15; - add.f64 %fd22, %fd132, 0dBFF0000000000000; - mul.f64 %fd23, %fd22, %fd21; - fma.rn.f64 %fd24, %fd22, %fd21, %fd23; - mul.f64 %fd25, %fd24, %fd24; - mov.f64 %fd26, 0d3ED0F5D241AD3B5A; - mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2; - fma.rn.f64 %fd28, %fd27, %fd25, %fd26; - mov.f64 %fd29, 0d3EF3B20A75488A3F; - fma.rn.f64 %fd30, %fd28, %fd25, %fd29; + neg.f64 %fd14, %fd13; + mov.f64 %fd15, 0d3FF0000000000000; + fma.rn.f64 %fd16, %fd14, %fd12, %fd15; + fma.rn.f64 %fd17, %fd16, %fd16, %fd16; + fma.rn.f64 %fd18, %fd17, %fd12, %fd12; + add.f64 %fd19, %fd57, 0dBFF0000000000000; + mul.f64 %fd20, %fd19, %fd18; + fma.rn.f64 %fd21, %fd19, %fd18, %fd20; + mul.f64 %fd22, %fd21, %fd21; + mov.f64 %fd23, 0d3ED0EE258B7A8B04; + mov.f64 %fd24, 0d3EB1380B3AE80F1E; + fma.rn.f64 %fd25, %fd24, %fd22, %fd23; + mov.f64 %fd26, 0d3EF3B2669F02676F; + fma.rn.f64 %fd27, %fd25, %fd22, %fd26; + mov.f64 %fd28, 0d3F1745CBA9AB0956; + fma.rn.f64 %fd29, %fd27, %fd22, %fd28; + mov.f64 %fd30, 0d3F3C71C72D1B5154; + fma.rn.f64 %fd31, %fd29, %fd22, %fd30; + mov.f64 %fd32, 0d3F624924923BE72D; + fma.rn.f64 %fd33, %fd31, %fd22, %fd32; + mov.f64 %fd34, 0d3F8999999999A3C4; + fma.rn.f64 %fd35, %fd33, %fd22, %fd34; + mov.f64 %fd36, 0d3FB5555555555554; + fma.rn.f64 %fd37, %fd35, %fd22, %fd36; + sub.f64 %fd38, %fd19, %fd21; + add.f64 %fd39, %fd38, %fd38; + neg.f64 %fd40, %fd21; + fma.rn.f64 %fd41, %fd40, %fd19, %fd39; + mul.f64 %fd42, %fd18, %fd41; + mul.f64 %fd43, %fd22, %fd37; + fma.rn.f64 %fd44, %fd43, %fd21, %fd42; + xor.b32 %r26, %r32, -2147483648; + mov.u32 %r27, 1127219200; + mov.b64 %fd45, {%r26, %r27}; + mov.u32 %r28, -2147483648; + mov.b64 %fd46, {%r28, %r27}; + sub.f64 %fd47, %fd45, %fd46; + mov.f64 %fd48, 0d3FE62E42FEFA39EF; + fma.rn.f64 %fd49, %fd47, %fd48, %fd21; + neg.f64 %fd50, %fd47; + fma.rn.f64 %fd51, %fd50, %fd48, %fd49; + sub.f64 %fd52, %fd51, %fd21; + sub.f64 %fd53, %fd44, %fd52; + mov.f64 %fd54, 0d3C7ABC9E3B39803F; + fma.rn.f64 %fd55, %fd47, %fd54, %fd53; + add.f64 %fd58, %fd49, %fd55; + bra.uni BB26_8; + +BB26_4: + mov.f64 %fd10, 0d7FF0000000000000; + fma.rn.f64 %fd11, %fd56, %fd10, %fd10; + { + .reg .b32 %temp; + mov.b64 {%temp, %r19}, %fd56; + } + mov.b32 %f1, %r19; + setp.eq.f32 %p4, %f1, 0f00000000; + selp.f64 %fd58, 0dFFF0000000000000, %fd11, %p4; + +BB26_8: + cvta.to.global.u64 %rd7, %rd3; + shl.b64 %rd8, %rd1, 3; + add.s64 %rd9, %rd7, %rd8; + st.global.f64 [%rd9], %fd58; + +BB26_9: + ret; +} + + // .globl matrix_floor +.visible .entry matrix_floor( + .param .u64 matrix_floor_param_0, + .param .u64 matrix_floor_param_1, + .param .u32 matrix_floor_param_2 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<6>; + .reg .f64 %fd<3>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [matrix_floor_param_0]; + ld.param.u64 %rd2, [matrix_floor_param_1]; + ld.param.u32 %r2, [matrix_floor_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB27_2; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + cvt.rmi.f64.f64 %fd2, %fd1; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + st.global.f64 [%rd7], %fd2; + +BB27_2: + ret; +} + + // .globl matrix_ceil +.visible .entry matrix_ceil( + .param .u64 matrix_ceil_param_0, + .param .u64 matrix_ceil_param_1, + .param .u32 matrix_ceil_param_2 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<6>; + .reg .f64 %fd<3>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [matrix_ceil_param_0]; + ld.param.u64 %rd2, [matrix_ceil_param_1]; + ld.param.u32 %r2, [matrix_ceil_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB28_2; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + cvt.rpi.f64.f64 %fd2, %fd1; + cvta.to.global.u64 %rd6, %rd2; + add.s64 %rd7, %rd6, %rd4; + st.global.f64 [%rd7], %fd2; + +BB28_2: + ret; +} + + // .globl matrix_sin +.visible .entry matrix_sin( + .param .u64 matrix_sin_param_0, + .param .u64 matrix_sin_param_1, + .param .u32 matrix_sin_param_2 +) +{ + .local .align 4 .b8 __local_depot29[4]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<7>; + .reg .b32 %r<18>; + .reg .f64 %fd<41>; + .reg .b64 %rd<17>; + + + mov.u64 %rd16, __local_depot29; + cvta.local.u64 %SP, %rd16; + ld.param.u64 %rd3, [matrix_sin_param_0]; + ld.param.u64 %rd4, [matrix_sin_param_1]; + ld.param.u32 %r5, [matrix_sin_param_2]; + add.u64 %rd5, %SP, 0; + cvta.to.local.u64 %rd1, %rd5; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r6, %r7, %r8; + setp.ge.u32 %p1, %r1, %r5; + @%p1 bra BB29_11; + + cvta.to.global.u64 %rd6, %rd3; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd7, %r1, 8; + add.s64 %rd8, %rd6, %rd7; + ld.global.f64 %fd38, [%rd8]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r9}, %fd38; + } + and.b32 %r10, %r9, 2147483647; + setp.ne.s32 %p2, %r10, 2146435072; + @%p2 bra BB29_4; + + { + .reg .b32 %temp; + mov.b64 {%r11, %temp}, %fd38; + } + setp.ne.s32 %p3, %r11, 0; + @%p3 bra BB29_4; + + mov.f64 %fd14, 0d0000000000000000; + mul.rn.f64 %fd38, %fd38, %fd14; + +BB29_4: + mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; + cvt.rni.s32.f64 %r17, %fd15; + st.local.u32 [%rd1], %r17; + cvt.rn.f64.s32 %fd16, %r17; + neg.f64 %fd17, %fd16; + mov.f64 %fd18, 0d3FF921FB54442D18; + fma.rn.f64 %fd19, %fd17, %fd18, %fd38; + mov.f64 %fd20, 0d3C91A62633145C00; + fma.rn.f64 %fd21, %fd17, %fd20, %fd19; + mov.f64 %fd22, 0d397B839A252049C0; + fma.rn.f64 %fd39, %fd17, %fd22, %fd21; + { + .reg .b32 %temp; + mov.b64 {%temp, %r12}, %fd38; + } + and.b32 %r13, %r12, 2145386496; + setp.lt.u32 %p4, %r13, 1105199104; + @%p4 bra BB29_6; + + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd38; + .param .b64 param1; + st.param.b64 [param1+0], %rd5; + .param .b64 retval0; + call.uni (retval0), + __internal_trig_reduction_slowpathd, + ( + param0, + param1 + ); + ld.param.f64 %fd39, [retval0+0]; + + //{ + }// Callseq End 3 + ld.local.u32 %r17, [%rd1]; + +BB29_6: + and.b32 %r14, %r17, 1; + shl.b32 %r15, %r14, 3; + setp.eq.s32 %p5, %r14, 0; + selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; + mul.wide.u32 %rd10, %r15, 8; + mov.u64 %rd11, __cudart_sin_cos_coeffs; + add.s64 %rd12, %rd10, %rd11; + ld.const.f64 %fd24, [%rd12+8]; + mul.rn.f64 %fd7, %fd39, %fd39; + fma.rn.f64 %fd25, %fd23, %fd7, %fd24; + ld.const.f64 %fd26, [%rd12+16]; + fma.rn.f64 %fd27, %fd25, %fd7, %fd26; + ld.const.f64 %fd28, [%rd12+24]; + fma.rn.f64 %fd29, %fd27, %fd7, %fd28; + ld.const.f64 %fd30, [%rd12+32]; + fma.rn.f64 %fd31, %fd29, %fd7, %fd30; + ld.const.f64 %fd32, [%rd12+40]; + fma.rn.f64 %fd33, %fd31, %fd7, %fd32; + ld.const.f64 %fd34, [%rd12+48]; + fma.rn.f64 %fd8, %fd33, %fd7, %fd34; + fma.rn.f64 %fd40, %fd8, %fd39, %fd39; + @%p5 bra BB29_8; + + mov.f64 %fd35, 0d3FF0000000000000; + fma.rn.f64 %fd40, %fd8, %fd7, %fd35; + +BB29_8: + and.b32 %r16, %r17, 2; + setp.eq.s32 %p6, %r16, 0; + @%p6 bra BB29_10; + + mov.f64 %fd36, 0d0000000000000000; + mov.f64 %fd37, 0dBFF0000000000000; + fma.rn.f64 %fd40, %fd40, %fd37, %fd36; + +BB29_10: + cvta.to.global.u64 %rd13, %rd4; + shl.b64 %rd14, %rd2, 3; + add.s64 %rd15, %rd13, %rd14; + st.global.f64 [%rd15], %fd40; + +BB29_11: + ret; +} + + // .globl matrix_cos +.visible .entry matrix_cos( + .param .u64 matrix_cos_param_0, + .param .u64 matrix_cos_param_1, + .param .u32 matrix_cos_param_2 +) +{ + .local .align 4 .b8 __local_depot30[4]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<7>; + .reg .b32 %r<19>; + .reg .f64 %fd<41>; + .reg .b64 %rd<17>; + + + mov.u64 %rd16, __local_depot30; + cvta.local.u64 %SP, %rd16; + ld.param.u64 %rd3, [matrix_cos_param_0]; + ld.param.u64 %rd4, [matrix_cos_param_1]; + ld.param.u32 %r6, [matrix_cos_param_2]; + add.u64 %rd5, %SP, 0; + cvta.to.local.u64 %rd1, %rd5; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %ctaid.x; + mov.u32 %r9, %tid.x; + mad.lo.s32 %r1, %r7, %r8, %r9; + setp.ge.u32 %p1, %r1, %r6; + @%p1 bra BB30_11; + + cvta.to.global.u64 %rd6, %rd3; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd7, %r1, 8; + add.s64 %rd8, %rd6, %rd7; + ld.global.f64 %fd38, [%rd8]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r10}, %fd38; + } + and.b32 %r11, %r10, 2147483647; + setp.ne.s32 %p2, %r11, 2146435072; + @%p2 bra BB30_4; + + { + .reg .b32 %temp; + mov.b64 {%r12, %temp}, %fd38; + } + setp.ne.s32 %p3, %r12, 0; + @%p3 bra BB30_4; + + mov.f64 %fd14, 0d0000000000000000; + mul.rn.f64 %fd38, %fd38, %fd14; + +BB30_4: + mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; + cvt.rni.s32.f64 %r18, %fd15; + st.local.u32 [%rd1], %r18; + cvt.rn.f64.s32 %fd16, %r18; + neg.f64 %fd17, %fd16; + mov.f64 %fd18, 0d3FF921FB54442D18; + fma.rn.f64 %fd19, %fd17, %fd18, %fd38; + mov.f64 %fd20, 0d3C91A62633145C00; + fma.rn.f64 %fd21, %fd17, %fd20, %fd19; + mov.f64 %fd22, 0d397B839A252049C0; + fma.rn.f64 %fd39, %fd17, %fd22, %fd21; + { + .reg .b32 %temp; + mov.b64 {%temp, %r13}, %fd38; + } + and.b32 %r14, %r13, 2145386496; + setp.lt.u32 %p4, %r14, 1105199104; + @%p4 bra BB30_6; + + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd38; + .param .b64 param1; + st.param.b64 [param1+0], %rd5; + .param .b64 retval0; + call.uni (retval0), + __internal_trig_reduction_slowpathd, + ( + param0, + param1 + ); + ld.param.f64 %fd39, [retval0+0]; + + //{ + }// Callseq End 4 + ld.local.u32 %r18, [%rd1]; + +BB30_6: + add.s32 %r5, %r18, 1; + and.b32 %r15, %r5, 1; + shl.b32 %r16, %r15, 3; + setp.eq.s32 %p5, %r15, 0; + selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; + mul.wide.u32 %rd10, %r16, 8; + mov.u64 %rd11, __cudart_sin_cos_coeffs; + add.s64 %rd12, %rd10, %rd11; + ld.const.f64 %fd24, [%rd12+8]; + mul.rn.f64 %fd7, %fd39, %fd39; + fma.rn.f64 %fd25, %fd23, %fd7, %fd24; + ld.const.f64 %fd26, [%rd12+16]; + fma.rn.f64 %fd27, %fd25, %fd7, %fd26; + ld.const.f64 %fd28, [%rd12+24]; + fma.rn.f64 %fd29, %fd27, %fd7, %fd28; + ld.const.f64 %fd30, [%rd12+32]; + fma.rn.f64 %fd31, %fd29, %fd7, %fd30; + ld.const.f64 %fd32, [%rd12+40]; + fma.rn.f64 %fd33, %fd31, %fd7, %fd32; + ld.const.f64 %fd34, [%rd12+48]; + fma.rn.f64 %fd8, %fd33, %fd7, %fd34; + fma.rn.f64 %fd40, %fd8, %fd39, %fd39; + @%p5 bra BB30_8; + + mov.f64 %fd35, 0d3FF0000000000000; + fma.rn.f64 %fd40, %fd8, %fd7, %fd35; + +BB30_8: + and.b32 %r17, %r5, 2; + setp.eq.s32 %p6, %r17, 0; + @%p6 bra BB30_10; + + mov.f64 %fd36, 0d0000000000000000; + mov.f64 %fd37, 0dBFF0000000000000; + fma.rn.f64 %fd40, %fd40, %fd37, %fd36; + +BB30_10: + cvta.to.global.u64 %rd13, %rd4; + shl.b64 %rd14, %rd2, 3; + add.s64 %rd15, %rd13, %rd14; + st.global.f64 [%rd15], %fd40; + +BB30_11: + ret; +} + + // .globl matrix_tan +.visible .entry matrix_tan( + .param .u64 matrix_tan_param_0, + .param .u64 matrix_tan_param_1, + .param .u32 matrix_tan_param_2 +) +{ + .local .align 4 .b8 __local_depot31[4]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<6>; + .reg .b32 %r<16>; + .reg .f64 %fd<66>; + .reg .b64 %rd<14>; + + + mov.u64 %rd13, __local_depot31; + cvta.local.u64 %SP, %rd13; + ld.param.u64 %rd3, [matrix_tan_param_0]; + ld.param.u64 %rd4, [matrix_tan_param_1]; + ld.param.u32 %r5, [matrix_tan_param_2]; + add.u64 %rd5, %SP, 0; + cvta.to.local.u64 %rd1, %rd5; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r6, %r7, %r8; + setp.ge.u32 %p1, %r1, %r5; + @%p1 bra BB31_9; + + cvta.to.global.u64 %rd6, %rd3; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd7, %r1, 8; + add.s64 %rd8, %rd6, %rd7; + ld.global.f64 %fd63, [%rd8]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r9}, %fd63; + } + and.b32 %r10, %r9, 2147483647; + setp.ne.s32 %p2, %r10, 2146435072; + @%p2 bra BB31_4; + + { + .reg .b32 %temp; + mov.b64 {%r11, %temp}, %fd63; + } + setp.ne.s32 %p3, %r11, 0; + @%p3 bra BB31_4; + + mov.f64 %fd11, 0d0000000000000000; + mul.rn.f64 %fd63, %fd63, %fd11; + +BB31_4: + mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883; + cvt.rni.s32.f64 %r15, %fd12; + st.local.u32 [%rd1], %r15; + cvt.rn.f64.s32 %fd13, %r15; + neg.f64 %fd14, %fd13; + mov.f64 %fd15, 0d3FF921FB54442D18; + fma.rn.f64 %fd16, %fd14, %fd15, %fd63; + mov.f64 %fd17, 0d3C91A62633145C00; + fma.rn.f64 %fd18, %fd14, %fd17, %fd16; + mov.f64 %fd19, 0d397B839A252049C0; + fma.rn.f64 %fd64, %fd14, %fd19, %fd18; + { + .reg .b32 %temp; + mov.b64 {%temp, %r12}, %fd63; + } + and.b32 %r13, %r12, 2145386496; + setp.lt.u32 %p4, %r13, 1105199104; + @%p4 bra BB31_6; + + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd63; + .param .b64 param1; + st.param.b64 [param1+0], %rd5; + .param .b64 retval0; + call.uni (retval0), + __internal_trig_reduction_slowpathd, + ( + param0, + param1 + ); + ld.param.f64 %fd64, [retval0+0]; + + //{ + }// Callseq End 5 + ld.local.u32 %r15, [%rd1]; + +BB31_6: + mul.f64 %fd20, %fd64, %fd64; + mov.f64 %fd21, 0dBEF9757C5B27EBB1; + mov.f64 %fd22, 0d3EE48DAC2799BCB9; + fma.rn.f64 %fd23, %fd22, %fd20, %fd21; + mov.f64 %fd24, 0d3F0980E90FD91E04; + fma.rn.f64 %fd25, %fd23, %fd20, %fd24; + mov.f64 %fd26, 0dBEFAE2B0417D7E1D; + fma.rn.f64 %fd27, %fd25, %fd20, %fd26; + mov.f64 %fd28, 0d3F119F5341BFBA57; + fma.rn.f64 %fd29, %fd27, %fd20, %fd28; + mov.f64 %fd30, 0d3F15E791A00F6919; + fma.rn.f64 %fd31, %fd29, %fd20, %fd30; + mov.f64 %fd32, 0d3F2FF2E7FADEC73A; + fma.rn.f64 %fd33, %fd31, %fd20, %fd32; + mov.f64 %fd34, 0d3F434BC1B206DA62; + fma.rn.f64 %fd35, %fd33, %fd20, %fd34; + mov.f64 %fd36, 0d3F57DB18EF2F83F9; + fma.rn.f64 %fd37, %fd35, %fd20, %fd36; + mov.f64 %fd38, 0d3F6D6D2E7AE49FBC; + fma.rn.f64 %fd39, %fd37, %fd20, %fd38; + mov.f64 %fd40, 0d3F8226E3A816A776; + fma.rn.f64 %fd41, %fd39, %fd20, %fd40; + mov.f64 %fd42, 0d3F9664F485D25660; + fma.rn.f64 %fd43, %fd41, %fd20, %fd42; + mov.f64 %fd44, 0d3FABA1BA1BABF31D; + fma.rn.f64 %fd45, %fd43, %fd20, %fd44; + mov.f64 %fd46, 0d3FC11111111105D2; + fma.rn.f64 %fd47, %fd45, %fd20, %fd46; + mov.f64 %fd48, 0d3FD555555555555E; + fma.rn.f64 %fd49, %fd47, %fd20, %fd48; + mul.f64 %fd7, %fd20, %fd49; + fma.rn.f64 %fd65, %fd7, %fd64, %fd64; + and.b32 %r14, %r15, 1; + setp.eq.b32 %p5, %r14, 1; + @!%p5 bra BB31_8; + bra.uni BB31_7; + +BB31_7: + sub.f64 %fd52, %fd65, %fd64; + neg.f64 %fd53, %fd52; + fma.rn.f64 %fd54, %fd7, %fd64, %fd53; + // inline asm + rcp.approx.ftz.f64 %fd50,%fd65; + // inline asm + neg.f64 %fd55, %fd65; + mov.f64 %fd56, 0d3FF0000000000000; + fma.rn.f64 %fd57, %fd55, %fd50, %fd56; + fma.rn.f64 %fd58, %fd57, %fd57, %fd57; + fma.rn.f64 %fd59, %fd58, %fd50, %fd50; + neg.f64 %fd60, %fd59; + fma.rn.f64 %fd61, %fd65, %fd60, %fd56; + fma.rn.f64 %fd62, %fd60, %fd54, %fd61; + fma.rn.f64 %fd65, %fd62, %fd60, %fd60; + +BB31_8: + cvta.to.global.u64 %rd10, %rd4; + shl.b64 %rd11, %rd2, 3; + add.s64 %rd12, %rd10, %rd11; + st.global.f64 [%rd12], %fd65; + +BB31_9: + ret; +} + + // .globl matrix_asin +.visible .entry matrix_asin( + .param .u64 matrix_asin_param_0, + .param .u64 matrix_asin_param_1, + .param .u32 matrix_asin_param_2 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<3>; + .reg .b32 %r<15>; + .reg .f64 %fd<83>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [matrix_asin_param_0]; + ld.param.u64 %rd3, [matrix_asin_param_1]; + ld.param.u32 %r3, [matrix_asin_param_2]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r1, %r5, %r4, %r6; + setp.ge.u32 %p1, %r1, %r3; + @%p1 bra BB32_5; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd1, [%rd6]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd1; + } + mov.b32 %f1, %r2; + abs.f32 %f2, %f1; + setp.lt.f32 %p2, %f2, 0f3FE26666; + @%p2 bra BB32_3; + bra.uni BB32_2; + +BB32_3: + mul.f64 %fd55, %fd1, %fd1; + mov.f64 %fd56, 0dBFB3823B180754AF; + mov.f64 %fd57, 0d3FB0066BDC1895E9; + fma.rn.f64 %fd58, %fd57, %fd55, %fd56; + mov.f64 %fd59, 0d3FB11E52CC2F79AE; + fma.rn.f64 %fd60, %fd58, %fd55, %fd59; + mov.f64 %fd61, 0dBF924EAF3526861B; + fma.rn.f64 %fd62, %fd60, %fd55, %fd61; + mov.f64 %fd63, 0d3F91DF02A31E6CB7; + fma.rn.f64 %fd64, %fd62, %fd55, %fd63; + mov.f64 %fd65, 0d3F847D18B0EEC6CC; + fma.rn.f64 %fd66, %fd64, %fd55, %fd65; + mov.f64 %fd67, 0d3F8D0AF961BA53B0; + fma.rn.f64 %fd68, %fd66, %fd55, %fd67; + mov.f64 %fd69, 0d3F91BF7734CF1C48; + fma.rn.f64 %fd70, %fd68, %fd55, %fd69; + mov.f64 %fd71, 0d3F96E91483144EF7; + fma.rn.f64 %fd72, %fd70, %fd55, %fd71; + mov.f64 %fd73, 0d3F9F1C6E0A4F9F81; + fma.rn.f64 %fd74, %fd72, %fd55, %fd73; + mov.f64 %fd75, 0d3FA6DB6DC27FA92B; + fma.rn.f64 %fd76, %fd74, %fd55, %fd75; + mov.f64 %fd77, 0d3FB333333320F91B; + fma.rn.f64 %fd78, %fd76, %fd55, %fd77; + mov.f64 %fd79, 0d3FC5555555555F4D; + fma.rn.f64 %fd80, %fd78, %fd55, %fd79; + mul.f64 %fd81, %fd55, %fd80; + fma.rn.f64 %fd82, %fd81, %fd1, %fd1; + bra.uni BB32_4; + +BB32_2: + abs.f64 %fd7, %fd1; + mov.f64 %fd8, 0d3FE0000000000000; + mov.f64 %fd9, 0dBFE0000000000000; + fma.rn.f64 %fd6, %fd9, %fd7, %fd8; + // inline asm + rsqrt.approx.ftz.f64 %fd5, %fd6; + // inline asm + { + .reg .b32 %temp; + mov.b64 {%r7, %temp}, %fd5; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r8}, %fd5; + } + add.s32 %r9, %r8, -1048576; + mov.b64 %fd10, {%r7, %r9}; + mul.f64 %fd11, %fd6, %fd5; + neg.f64 %fd12, %fd11; + fma.rn.f64 %fd13, %fd11, %fd12, %fd6; + fma.rn.f64 %fd14, %fd13, %fd10, %fd11; + neg.f64 %fd15, %fd14; + mov.f64 %fd16, 0d3FF0000000000000; + fma.rn.f64 %fd17, %fd5, %fd15, %fd16; + fma.rn.f64 %fd18, %fd17, %fd10, %fd10; + fma.rn.f64 %fd19, %fd14, %fd15, %fd6; + fma.rn.f64 %fd20, %fd19, %fd18, %fd14; + { + .reg .b32 %temp; + mov.b64 {%temp, %r10}, %fd6; + } + setp.lt.s32 %p3, %r10, 0; + selp.f64 %fd21, 0dFFF8000000000000, %fd20, %p3; + setp.equ.f64 %p4, %fd6, 0d0000000000000000; + selp.f64 %fd22, %fd6, %fd21, %p4; + mov.f64 %fd23, 0dBFB3823B180754AF; + mov.f64 %fd24, 0d3FB0066BDC1895E9; + fma.rn.f64 %fd25, %fd24, %fd6, %fd23; + mov.f64 %fd26, 0d3FB11E52CC2F79AE; + fma.rn.f64 %fd27, %fd25, %fd6, %fd26; + mov.f64 %fd28, 0dBF924EAF3526861B; + fma.rn.f64 %fd29, %fd27, %fd6, %fd28; + mov.f64 %fd30, 0d3F91DF02A31E6CB7; + fma.rn.f64 %fd31, %fd29, %fd6, %fd30; + mov.f64 %fd32, 0d3F847D18B0EEC6CC; + fma.rn.f64 %fd33, %fd31, %fd6, %fd32; + mov.f64 %fd34, 0d3F8D0AF961BA53B0; + fma.rn.f64 %fd35, %fd33, %fd6, %fd34; + mov.f64 %fd36, 0d3F91BF7734CF1C48; + fma.rn.f64 %fd37, %fd35, %fd6, %fd36; + mov.f64 %fd38, 0d3F96E91483144EF7; + fma.rn.f64 %fd39, %fd37, %fd6, %fd38; + mov.f64 %fd40, 0d3F9F1C6E0A4F9F81; + fma.rn.f64 %fd41, %fd39, %fd6, %fd40; + mov.f64 %fd42, 0d3FA6DB6DC27FA92B; + fma.rn.f64 %fd43, %fd41, %fd6, %fd42; + mov.f64 %fd44, 0d3FB333333320F91B; + fma.rn.f64 %fd45, %fd43, %fd6, %fd44; + mov.f64 %fd46, 0d3FC5555555555F4D; + fma.rn.f64 %fd47, %fd45, %fd6, %fd46; + mul.f64 %fd48, %fd6, %fd47; + mul.f64 %fd49, %fd22, 0dC000000000000000; + mov.f64 %fd50, 0d3C91A62633145C07; + fma.rn.f64 %fd51, %fd49, %fd48, %fd50; + add.f64 %fd52, %fd49, 0d3FE921FB54442D18; + add.f64 %fd53, %fd52, %fd51; + add.f64 %fd54, %fd53, 0d3FE921FB54442D18; + { + .reg .b32 %temp; + mov.b64 {%r11, %temp}, %fd54; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r12}, %fd54; + } + and.b32 %r13, %r2, -2147483648; + or.b32 %r14, %r12, %r13; + mov.b64 %fd82, {%r11, %r14}; + +BB32_4: + cvta.to.global.u64 %rd7, %rd3; + shl.b64 %rd8, %rd1, 3; + add.s64 %rd9, %rd7, %rd8; + st.global.f64 [%rd9], %fd82; + +BB32_5: + ret; +} + + // .globl matrix_acos +.visible .entry matrix_acos( + .param .u64 matrix_acos_param_0, + .param .u64 matrix_acos_param_1, + .param .u32 matrix_acos_param_2 +) +{ + .reg .pred %p<7>; + .reg .b32 %r<17>; + .reg .f64 %fd<95>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [matrix_acos_param_0]; + ld.param.u64 %rd3, [matrix_acos_param_1]; + ld.param.u32 %r4, [matrix_acos_param_2]; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r6, %r5, %r7; + setp.ge.u32 %p1, %r1, %r4; + @%p1 bra BB33_14; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd16, [%rd6]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd16; + } + abs.f64 %fd1, %fd16; + { + .reg .b32 %temp; + mov.b64 {%temp, %r8}, %fd1; + } + setp.lt.s32 %p2, %r8, 1071801958; + @%p2 bra BB33_9; + bra.uni BB33_2; + +BB33_9: + mul.f64 %fd62, %fd1, %fd1; + mov.f64 %fd63, 0dBFB3823B180754AF; + mov.f64 %fd64, 0d3FB0066BDC1895E9; + fma.rn.f64 %fd65, %fd64, %fd62, %fd63; + mov.f64 %fd66, 0d3FB11E52CC2F79AE; + fma.rn.f64 %fd67, %fd65, %fd62, %fd66; + mov.f64 %fd68, 0dBF924EAF3526861B; + fma.rn.f64 %fd69, %fd67, %fd62, %fd68; + mov.f64 %fd70, 0d3F91DF02A31E6CB7; + fma.rn.f64 %fd71, %fd69, %fd62, %fd70; + mov.f64 %fd72, 0d3F847D18B0EEC6CC; + fma.rn.f64 %fd73, %fd71, %fd62, %fd72; + mov.f64 %fd74, 0d3F8D0AF961BA53B0; + fma.rn.f64 %fd75, %fd73, %fd62, %fd74; + mov.f64 %fd76, 0d3F91BF7734CF1C48; + fma.rn.f64 %fd77, %fd75, %fd62, %fd76; + mov.f64 %fd78, 0d3F96E91483144EF7; + fma.rn.f64 %fd79, %fd77, %fd62, %fd78; + mov.f64 %fd80, 0d3F9F1C6E0A4F9F81; + fma.rn.f64 %fd81, %fd79, %fd62, %fd80; + mov.f64 %fd82, 0d3FA6DB6DC27FA92B; + fma.rn.f64 %fd83, %fd81, %fd62, %fd82; + mov.f64 %fd84, 0d3FB333333320F91B; + fma.rn.f64 %fd85, %fd83, %fd62, %fd84; + mov.f64 %fd86, 0d3FC5555555555F4D; + fma.rn.f64 %fd87, %fd85, %fd62, %fd86; + mul.f64 %fd88, %fd62, %fd87; + fma.rn.f64 %fd10, %fd88, %fd1, %fd1; + setp.lt.s32 %p6, %r2, 0; + @%p6 bra BB33_11; + + mov.f64 %fd89, 0dBC91A62633145C07; + add.rn.f64 %fd90, %fd10, %fd89; + neg.f64 %fd93, %fd90; + bra.uni BB33_12; + +BB33_2: + mov.f64 %fd19, 0d3FF0000000000000; + sub.f64 %fd2, %fd19, %fd1; + { + .reg .b32 %temp; + mov.b64 {%r9, %temp}, %fd2; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r3}, %fd2; + } + add.s32 %r10, %r3, -1048576; + mov.b64 %fd18, {%r9, %r10}; + // inline asm + rsqrt.approx.ftz.f64 %fd17, %fd18; + // inline asm + { + .reg .b32 %temp; + mov.b64 {%r11, %temp}, %fd17; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r12}, %fd17; + } + add.s32 %r13, %r12, -1048576; + mov.b64 %fd20, {%r11, %r13}; + mul.f64 %fd21, %fd18, %fd17; + neg.f64 %fd22, %fd21; + fma.rn.f64 %fd23, %fd21, %fd22, %fd18; + fma.rn.f64 %fd24, %fd23, %fd20, %fd21; + neg.f64 %fd25, %fd24; + fma.rn.f64 %fd26, %fd17, %fd25, %fd19; + fma.rn.f64 %fd27, %fd26, %fd20, %fd20; + fma.rn.f64 %fd28, %fd24, %fd25, %fd18; + fma.rn.f64 %fd3, %fd28, %fd27, %fd24; + setp.lt.s32 %p3, %r3, 1; + @%p3 bra BB33_4; + + { + .reg .b32 %temp; + mov.b64 {%r14, %temp}, %fd3; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r15}, %fd3; + } + add.s32 %r16, %r15, 1048576; + mov.b64 %fd29, {%r14, %r16}; + mov.f64 %fd30, 0dBEBAC2FE66FAAC4B; + mov.f64 %fd31, 0d3EC715B371155F70; + fma.rn.f64 %fd32, %fd31, %fd2, %fd30; + mov.f64 %fd33, 0d3ED9A9B88EFCD9B8; + fma.rn.f64 %fd34, %fd32, %fd2, %fd33; + mov.f64 %fd35, 0d3EDD0F40A8A0C4C3; + fma.rn.f64 %fd36, %fd34, %fd2, %fd35; + mov.f64 %fd37, 0d3EF46D4CFA9E0E1F; + fma.rn.f64 %fd38, %fd36, %fd2, %fd37; + mov.f64 %fd39, 0d3F079C168D1E2422; + fma.rn.f64 %fd40, %fd38, %fd2, %fd39; + mov.f64 %fd41, 0d3F1C9A88C3BCA540; + fma.rn.f64 %fd42, %fd40, %fd2, %fd41; + mov.f64 %fd43, 0d3F31C4E64BD476DF; + fma.rn.f64 %fd44, %fd42, %fd2, %fd43; + mov.f64 %fd45, 0d3F46E8BA60009C8F; + fma.rn.f64 %fd46, %fd44, %fd2, %fd45; + mov.f64 %fd47, 0d3F5F1C71C62B05A2; + fma.rn.f64 %fd48, %fd46, %fd2, %fd47; + mov.f64 %fd49, 0d3F76DB6DB6DC9F2C; + fma.rn.f64 %fd50, %fd48, %fd2, %fd49; + mov.f64 %fd51, 0d3F9333333333329C; + fma.rn.f64 %fd52, %fd50, %fd2, %fd51; + mov.f64 %fd53, 0d3FB5555555555555; + fma.rn.f64 %fd54, %fd52, %fd2, %fd53; + mul.f64 %fd55, %fd2, %fd54; + fma.rn.f64 %fd94, %fd55, %fd29, %fd29; + bra.uni BB33_5; + +BB33_11: + mov.f64 %fd91, 0d3C91A62633145C07; + add.rn.f64 %fd93, %fd10, %fd91; + +BB33_12: + mov.f64 %fd92, 0d3FF921FB54442D18; + add.rn.f64 %fd94, %fd92, %fd93; + bra.uni BB33_13; + +BB33_4: + mov.f64 %fd56, 0d0000000000000000; + mul.rn.f64 %fd94, %fd1, %fd56; + +BB33_5: + setp.gt.s32 %p4, %r3, -1; + @%p4 bra BB33_7; + + mov.f64 %fd57, 0d7FF0000000000000; + mul.rn.f64 %fd94, %fd94, %fd57; + +BB33_7: + setp.gt.s32 %p5, %r2, -1; + @%p5 bra BB33_13; + + mov.f64 %fd58, 0dBCA1A62633145C07; + add.rn.f64 %fd59, %fd94, %fd58; + neg.f64 %fd60, %fd59; + mov.f64 %fd61, 0d400921FB54442D18; + add.rn.f64 %fd94, %fd61, %fd60; + +BB33_13: + cvta.to.global.u64 %rd7, %rd3; + shl.b64 %rd8, %rd1, 3; + add.s64 %rd9, %rd7, %rd8; + st.global.f64 [%rd9], %fd94; + +BB33_14: + ret; +} + + // .globl matrix_atan +.visible .entry matrix_atan( + .param .u64 matrix_atan_param_0, + .param .u64 matrix_atan_param_1, + .param .u32 matrix_atan_param_2 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<11>; + .reg .f64 %fd<57>; + .reg .b64 %rd<10>; + + + ld.param.u64 %rd2, [matrix_atan_param_0]; + ld.param.u64 %rd3, [matrix_atan_param_1]; + ld.param.u32 %r2, [matrix_atan_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB34_4; + + cvta.to.global.u64 %rd4, %rd2; + cvt.s64.s32 %rd1, %r1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd1, [%rd6]; + abs.f64 %fd2, %fd1; + setp.leu.f64 %p2, %fd2, 0d3FF0000000000000; + mov.f64 %fd56, %fd2; + @%p2 bra BB34_3; + + // inline asm + rcp.approx.ftz.f64 %fd5,%fd2; + // inline asm + neg.f64 %fd7, %fd2; + mov.f64 %fd8, 0d3FF0000000000000; + fma.rn.f64 %fd9, %fd7, %fd5, %fd8; + fma.rn.f64 %fd10, %fd9, %fd9, %fd9; + fma.rn.f64 %fd11, %fd10, %fd5, %fd5; + setp.eq.f64 %p3, %fd2, 0d7FF0000000000000; + selp.f64 %fd3, 0d0000000000000000, %fd11, %p3; + mov.f64 %fd56, %fd3; + +BB34_3: + mov.f64 %fd4, %fd56; + cvta.to.global.u64 %rd7, %rd3; + mul.f64 %fd12, %fd4, %fd4; + mov.f64 %fd13, 0d3F2D3B63DBB65B49; + mov.f64 %fd14, 0dBEF53E1D2A25FF7E; + fma.rn.f64 %fd15, %fd14, %fd12, %fd13; + mov.f64 %fd16, 0dBF5312788DDE082E; + fma.rn.f64 %fd17, %fd15, %fd12, %fd16; + mov.f64 %fd18, 0d3F6F9690C8249315; + fma.rn.f64 %fd19, %fd17, %fd12, %fd18; + mov.f64 %fd20, 0dBF82CF5AABC7CF0D; + fma.rn.f64 %fd21, %fd19, %fd12, %fd20; + mov.f64 %fd22, 0d3F9162B0B2A3BFDE; + fma.rn.f64 %fd23, %fd21, %fd12, %fd22; + mov.f64 %fd24, 0dBF9A7256FEB6FC6B; + fma.rn.f64 %fd25, %fd23, %fd12, %fd24; + mov.f64 %fd26, 0d3FA171560CE4A489; + fma.rn.f64 %fd27, %fd25, %fd12, %fd26; + mov.f64 %fd28, 0dBFA4F44D841450E4; + fma.rn.f64 %fd29, %fd27, %fd12, %fd28; + mov.f64 %fd30, 0d3FA7EE3D3F36BB95; + fma.rn.f64 %fd31, %fd29, %fd12, %fd30; + mov.f64 %fd32, 0dBFAAD32AE04A9FD1; + fma.rn.f64 %fd33, %fd31, %fd12, %fd32; + mov.f64 %fd34, 0d3FAE17813D66954F; + fma.rn.f64 %fd35, %fd33, %fd12, %fd34; + mov.f64 %fd36, 0dBFB11089CA9A5BCD; + fma.rn.f64 %fd37, %fd35, %fd12, %fd36; + mov.f64 %fd38, 0d3FB3B12B2DB51738; + fma.rn.f64 %fd39, %fd37, %fd12, %fd38; + mov.f64 %fd40, 0dBFB745D022F8DC5C; + fma.rn.f64 %fd41, %fd39, %fd12, %fd40; + mov.f64 %fd42, 0d3FBC71C709DFE927; + fma.rn.f64 %fd43, %fd41, %fd12, %fd42; + mov.f64 %fd44, 0dBFC2492491FA1744; + fma.rn.f64 %fd45, %fd43, %fd12, %fd44; + mov.f64 %fd46, 0d3FC99999999840D2; + fma.rn.f64 %fd47, %fd45, %fd12, %fd46; + mov.f64 %fd48, 0dBFD555555555544C; + fma.rn.f64 %fd49, %fd47, %fd12, %fd48; + mul.f64 %fd50, %fd12, %fd49; + fma.rn.f64 %fd51, %fd50, %fd4, %fd4; + mov.f64 %fd52, 0d3FF921FB54442D18; + sub.f64 %fd53, %fd52, %fd51; + setp.gt.f64 %p4, %fd2, 0d3FF0000000000000; + selp.f64 %fd54, %fd53, %fd51, %p4; + { + .reg .b32 %temp; + mov.b64 {%r6, %temp}, %fd54; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r7}, %fd54; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r8}, %fd1; + } + and.b32 %r9, %r8, -2147483648; + or.b32 %r10, %r7, %r9; + mov.b64 %fd55, {%r6, %r10}; + shl.b64 %rd8, %rd1, 3; + add.s64 %rd9, %rd7, %rd8; + st.global.f64 [%rd9], %fd55; + +BB34_4: + ret; +} + + // .globl matrix_sign +.visible .entry matrix_sign( + .param .u64 matrix_sign_param_0, + .param .u64 matrix_sign_param_1, + .param .u32 matrix_sign_param_2 +) +{ + .reg .pred %p<3>; + .reg .b32 %r<12>; + .reg .f64 %fd<4>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd2, [matrix_sign_param_0]; + ld.param.u64 %rd3, [matrix_sign_param_1]; + ld.param.u32 %r2, [matrix_sign_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra BB35_4; + + cvta.to.global.u64 %rd4, %rd2; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd1, [%rd6]; + setp.eq.f64 %p2, %fd1, 0d0000000000000000; + cvta.to.global.u64 %rd7, %rd3; + add.s64 %rd1, %rd7, %rd5; + @%p2 bra BB35_3; + bra.uni BB35_2; + +BB35_3: + mov.u64 %rd8, 0; + st.global.u64 [%rd1], %rd8; + bra.uni BB35_4; + +BB35_2: + { + .reg .b32 %temp; + mov.b64 {%temp, %r6}, %fd1; + } + and.b32 %r7, %r6, -2147483648; + mov.f64 %fd2, 0d3FF0000000000000; + { + .reg .b32 %temp; + mov.b64 {%temp, %r8}, %fd2; + } + and.b32 %r9, %r8, 2147483647; + or.b32 %r10, %r9, %r7; + { + .reg .b32 %temp; + mov.b64 {%r11, %temp}, %fd2; + } + mov.b64 %fd3, {%r11, %r10}; + st.global.f64 [%rd1], %fd3; + +BB35_4: + ret; +} + +.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd( + .param .b64 __internal_trig_reduction_slowpathd_param_0, + .param .b64 __internal_trig_reduction_slowpathd_param_1 +) +{ + .local .align 8 .b8 __local_depot36[40]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<9>; + .reg .b32 %r<42>; + .reg .f64 %fd<5>; + .reg .b64 %rd<101>; + + + mov.u64 %rd100, __local_depot36; + cvta.local.u64 %SP, %rd100; + ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0]; + ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1]; + add.u64 %rd38, %SP, 0; + cvta.to.local.u64 %rd1, %rd38; + { + .reg .b32 %temp; + mov.b64 {%temp, %r1}, %fd4; + } + and.b32 %r40, %r1, -2147483648; + shr.u32 %r3, %r1, 20; + bfe.u32 %r4, %r1, 20, 11; + setp.eq.s32 %p1, %r4, 2047; + @%p1 bra BB36_13; + + add.s32 %r16, %r4, -1024; + shr.u32 %r17, %r16, 6; + mov.u32 %r18, 16; + sub.s32 %r5, %r18, %r17; + mov.u32 %r19, 19; + sub.s32 %r20, %r19, %r17; + mov.u32 %r21, 18; + min.s32 %r6, %r21, %r20; + setp.gt.s32 %p2, %r5, %r6; + mov.u64 %rd94, 0; + mov.u64 %rd93, %rd1; + @%p2 bra BB36_4; + + mov.b64 %rd41, %fd4; + shl.b64 %rd42, %rd41, 11; + or.b64 %rd3, %rd42, -9223372036854775808; + add.s32 %r7, %r5, -1; + mov.u64 %rd92, %rd1; + bfe.u32 %r22, %r1, 20, 11; + add.s32 %r23, %r22, -1024; + shr.u32 %r24, %r23, 6; + neg.s32 %r25, %r24; + mul.wide.s32 %rd43, %r25, 8; + mov.u64 %rd44, __cudart_i2opi_d; + add.s64 %rd45, %rd43, %rd44; + add.s64 %rd90, %rd45, 120; + mov.u64 %rd94, 0; + mov.u64 %rd91, %rd1; + mov.u32 %r39, %r7; + +BB36_3: + .pragma "nounroll"; + mov.u32 %r8, %r39; + mov.u64 %rd7, %rd91; + ld.const.u64 %rd48, [%rd90]; + // inline asm + { + .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi; + mov.b64 {alo,ahi}, %rd48; + mov.b64 {blo,bhi}, %rd3; + mov.b64 {clo,chi}, %rd94; + mad.lo.cc.u32 r0, alo, blo, clo; + madc.hi.cc.u32 r1, alo, blo, chi; + madc.hi.u32 r2, alo, bhi, 0; + mad.lo.cc.u32 r1, alo, bhi, r1; + madc.hi.cc.u32 r2, ahi, blo, r2; + madc.hi.u32 r3, ahi, bhi, 0; + mad.lo.cc.u32 r1, ahi, blo, r1; + madc.lo.cc.u32 r2, ahi, bhi, r2; + addc.u32 r3, r3, 0; + mov.b64 %rd46, {r0,r1}; + mov.b64 %rd94, {r2,r3}; + } + // inline asm + st.local.u64 [%rd92], %rd46; + add.s32 %r9, %r8, 1; + sub.s32 %r26, %r9, %r7; + mul.wide.s32 %rd51, %r26, 8; + add.s64 %rd92, %rd1, %rd51; + add.s64 %rd13, %rd7, 8; + mov.u64 %rd93, %rd13; + add.s64 %rd90, %rd90, 8; + setp.lt.s32 %p3, %r9, %r6; + mov.u64 %rd91, %rd13; + mov.u32 %r39, %r9; + @%p3 bra BB36_3; + +BB36_4: + st.local.u64 [%rd93], %rd94; + ld.local.u64 %rd95, [%rd1+16]; + ld.local.u64 %rd96, [%rd1+24]; + and.b32 %r10, %r3, 63; + setp.eq.s32 %p4, %r10, 0; + @%p4 bra BB36_6; + + mov.u32 %r27, 64; + sub.s32 %r28, %r27, %r10; + shl.b64 %rd52, %rd96, %r10; + shr.u64 %rd53, %rd95, %r28; + or.b64 %rd96, %rd52, %rd53; + shl.b64 %rd54, %rd95, %r10; + ld.local.u64 %rd55, [%rd1+8]; + shr.u64 %rd56, %rd55, %r28; + or.b64 %rd95, %rd56, %rd54; + +BB36_6: + cvta.to.local.u64 %rd57, %rd37; + shr.u64 %rd58, %rd96, 62; + cvt.u32.u64 %r29, %rd58; + shr.u64 %rd59, %rd95, 62; + shl.b64 %rd60, %rd96, 2; + or.b64 %rd98, %rd60, %rd59; + shl.b64 %rd97, %rd95, 2; + shr.u64 %rd61, %rd96, 61; + cvt.u32.u64 %r30, %rd61; + and.b32 %r31, %r30, 1; + add.s32 %r32, %r31, %r29; + neg.s32 %r33, %r32; + setp.eq.s32 %p5, %r40, 0; + selp.b32 %r34, %r32, %r33, %p5; + st.local.u32 [%rd57], %r34; + setp.eq.s32 %p6, %r31, 0; + @%p6 bra BB36_8; + + mov.u64 %rd65, 0; + // inline asm + { + .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3; + mov.b64 {a0,a1}, %rd65; + mov.b64 {a2,a3}, %rd65; + mov.b64 {b0,b1}, %rd97; + mov.b64 {b2,b3}, %rd98; + sub.cc.u32 r0, a0, b0; + subc.cc.u32 r1, a1, b1; + subc.cc.u32 r2, a2, b2; + subc.u32 r3, a3, b3; + mov.b64 %rd97, {r0,r1}; + mov.b64 %rd98, {r2,r3}; + } + // inline asm + xor.b32 %r40, %r40, -2147483648; + +BB36_8: + clz.b64 %r41, %rd98; + setp.eq.s32 %p7, %r41, 0; + @%p7 bra BB36_10; + + shl.b64 %rd68, %rd98, %r41; + mov.u32 %r35, 64; + sub.s32 %r36, %r35, %r41; + shr.u64 %rd69, %rd97, %r36; + or.b64 %rd98, %rd69, %rd68; + +BB36_10: + mov.u64 %rd73, -3958705157555305931; + // inline asm + { + .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi; + mov.b64 {alo,ahi}, %rd98; + mov.b64 {blo,bhi}, %rd73; + mul.lo.u32 r0, alo, blo; + mul.hi.u32 r1, alo, blo; + mad.lo.cc.u32 r1, alo, bhi, r1; + madc.hi.u32 r2, alo, bhi, 0; + mad.lo.cc.u32 r1, ahi, blo, r1; + madc.hi.cc.u32 r2, ahi, blo, r2; + madc.hi.u32 r3, ahi, bhi, 0; + mad.lo.cc.u32 r2, ahi, bhi, r2; + addc.u32 r3, r3, 0; + mov.b64 %rd70, {r0,r1}; + mov.b64 %rd99, {r2,r3}; + } + // inline asm + setp.lt.s64 %p8, %rd99, 1; + @%p8 bra BB36_12; + + // inline asm + { + .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3; + mov.b64 {a0,a1}, %rd70; + mov.b64 {a2,a3}, %rd99; + mov.b64 {b0,b1}, %rd70; + mov.b64 {b2,b3}, %rd99; + add.cc.u32 r0, a0, b0; + addc.cc.u32 r1, a1, b1; + addc.cc.u32 r2, a2, b2; + addc.u32 r3, a3, b3; + mov.b64 %rd74, {r0,r1}; + mov.b64 %rd99, {r2,r3}; + } + // inline asm + add.s32 %r41, %r41, 1; + +BB36_12: + cvt.u64.u32 %rd80, %r40; + shl.b64 %rd81, %rd80, 32; + mov.u32 %r37, 1022; + sub.s32 %r38, %r37, %r41; + cvt.u64.u32 %rd82, %r38; + shl.b64 %rd83, %rd82, 52; + add.s64 %rd84, %rd99, 1; + shr.u64 %rd85, %rd84, 10; + add.s64 %rd86, %rd85, 1; + shr.u64 %rd87, %rd86, 1; + add.s64 %rd88, %rd87, %rd83; + or.b64 %rd89, %rd88, %rd81; + mov.b64 %fd4, %rd89; + +BB36_13: + st.param.f64 [func_retval0+0], %fd4; + ret; +} + +.func (.param .b64 func_retval0) __internal_accurate_pow( + .param .b64 __internal_accurate_pow_param_0, + .param .b64 __internal_accurate_pow_param_1 +) +{ + .reg .pred %p<9>; + .reg .f32 %f<3>; + .reg .b32 %r<52>; + .reg .f64 %fd<134>; + + + ld.param.f64 %fd12, [__internal_accurate_pow_param_0]; + ld.param.f64 %fd13, [__internal_accurate_pow_param_1]; + { + .reg .b32 %temp; + mov.b64 {%temp, %r49}, %fd12; + } + { + .reg .b32 %temp; + mov.b64 {%r48, %temp}, %fd12; + } + shr.u32 %r50, %r49, 20; + setp.ne.s32 %p1, %r50, 0; + @%p1 bra BB37_2; + + mul.f64 %fd14, %fd12, 0d4350000000000000; + { + .reg .b32 %temp; + mov.b64 {%temp, %r49}, %fd14; + } + { + .reg .b32 %temp; + mov.b64 {%r48, %temp}, %fd14; + } + shr.u32 %r16, %r49, 20; + add.s32 %r50, %r16, -54; + +BB37_2: + add.s32 %r51, %r50, -1023; + and.b32 %r17, %r49, -2146435073; + or.b32 %r18, %r17, 1072693248; + mov.b64 %fd132, {%r48, %r18}; + setp.lt.u32 %p2, %r18, 1073127583; + @%p2 bra BB37_4; + + { + .reg .b32 %temp; + mov.b64 {%r19, %temp}, %fd132; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r20}, %fd132; + } + add.s32 %r21, %r20, -1048576; + mov.b64 %fd132, {%r19, %r21}; + add.s32 %r51, %r50, -1022; + +BB37_4: + add.f64 %fd16, %fd132, 0d3FF0000000000000; + // inline asm + rcp.approx.ftz.f64 %fd15,%fd16; + // inline asm + neg.f64 %fd17, %fd16; + mov.f64 %fd18, 0d3FF0000000000000; + fma.rn.f64 %fd19, %fd17, %fd15, %fd18; + fma.rn.f64 %fd20, %fd19, %fd19, %fd19; + fma.rn.f64 %fd21, %fd20, %fd15, %fd15; + add.f64 %fd22, %fd132, 0dBFF0000000000000; + mul.f64 %fd23, %fd22, %fd21; + fma.rn.f64 %fd24, %fd22, %fd21, %fd23; + mul.f64 %fd25, %fd24, %fd24; + mov.f64 %fd26, 0d3ED0F5D241AD3B5A; + mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2; + fma.rn.f64 %fd28, %fd27, %fd25, %fd26; + mov.f64 %fd29, 0d3EF3B20A75488A3F; + fma.rn.f64 %fd30, %fd28, %fd25, %fd29; mov.f64 %fd31, 0d3F1745CDE4FAECD5; fma.rn.f64 %fd32, %fd30, %fd25, %fd31; mov.f64 %fd33, 0d3F3C71C7258A578B; @@ -3499,13 +5071,13 @@ BB23_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB23_7; + @%p4 bra BB37_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; add.f64 %fd129, %fd4, 0d7FF0000000000000; selp.f64 %fd133, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB23_7; + @%p6 bra BB37_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; @@ -3520,26 +5092,26 @@ BB23_4: mov.b64 %fd131, {%r44, %r43}; mul.f64 %fd133, %fd130, %fd131; -BB23_7: +BB37_7: { .reg .b32 %temp; mov.b64 {%temp, %r45}, %fd133; } and.b32 %r46, %r45, 2147483647; setp.ne.s32 %p7, %r46, 2146435072; + @%p7 bra BB37_9; + { .reg .b32 %temp; mov.b64 {%r47, %temp}, %fd133; } - setp.ne.s32 %p8, %r47, 0; - or.pred %p9, %p8, %p7; - @!%p9 bra BB23_9; - bra.uni BB23_8; + setp.eq.s32 %p8, %r47, 0; + @%p8 bra BB37_10; -BB23_8: +BB37_9: fma.rn.f64 %fd133, %fd133, %fd5, %fd133; -BB23_9: +BB37_10: st.param.f64 [func_retval0+0], %fd133; ret; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/hops/UnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java index c75d0e0..451960b 100644 --- a/src/main/java/org/apache/sysml/hops/UnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java @@ -157,8 +157,14 @@ public class UnaryOp extends Hop implements MultiThreadedHop else //default unary { int k = isCumulativeUnaryOperation() ? OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1; - if(_op == OpOp1.SELP || _op == OpOp1.EXP) { - et = findGPUExecTypeByMemEstimate(et); + switch(_op) { + case SELP:case EXP:case SQRT:case LOG:case ABS: + case ROUND:case FLOOR:case CEIL: + case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN: + case SIGN: + et = findGPUExecTypeByMemEstimate(et); + break; + default: } Unary unary1 = new Unary(input.constructLops(), HopsOpOp1LopsU.get(_op), getDataType(), getValueType(), et, k); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 4a45521..443d0eb 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -69,12 +69,27 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "^2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case String2GPUInstructionType.put( "*2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case String2GPUInstructionType.put( "-nz" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case - String2GPUInstructionType.put( "+*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "+*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); // Builtin functions String2GPUInstructionType.put( "sel+" , GPUINSTRUCTION_TYPE.BuiltinUnary); String2GPUInstructionType.put( "exp" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "log" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "abs" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "sqrt" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "round" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "floor" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "ceil" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "sin" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "cos" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "tan" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "asin" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "acos" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "atan" , GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "sign" , GPUINSTRUCTION_TYPE.BuiltinUnary); + + String2GPUInstructionType.put( "solve" , GPUINSTRUCTION_TYPE.BuiltinBinary);
