[SYSTEMML-1344] sqrt,round,abs,log,floor,ceil,trig funcs & sign for GPU

Closes #503


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/1fc764b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/1fc764b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/1fc764b9

Branch: refs/heads/master
Commit: 1fc764b9b099271822056a82e248acdbb785dc63
Parents: 0d553e3
Author: Nakul Jindal <[email protected]>
Authored: Wed May 17 10:55:51 2017 -0700
Committer: Nakul Jindal <[email protected]>
Committed: Wed May 17 10:55:51 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/kernels/Makefile                   |   28 +
 src/main/cpp/kernels/SystemML.cu                |  187 ++
 src/main/cpp/kernels/SystemML.ptx               | 2506 ++++++++++++++----
 .../java/org/apache/sysml/hops/UnaryOp.java     |   10 +-
 .../instructions/GPUInstructionParser.java      |   19 +-
 .../instructions/gpu/GPUInstruction.java        |   72 +-
 .../gpu/MatrixBuiltinGPUInstruction.java        |   41 +-
 .../instructions/gpu/context/CSRPointer.java    |    2 +-
 .../instructions/gpu/context/GPUObject.java     |    4 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  226 +-
 10 files changed, 2577 insertions(+), 518 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/Makefile
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile
new file mode 100644
index 0000000..0b003f3
--- /dev/null
+++ b/src/main/cpp/kernels/Makefile
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+NVCC=nvcc
+CUDAFLAGS= -ptx -c -arch=sm_30
+
+SystemML.o: SystemML.cu
+       $(NVCC) $(CUDAFLAGS)  SystemML.cu
+
+all: SystemML.o
+       ;
+
+clean:
+       rm -rf SystemML.ptx

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 2651e4a..5b4574e 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -656,3 +656,190 @@ __global__ void matrix_exp(double *A, double *C, unsigned 
int size) {
         C[index] = exp(A[index]);
     }
 }
+
+/**
+ * Do an sqrt over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sqrt(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = sqrt(A[index]);
+    }
+}
+
+/**
+ * Do an round over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_round(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = (double)llround(A[index]);
+    }
+}
+
+/**
+ * Do an abs over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_abs(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = (double)fabs(A[index]);
+    }
+}
+
+/**
+ * Do an log over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_log(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = log(A[index]);
+    }
+}
+
+/**
+ * Do an floor over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_floor(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = floor(A[index]);
+    }
+}
+
+/**
+ * Do an ceil over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_ceil(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = ceil(A[index]);
+    }
+}
+
+/**
+ * Do an sin over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sin(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = sin(A[index]);
+    }
+}
+
+/**
+ * Do an cos over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_cos(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = cos(A[index]);
+    }
+}
+
+/**
+ * Do an tan over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_tan(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = tan(A[index]);
+    }
+}
+
+/**
+ * Do an asin over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_asin(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = asin(A[index]);
+    }
+}
+
+/**
+ * Do an acos over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_acos(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = acos(A[index]);
+    }
+}
+
+/**
+ * Do an atan over all the elements of a matrix
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_atan(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        C[index] = atan(A[index]);
+    }
+}
+
+/**
+ * Do an sign over all the elements of a matrix
+ * Assign -1, 0 or 1 depending on the element being negative, 0 or positive
+ * @param A the input matrix (of length = size)
+ * @param C the pre-allocated output matrix (of length = size)
+ * @param siz the length of the input and output matrices
+ */
+extern "C"
+__global__ void matrix_sign(double *A, double *C, unsigned int size) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < size){
+        if (A[index] == 0.0) {
+            C[index] = 0.0;
+        } else {
+            C[index] = copysign(1.0, A[index]);
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index 50002f5..3229581 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21112126
-// Cuda compilation tools, release 8.0, V8.0.43
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
 // Based on LLVM 3.4svn
 //
 
@@ -11,6 +11,12 @@
 .address_size 64
 
        // .globl       copy_u2l_dense
+.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
+(
+       .param .b64 __internal_trig_reduction_slowpathd_param_0,
+       .param .b64 __internal_trig_reduction_slowpathd_param_1
+)
+;
 .func  (.param .b64 func_retval0) __internal_accurate_pow
 (
        .param .b64 __internal_accurate_pow_param_0,
@@ -18,6 +24,8 @@
 )
 ;
 .extern .shared .align 8 .b8 sdata[];
+.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 
107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 
45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 
10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 
139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 
112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 
29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 
97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 
221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
+.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 
219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 
199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 
85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 
217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 
93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 
0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
 
 .visible .entry copy_u2l_dense(
        .param .u64 copy_u2l_dense_param_0,
@@ -442,9 +450,9 @@ BB6_6:
        .param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
-       .reg .pred      %p<54>;
-       .reg .b32       %r<55>;
-       .reg .f64       %fd<39>;
+       .reg .pred      %p<52>;
+       .reg .b32       %r<56>;
+       .reg .f64       %fd<40>;
        .reg .b64       %rd<15>;
 
 
@@ -467,40 +475,40 @@ BB6_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.lt.s32     %p3, %r2, %r10;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB7_53;
+       @!%p4 bra       BB7_55;
        bra.uni         BB7_1;
 
 BB7_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r53, %r1;
+       mov.u32         %r54, %r1;
        @%p5 bra        BB7_5;
 
        setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r54, %r3;
+       mov.u32         %r55, %r3;
        @%p6 bra        BB7_4;
 
-       mov.u32         %r54, %r2;
+       mov.u32         %r55, %r2;
 
 BB7_4:
-       mov.u32         %r48, %r54;
-       mov.u32         %r4, %r48;
-       mov.u32         %r53, %r4;
+       mov.u32         %r49, %r55;
+       mov.u32         %r4, %r49;
+       mov.u32         %r54, %r4;
 
 BB7_5:
-       mov.u32         %r5, %r53;
+       mov.u32         %r5, %r54;
        setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r51, %r1;
+       mov.u32         %r52, %r1;
        @%p7 bra        BB7_9;
 
        setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r52, %r3;
+       mov.u32         %r53, %r3;
        @%p8 bra        BB7_8;
 
-       mov.u32         %r52, %r2;
+       mov.u32         %r53, %r2;
 
 BB7_8:
-       mov.u32         %r51, %r52;
+       mov.u32         %r52, %r53;
 
 BB7_9:
        cvta.to.global.u64      %rd5, %rd3;
@@ -508,10 +516,10 @@ BB7_9:
        mul.wide.s32    %rd7, %r5, 8;
        add.s64         %rd8, %rd6, %rd7;
        ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r51, 8;
+       mul.wide.s32    %rd9, %r52, 8;
        add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
-       mov.f64         %fd38, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd39, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p9, %r13, 5;
        @%p9 bra        BB7_19;
 
@@ -519,15 +527,15 @@ BB7_9:
        @%p19 bra       BB7_15;
 
        setp.eq.s32     %p23, %r13, 0;
-       @%p23 bra       BB7_51;
+       @%p23 bra       BB7_53;
 
        setp.eq.s32     %p24, %r13, 1;
-       @%p24 bra       BB7_50;
+       @%p24 bra       BB7_52;
        bra.uni         BB7_13;
 
-BB7_50:
-       sub.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+BB7_52:
+       sub.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
 BB7_19:
        setp.gt.s32     %p10, %r13, 8;
@@ -542,12 +550,12 @@ BB7_19:
 
 BB7_33:
        setp.gt.f64     %p29, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
-       bra.uni         BB7_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
+       bra.uni         BB7_54;
 
 BB7_15:
        setp.eq.s32     %p20, %r13, 3;
-       @%p20 bra       BB7_49;
+       @%p20 bra       BB7_51;
 
        setp.eq.s32     %p21, %r13, 4;
        @%p21 bra       BB7_35;
@@ -583,7 +591,7 @@ BB7_35:
        param0, 
        param1
        );
-       ld.param.f64    %fd37, [retval0+0];
+       ld.param.f64    %fd38, [retval0+0];
        
        //{
        }// Callseq End 0
@@ -595,17 +603,17 @@ BB7_35:
 BB7_36:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r23}, %fd37;
+       mov.b64         {%temp, %r23}, %fd38;
        }
        xor.b32         %r24, %r23, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r25, %temp}, %fd37;
+       mov.b64         {%r25, %temp}, %fd38;
        }
-       mov.b64         %fd37, {%r25, %r24};
+       mov.b64         %fd38, {%r25, %r24};
 
 BB7_37:
-       mov.f64         %fd36, %fd37;
+       mov.f64         %fd37, %fd38;
        setp.eq.f64     %p34, %fd1, 0d0000000000000000;
        @%p34 bra       BB7_40;
        bra.uni         BB7_38;
@@ -616,7 +624,7 @@ BB7_40:
        setp.lt.s32     %p38, %r9, 0;
        selp.b32        %r28, %r27, %r26, %p38;
        mov.u32         %r29, 0;
-       mov.b64         %fd36, {%r29, %r28};
+       mov.b64         %fd37, {%r29, %r28};
        bra.uni         BB7_41;
 
 BB7_24:
@@ -629,8 +637,8 @@ BB7_24:
 
 BB7_32:
        setp.eq.f64     %p27, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB7_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
+       bra.uni         BB7_54;
 
 BB7_28:
        setp.eq.s32     %p12, %r13, 11;
@@ -638,67 +646,67 @@ BB7_28:
        bra.uni         BB7_29;
 
 BB7_31:
-       min.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       min.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
-BB7_51:
-       add.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+BB7_53:
+       add.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
 BB7_13:
        setp.eq.s32     %p25, %r13, 2;
        @%p25 bra       BB7_14;
-       bra.uni         BB7_52;
+       bra.uni         BB7_54;
 
 BB7_14:
-       mul.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       mul.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
 BB7_34:
        setp.le.f64     %p30, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
-       bra.uni         BB7_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
+       bra.uni         BB7_54;
 
 BB7_22:
        setp.eq.s32     %p18, %r13, 8;
        @%p18 bra       BB7_23;
-       bra.uni         BB7_52;
+       bra.uni         BB7_54;
 
 BB7_23:
        setp.ge.f64     %p28, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
-       bra.uni         BB7_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
+       bra.uni         BB7_54;
 
-BB7_49:
-       div.rn.f64      %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+BB7_51:
+       div.rn.f64      %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
 BB7_17:
        setp.eq.s32     %p22, %r13, 5;
        @%p22 bra       BB7_18;
-       bra.uni         BB7_52;
+       bra.uni         BB7_54;
 
 BB7_18:
        setp.lt.f64     %p31, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
-       bra.uni         BB7_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
+       bra.uni         BB7_54;
 
 BB7_26:
        setp.eq.s32     %p15, %r13, 10;
        @%p15 bra       BB7_27;
-       bra.uni         BB7_52;
+       bra.uni         BB7_54;
 
 BB7_27:
        setp.neu.f64    %p26, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB7_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB7_54;
 
 BB7_29:
        setp.ne.s32     %p13, %r13, 12;
-       @%p13 bra       BB7_52;
+       @%p13 bra       BB7_54;
 
-       max.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       max.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
 BB7_38:
        setp.gt.s32     %p35, %r8, -1;
@@ -706,10 +714,10 @@ BB7_38:
 
        cvt.rzi.f64.f64 %fd29, %fd2;
        setp.neu.f64    %p36, %fd29, %fd2;
-       selp.f64        %fd36, 0dFFF8000000000000, %fd36, %p36;
+       selp.f64        %fd37, 0dFFF8000000000000, %fd37, %p36;
 
 BB7_41:
-       mov.f64         %fd17, %fd36;
+       mov.f64         %fd17, %fd37;
        add.f64         %fd18, %fd1, %fd2;
        {
        .reg .b32 %temp; 
@@ -717,77 +725,79 @@ BB7_41:
        }
        and.b32         %r31, %r30, 2146435072;
        setp.ne.s32     %p39, %r31, 2146435072;
-       mov.f64         %fd35, %fd17;
-       @%p39 bra       BB7_48;
+       mov.f64         %fd36, %fd17;
+       @%p39 bra       BB7_50;
 
        setp.gtu.f64    %p40, %fd11, 0d7FF0000000000000;
-       mov.f64         %fd35, %fd18;
-       @%p40 bra       BB7_48;
+       mov.f64         %fd36, %fd18;
+       @%p40 bra       BB7_50;
 
        abs.f64         %fd30, %fd2;
        setp.gtu.f64    %p41, %fd30, 0d7FF0000000000000;
-       mov.f64         %fd34, %fd18;
-       mov.f64         %fd35, %fd34;
-       @%p41 bra       BB7_48;
+       mov.f64         %fd35, %fd18;
+       mov.f64         %fd36, %fd35;
+       @%p41 bra       BB7_50;
+
+       and.b32         %r32, %r9, 2147483647;
+       setp.ne.s32     %p42, %r32, 2146435072;
+       @%p42 bra       BB7_46;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r32, %temp}, %fd2;
+       mov.b64         {%r33, %temp}, %fd2;
        }
-       and.b32         %r33, %r9, 2147483647;
-       setp.eq.s32     %p42, %r33, 2146435072;
-       setp.eq.s32     %p43, %r32, 0;
-       and.pred        %p44, %p42, %p43;
-       @%p44 bra       BB7_47;
-       bra.uni         BB7_45;
+       setp.eq.s32     %p43, %r33, 0;
+       @%p43 bra       BB7_49;
 
-BB7_47:
-       setp.gt.f64     %p48, %fd11, 0d3FF0000000000000;
-       selp.b32        %r41, 2146435072, 0, %p48;
-       xor.b32         %r42, %r41, 2146435072;
-       setp.lt.s32     %p49, %r9, 0;
-       selp.b32        %r43, %r42, %r41, %p49;
-       setp.eq.f64     %p50, %fd1, 0dBFF0000000000000;
-       selp.b32        %r44, 1072693248, %r43, %p50;
-       mov.u32         %r45, 0;
-       mov.b64         %fd35, {%r45, %r44};
-       bra.uni         BB7_48;
+BB7_46:
+       and.b32         %r34, %r8, 2147483647;
+       setp.ne.s32     %p44, %r34, 2146435072;
+       mov.f64         %fd33, %fd17;
+       mov.f64         %fd36, %fd33;
+       @%p44 bra       BB7_50;
 
-BB7_45:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r34, %temp}, %fd1;
+       mov.b64         {%r35, %temp}, %fd1;
        }
-       and.b32         %r35, %r8, 2147483647;
-       setp.eq.s32     %p45, %r35, 2146435072;
-       setp.eq.s32     %p46, %r34, 0;
-       and.pred        %p47, %p45, %p46;
-       mov.f64         %fd35, %fd17;
-       @!%p47 bra      BB7_48;
-       bra.uni         BB7_46;
+       setp.ne.s32     %p45, %r35, 0;
+       mov.f64         %fd36, %fd17;
+       @%p45 bra       BB7_50;
 
-BB7_46:
        shr.s32         %r36, %r9, 31;
        and.b32         %r37, %r36, -2146435072;
-       selp.b32        %r38, -1048576, 2146435072, %p1;
-       add.s32         %r39, %r38, %r37;
-       mov.u32         %r40, 0;
-       mov.b64         %fd35, {%r40, %r39};
+       add.s32         %r38, %r37, 2146435072;
+       or.b32          %r39, %r38, -2147483648;
+       selp.b32        %r40, %r39, %r38, %p1;
+       mov.u32         %r41, 0;
+       mov.b64         %fd36, {%r41, %r40};
+       bra.uni         BB7_50;
 
-BB7_48:
-       setp.eq.f64     %p51, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p52, %fd1, 0d3FF0000000000000;
-       or.pred         %p53, %p52, %p51;
-       selp.f64        %fd38, 0d3FF0000000000000, %fd35, %p53;
+BB7_49:
+       setp.gt.f64     %p46, %fd11, 0d3FF0000000000000;
+       selp.b32        %r42, 2146435072, 0, %p46;
+       xor.b32         %r43, %r42, 2146435072;
+       setp.lt.s32     %p47, %r9, 0;
+       selp.b32        %r44, %r43, %r42, %p47;
+       setp.eq.f64     %p48, %fd1, 0dBFF0000000000000;
+       selp.b32        %r45, 1072693248, %r44, %p48;
+       mov.u32         %r46, 0;
+       mov.b64         %fd36, {%r46, %r45};
 
-BB7_52:
+BB7_50:
+       setp.eq.f64     %p49, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p50, %fd1, 0d3FF0000000000000;
+       or.pred         %p51, %p50, %p49;
+       selp.f64        %fd39, 0d3FF0000000000000, %fd36, %p51;
+
+BB7_54:
        cvta.to.global.u64      %rd12, %rd4;
        mul.wide.s32    %rd13, %r3, 8;
        add.s64         %rd14, %rd12, %rd13;
-       st.global.f64   [%rd14], %fd38;
+       st.global.f64   [%rd14], %fd39;
        bar.sync        0;
 
-BB7_53:
+BB7_55:
        ret;
 }
 
@@ -801,9 +811,9 @@ BB7_53:
        .param .u32 matrix_scalar_op_param_5
 )
 {
-       .reg .pred      %p<95>;
-       .reg .b32       %r<62>;
-       .reg .f64       %fd<75>;
+       .reg .pred      %p<91>;
+       .reg .b32       %r<64>;
+       .reg .f64       %fd<77>;
        .reg .b64       %rd<12>;
 
 
@@ -818,7 +828,7 @@ BB7_53:
        mov.u32         %r11, %tid.x;
        mad.lo.s32      %r1, %r10, %r9, %r11;
        setp.ge.s32     %p3, %r1, %r8;
-       @%p3 bra        BB8_90;
+       @%p3 bra        BB8_94;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -827,9 +837,9 @@ BB7_53:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB8_46;
+       @%p4 bra        BB8_48;
 
-       mov.f64         %fd66, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd67, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p5, %r6, 5;
        @%p5 bra        BB8_12;
 
@@ -837,34 +847,34 @@ BB7_53:
        @%p15 bra       BB8_8;
 
        setp.eq.s32     %p19, %r6, 0;
-       @%p19 bra       BB8_44;
+       @%p19 bra       BB8_46;
 
        setp.eq.s32     %p20, %r6, 1;
-       @%p20 bra       BB8_43;
+       @%p20 bra       BB8_45;
        bra.uni         BB8_6;
 
-BB8_43:
-       sub.f64         %fd66, %fd52, %fd1;
-       bra.uni         BB8_45;
+BB8_45:
+       sub.f64         %fd67, %fd52, %fd1;
+       bra.uni         BB8_47;
 
-BB8_46:
-       mov.f64         %fd74, 0d7FEFFFFFFFFFFFFF;
-       setp.gt.s32     %p50, %r6, 5;
-       @%p50 bra       BB8_56;
+BB8_48:
+       mov.f64         %fd76, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p48, %r6, 5;
+       @%p48 bra       BB8_58;
 
-       setp.gt.s32     %p60, %r6, 2;
-       @%p60 bra       BB8_52;
+       setp.gt.s32     %p58, %r6, 2;
+       @%p58 bra       BB8_54;
 
-       setp.eq.s32     %p64, %r6, 0;
-       @%p64 bra       BB8_88;
+       setp.eq.s32     %p62, %r6, 0;
+       @%p62 bra       BB8_92;
 
-       setp.eq.s32     %p65, %r6, 1;
-       @%p65 bra       BB8_87;
-       bra.uni         BB8_50;
+       setp.eq.s32     %p63, %r6, 1;
+       @%p63 bra       BB8_91;
+       bra.uni         BB8_52;
 
-BB8_87:
-       sub.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB8_89;
+BB8_91:
+       sub.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_93;
 
 BB8_12:
        setp.gt.s32     %p6, %r6, 8;
@@ -879,28 +889,28 @@ BB8_12:
 
 BB8_26:
        setp.lt.f64     %p25, %fd1, %fd52;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
-       bra.uni         BB8_45;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+       bra.uni         BB8_47;
 
-BB8_56:
-       setp.gt.s32     %p51, %r6, 8;
-       @%p51 bra       BB8_61;
+BB8_58:
+       setp.gt.s32     %p49, %r6, 8;
+       @%p49 bra       BB8_63;
 
-       setp.eq.s32     %p57, %r6, 6;
-       @%p57 bra       BB8_71;
+       setp.eq.s32     %p55, %r6, 6;
+       @%p55 bra       BB8_73;
 
-       setp.eq.s32     %p58, %r6, 7;
-       @%p58 bra       BB8_70;
-       bra.uni         BB8_59;
+       setp.eq.s32     %p56, %r6, 7;
+       @%p56 bra       BB8_72;
+       bra.uni         BB8_61;
 
-BB8_70:
-       setp.gt.f64     %p70, %fd1, %fd52;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
-       bra.uni         BB8_89;
+BB8_72:
+       setp.gt.f64     %p68, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
+       bra.uni         BB8_93;
 
 BB8_8:
        setp.eq.s32     %p16, %r6, 3;
-       @%p16 bra       BB8_42;
+       @%p16 bra       BB8_44;
 
        setp.eq.s32     %p17, %r6, 4;
        @%p17 bra       BB8_28;
@@ -936,7 +946,7 @@ BB8_28:
        param0, 
        param1
        );
-       ld.param.f64    %fd65, [retval0+0];
+       ld.param.f64    %fd66, [retval0+0];
        
        //{
        }// Callseq End 1
@@ -948,17 +958,17 @@ BB8_28:
 BB8_29:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r14}, %fd65;
+       mov.b64         {%temp, %r14}, %fd66;
        }
        xor.b32         %r15, %r14, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r16, %temp}, %fd65;
+       mov.b64         {%r16, %temp}, %fd66;
        }
-       mov.b64         %fd65, {%r16, %r15};
+       mov.b64         %fd66, {%r16, %r15};
 
 BB8_30:
-       mov.f64         %fd64, %fd65;
+       mov.f64         %fd65, %fd66;
        setp.eq.f64     %p30, %fd52, 0d0000000000000000;
        @%p30 bra       BB8_33;
        bra.uni         BB8_31;
@@ -969,7 +979,7 @@ BB8_33:
        setp.lt.s32     %p34, %r3, 0;
        selp.b32        %r19, %r18, %r17, %p34;
        mov.u32         %r20, 0;
-       mov.b64         %fd64, {%r20, %r19};
+       mov.b64         %fd65, {%r20, %r19};
        bra.uni         BB8_34;
 
 BB8_17:
@@ -982,18 +992,18 @@ BB8_17:
 
 BB8_25:
        setp.eq.f64     %p23, %fd1, %fd52;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
-       bra.uni         BB8_45;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+       bra.uni         BB8_47;
 
-BB8_52:
-       setp.eq.s32     %p61, %r6, 3;
-       @%p61 bra       BB8_86;
+BB8_54:
+       setp.eq.s32     %p59, %r6, 3;
+       @%p59 bra       BB8_90;
 
-       setp.eq.s32     %p62, %r6, 4;
-       @%p62 bra       BB8_72;
-       bra.uni         BB8_54;
+       setp.eq.s32     %p60, %r6, 4;
+       @%p60 bra       BB8_74;
+       bra.uni         BB8_56;
 
-BB8_72:
+BB8_74:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -1002,11 +1012,11 @@ BB8_72:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r5}, %fd52;
        }
-       bfe.u32         %r37, %r5, 20, 11;
-       add.s32         %r38, %r37, -1012;
+       bfe.u32         %r38, %r5, 20, 11;
+       add.s32         %r39, %r38, -1012;
        mov.b64          %rd11, %fd52;
-       shl.b64         %rd3, %rd11, %r38;
-       setp.eq.s64     %p73, %rd3, -9223372036854775808;
+       shl.b64         %rd3, %rd11, %r39;
+       setp.eq.s64     %p71, %rd3, -9223372036854775808;
        abs.f64         %fd35, %fd1;
        // Callseq Start 2
        {
@@ -1023,54 +1033,54 @@ BB8_72:
        param0, 
        param1
        );
-       ld.param.f64    %fd73, [retval0+0];
+       ld.param.f64    %fd75, [retval0+0];
        
        //{
        }// Callseq End 2
-       setp.lt.s32     %p74, %r4, 0;
-       and.pred        %p2, %p74, %p73;
-       @!%p2 bra       BB8_74;
-       bra.uni         BB8_73;
+       setp.lt.s32     %p72, %r4, 0;
+       and.pred        %p2, %p72, %p71;
+       @!%p2 bra       BB8_76;
+       bra.uni         BB8_75;
 
-BB8_73:
+BB8_75:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r39}, %fd73;
+       mov.b64         {%temp, %r40}, %fd75;
        }
-       xor.b32         %r40, %r39, -2147483648;
+       xor.b32         %r41, %r40, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r41, %temp}, %fd73;
+       mov.b64         {%r42, %temp}, %fd75;
        }
-       mov.b64         %fd73, {%r41, %r40};
-
-BB8_74:
-       mov.f64         %fd72, %fd73;
-       setp.eq.f64     %p75, %fd1, 0d0000000000000000;
-       @%p75 bra       BB8_77;
-       bra.uni         BB8_75;
+       mov.b64         %fd75, {%r42, %r41};
 
-BB8_77:
-       selp.b32        %r42, %r4, 0, %p73;
-       or.b32          %r43, %r42, 2146435072;
-       setp.lt.s32     %p79, %r5, 0;
-       selp.b32        %r44, %r43, %r42, %p79;
-       mov.u32         %r45, 0;
-       mov.b64         %fd72, {%r45, %r44};
-       bra.uni         BB8_78;
+BB8_76:
+       mov.f64         %fd74, %fd75;
+       setp.eq.f64     %p73, %fd1, 0d0000000000000000;
+       @%p73 bra       BB8_79;
+       bra.uni         BB8_77;
+
+BB8_79:
+       selp.b32        %r43, %r4, 0, %p71;
+       or.b32          %r44, %r43, 2146435072;
+       setp.lt.s32     %p77, %r5, 0;
+       selp.b32        %r45, %r44, %r43, %p77;
+       mov.u32         %r46, 0;
+       mov.b64         %fd74, {%r46, %r45};
+       bra.uni         BB8_80;
 
-BB8_61:
-       setp.gt.s32     %p52, %r6, 10;
-       @%p52 bra       BB8_65;
+BB8_63:
+       setp.gt.s32     %p50, %r6, 10;
+       @%p50 bra       BB8_67;
 
-       setp.eq.s32     %p55, %r6, 9;
-       @%p55 bra       BB8_69;
-       bra.uni         BB8_63;
+       setp.eq.s32     %p53, %r6, 9;
+       @%p53 bra       BB8_71;
+       bra.uni         BB8_65;
 
-BB8_69:
-       setp.eq.f64     %p68, %fd1, %fd52;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
-       bra.uni         BB8_89;
+BB8_71:
+       setp.eq.f64     %p66, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+       bra.uni         BB8_93;
 
 BB8_21:
        setp.eq.s32     %p8, %r6, 11;
@@ -1078,135 +1088,135 @@ BB8_21:
        bra.uni         BB8_22;
 
 BB8_24:
-       min.f64         %fd66, %fd52, %fd1;
-       bra.uni         BB8_45;
+       min.f64         %fd67, %fd52, %fd1;
+       bra.uni         BB8_47;
 
-BB8_44:
-       add.f64         %fd66, %fd1, %fd52;
-       bra.uni         BB8_45;
+BB8_46:
+       add.f64         %fd67, %fd1, %fd52;
+       bra.uni         BB8_47;
 
 BB8_6:
        setp.eq.s32     %p21, %r6, 2;
        @%p21 bra       BB8_7;
-       bra.uni         BB8_45;
+       bra.uni         BB8_47;
 
 BB8_7:
-       mul.f64         %fd66, %fd1, %fd52;
-       bra.uni         BB8_45;
+       mul.f64         %fd67, %fd1, %fd52;
+       bra.uni         BB8_47;
 
 BB8_27:
        setp.ge.f64     %p26, %fd1, %fd52;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB8_45;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB8_47;
 
 BB8_15:
        setp.eq.s32     %p14, %r6, 8;
        @%p14 bra       BB8_16;
-       bra.uni         BB8_45;
+       bra.uni         BB8_47;
 
 BB8_16:
        setp.le.f64     %p24, %fd1, %fd52;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
-       bra.uni         BB8_45;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+       bra.uni         BB8_47;
 
-BB8_42:
-       div.rn.f64      %fd66, %fd52, %fd1;
-       bra.uni         BB8_45;
+BB8_44:
+       div.rn.f64      %fd67, %fd52, %fd1;
+       bra.uni         BB8_47;
 
 BB8_10:
        setp.eq.s32     %p18, %r6, 5;
        @%p18 bra       BB8_11;
-       bra.uni         BB8_45;
+       bra.uni         BB8_47;
 
 BB8_11:
        setp.gt.f64     %p27, %fd1, %fd52;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB8_45;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p27;
+       bra.uni         BB8_47;
 
-BB8_65:
-       setp.eq.s32     %p53, %r6, 11;
-       @%p53 bra       BB8_68;
-       bra.uni         BB8_66;
+BB8_67:
+       setp.eq.s32     %p51, %r6, 11;
+       @%p51 bra       BB8_70;
+       bra.uni         BB8_68;
 
-BB8_68:
-       min.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB8_89;
+BB8_70:
+       min.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_93;
 
 BB8_19:
        setp.eq.s32     %p11, %r6, 10;
        @%p11 bra       BB8_20;
-       bra.uni         BB8_45;
+       bra.uni         BB8_47;
 
 BB8_20:
        setp.neu.f64    %p22, %fd1, %fd52;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
-       bra.uni         BB8_45;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+       bra.uni         BB8_47;
 
 BB8_22:
        setp.ne.s32     %p9, %r6, 12;
-       @%p9 bra        BB8_45;
-
-       max.f64         %fd66, %fd52, %fd1;
-       bra.uni         BB8_45;
+       @%p9 bra        BB8_47;
 
-BB8_88:
-       add.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB8_89;
+       max.f64         %fd67, %fd52, %fd1;
+       bra.uni         BB8_47;
 
-BB8_50:
-       setp.eq.s32     %p66, %r6, 2;
-       @%p66 bra       BB8_51;
-       bra.uni         BB8_89;
+BB8_92:
+       add.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_93;
 
-BB8_51:
-       mul.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB8_89;
+BB8_52:
+       setp.eq.s32     %p64, %r6, 2;
+       @%p64 bra       BB8_53;
+       bra.uni         BB8_93;
 
-BB8_71:
-       setp.le.f64     %p71, %fd1, %fd52;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71;
-       bra.uni         BB8_89;
+BB8_53:
+       mul.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_93;
 
-BB8_59:
-       setp.eq.s32     %p59, %r6, 8;
-       @%p59 bra       BB8_60;
-       bra.uni         BB8_89;
+BB8_73:
+       setp.le.f64     %p69, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p69;
+       bra.uni         BB8_93;
 
-BB8_60:
-       setp.ge.f64     %p69, %fd1, %fd52;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
-       bra.uni         BB8_89;
+BB8_61:
+       setp.eq.s32     %p57, %r6, 8;
+       @%p57 bra       BB8_62;
+       bra.uni         BB8_93;
 
-BB8_86:
-       div.rn.f64      %fd74, %fd1, %fd52;
-       bra.uni         BB8_89;
+BB8_62:
+       setp.ge.f64     %p67, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
+       bra.uni         BB8_93;
 
-BB8_54:
-       setp.eq.s32     %p63, %r6, 5;
-       @%p63 bra       BB8_55;
-       bra.uni         BB8_89;
+BB8_90:
+       div.rn.f64      %fd76, %fd1, %fd52;
+       bra.uni         BB8_93;
 
-BB8_55:
-       setp.lt.f64     %p72, %fd1, %fd52;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72;
-       bra.uni         BB8_89;
+BB8_56:
+       setp.eq.s32     %p61, %r6, 5;
+       @%p61 bra       BB8_57;
+       bra.uni         BB8_93;
 
-BB8_63:
-       setp.eq.s32     %p56, %r6, 10;
-       @%p56 bra       BB8_64;
-       bra.uni         BB8_89;
+BB8_57:
+       setp.lt.f64     %p70, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p70;
+       bra.uni         BB8_93;
 
-BB8_64:
-       setp.neu.f64    %p67, %fd1, %fd52;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
-       bra.uni         BB8_89;
+BB8_65:
+       setp.eq.s32     %p54, %r6, 10;
+       @%p54 bra       BB8_66;
+       bra.uni         BB8_93;
 
 BB8_66:
-       setp.ne.s32     %p54, %r6, 12;
-       @%p54 bra       BB8_89;
+       setp.neu.f64    %p65, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+       bra.uni         BB8_93;
 
-       max.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB8_89;
+BB8_68:
+       setp.ne.s32     %p52, %r6, 12;
+       @%p52 bra       BB8_93;
+
+       max.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_93;
 
 BB8_31:
        setp.gt.s32     %p31, %r2, -1;
@@ -1214,10 +1224,10 @@ BB8_31:
 
        cvt.rzi.f64.f64 %fd54, %fd1;
        setp.neu.f64    %p32, %fd54, %fd1;
-       selp.f64        %fd64, 0dFFF8000000000000, %fd64, %p32;
+       selp.f64        %fd65, 0dFFF8000000000000, %fd65, %p32;
 
 BB8_34:
-       mov.f64         %fd16, %fd64;
+       mov.f64         %fd16, %fd65;
        add.f64         %fd17, %fd1, %fd52;
        {
        .reg .b32 %temp; 
@@ -1225,157 +1235,161 @@ BB8_34:
        }
        and.b32         %r22, %r21, 2146435072;
        setp.ne.s32     %p35, %r22, 2146435072;
-       mov.f64         %fd63, %fd16;
-       @%p35 bra       BB8_41;
+       mov.f64         %fd64, %fd16;
+       @%p35 bra       BB8_43;
 
        setp.gtu.f64    %p36, %fd10, 0d7FF0000000000000;
-       mov.f64         %fd63, %fd17;
-       @%p36 bra       BB8_41;
+       mov.f64         %fd64, %fd17;
+       @%p36 bra       BB8_43;
 
        abs.f64         %fd55, %fd1;
        setp.gtu.f64    %p37, %fd55, 0d7FF0000000000000;
-       mov.f64         %fd62, %fd17;
-       mov.f64         %fd63, %fd62;
-       @%p37 bra       BB8_41;
+       mov.f64         %fd63, %fd17;
+       mov.f64         %fd64, %fd63;
+       @%p37 bra       BB8_43;
+
+       and.b32         %r23, %r3, 2147483647;
+       setp.ne.s32     %p38, %r23, 2146435072;
+       @%p38 bra       BB8_39;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r23, %temp}, %fd1;
+       mov.b64         {%r24, %temp}, %fd1;
        }
-       and.b32         %r24, %r3, 2147483647;
-       setp.eq.s32     %p38, %r24, 2146435072;
-       setp.eq.s32     %p39, %r23, 0;
-       and.pred        %p40, %p38, %p39;
-       @%p40 bra       BB8_40;
-       bra.uni         BB8_38;
+       setp.eq.s32     %p39, %r24, 0;
+       @%p39 bra       BB8_42;
 
-BB8_40:
-       setp.gt.f64     %p44, %fd10, 0d3FF0000000000000;
-       selp.b32        %r32, 2146435072, 0, %p44;
-       xor.b32         %r33, %r32, 2146435072;
-       setp.lt.s32     %p45, %r3, 0;
-       selp.b32        %r34, %r33, %r32, %p45;
-       setp.eq.f64     %p46, %fd52, 0dBFF0000000000000;
-       selp.b32        %r35, 1072693248, %r34, %p46;
-       mov.u32         %r36, 0;
-       mov.b64         %fd63, {%r36, %r35};
-       bra.uni         BB8_41;
+BB8_39:
+       and.b32         %r25, %r2, 2147483647;
+       setp.ne.s32     %p40, %r25, 2146435072;
+       mov.f64         %fd61, %fd16;
+       mov.f64         %fd64, %fd61;
+       @%p40 bra       BB8_43;
 
-BB8_75:
-       setp.gt.s32     %p76, %r4, -1;
-       @%p76 bra       BB8_78;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r26, %temp}, %fd52;
+       }
+       setp.ne.s32     %p41, %r26, 0;
+       mov.f64         %fd64, %fd16;
+       @%p41 bra       BB8_43;
+
+       shr.s32         %r27, %r3, 31;
+       and.b32         %r28, %r27, -2146435072;
+       add.s32         %r29, %r28, 2146435072;
+       or.b32          %r30, %r29, -2147483648;
+       selp.b32        %r31, %r30, %r29, %p1;
+       mov.u32         %r32, 0;
+       mov.b64         %fd64, {%r32, %r31};
+       bra.uni         BB8_43;
+
+BB8_77:
+       setp.gt.s32     %p74, %r4, -1;
+       @%p74 bra       BB8_80;
 
        cvt.rzi.f64.f64 %fd57, %fd52;
-       setp.neu.f64    %p77, %fd57, %fd52;
-       selp.f64        %fd72, 0dFFF8000000000000, %fd72, %p77;
+       setp.neu.f64    %p75, %fd57, %fd52;
+       selp.f64        %fd74, 0dFFF8000000000000, %fd74, %p75;
 
-BB8_78:
-       mov.f64         %fd41, %fd72;
+BB8_80:
+       mov.f64         %fd41, %fd74;
        add.f64         %fd42, %fd1, %fd52;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r46}, %fd42;
+       mov.b64         {%temp, %r47}, %fd42;
        }
-       and.b32         %r47, %r46, 2146435072;
-       setp.ne.s32     %p80, %r47, 2146435072;
-       mov.f64         %fd71, %fd41;
-       @%p80 bra       BB8_85;
+       and.b32         %r48, %r47, 2146435072;
+       setp.ne.s32     %p78, %r48, 2146435072;
+       mov.f64         %fd73, %fd41;
+       @%p78 bra       BB8_89;
 
-       setp.gtu.f64    %p81, %fd35, 0d7FF0000000000000;
-       mov.f64         %fd71, %fd42;
-       @%p81 bra       BB8_85;
+       setp.gtu.f64    %p79, %fd35, 0d7FF0000000000000;
+       mov.f64         %fd73, %fd42;
+       @%p79 bra       BB8_89;
 
        abs.f64         %fd58, %fd52;
-       setp.gtu.f64    %p82, %fd58, 0d7FF0000000000000;
-       mov.f64         %fd70, %fd42;
-       mov.f64         %fd71, %fd70;
-       @%p82 bra       BB8_85;
+       setp.gtu.f64    %p80, %fd58, 0d7FF0000000000000;
+       mov.f64         %fd72, %fd42;
+       mov.f64         %fd73, %fd72;
+       @%p80 bra       BB8_89;
+
+       and.b32         %r49, %r5, 2147483647;
+       setp.ne.s32     %p81, %r49, 2146435072;
+       @%p81 bra       BB8_85;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r48, %temp}, %fd52;
+       mov.b64         {%r50, %temp}, %fd52;
        }
-       and.b32         %r49, %r5, 2147483647;
-       setp.eq.s32     %p83, %r49, 2146435072;
-       setp.eq.s32     %p84, %r48, 0;
-       and.pred        %p85, %p83, %p84;
-       @%p85 bra       BB8_84;
-       bra.uni         BB8_82;
-
-BB8_84:
-       setp.gt.f64     %p89, %fd35, 0d3FF0000000000000;
-       selp.b32        %r57, 2146435072, 0, %p89;
-       xor.b32         %r58, %r57, 2146435072;
-       setp.lt.s32     %p90, %r5, 0;
-       selp.b32        %r59, %r58, %r57, %p90;
-       setp.eq.f64     %p91, %fd1, 0dBFF0000000000000;
-       selp.b32        %r60, 1072693248, %r59, %p91;
-       mov.u32         %r61, 0;
-       mov.b64         %fd71, {%r61, %r60};
-       bra.uni         BB8_85;
-
-BB8_38:
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r25, %temp}, %fd52;
-       }
-       and.b32         %r26, %r2, 2147483647;
-       setp.eq.s32     %p41, %r26, 2146435072;
-       setp.eq.s32     %p42, %r25, 0;
-       and.pred        %p43, %p41, %p42;
-       mov.f64         %fd63, %fd16;
-       @!%p43 bra      BB8_41;
-       bra.uni         BB8_39;
-
-BB8_39:
-       shr.s32         %r27, %r3, 31;
-       and.b32         %r28, %r27, -2146435072;
-       selp.b32        %r29, -1048576, 2146435072, %p1;
-       add.s32         %r30, %r29, %r28;
-       mov.u32         %r31, 0;
-       mov.b64         %fd63, {%r31, %r30};
+       setp.eq.s32     %p82, %r50, 0;
+       @%p82 bra       BB8_88;
 
-BB8_41:
-       setp.eq.f64     %p47, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p48, %fd52, 0d3FF0000000000000;
-       or.pred         %p49, %p48, %p47;
-       selp.f64        %fd66, 0d3FF0000000000000, %fd63, %p49;
-
-BB8_45:
-       st.global.f64   [%rd1], %fd66;
-       bra.uni         BB8_90;
+BB8_85:
+       and.b32         %r51, %r4, 2147483647;
+       setp.ne.s32     %p83, %r51, 2146435072;
+       mov.f64         %fd70, %fd41;
+       mov.f64         %fd73, %fd70;
+       @%p83 bra       BB8_89;
 
-BB8_82:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r50, %temp}, %fd1;
+       mov.b64         {%r52, %temp}, %fd1;
        }
-       and.b32         %r51, %r4, 2147483647;
-       setp.eq.s32     %p86, %r51, 2146435072;
-       setp.eq.s32     %p87, %r50, 0;
-       and.pred        %p88, %p86, %p87;
-       mov.f64         %fd71, %fd41;
-       @!%p88 bra      BB8_85;
-       bra.uni         BB8_83;
-
-BB8_83:
-       shr.s32         %r52, %r5, 31;
-       and.b32         %r53, %r52, -2146435072;
-       selp.b32        %r54, -1048576, 2146435072, %p2;
-       add.s32         %r55, %r54, %r53;
-       mov.u32         %r56, 0;
-       mov.b64         %fd71, {%r56, %r55};
+       setp.ne.s32     %p84, %r52, 0;
+       mov.f64         %fd73, %fd41;
+       @%p84 bra       BB8_89;
+
+       shr.s32         %r53, %r5, 31;
+       and.b32         %r54, %r53, -2146435072;
+       add.s32         %r55, %r54, 2146435072;
+       or.b32          %r56, %r55, -2147483648;
+       selp.b32        %r57, %r56, %r55, %p2;
+       mov.u32         %r58, 0;
+       mov.b64         %fd73, {%r58, %r57};
+       bra.uni         BB8_89;
 
-BB8_85:
-       setp.eq.f64     %p92, %fd52, 0d0000000000000000;
-       setp.eq.f64     %p93, %fd1, 0d3FF0000000000000;
-       or.pred         %p94, %p93, %p92;
-       selp.f64        %fd74, 0d3FF0000000000000, %fd71, %p94;
+BB8_42:
+       setp.gt.f64     %p42, %fd10, 0d3FF0000000000000;
+       selp.b32        %r33, 2146435072, 0, %p42;
+       xor.b32         %r34, %r33, 2146435072;
+       setp.lt.s32     %p43, %r3, 0;
+       selp.b32        %r35, %r34, %r33, %p43;
+       setp.eq.f64     %p44, %fd52, 0dBFF0000000000000;
+       selp.b32        %r36, 1072693248, %r35, %p44;
+       mov.u32         %r37, 0;
+       mov.b64         %fd64, {%r37, %r36};
+
+BB8_43:
+       setp.eq.f64     %p45, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p46, %fd52, 0d3FF0000000000000;
+       or.pred         %p47, %p46, %p45;
+       selp.f64        %fd67, 0d3FF0000000000000, %fd64, %p47;
+
+BB8_47:
+       st.global.f64   [%rd1], %fd67;
+       bra.uni         BB8_94;
+
+BB8_88:
+       setp.gt.f64     %p85, %fd35, 0d3FF0000000000000;
+       selp.b32        %r59, 2146435072, 0, %p85;
+       xor.b32         %r60, %r59, 2146435072;
+       setp.lt.s32     %p86, %r5, 0;
+       selp.b32        %r61, %r60, %r59, %p86;
+       setp.eq.f64     %p87, %fd1, 0dBFF0000000000000;
+       selp.b32        %r62, 1072693248, %r61, %p87;
+       mov.u32         %r63, 0;
+       mov.b64         %fd73, {%r63, %r62};
 
 BB8_89:
-       st.global.f64   [%rd1], %fd74;
+       setp.eq.f64     %p88, %fd52, 0d0000000000000000;
+       setp.eq.f64     %p89, %fd1, 0d3FF0000000000000;
+       or.pred         %p90, %p89, %p88;
+       selp.f64        %fd76, 0d3FF0000000000000, %fd73, %p90;
 
-BB8_90:
+BB8_93:
+       st.global.f64   [%rd1], %fd76;
+
+BB8_94:
        bar.sync        0;
        ret;
 }
@@ -2928,7 +2942,7 @@ BB19_35:
        .reg .pred      %p<20>;
        .reg .b32       %r<39>;
        .reg .f64       %fd<76>;
-       .reg .b64       %rd<42>;
+       .reg .b64       %rd<43>;
 
 
        ld.param.u64    %rd1, [reduce_row_mean_param_0];
@@ -3095,12 +3109,13 @@ BB20_33:
        @%p19 bra       BB20_35;
 
        ld.shared.f64   %fd40, [sdata];
-       cvt.rn.f64.s32  %fd41, %r4;
+       cvt.u64.u32     %rd39, %r4;
+       cvt.rn.f64.s64  %fd41, %rd39;
        div.rn.f64      %fd42, %fd40, %fd41;
-       cvta.to.global.u64      %rd39, %rd2;
-       mul.wide.u32    %rd40, %r6, 8;
-       add.s64         %rd41, %rd39, %rd40;
-       st.global.f64   [%rd41], %fd42;
+       cvta.to.global.u64      %rd40, %rd2;
+       mul.wide.u32    %rd41, %r6, 8;
+       add.s64         %rd42, %rd40, %rd41;
+       st.global.f64   [%rd42], %fd42;
 
 BB20_35:
        ret;
@@ -3117,7 +3132,7 @@ BB20_35:
        .reg .pred      %p<4>;
        .reg .b32       %r<11>;
        .reg .f64       %fd<12>;
-       .reg .b64       %rd<9>;
+       .reg .b64       %rd<10>;
 
 
        ld.param.u64    %rd2, [reduce_col_mean_param_0];
@@ -3154,11 +3169,12 @@ BB21_3:
 
 BB21_4:
        cvta.to.global.u64      %rd6, %rd3;
-       cvt.rn.f64.s32  %fd7, %r5;
+       cvt.u64.u32     %rd7, %r5;
+       cvt.rn.f64.s64  %fd7, %rd7;
        div.rn.f64      %fd8, %fd10, %fd7;
-       mul.wide.u32    %rd7, %r1, 8;
-       add.s64         %rd8, %rd6, %rd7;
-       st.global.f64   [%rd8], %fd8;
+       mul.wide.u32    %rd8, %r1, 8;
+       add.s64         %rd9, %rd6, %rd8;
+       st.global.f64   [%rd9], %fd8;
 
 BB21_5:
        ret;
@@ -3277,82 +3293,1638 @@ BB22_5:
        ret;
 }
 
-.func  (.param .b64 func_retval0) __internal_accurate_pow(
-       .param .b64 __internal_accurate_pow_param_0,
-       .param .b64 __internal_accurate_pow_param_1
+       // .globl       matrix_sqrt
+.visible .entry matrix_sqrt(
+       .param .u64 matrix_sqrt_param_0,
+       .param .u64 matrix_sqrt_param_1,
+       .param .u32 matrix_sqrt_param_2
 )
 {
-       .reg .pred      %p<10>;
-       .reg .f32       %f<3>;
-       .reg .b32       %r<52>;
-       .reg .f64       %fd<134>;
+       .reg .pred      %p<2>;
+       .reg .b32       %r<6>;
+       .reg .f64       %fd<3>;
+       .reg .b64       %rd<8>;
 
 
-       ld.param.f64    %fd12, [__internal_accurate_pow_param_0];
-       ld.param.f64    %fd13, [__internal_accurate_pow_param_1];
+       ld.param.u64    %rd1, [matrix_sqrt_param_0];
+       ld.param.u64    %rd2, [matrix_sqrt_param_1];
+       ld.param.u32    %r2, [matrix_sqrt_param_2];
+       mov.u32         %r3, %ctaid.x;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %tid.x;
+       mad.lo.s32      %r1, %r4, %r3, %r5;
+       setp.ge.u32     %p1, %r1, %r2;
+       @%p1 bra        BB23_2;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       mul.wide.s32    %rd4, %r1, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd1, [%rd5];
+       sqrt.rn.f64     %fd2, %fd1;
+       cvta.to.global.u64      %rd6, %rd2;
+       add.s64         %rd7, %rd6, %rd4;
+       st.global.f64   [%rd7], %fd2;
+
+BB23_2:
+       ret;
+}
+
+       // .globl       matrix_round
+.visible .entry matrix_round(
+       .param .u64 matrix_round_param_0,
+       .param .u64 matrix_round_param_1,
+       .param .u32 matrix_round_param_2
+)
+{
+       .reg .pred      %p<4>;
+       .reg .b32       %r<11>;
+       .reg .f64       %fd<10>;
+       .reg .b64       %rd<11>;
+
+
+       ld.param.u64    %rd2, [matrix_round_param_0];
+       ld.param.u64    %rd3, [matrix_round_param_1];
+       ld.param.u32    %r2, [matrix_round_param_2];
+       mov.u32         %r3, %ctaid.x;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %tid.x;
+       mad.lo.s32      %r1, %r4, %r3, %r5;
+       setp.ge.u32     %p1, %r1, %r2;
+       @%p1 bra        BB24_4;
+
+       cvta.to.global.u64      %rd4, %rd2;
+       cvt.s64.s32     %rd1, %r1;
+       mul.wide.s32    %rd5, %r1, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd9, [%rd6];
+       abs.f64         %fd2, %fd9;
+       setp.ge.f64     %p2, %fd2, 0d4330000000000000;
+       @%p2 bra        BB24_3;
+
+       add.f64         %fd5, %fd2, 0d3FE0000000000000;
+       cvt.rzi.f64.f64 %fd6, %fd5;
+       setp.lt.f64     %p3, %fd2, 0d3FE0000000000000;
+       selp.f64        %fd7, 0d0000000000000000, %fd6, %p3;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r49}, %fd12;
+       mov.b64         {%r6, %temp}, %fd7;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%r48, %temp}, %fd12;
+       mov.b64         {%temp, %r7}, %fd7;
        }
-       shr.u32         %r50, %r49, 20;
-       setp.ne.s32     %p1, %r50, 0;
-       @%p1 bra        BB23_2;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r8}, %fd9;
+       }
+       and.b32         %r9, %r8, -2147483648;
+       or.b32          %r10, %r7, %r9;
+       mov.b64         %fd9, {%r6, %r10};
 
-       mul.f64         %fd14, %fd12, 0d4350000000000000;
+BB24_3:
+       cvta.to.global.u64      %rd7, %rd3;
+       cvt.rzi.s64.f64 %rd8, %fd9;
+       cvt.rn.f64.s64  %fd8, %rd8;
+       shl.b64         %rd9, %rd1, 3;
+       add.s64         %rd10, %rd7, %rd9;
+       st.global.f64   [%rd10], %fd8;
+
+BB24_4:
+       ret;
+}
+
+       // .globl       matrix_abs
+.visible .entry matrix_abs(
+       .param .u64 matrix_abs_param_0,
+       .param .u64 matrix_abs_param_1,
+       .param .u32 matrix_abs_param_2
+)
+{
+       .reg .pred      %p<2>;
+       .reg .b32       %r<6>;
+       .reg .f64       %fd<3>;
+       .reg .b64       %rd<8>;
+
+
+       ld.param.u64    %rd1, [matrix_abs_param_0];
+       ld.param.u64    %rd2, [matrix_abs_param_1];
+       ld.param.u32    %r2, [matrix_abs_param_2];
+       mov.u32         %r3, %ctaid.x;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %tid.x;
+       mad.lo.s32      %r1, %r4, %r3, %r5;
+       setp.ge.u32     %p1, %r1, %r2;
+       @%p1 bra        BB25_2;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       mul.wide.s32    %rd4, %r1, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd1, [%rd5];
+       abs.f64         %fd2, %fd1;
+       cvta.to.global.u64      %rd6, %rd2;
+       add.s64         %rd7, %rd6, %rd4;
+       st.global.f64   [%rd7], %fd2;
+
+BB25_2:
+       ret;
+}
+
+       // .globl       matrix_log
+.visible .entry matrix_log(
+       .param .u64 matrix_log_param_0,
+       .param .u64 matrix_log_param_1,
+       .param .u32 matrix_log_param_2
+)
+{
+       .reg .pred      %p<6>;
+       .reg .f32       %f<2>;
+       .reg .b32       %r<33>;
+       .reg .f64       %fd<59>;
+       .reg .b64       %rd<10>;
+
+
+       ld.param.u64    %rd2, [matrix_log_param_0];
+       ld.param.u64    %rd3, [matrix_log_param_1];
+       ld.param.u32    %r12, [matrix_log_param_2];
+       mov.u32         %r13, %ctaid.x;
+       mov.u32         %r14, %ntid.x;
+       mov.u32         %r15, %tid.x;
+       mad.lo.s32      %r1, %r14, %r13, %r15;
+       setp.ge.u32     %p1, %r1, %r12;
+       @%p1 bra        BB26_9;
+
+       cvta.to.global.u64      %rd4, %rd2;
+       cvt.s64.s32     %rd1, %r1;
+       mul.wide.s32    %rd5, %r1, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd56, [%rd6];
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r49}, %fd14;
+       mov.b64         {%temp, %r29}, %fd56;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%r48, %temp}, %fd14;
+       mov.b64         {%r30, %temp}, %fd56;
        }
-       shr.u32         %r16, %r49, 20;
-       add.s32         %r50, %r16, -54;
+       mov.u32         %r31, -1023;
+       setp.gt.s32     %p2, %r29, 1048575;
+       @%p2 bra        BB26_3;
 
-BB23_2:
-       add.s32         %r51, %r50, -1023;
-       and.b32         %r17, %r49, -2146435073;
-       or.b32          %r18, %r17, 1072693248;
-       mov.b64         %fd132, {%r48, %r18};
-       setp.lt.u32     %p2, %r18, 1073127583;
-       @%p2 bra        BB23_4;
+       mul.f64         %fd56, %fd56, 0d4350000000000000;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r29}, %fd56;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r30, %temp}, %fd56;
+       }
+       mov.u32         %r31, -1077;
+
+BB26_3:
+       add.s32         %r18, %r29, -1;
+       setp.lt.u32     %p3, %r18, 2146435071;
+       @%p3 bra        BB26_5;
+       bra.uni         BB26_4;
+
+BB26_5:
+       shr.u32         %r20, %r29, 20;
+       add.s32         %r32, %r31, %r20;
+       and.b32         %r21, %r29, -2146435073;
+       or.b32          %r22, %r21, 1072693248;
+       mov.b64         %fd57, {%r30, %r22};
+       setp.lt.s32     %p5, %r22, 1073127583;
+       @%p5 bra        BB26_7;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r19, %temp}, %fd132;
+       mov.b64         {%r23, %temp}, %fd57;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r20}, %fd132;
+       mov.b64         {%temp, %r24}, %fd57;
        }
-       add.s32         %r21, %r20, -1048576;
-       mov.b64         %fd132, {%r19, %r21};
-       add.s32         %r51, %r50, -1022;
+       add.s32         %r25, %r24, -1048576;
+       mov.b64         %fd57, {%r23, %r25};
+       add.s32         %r32, %r32, 1;
 
-BB23_4:
-       add.f64         %fd16, %fd132, 0d3FF0000000000000;
+BB26_7:
+       add.f64         %fd13, %fd57, 0d3FF0000000000000;
        // inline asm
-       rcp.approx.ftz.f64 %fd15,%fd16;
+       rcp.approx.ftz.f64 %fd12,%fd13;
        // inline asm
-       neg.f64         %fd17, %fd16;
-       mov.f64         %fd18, 0d3FF0000000000000;
-       fma.rn.f64      %fd19, %fd17, %fd15, %fd18;
-       fma.rn.f64      %fd20, %fd19, %fd19, %fd19;
-       fma.rn.f64      %fd21, %fd20, %fd15, %fd15;
-       add.f64         %fd22, %fd132, 0dBFF0000000000000;
-       mul.f64         %fd23, %fd22, %fd21;
-       fma.rn.f64      %fd24, %fd22, %fd21, %fd23;
-       mul.f64         %fd25, %fd24, %fd24;
-       mov.f64         %fd26, 0d3ED0F5D241AD3B5A;
-       mov.f64         %fd27, 0d3EB0F5FF7D2CAFE2;
-       fma.rn.f64      %fd28, %fd27, %fd25, %fd26;
-       mov.f64         %fd29, 0d3EF3B20A75488A3F;
-       fma.rn.f64      %fd30, %fd28, %fd25, %fd29;
+       neg.f64         %fd14, %fd13;
+       mov.f64         %fd15, 0d3FF0000000000000;
+       fma.rn.f64      %fd16, %fd14, %fd12, %fd15;
+       fma.rn.f64      %fd17, %fd16, %fd16, %fd16;
+       fma.rn.f64      %fd18, %fd17, %fd12, %fd12;
+       add.f64         %fd19, %fd57, 0dBFF0000000000000;
+       mul.f64         %fd20, %fd19, %fd18;
+       fma.rn.f64      %fd21, %fd19, %fd18, %fd20;
+       mul.f64         %fd22, %fd21, %fd21;
+       mov.f64         %fd23, 0d3ED0EE258B7A8B04;
+       mov.f64         %fd24, 0d3EB1380B3AE80F1E;
+       fma.rn.f64      %fd25, %fd24, %fd22, %fd23;
+       mov.f64         %fd26, 0d3EF3B2669F02676F;
+       fma.rn.f64      %fd27, %fd25, %fd22, %fd26;
+       mov.f64         %fd28, 0d3F1745CBA9AB0956;
+       fma.rn.f64      %fd29, %fd27, %fd22, %fd28;
+       mov.f64         %fd30, 0d3F3C71C72D1B5154;
+       fma.rn.f64      %fd31, %fd29, %fd22, %fd30;
+       mov.f64         %fd32, 0d3F624924923BE72D;
+       fma.rn.f64      %fd33, %fd31, %fd22, %fd32;
+       mov.f64         %fd34, 0d3F8999999999A3C4;
+       fma.rn.f64      %fd35, %fd33, %fd22, %fd34;
+       mov.f64         %fd36, 0d3FB5555555555554;
+       fma.rn.f64      %fd37, %fd35, %fd22, %fd36;
+       sub.f64         %fd38, %fd19, %fd21;
+       add.f64         %fd39, %fd38, %fd38;
+       neg.f64         %fd40, %fd21;
+       fma.rn.f64      %fd41, %fd40, %fd19, %fd39;
+       mul.f64         %fd42, %fd18, %fd41;
+       mul.f64         %fd43, %fd22, %fd37;
+       fma.rn.f64      %fd44, %fd43, %fd21, %fd42;
+       xor.b32         %r26, %r32, -2147483648;
+       mov.u32         %r27, 1127219200;
+       mov.b64         %fd45, {%r26, %r27};
+       mov.u32         %r28, -2147483648;
+       mov.b64         %fd46, {%r28, %r27};
+       sub.f64         %fd47, %fd45, %fd46;
+       mov.f64         %fd48, 0d3FE62E42FEFA39EF;
+       fma.rn.f64      %fd49, %fd47, %fd48, %fd21;
+       neg.f64         %fd50, %fd47;
+       fma.rn.f64      %fd51, %fd50, %fd48, %fd49;
+       sub.f64         %fd52, %fd51, %fd21;
+       sub.f64         %fd53, %fd44, %fd52;
+       mov.f64         %fd54, 0d3C7ABC9E3B39803F;
+       fma.rn.f64      %fd55, %fd47, %fd54, %fd53;
+       add.f64         %fd58, %fd49, %fd55;
+       bra.uni         BB26_8;
+
+BB26_4:
+       mov.f64         %fd10, 0d7FF0000000000000;
+       fma.rn.f64      %fd11, %fd56, %fd10, %fd10;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r19}, %fd56;
+       }
+       mov.b32          %f1, %r19;
+       setp.eq.f32     %p4, %f1, 0f00000000;
+       selp.f64        %fd58, 0dFFF0000000000000, %fd11, %p4;
+
+BB26_8:
+       cvta.to.global.u64      %rd7, %rd3;
+       shl.b64         %rd8, %rd1, 3;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f64   [%rd9], %fd58;
+
+BB26_9:
+       ret;
+}
+
+       // .globl       matrix_floor
+.visible .entry matrix_floor(
+       .param .u64 matrix_floor_param_0,
+       .param .u64 matrix_floor_param_1,
+       .param .u32 matrix_floor_param_2
+)
+{
+       .reg .pred      %p<2>;
+       .reg .b32       %r<6>;
+       .reg .f64       %fd<3>;
+       .reg .b64       %rd<8>;
+
+
+       ld.param.u64    %rd1, [matrix_floor_param_0];
+       ld.param.u64    %rd2, [matrix_floor_param_1];
+       ld.param.u32    %r2, [matrix_floor_param_2];
+       mov.u32         %r3, %ctaid.x;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %tid.x;
+       mad.lo.s32      %r1, %r4, %r3, %r5;
+       setp.ge.u32     %p1, %r1, %r2;
+       @%p1 bra        BB27_2;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       mul.wide.s32    %rd4, %r1, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd1, [%rd5];
+       cvt.rmi.f64.f64 %fd2, %fd1;
+       cvta.to.global.u64      %rd6, %rd2;
+       add.s64         %rd7, %rd6, %rd4;
+       st.global.f64   [%rd7], %fd2;
+
+BB27_2:
+       ret;
+}
+
+       // .globl       matrix_ceil
+.visible .entry matrix_ceil(
+       .param .u64 matrix_ceil_param_0,
+       .param .u64 matrix_ceil_param_1,
+       .param .u32 matrix_ceil_param_2
+)
+{
+       .reg .pred      %p<2>;
+       .reg .b32       %r<6>;
+       .reg .f64       %fd<3>;
+       .reg .b64       %rd<8>;
+
+
+       ld.param.u64    %rd1, [matrix_ceil_param_0];
+       ld.param.u64    %rd2, [matrix_ceil_param_1];
+       ld.param.u32    %r2, [matrix_ceil_param_2];
+       mov.u32         %r3, %ctaid.x;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %tid.x;
+       mad.lo.s32      %r1, %r4, %r3, %r5;
+       setp.ge.u32     %p1, %r1, %r2;
+       @%p1 bra        BB28_2;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       mul.wide.s32    %rd4, %r1, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd1, [%rd5];
+       cvt.rpi.f64.f64 %fd2, %fd1;
+       cvta.to.global.u64      %rd6, %rd2;
+       add.s64         %rd7, %rd6, %rd4;
+       st.global.f64   [%rd7], %fd2;
+
+BB28_2:
+       ret;
+}
+
+       // .globl       matrix_sin
+.visible .entry matrix_sin(
+       .param .u64 matrix_sin_param_0,
+       .param .u64 matrix_sin_param_1,
+       .param .u32 matrix_sin_param_2
+)
+{
+       .local .align 4 .b8     __local_depot29[4];
+       .reg .b64       %SP;
+       .reg .b64       %SPL;
+       .reg .pred      %p<7>;
+       .reg .b32       %r<18>;
+       .reg .f64       %fd<41>;
+       .reg .b64       %rd<17>;
+
+
+       mov.u64         %rd16, __local_depot29;
+       cvta.local.u64  %SP, %rd16;
+       ld.param.u64    %rd3, [matrix_sin_param_0];
+       ld.param.u64    %rd4, [matrix_sin_param_1];
+       ld.param.u32    %r5, [matrix_sin_param_2];
+       add.u64         %rd5, %SP, 0;
+       cvta.to.local.u64       %rd1, %rd5;
+       mov.u32         %r6, %ntid.x;
+       mov.u32         %r7, %ctaid.x;
+       mov.u32         %r8, %tid.x;
+       mad.lo.s32      %r1, %r6, %r7, %r8;
+       setp.ge.u32     %p1, %r1, %r5;
+       @%p1 bra        BB29_11;
+
+       cvta.to.global.u64      %rd6, %rd3;
+       cvt.s64.s32     %rd2, %r1;
+       mul.wide.s32    %rd7, %r1, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       ld.global.f64   %fd38, [%rd8];
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r9}, %fd38;
+       }
+       and.b32         %r10, %r9, 2147483647;
+       setp.ne.s32     %p2, %r10, 2146435072;
+       @%p2 bra        BB29_4;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r11, %temp}, %fd38;
+       }
+       setp.ne.s32     %p3, %r11, 0;
+       @%p3 bra        BB29_4;
+
+       mov.f64         %fd14, 0d0000000000000000;
+       mul.rn.f64      %fd38, %fd38, %fd14;
+
+BB29_4:
+       mul.f64         %fd15, %fd38, 0d3FE45F306DC9C883;
+       cvt.rni.s32.f64 %r17, %fd15;
+       st.local.u32    [%rd1], %r17;
+       cvt.rn.f64.s32  %fd16, %r17;
+       neg.f64         %fd17, %fd16;
+       mov.f64         %fd18, 0d3FF921FB54442D18;
+       fma.rn.f64      %fd19, %fd17, %fd18, %fd38;
+       mov.f64         %fd20, 0d3C91A62633145C00;
+       fma.rn.f64      %fd21, %fd17, %fd20, %fd19;
+       mov.f64         %fd22, 0d397B839A252049C0;
+       fma.rn.f64      %fd39, %fd17, %fd22, %fd21;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r12}, %fd38;
+       }
+       and.b32         %r13, %r12, 2145386496;
+       setp.lt.u32     %p4, %r13, 1105199104;
+       @%p4 bra        BB29_6;
+
+       // Callseq Start 3
+       {
+       .reg .b32 temp_param_reg;
+       // <end>}
+       .param .b64 param0;
+       st.param.f64    [param0+0], %fd38;
+       .param .b64 param1;
+       st.param.b64    [param1+0], %rd5;
+       .param .b64 retval0;
+       call.uni (retval0), 
+       __internal_trig_reduction_slowpathd, 
+       (
+       param0, 
+       param1
+       );
+       ld.param.f64    %fd39, [retval0+0];
+       
+       //{
+       }// Callseq End 3
+       ld.local.u32    %r17, [%rd1];
+
+BB29_6:
+       and.b32         %r14, %r17, 1;
+       shl.b32         %r15, %r14, 3;
+       setp.eq.s32     %p5, %r14, 0;
+       selp.f64        %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+       mul.wide.u32    %rd10, %r15, 8;
+       mov.u64         %rd11, __cudart_sin_cos_coeffs;
+       add.s64         %rd12, %rd10, %rd11;
+       ld.const.f64    %fd24, [%rd12+8];
+       mul.rn.f64      %fd7, %fd39, %fd39;
+       fma.rn.f64      %fd25, %fd23, %fd7, %fd24;
+       ld.const.f64    %fd26, [%rd12+16];
+       fma.rn.f64      %fd27, %fd25, %fd7, %fd26;
+       ld.const.f64    %fd28, [%rd12+24];
+       fma.rn.f64      %fd29, %fd27, %fd7, %fd28;
+       ld.const.f64    %fd30, [%rd12+32];
+       fma.rn.f64      %fd31, %fd29, %fd7, %fd30;
+       ld.const.f64    %fd32, [%rd12+40];
+       fma.rn.f64      %fd33, %fd31, %fd7, %fd32;
+       ld.const.f64    %fd34, [%rd12+48];
+       fma.rn.f64      %fd8, %fd33, %fd7, %fd34;
+       fma.rn.f64      %fd40, %fd8, %fd39, %fd39;
+       @%p5 bra        BB29_8;
+
+       mov.f64         %fd35, 0d3FF0000000000000;
+       fma.rn.f64      %fd40, %fd8, %fd7, %fd35;
+
+BB29_8:
+       and.b32         %r16, %r17, 2;
+       setp.eq.s32     %p6, %r16, 0;
+       @%p6 bra        BB29_10;
+
+       mov.f64         %fd36, 0d0000000000000000;
+       mov.f64         %fd37, 0dBFF0000000000000;
+       fma.rn.f64      %fd40, %fd40, %fd37, %fd36;
+
+BB29_10:
+       cvta.to.global.u64      %rd13, %rd4;
+       shl.b64         %rd14, %rd2, 3;
+       add.s64         %rd15, %rd13, %rd14;
+       st.global.f64   [%rd15], %fd40;
+
+BB29_11:
+       ret;
+}
+
+       // .globl       matrix_cos
+.visible .entry matrix_cos(
+       .param .u64 matrix_cos_param_0,
+       .param .u64 matrix_cos_param_1,
+       .param .u32 matrix_cos_param_2
+)
+{
+       .local .align 4 .b8     __local_depot30[4];
+       .reg .b64       %SP;
+       .reg .b64       %SPL;
+       .reg .pred      %p<7>;
+       .reg .b32       %r<19>;
+       .reg .f64       %fd<41>;
+       .reg .b64       %rd<17>;
+
+
+       mov.u64         %rd16, __local_depot30;
+       cvta.local.u64  %SP, %rd16;
+       ld.param.u64    %rd3, [matrix_cos_param_0];
+       ld.param.u64    %rd4, [matrix_cos_param_1];
+       ld.param.u32    %r6, [matrix_cos_param_2];
+       add.u64         %rd5, %SP, 0;
+       cvta.to.local.u64       %rd1, %rd5;
+       mov.u32         %r7, %ntid.x;
+       mov.u32         %r8, %ctaid.x;
+       mov.u32         %r9, %tid.x;
+       mad.lo.s32      %r1, %r7, %r8, %r9;
+       setp.ge.u32     %p1, %r1, %r6;
+       @%p1 bra        BB30_11;
+
+       cvta.to.global.u64      %rd6, %rd3;
+       cvt.s64.s32     %rd2, %r1;
+       mul.wide.s32    %rd7, %r1, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       ld.global.f64   %fd38, [%rd8];
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r10}, %fd38;
+       }
+       and.b32         %r11, %r10, 2147483647;
+       setp.ne.s32     %p2, %r11, 2146435072;
+       @%p2 bra        BB30_4;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r12, %temp}, %fd38;
+       }
+       setp.ne.s32     %p3, %r12, 0;
+       @%p3 bra        BB30_4;
+
+       mov.f64         %fd14, 0d0000000000000000;
+       mul.rn.f64      %fd38, %fd38, %fd14;
+
+BB30_4:
+       mul.f64         %fd15, %fd38, 0d3FE45F306DC9C883;
+       cvt.rni.s32.f64 %r18, %fd15;
+       st.local.u32    [%rd1], %r18;
+       cvt.rn.f64.s32  %fd16, %r18;
+       neg.f64         %fd17, %fd16;
+       mov.f64         %fd18, 0d3FF921FB54442D18;
+       fma.rn.f64      %fd19, %fd17, %fd18, %fd38;
+       mov.f64         %fd20, 0d3C91A62633145C00;
+       fma.rn.f64      %fd21, %fd17, %fd20, %fd19;
+       mov.f64         %fd22, 0d397B839A252049C0;
+       fma.rn.f64      %fd39, %fd17, %fd22, %fd21;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r13}, %fd38;
+       }
+       and.b32         %r14, %r13, 2145386496;
+       setp.lt.u32     %p4, %r14, 1105199104;
+       @%p4 bra        BB30_6;
+
+       // Callseq Start 4
+       {
+       .reg .b32 temp_param_reg;
+       // <end>}
+       .param .b64 param0;
+       st.param.f64    [param0+0], %fd38;
+       .param .b64 param1;
+       st.param.b64    [param1+0], %rd5;
+       .param .b64 retval0;
+       call.uni (retval0), 
+       __internal_trig_reduction_slowpathd, 
+       (
+       param0, 
+       param1
+       );
+       ld.param.f64    %fd39, [retval0+0];
+       
+       //{
+       }// Callseq End 4
+       ld.local.u32    %r18, [%rd1];
+
+BB30_6:
+       add.s32         %r5, %r18, 1;
+       and.b32         %r15, %r5, 1;
+       shl.b32         %r16, %r15, 3;
+       setp.eq.s32     %p5, %r15, 0;
+       selp.f64        %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
+       mul.wide.u32    %rd10, %r16, 8;
+       mov.u64         %rd11, __cudart_sin_cos_coeffs;
+       add.s64         %rd12, %rd10, %rd11;
+       ld.const.f64    %fd24, [%rd12+8];
+       mul.rn.f64      %fd7, %fd39, %fd39;
+       fma.rn.f64      %fd25, %fd23, %fd7, %fd24;
+       ld.const.f64    %fd26, [%rd12+16];
+       fma.rn.f64      %fd27, %fd25, %fd7, %fd26;
+       ld.const.f64    %fd28, [%rd12+24];
+       fma.rn.f64      %fd29, %fd27, %fd7, %fd28;
+       ld.const.f64    %fd30, [%rd12+32];
+       fma.rn.f64      %fd31, %fd29, %fd7, %fd30;
+       ld.const.f64    %fd32, [%rd12+40];
+       fma.rn.f64      %fd33, %fd31, %fd7, %fd32;
+       ld.const.f64    %fd34, [%rd12+48];
+       fma.rn.f64      %fd8, %fd33, %fd7, %fd34;
+       fma.rn.f64      %fd40, %fd8, %fd39, %fd39;
+       @%p5 bra        BB30_8;
+
+       mov.f64         %fd35, 0d3FF0000000000000;
+       fma.rn.f64      %fd40, %fd8, %fd7, %fd35;
+
+BB30_8:
+       and.b32         %r17, %r5, 2;
+       setp.eq.s32     %p6, %r17, 0;
+       @%p6 bra        BB30_10;
+
+       mov.f64         %fd36, 0d0000000000000000;
+       mov.f64         %fd37, 0dBFF0000000000000;
+       fma.rn.f64      %fd40, %fd40, %fd37, %fd36;
+
+BB30_10:
+       cvta.to.global.u64      %rd13, %rd4;
+       shl.b64         %rd14, %rd2, 3;
+       add.s64         %rd15, %rd13, %rd14;
+       st.global.f64   [%rd15], %fd40;
+
+BB30_11:
+       ret;
+}
+
+       // .globl       matrix_tan
+.visible .entry matrix_tan(
+       .param .u64 matrix_tan_param_0,
+       .param .u64 matrix_tan_param_1,
+       .param .u32 matrix_tan_param_2
+)
+{
+       .local .align 4 .b8     __local_depot31[4];
+       .reg .b64       %SP;
+       .reg .b64       %SPL;
+       .reg .pred      %p<6>;
+       .reg .b32       %r<16>;
+       .reg .f64       %fd<66>;
+       .reg .b64       %rd<14>;
+
+
+       mov.u64         %rd13, __local_depot31;
+       cvta.local.u64  %SP, %rd13;
+       ld.param.u64    %rd3, [matrix_tan_param_0];
+       ld.param.u64    %rd4, [matrix_tan_param_1];
+       ld.param.u32    %r5, [matrix_tan_param_2];
+       add.u64         %rd5, %SP, 0;
+       cvta.to.local.u64       %rd1, %rd5;
+       mov.u32         %r6, %ntid.x;
+       mov.u32         %r7, %ctaid.x;
+       mov.u32         %r8, %tid.x;
+       mad.lo.s32      %r1, %r6, %r7, %r8;
+       setp.ge.u32     %p1, %r1, %r5;
+       @%p1 bra        BB31_9;
+
+       cvta.to.global.u64      %rd6, %rd3;
+       cvt.s64.s32     %rd2, %r1;
+       mul.wide.s32    %rd7, %r1, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       ld.global.f64   %fd63, [%rd8];
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r9}, %fd63;
+       }
+       and.b32         %r10, %r9, 2147483647;
+       setp.ne.s32     %p2, %r10, 2146435072;
+       @%p2 bra        BB31_4;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r11, %temp}, %fd63;
+       }
+       setp.ne.s32     %p3, %r11, 0;
+       @%p3 bra        BB31_4;
+
+       mov.f64         %fd11, 0d0000000000000000;
+       mul.rn.f64      %fd63, %fd63, %fd11;
+
+BB31_4:
+       mul.f64         %fd12, %fd63, 0d3FE45F306DC9C883;
+       cvt.rni.s32.f64 %r15, %fd12;
+       st.local.u32    [%rd1], %r15;
+       cvt.rn.f64.s32  %fd13, %r15;
+       neg.f64         %fd14, %fd13;
+       mov.f64         %fd15, 0d3FF921FB54442D18;
+       fma.rn.f64      %fd16, %fd14, %fd15, %fd63;
+       mov.f64         %fd17, 0d3C91A62633145C00;
+       fma.rn.f64      %fd18, %fd14, %fd17, %fd16;
+       mov.f64         %fd19, 0d397B839A252049C0;
+       fma.rn.f64      %fd64, %fd14, %fd19, %fd18;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r12}, %fd63;
+       }
+       and.b32         %r13, %r12, 2145386496;
+       setp.lt.u32     %p4, %r13, 1105199104;
+       @%p4 bra        BB31_6;
+
+       // Callseq Start 5
+       {
+       .reg .b32 temp_param_reg;
+       // <end>}
+       .param .b64 param0;
+       st.param.f64    [param0+0], %fd63;
+       .param .b64 param1;
+       st.param.b64    [param1+0], %rd5;
+       .param .b64 retval0;
+       call.uni (retval0), 
+       __internal_trig_reduction_slowpathd, 
+       (
+       param0, 
+       param1
+       );
+       ld.param.f64    %fd64, [retval0+0];
+       
+       //{
+       }// Callseq End 5
+       ld.local.u32    %r15, [%rd1];
+
+BB31_6:
+       mul.f64         %fd20, %fd64, %fd64;
+       mov.f64         %fd21, 0dBEF9757C5B27EBB1;
+       mov.f64         %fd22, 0d3EE48DAC2799BCB9;
+       fma.rn.f64      %fd23, %fd22, %fd20, %fd21;
+       mov.f64         %fd24, 0d3F0980E90FD91E04;
+       fma.rn.f64      %fd25, %fd23, %fd20, %fd24;
+       mov.f64         %fd26, 0dBEFAE2B0417D7E1D;
+       fma.rn.f64      %fd27, %fd25, %fd20, %fd26;
+       mov.f64         %fd28, 0d3F119F5341BFBA57;
+       fma.rn.f64      %fd29, %fd27, %fd20, %fd28;
+       mov.f64         %fd30, 0d3F15E791A00F6919;
+       fma.rn.f64      %fd31, %fd29, %fd20, %fd30;
+       mov.f64         %fd32, 0d3F2FF2E7FADEC73A;
+       fma.rn.f64      %fd33, %fd31, %fd20, %fd32;
+       mov.f64         %fd34, 0d3F434BC1B206DA62;
+       fma.rn.f64      %fd35, %fd33, %fd20, %fd34;
+       mov.f64         %fd36, 0d3F57DB18EF2F83F9;
+       fma.rn.f64      %fd37, %fd35, %fd20, %fd36;
+       mov.f64         %fd38, 0d3F6D6D2E7AE49FBC;
+       fma.rn.f64      %fd39, %fd37, %fd20, %fd38;
+       mov.f64         %fd40, 0d3F8226E3A816A776;
+       fma.rn.f64      %fd41, %fd39, %fd20, %fd40;
+       mov.f64         %fd42, 0d3F9664F485D25660;
+       fma.rn.f64      %fd43, %fd41, %fd20, %fd42;
+       mov.f64         %fd44, 0d3FABA1BA1BABF31D;
+       fma.rn.f64      %fd45, %fd43, %fd20, %fd44;
+       mov.f64         %fd46, 0d3FC11111111105D2;
+       fma.rn.f64      %fd47, %fd45, %fd20, %fd46;
+       mov.f64         %fd48, 0d3FD555555555555E;
+       fma.rn.f64      %fd49, %fd47, %fd20, %fd48;
+       mul.f64         %fd7, %fd20, %fd49;
+       fma.rn.f64      %fd65, %fd7, %fd64, %fd64;
+       and.b32         %r14, %r15, 1;
+       setp.eq.b32     %p5, %r14, 1;
+       @!%p5 bra       BB31_8;
+       bra.uni         BB31_7;
+
+BB31_7:
+       sub.f64         %fd52, %fd65, %fd64;
+       neg.f64         %fd53, %fd52;
+       fma.rn.f64      %fd54, %fd7, %fd64, %fd53;
+       // inline asm
+       rcp.approx.ftz.f64 %fd50,%fd65;
+       // inline asm
+       neg.f64         %fd55, %fd65;
+       mov.f64         %fd56, 0d3FF0000000000000;
+       fma.rn.f64      %fd57, %fd55, %fd50, %fd56;
+       fma.rn.f64      %fd58, %fd57, %fd57, %fd57;
+       fma.rn.f64      %fd59, %fd58, %fd50, %fd50;
+       neg.f64         %fd60, %fd59;
+       fma.rn.f64      %fd61, %fd65, %fd60, %fd56;
+       fma.rn.f64      %fd62, %fd60, %fd54, %fd61;
+       fma.rn.f64      %fd65, %fd62, %fd60, %fd60;
+
+BB31_8:
+       cvta.to.global.u64      %rd10, %rd4;
+       shl.b64         %rd11, %rd2, 3;
+       add.s64         %rd12, %rd10, %rd11;
+       st.global.f64   [%rd12], %fd65;
+
+BB31_9:
+       ret;
+}
+
+       // .globl       matrix_asin
+.visible .entry matrix_asin(
+       .param .u64 matrix_asin_param_0,
+       .param .u64 matrix_asin_param_1,
+       .param .u32 matrix_asin_param_2
+)
+{
+       .reg .pred      %p<5>;
+       .reg .f32       %f<3>;
+       .reg .b32       %r<15>;
+       .reg .f64       %fd<83>;
+       .reg .b64       %rd<10>;
+
+
+       ld.param.u64    %rd2, [matrix_asin_param_0];
+       ld.param.u64    %rd3, [matrix_asin_param_1];
+       ld.param.u32    %r3, [matrix_asin_param_2];
+       mov.u32         %r4, %ctaid.x;
+       mov.u32         %r5, %ntid.x;
+       mov.u32         %r6, %tid.x;
+       mad.lo.s32      %r1, %r5, %r4, %r6;
+       setp.ge.u32     %p1, %r1, %r3;
+       @%p1 bra        BB32_5;
+
+       cvta.to.global.u64      %rd4, %rd2;
+       cvt.s64.s32     %rd1, %r1;
+       mul.wide.s32    %rd5, %r1, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd1, [%rd6];
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r2}, %fd1;
+       }
+       mov.b32          %f1, %r2;
+       abs.f32         %f2, %f1;
+       setp.lt.f32     %p2, %f2, 0f3FE26666;
+       @%p2 bra        BB32_3;
+       bra.uni         BB32_2;
+
+BB32_3:
+       mul.f64         %fd55, %fd1, %fd1;
+       mov.f64         %fd56, 0dBFB3823B180754AF;
+       mov.f64         %fd57, 0d3FB0066BDC1895E9;
+       fma.rn.f64      %fd58, %fd57, %fd55, %fd56;
+       mov.f64         %fd59, 0d3FB11E52CC2F79AE;
+       fma.rn.f64      %fd60, %fd58, %fd55, %fd59;
+       mov.f64         %fd61, 0dBF924EAF3526861B;
+       fma.rn.f64      %fd62, %fd60, %fd55, %fd61;
+       mov.f64         %fd63, 0d3F91DF02A31E6CB7;
+       fma.rn.f64      %fd64, %fd62, %fd55, %fd63;
+       mov.f64         %fd65, 0d3F847D18B0EEC6CC;
+       fma.rn.f64      %fd66, %fd64, %fd55, %fd65;
+       mov.f64         %fd67, 0d3F8D0AF961BA53B0;
+       fma.rn.f64      %fd68, %fd66, %fd55, %fd67;
+       mov.f64         %fd69, 0d3F91BF7734CF1C48;
+       fma.rn.f64      %fd70, %fd68, %fd55, %fd69;
+       mov.f64         %fd71, 0d3F96E91483144EF7;
+       fma.rn.f64      %fd72, %fd70, %fd55, %fd71;
+       mov.f64         %fd73, 0d3F9F1C6E0A4F9F81;
+       fma.rn.f64      %fd74, %fd72, %fd55, %fd73;
+       mov.f64         %fd75, 0d3FA6DB6DC27FA92B;
+       fma.rn.f64      %fd76, %fd74, %fd55, %fd75;
+       mov.f64         %fd77, 0d3FB333333320F91B;
+       fma.rn.f64      %fd78, %fd76, %fd55, %fd77;
+       mov.f64         %fd79, 0d3FC5555555555F4D;
+       fma.rn.f64      %fd80, %fd78, %fd55, %fd79;
+       mul.f64         %fd81, %fd55, %fd80;
+       fma.rn.f64      %fd82, %fd81, %fd1, %fd1;
+       bra.uni         BB32_4;
+
+BB32_2:
+       abs.f64         %fd7, %fd1;
+       mov.f64         %fd8, 0d3FE0000000000000;
+       mov.f64         %fd9, 0dBFE0000000000000;
+       fma.rn.f64      %fd6, %fd9, %fd7, %fd8;
+       // inline asm
+       rsqrt.approx.ftz.f64 %fd5, %fd6;
+       // inline asm
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r7, %temp}, %fd5;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r8}, %fd5;
+       }
+       add.s32         %r9, %r8, -1048576;
+       mov.b64         %fd10, {%r7, %r9};
+       mul.f64         %fd11, %fd6, %fd5;
+       neg.f64         %fd12, %fd11;
+       fma.rn.f64      %fd13, %fd11, %fd12, %fd6;
+       fma.rn.f64      %fd14, %fd13, %fd10, %fd11;
+       neg.f64         %fd15, %fd14;
+       mov.f64         %fd16, 0d3FF0000000000000;
+       fma.rn.f64      %fd17, %fd5, %fd15, %fd16;
+       fma.rn.f64      %fd18, %fd17, %fd10, %fd10;
+       fma.rn.f64      %fd19, %fd14, %fd15, %fd6;
+       fma.rn.f64      %fd20, %fd19, %fd18, %fd14;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r10}, %fd6;
+       }
+       setp.lt.s32     %p3, %r10, 0;
+       selp.f64        %fd21, 0dFFF8000000000000, %fd20, %p3;
+       setp.equ.f64    %p4, %fd6, 0d0000000000000000;
+       selp.f64        %fd22, %fd6, %fd21, %p4;
+       mov.f64         %fd23, 0dBFB3823B180754AF;
+       mov.f64         %fd24, 0d3FB0066BDC1895E9;
+       fma.rn.f64      %fd25, %fd24, %fd6, %fd23;
+       mov.f64         %fd26, 0d3FB11E52CC2F79AE;
+       fma.rn.f64      %fd27, %fd25, %fd6, %fd26;
+       mov.f64         %fd28, 0dBF924EAF3526861B;
+       fma.rn.f64      %fd29, %fd27, %fd6, %fd28;
+       mov.f64         %fd30, 0d3F91DF02A31E6CB7;
+       fma.rn.f64      %fd31, %fd29, %fd6, %fd30;
+       mov.f64         %fd32, 0d3F847D18B0EEC6CC;
+       fma.rn.f64      %fd33, %fd31, %fd6, %fd32;
+       mov.f64         %fd34, 0d3F8D0AF961BA53B0;
+       fma.rn.f64      %fd35, %fd33, %fd6, %fd34;
+       mov.f64         %fd36, 0d3F91BF7734CF1C48;
+       fma.rn.f64      %fd37, %fd35, %fd6, %fd36;
+       mov.f64         %fd38, 0d3F96E91483144EF7;
+       fma.rn.f64      %fd39, %fd37, %fd6, %fd38;
+       mov.f64         %fd40, 0d3F9F1C6E0A4F9F81;
+       fma.rn.f64      %fd41, %fd39, %fd6, %fd40;
+       mov.f64         %fd42, 0d3FA6DB6DC27FA92B;
+       fma.rn.f64      %fd43, %fd41, %fd6, %fd42;
+       mov.f64         %fd44, 0d3FB333333320F91B;
+       fma.rn.f64      %fd45, %fd43, %fd6, %fd44;
+       mov.f64         %fd46, 0d3FC5555555555F4D;
+       fma.rn.f64      %fd47, %fd45, %fd6, %fd46;
+       mul.f64         %fd48, %fd6, %fd47;
+       mul.f64         %fd49, %fd22, 0dC000000000000000;
+       mov.f64         %fd50, 0d3C91A62633145C07;
+       fma.rn.f64      %fd51, %fd49, %fd48, %fd50;
+       add.f64         %fd52, %fd49, 0d3FE921FB54442D18;
+       add.f64         %fd53, %fd52, %fd51;
+       add.f64         %fd54, %fd53, 0d3FE921FB54442D18;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r11, %temp}, %fd54;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r12}, %fd54;
+       }
+       and.b32         %r13, %r2, -2147483648;
+       or.b32          %r14, %r12, %r13;
+       mov.b64         %fd82, {%r11, %r14};
+
+BB32_4:
+       cvta.to.global.u64      %rd7, %rd3;
+       shl.b64         %rd8, %rd1, 3;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f64   [%rd9], %fd82;
+
+BB32_5:
+       ret;
+}
+
+       // .globl       matrix_acos
+.visible .entry matrix_acos(
+       .param .u64 matrix_acos_param_0,
+       .param .u64 matrix_acos_param_1,
+       .param .u32 matrix_acos_param_2
+)
+{
+       .reg .pred      %p<7>;
+       .reg .b32       %r<17>;
+       .reg .f64       %fd<95>;
+       .reg .b64       %rd<10>;
+
+
+       ld.param.u64    %rd2, [matrix_acos_param_0];
+       ld.param.u64    %rd3, [matrix_acos_param_1];
+       ld.param.u32    %r4, [matrix_acos_param_2];
+       mov.u32         %r5, %ctaid.x;
+       mov.u32         %r6, %ntid.x;
+       mov.u32         %r7, %tid.x;
+       mad.lo.s32      %r1, %r6, %r5, %r7;
+       setp.ge.u32     %p1, %r1, %r4;
+       @%p1 bra        BB33_14;
+
+       cvta.to.global.u64      %rd4, %rd2;
+       cvt.s64.s32     %rd1, %r1;
+       mul.wide.s32    %rd5, %r1, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd16, [%rd6];
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r2}, %fd16;
+       }
+       abs.f64         %fd1, %fd16;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r8}, %fd1;
+       }
+       setp.lt.s32     %p2, %r8, 1071801958;
+       @%p2 bra        BB33_9;
+       bra.uni         BB33_2;
+
+BB33_9:
+       mul.f64         %fd62, %fd1, %fd1;
+       mov.f64         %fd63, 0dBFB3823B180754AF;
+       mov.f64         %fd64, 0d3FB0066BDC1895E9;
+       fma.rn.f64      %fd65, %fd64, %fd62, %fd63;
+       mov.f64         %fd66, 0d3FB11E52CC2F79AE;
+       fma.rn.f64      %fd67, %fd65, %fd62, %fd66;
+       mov.f64         %fd68, 0dBF924EAF3526861B;
+       fma.rn.f64      %fd69, %fd67, %fd62, %fd68;
+       mov.f64         %fd70, 0d3F91DF02A31E6CB7;
+       fma.rn.f64      %fd71, %fd69, %fd62, %fd70;
+       mov.f64         %fd72, 0d3F847D18B0EEC6CC;
+       fma.rn.f64      %fd73, %fd71, %fd62, %fd72;
+       mov.f64         %fd74, 0d3F8D0AF961BA53B0;
+       fma.rn.f64      %fd75, %fd73, %fd62, %fd74;
+       mov.f64         %fd76, 0d3F91BF7734CF1C48;
+       fma.rn.f64      %fd77, %fd75, %fd62, %fd76;
+       mov.f64         %fd78, 0d3F96E91483144EF7;
+       fma.rn.f64      %fd79, %fd77, %fd62, %fd78;
+       mov.f64         %fd80, 0d3F9F1C6E0A4F9F81;
+       fma.rn.f64      %fd81, %fd79, %fd62, %fd80;
+       mov.f64         %fd82, 0d3FA6DB6DC27FA92B;
+       fma.rn.f64      %fd83, %fd81, %fd62, %fd82;
+       mov.f64         %fd84, 0d3FB333333320F91B;
+       fma.rn.f64      %fd85, %fd83, %fd62, %fd84;
+       mov.f64         %fd86, 0d3FC5555555555F4D;
+       fma.rn.f64      %fd87, %fd85, %fd62, %fd86;
+       mul.f64         %fd88, %fd62, %fd87;
+       fma.rn.f64      %fd10, %fd88, %fd1, %fd1;
+       setp.lt.s32     %p6, %r2, 0;
+       @%p6 bra        BB33_11;
+
+       mov.f64         %fd89, 0dBC91A62633145C07;
+       add.rn.f64      %fd90, %fd10, %fd89;
+       neg.f64         %fd93, %fd90;
+       bra.uni         BB33_12;
+
+BB33_2:
+       mov.f64         %fd19, 0d3FF0000000000000;
+       sub.f64         %fd2, %fd19, %fd1;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r9, %temp}, %fd2;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r3}, %fd2;
+       }
+       add.s32         %r10, %r3, -1048576;
+       mov.b64         %fd18, {%r9, %r10};
+       // inline asm
+       rsqrt.approx.ftz.f64 %fd17, %fd18;
+       // inline asm
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r11, %temp}, %fd17;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r12}, %fd17;
+       }
+       add.s32         %r13, %r12, -1048576;
+       mov.b64         %fd20, {%r11, %r13};
+       mul.f64         %fd21, %fd18, %fd17;
+       neg.f64         %fd22, %fd21;
+       fma.rn.f64      %fd23, %fd21, %fd22, %fd18;
+       fma.rn.f64      %fd24, %fd23, %fd20, %fd21;
+       neg.f64         %fd25, %fd24;
+       fma.rn.f64      %fd26, %fd17, %fd25, %fd19;
+       fma.rn.f64      %fd27, %fd26, %fd20, %fd20;
+       fma.rn.f64      %fd28, %fd24, %fd25, %fd18;
+       fma.rn.f64      %fd3, %fd28, %fd27, %fd24;
+       setp.lt.s32     %p3, %r3, 1;
+       @%p3 bra        BB33_4;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r14, %temp}, %fd3;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r15}, %fd3;
+       }
+       add.s32         %r16, %r15, 1048576;
+       mov.b64         %fd29, {%r14, %r16};
+       mov.f64         %fd30, 0dBEBAC2FE66FAAC4B;
+       mov.f64         %fd31, 0d3EC715B371155F70;
+       fma.rn.f64      %fd32, %fd31, %fd2, %fd30;
+       mov.f64         %fd33, 0d3ED9A9B88EFCD9B8;
+       fma.rn.f64      %fd34, %fd32, %fd2, %fd33;
+       mov.f64         %fd35, 0d3EDD0F40A8A0C4C3;
+       fma.rn.f64      %fd36, %fd34, %fd2, %fd35;
+       mov.f64         %fd37, 0d3EF46D4CFA9E0E1F;
+       fma.rn.f64      %fd38, %fd36, %fd2, %fd37;
+       mov.f64         %fd39, 0d3F079C168D1E2422;
+       fma.rn.f64      %fd40, %fd38, %fd2, %fd39;
+       mov.f64         %fd41, 0d3F1C9A88C3BCA540;
+       fma.rn.f64      %fd42, %fd40, %fd2, %fd41;
+       mov.f64         %fd43, 0d3F31C4E64BD476DF;
+       fma.rn.f64      %fd44, %fd42, %fd2, %fd43;
+       mov.f64         %fd45, 0d3F46E8BA60009C8F;
+       fma.rn.f64      %fd46, %fd44, %fd2, %fd45;
+       mov.f64         %fd47, 0d3F5F1C71C62B05A2;
+       fma.rn.f64      %fd48, %fd46, %fd2, %fd47;
+       mov.f64         %fd49, 0d3F76DB6DB6DC9F2C;
+       fma.rn.f64      %fd50, %fd48, %fd2, %fd49;
+       mov.f64         %fd51, 0d3F9333333333329C;
+       fma.rn.f64      %fd52, %fd50, %fd2, %fd51;
+       mov.f64         %fd53, 0d3FB5555555555555;
+       fma.rn.f64      %fd54, %fd52, %fd2, %fd53;
+       mul.f64         %fd55, %fd2, %fd54;
+       fma.rn.f64      %fd94, %fd55, %fd29, %fd29;
+       bra.uni         BB33_5;
+
+BB33_11:
+       mov.f64         %fd91, 0d3C91A62633145C07;
+       add.rn.f64      %fd93, %fd10, %fd91;
+
+BB33_12:
+       mov.f64         %fd92, 0d3FF921FB54442D18;
+       add.rn.f64      %fd94, %fd92, %fd93;
+       bra.uni         BB33_13;
+
+BB33_4:
+       mov.f64         %fd56, 0d0000000000000000;
+       mul.rn.f64      %fd94, %fd1, %fd56;
+
+BB33_5:
+       setp.gt.s32     %p4, %r3, -1;
+       @%p4 bra        BB33_7;
+
+       mov.f64         %fd57, 0d7FF0000000000000;
+       mul.rn.f64      %fd94, %fd94, %fd57;
+
+BB33_7:
+       setp.gt.s32     %p5, %r2, -1;
+       @%p5 bra        BB33_13;
+
+       mov.f64         %fd58, 0dBCA1A62633145C07;
+       add.rn.f64      %fd59, %fd94, %fd58;
+       neg.f64         %fd60, %fd59;
+       mov.f64         %fd61, 0d400921FB54442D18;
+       add.rn.f64      %fd94, %fd61, %fd60;
+
+BB33_13:
+       cvta.to.global.u64      %rd7, %rd3;
+       shl.b64         %rd8, %rd1, 3;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f64   [%rd9], %fd94;
+
+BB33_14:
+       ret;
+}
+
+       // .globl       matrix_atan
+.visible .entry matrix_atan(
+       .param .u64 matrix_atan_param_0,
+       .param .u64 matrix_atan_param_1,
+       .param .u32 matrix_atan_param_2
+)
+{
+       .reg .pred      %p<5>;
+       .reg .b32       %r<11>;
+       .reg .f64       %fd<57>;
+       .reg .b64       %rd<10>;
+
+
+       ld.param.u64    %rd2, [matrix_atan_param_0];
+       ld.param.u64    %rd3, [matrix_atan_param_1];
+       ld.param.u32    %r2, [matrix_atan_param_2];
+       mov.u32         %r3, %ctaid.x;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %tid.x;
+       mad.lo.s32      %r1, %r4, %r3, %r5;
+       setp.ge.u32     %p1, %r1, %r2;
+       @%p1 bra        BB34_4;
+
+       cvta.to.global.u64      %rd4, %rd2;
+       cvt.s64.s32     %rd1, %r1;
+       mul.wide.s32    %rd5, %r1, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd1, [%rd6];
+       abs.f64         %fd2, %fd1;
+       setp.leu.f64    %p2, %fd2, 0d3FF0000000000000;
+       mov.f64         %fd56, %fd2;
+       @%p2 bra        BB34_3;
+
+       // inline asm
+       rcp.approx.ftz.f64 %fd5,%fd2;
+       // inline asm
+       neg.f64         %fd7, %fd2;
+       mov.f64         %fd8, 0d3FF0000000000000;
+       fma.rn.f64      %fd9, %fd7, %fd5, %fd8;
+       fma.rn.f64      %fd10, %fd9, %fd9, %fd9;
+       fma.rn.f64      %fd11, %fd10, %fd5, %fd5;
+       setp.eq.f64     %p3, %fd2, 0d7FF0000000000000;
+       selp.f64        %fd3, 0d0000000000000000, %fd11, %p3;
+       mov.f64         %fd56, %fd3;
+
+BB34_3:
+       mov.f64         %fd4, %fd56;
+       cvta.to.global.u64      %rd7, %rd3;
+       mul.f64         %fd12, %fd4, %fd4;
+       mov.f64         %fd13, 0d3F2D3B63DBB65B49;
+       mov.f64         %fd14, 0dBEF53E1D2A25FF7E;
+       fma.rn.f64      %fd15, %fd14, %fd12, %fd13;
+       mov.f64         %fd16, 0dBF5312788DDE082E;
+       fma.rn.f64      %fd17, %fd15, %fd12, %fd16;
+       mov.f64         %fd18, 0d3F6F9690C8249315;
+       fma.rn.f64      %fd19, %fd17, %fd12, %fd18;
+       mov.f64         %fd20, 0dBF82CF5AABC7CF0D;
+       fma.rn.f64      %fd21, %fd19, %fd12, %fd20;
+       mov.f64         %fd22, 0d3F9162B0B2A3BFDE;
+       fma.rn.f64      %fd23, %fd21, %fd12, %fd22;
+       mov.f64         %fd24, 0dBF9A7256FEB6FC6B;
+       fma.rn.f64      %fd25, %fd23, %fd12, %fd24;
+       mov.f64         %fd26, 0d3FA171560CE4A489;
+       fma.rn.f64      %fd27, %fd25, %fd12, %fd26;
+       mov.f64         %fd28, 0dBFA4F44D841450E4;
+       fma.rn.f64      %fd29, %fd27, %fd12, %fd28;
+       mov.f64         %fd30, 0d3FA7EE3D3F36BB95;
+       fma.rn.f64      %fd31, %fd29, %fd12, %fd30;
+       mov.f64         %fd32, 0dBFAAD32AE04A9FD1;
+       fma.rn.f64      %fd33, %fd31, %fd12, %fd32;
+       mov.f64         %fd34, 0d3FAE17813D66954F;
+       fma.rn.f64      %fd35, %fd33, %fd12, %fd34;
+       mov.f64         %fd36, 0dBFB11089CA9A5BCD;
+       fma.rn.f64      %fd37, %fd35, %fd12, %fd36;
+       mov.f64         %fd38, 0d3FB3B12B2DB51738;
+       fma.rn.f64      %fd39, %fd37, %fd12, %fd38;
+       mov.f64         %fd40, 0dBFB745D022F8DC5C;
+       fma.rn.f64      %fd41, %fd39, %fd12, %fd40;
+       mov.f64         %fd42, 0d3FBC71C709DFE927;
+       fma.rn.f64      %fd43, %fd41, %fd12, %fd42;
+       mov.f64         %fd44, 0dBFC2492491FA1744;
+       fma.rn.f64      %fd45, %fd43, %fd12, %fd44;
+       mov.f64         %fd46, 0d3FC99999999840D2;
+       fma.rn.f64      %fd47, %fd45, %fd12, %fd46;
+       mov.f64         %fd48, 0dBFD555555555544C;
+       fma.rn.f64      %fd49, %fd47, %fd12, %fd48;
+       mul.f64         %fd50, %fd12, %fd49;
+       fma.rn.f64      %fd51, %fd50, %fd4, %fd4;
+       mov.f64         %fd52, 0d3FF921FB54442D18;
+       sub.f64         %fd53, %fd52, %fd51;
+       setp.gt.f64     %p4, %fd2, 0d3FF0000000000000;
+       selp.f64        %fd54, %fd53, %fd51, %p4;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r6, %temp}, %fd54;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r7}, %fd54;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r8}, %fd1;
+       }
+       and.b32         %r9, %r8, -2147483648;
+       or.b32          %r10, %r7, %r9;
+       mov.b64         %fd55, {%r6, %r10};
+       shl.b64         %rd8, %rd1, 3;
+       add.s64         %rd9, %rd7, %rd8;
+       st.global.f64   [%rd9], %fd55;
+
+BB34_4:
+       ret;
+}
+
+       // .globl       matrix_sign
+.visible .entry matrix_sign(
+       .param .u64 matrix_sign_param_0,
+       .param .u64 matrix_sign_param_1,
+       .param .u32 matrix_sign_param_2
+)
+{
+       .reg .pred      %p<3>;
+       .reg .b32       %r<12>;
+       .reg .f64       %fd<4>;
+       .reg .b64       %rd<9>;
+
+
+       ld.param.u64    %rd2, [matrix_sign_param_0];
+       ld.param.u64    %rd3, [matrix_sign_param_1];
+       ld.param.u32    %r2, [matrix_sign_param_2];
+       mov.u32         %r3, %ctaid.x;
+       mov.u32         %r4, %ntid.x;
+       mov.u32         %r5, %tid.x;
+       mad.lo.s32      %r1, %r4, %r3, %r5;
+       setp.ge.u32     %p1, %r1, %r2;
+       @%p1 bra        BB35_4;
+
+       cvta.to.global.u64      %rd4, %rd2;
+       mul.wide.s32    %rd5, %r1, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd1, [%rd6];
+       setp.eq.f64     %p2, %fd1, 0d0000000000000000;
+       cvta.to.global.u64      %rd7, %rd3;
+       add.s64         %rd1, %rd7, %rd5;
+       @%p2 bra        BB35_3;
+       bra.uni         BB35_2;
+
+BB35_3:
+       mov.u64         %rd8, 0;
+       st.global.u64   [%rd1], %rd8;
+       bra.uni         BB35_4;
+
+BB35_2:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r6}, %fd1;
+       }
+       and.b32         %r7, %r6, -2147483648;
+       mov.f64         %fd2, 0d3FF0000000000000;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r8}, %fd2;
+       }
+       and.b32         %r9, %r8, 2147483647;
+       or.b32          %r10, %r9, %r7;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r11, %temp}, %fd2;
+       }
+       mov.b64         %fd3, {%r11, %r10};
+       st.global.f64   [%rd1], %fd3;
+
+BB35_4:
+       ret;
+}
+
+.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd(
+       .param .b64 __internal_trig_reduction_slowpathd_param_0,
+       .param .b64 __internal_trig_reduction_slowpathd_param_1
+)
+{
+       .local .align 8 .b8     __local_depot36[40];
+       .reg .b64       %SP;
+       .reg .b64       %SPL;
+       .reg .pred      %p<9>;
+       .reg .b32       %r<42>;
+       .reg .f64       %fd<5>;
+       .reg .b64       %rd<101>;
+
+
+       mov.u64         %rd100, __local_depot36;
+       cvta.local.u64  %SP, %rd100;
+       ld.param.f64    %fd4, [__internal_trig_reduction_slowpathd_param_0];
+       ld.param.u64    %rd37, [__internal_trig_reduction_slowpathd_param_1];
+       add.u64         %rd38, %SP, 0;
+       cvta.to.local.u64       %rd1, %rd38;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r1}, %fd4;
+       }
+       and.b32         %r40, %r1, -2147483648;
+       shr.u32         %r3, %r1, 20;
+       bfe.u32         %r4, %r1, 20, 11;
+       setp.eq.s32     %p1, %r4, 2047;
+       @%p1 bra        BB36_13;
+
+       add.s32         %r16, %r4, -1024;
+       shr.u32         %r17, %r16, 6;
+       mov.u32         %r18, 16;
+       sub.s32         %r5, %r18, %r17;
+       mov.u32         %r19, 19;
+       sub.s32         %r20, %r19, %r17;
+       mov.u32         %r21, 18;
+       min.s32         %r6, %r21, %r20;
+       setp.gt.s32     %p2, %r5, %r6;
+       mov.u64         %rd94, 0;
+       mov.u64         %rd93, %rd1;
+       @%p2 bra        BB36_4;
+
+       mov.b64          %rd41, %fd4;
+       shl.b64         %rd42, %rd41, 11;
+       or.b64          %rd3, %rd42, -9223372036854775808;
+       add.s32         %r7, %r5, -1;
+       mov.u64         %rd92, %rd1;
+       bfe.u32         %r22, %r1, 20, 11;
+       add.s32         %r23, %r22, -1024;
+       shr.u32         %r24, %r23, 6;
+       neg.s32         %r25, %r24;
+       mul.wide.s32    %rd43, %r25, 8;
+       mov.u64         %rd44, __cudart_i2opi_d;
+       add.s64         %rd45, %rd43, %rd44;
+       add.s64         %rd90, %rd45, 120;
+       mov.u64         %rd94, 0;
+       mov.u64         %rd91, %rd1;
+       mov.u32         %r39, %r7;
+
+BB36_3:
+       .pragma "nounroll";
+       mov.u32         %r8, %r39;
+       mov.u64         %rd7, %rd91;
+       ld.const.u64    %rd48, [%rd90];
+       // inline asm
+       {
+       .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
+       mov.b64         {alo,ahi}, %rd48;    
+       mov.b64         {blo,bhi}, %rd3;    
+       mov.b64         {clo,chi}, %rd94;    
+       mad.lo.cc.u32   r0, alo, blo, clo;
+       madc.hi.cc.u32  r1, alo, blo, chi;
+       madc.hi.u32     r2, alo, bhi,   0;
+       mad.lo.cc.u32   r1, alo, bhi,  r1;
+       madc.hi.cc.u32  r2, ahi, blo,  r2;
+       madc.hi.u32     r3, ahi, bhi,   0;
+       mad.lo.cc.u32   r1, ahi, blo,  r1;
+       madc.lo.cc.u32  r2, ahi, bhi,  r2;
+       addc.u32        r3,  r3,   0;     
+       mov.b64         %rd46, {r0,r1};      
+       mov.b64         %rd94, {r2,r3};      
+       }
+       // inline asm
+       st.local.u64    [%rd92], %rd46;
+       add.s32         %r9, %r8, 1;
+       sub.s32         %r26, %r9, %r7;
+       mul.wide.s32    %rd51, %r26, 8;
+       add.s64         %rd92, %rd1, %rd51;
+       add.s64         %rd13, %rd7, 8;
+       mov.u64         %rd93, %rd13;
+       add.s64         %rd90, %rd90, 8;
+       setp.lt.s32     %p3, %r9, %r6;
+       mov.u64         %rd91, %rd13;
+       mov.u32         %r39, %r9;
+       @%p3 bra        BB36_3;
+
+BB36_4:
+       st.local.u64    [%rd93], %rd94;
+       ld.local.u64    %rd95, [%rd1+16];
+       ld.local.u64    %rd96, [%rd1+24];
+       and.b32         %r10, %r3, 63;
+       setp.eq.s32     %p4, %r10, 0;
+       @%p4 bra        BB36_6;
+
+       mov.u32         %r27, 64;
+       sub.s32         %r28, %r27, %r10;
+       shl.b64         %rd52, %rd96, %r10;
+       shr.u64         %rd53, %rd95, %r28;
+       or.b64          %rd96, %rd52, %rd53;
+       shl.b64         %rd54, %rd95, %r10;
+       ld.local.u64    %rd55, [%rd1+8];
+       shr.u64         %rd56, %rd55, %r28;
+       or.b64          %rd95, %rd56, %rd54;
+
+BB36_6:
+       cvta.to.local.u64       %rd57, %rd37;
+       shr.u64         %rd58, %rd96, 62;
+       cvt.u32.u64     %r29, %rd58;
+       shr.u64         %rd59, %rd95, 62;
+       shl.b64         %rd60, %rd96, 2;
+       or.b64          %rd98, %rd60, %rd59;
+       shl.b64         %rd97, %rd95, 2;
+       shr.u64         %rd61, %rd96, 61;
+       cvt.u32.u64     %r30, %rd61;
+       and.b32         %r31, %r30, 1;
+       add.s32         %r32, %r31, %r29;
+       neg.s32         %r33, %r32;
+       setp.eq.s32     %p5, %r40, 0;
+       selp.b32        %r34, %r32, %r33, %p5;
+       st.local.u32    [%rd57], %r34;
+       setp.eq.s32     %p6, %r31, 0;
+       @%p6 bra        BB36_8;
+
+       mov.u64         %rd65, 0;
+       // inline asm
+       {
+       .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
+       mov.b64         {a0,a1}, %rd65;
+       mov.b64         {a2,a3}, %rd65;
+       mov.b64         {b0,b1}, %rd97;
+       mov.b64         {b2,b3}, %rd98;
+       sub.cc.u32      r0, a0, b0; 
+       subc.cc.u32     r1, a1, b1; 
+       subc.cc.u32     r2, a2, b2; 
+       subc.u32        r3, a3, b3; 
+       mov.b64         %rd97, {r0,r1};
+       mov.b64         %rd98, {r2,r3};
+       }
+       // inline asm
+       xor.b32         %r40, %r40, -2147483648;
+
+BB36_8:
+       clz.b64         %r41, %rd98;
+       setp.eq.s32     %p7, %r41, 0;
+       @%p7 bra        BB36_10;
+
+       shl.b64         %rd68, %rd98, %r41;
+       mov.u32         %r35, 64;
+       sub.s32         %r36, %r35, %r41;
+       shr.u64         %rd69, %rd97, %r36;
+       or.b64          %rd98, %rd69, %rd68;
+
+BB36_10:
+       mov.u64         %rd73, -3958705157555305931;
+       // inline asm
+       {
+       .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
+       mov.b64         {alo,ahi}, %rd98;   
+       mov.b64         {blo,bhi}, %rd73;   
+       mul.lo.u32      r0, alo, blo;    
+       mul.hi.u32      r1, alo, blo;    
+       mad.lo.cc.u32   r1, alo, bhi, r1;
+       madc.hi.u32     r2, alo, bhi,  0;
+       mad.lo.cc.u32   r1, ahi, blo, r1;
+       madc.hi.cc.u32  r2, ahi, blo, r2;
+       madc.hi.u32     r3, ahi, bhi,  0;
+       mad.lo.cc.u32   r2, ahi, bhi, r2;
+       addc.u32        r3, r3,  0;      
+       mov.b64         %rd70, {r0,r1};     
+       mov.b64         %rd99, {r2,r3};     
+       }
+       // inline asm
+       setp.lt.s64     %p8, %rd99, 1;
+       @%p8 bra        BB36_12;
+
+       // inline asm
+       {
+       .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
+       mov.b64         {a0,a1}, %rd70;
+       mov.b64         {a2,a3}, %rd99;
+       mov.b64         {b0,b1}, %rd70;
+       mov.b64         {b2,b3}, %rd99;
+       add.cc.u32      r0, a0, b0; 
+       addc.cc.u32     r1, a1, b1; 
+       addc.cc.u32     r2, a2, b2; 
+       addc.u32        r3, a3, b3; 
+       mov.b64         %rd74, {r0,r1};
+       mov.b64         %rd99, {r2,r3};
+       }
+       // inline asm
+       add.s32         %r41, %r41, 1;
+
+BB36_12:
+       cvt.u64.u32     %rd80, %r40;
+       shl.b64         %rd81, %rd80, 32;
+       mov.u32         %r37, 1022;
+       sub.s32         %r38, %r37, %r41;
+       cvt.u64.u32     %rd82, %r38;
+       shl.b64         %rd83, %rd82, 52;
+       add.s64         %rd84, %rd99, 1;
+       shr.u64         %rd85, %rd84, 10;
+       add.s64         %rd86, %rd85, 1;
+       shr.u64         %rd87, %rd86, 1;
+       add.s64         %rd88, %rd87, %rd83;
+       or.b64          %rd89, %rd88, %rd81;
+       mov.b64          %fd4, %rd89;
+
+BB36_13:
+       st.param.f64    [func_retval0+0], %fd4;
+       ret;
+}
+
+.func  (.param .b64 func_retval0) __internal_accurate_pow(
+       .param .b64 __internal_accurate_pow_param_0,
+       .param .b64 __internal_accurate_pow_param_1
+)
+{
+       .reg .pred      %p<9>;
+       .reg .f32       %f<3>;
+       .reg .b32       %r<52>;
+       .reg .f64       %fd<134>;
+
+
+       ld.param.f64    %fd12, [__internal_accurate_pow_param_0];
+       ld.param.f64    %fd13, [__internal_accurate_pow_param_1];
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r49}, %fd12;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r48, %temp}, %fd12;
+       }
+       shr.u32         %r50, %r49, 20;
+       setp.ne.s32     %p1, %r50, 0;
+       @%p1 bra        BB37_2;
+
+       mul.f64         %fd14, %fd12, 0d4350000000000000;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r49}, %fd14;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r48, %temp}, %fd14;
+       }
+       shr.u32         %r16, %r49, 20;
+       add.s32         %r50, %r16, -54;
+
+BB37_2:
+       add.s32         %r51, %r50, -1023;
+       and.b32         %r17, %r49, -2146435073;
+       or.b32          %r18, %r17, 1072693248;
+       mov.b64         %fd132, {%r48, %r18};
+       setp.lt.u32     %p2, %r18, 1073127583;
+       @%p2 bra        BB37_4;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r19, %temp}, %fd132;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r20}, %fd132;
+       }
+       add.s32         %r21, %r20, -1048576;
+       mov.b64         %fd132, {%r19, %r21};
+       add.s32         %r51, %r50, -1022;
+
+BB37_4:
+       add.f64         %fd16, %fd132, 0d3FF0000000000000;
+       // inline asm
+       rcp.approx.ftz.f64 %fd15,%fd16;
+       // inline asm
+       neg.f64         %fd17, %fd16;
+       mov.f64         %fd18, 0d3FF0000000000000;
+       fma.rn.f64      %fd19, %fd17, %fd15, %fd18;
+       fma.rn.f64      %fd20, %fd19, %fd19, %fd19;
+       fma.rn.f64      %fd21, %fd20, %fd15, %fd15;
+       add.f64         %fd22, %fd132, 0dBFF0000000000000;
+       mul.f64         %fd23, %fd22, %fd21;
+       fma.rn.f64      %fd24, %fd22, %fd21, %fd23;
+       mul.f64         %fd25, %fd24, %fd24;
+       mov.f64         %fd26, 0d3ED0F5D241AD3B5A;
+       mov.f64         %fd27, 0d3EB0F5FF7D2CAFE2;
+       fma.rn.f64      %fd28, %fd27, %fd25, %fd26;
+       mov.f64         %fd29, 0d3EF3B20A75488A3F;
+       fma.rn.f64      %fd30, %fd28, %fd25, %fd29;
        mov.f64         %fd31, 0d3F1745CDE4FAECD5;
        fma.rn.f64      %fd32, %fd30, %fd25, %fd31;
        mov.f64         %fd33, 0d3F3C71C7258A578B;
@@ -3499,13 +5071,13 @@ BB23_4:
        mov.b32          %f2, %r35;
        abs.f32         %f1, %f2;
        setp.lt.f32     %p4, %f1, 0f4086232B;
-       @%p4 bra        BB23_7;
+       @%p4 bra        BB37_7;
 
        setp.lt.f64     %p5, %fd4, 0d0000000000000000;
        add.f64         %fd129, %fd4, 0d7FF0000000000000;
        selp.f64        %fd133, 0d0000000000000000, %fd129, %p5;
        setp.geu.f32    %p6, %f1, 0f40874800;
-       @%p6 bra        BB23_7;
+       @%p6 bra        BB37_7;
 
        shr.u32         %r36, %r13, 31;
        add.s32         %r37, %r13, %r36;
@@ -3520,26 +5092,26 @@ BB23_4:
        mov.b64         %fd131, {%r44, %r43};
        mul.f64         %fd133, %fd130, %fd131;
 
-BB23_7:
+BB37_7:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r45}, %fd133;
        }
        and.b32         %r46, %r45, 2147483647;
        setp.ne.s32     %p7, %r46, 2146435072;
+       @%p7 bra        BB37_9;
+
        {
        .reg .b32 %temp; 
        mov.b64         {%r47, %temp}, %fd133;
        }
-       setp.ne.s32     %p8, %r47, 0;
-       or.pred         %p9, %p8, %p7;
-       @!%p9 bra       BB23_9;
-       bra.uni         BB23_8;
+       setp.eq.s32     %p8, %r47, 0;
+       @%p8 bra        BB37_10;
 
-BB23_8:
+BB37_9:
        fma.rn.f64      %fd133, %fd133, %fd5, %fd133;
 
-BB23_9:
+BB37_10:
        st.param.f64    [func_retval0+0], %fd133;
        ret;
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/hops/UnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java 
b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index c75d0e0..451960b 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -157,8 +157,14 @@ public class UnaryOp extends Hop implements 
MultiThreadedHop
                                else //default unary 
                                {
                                        int k = isCumulativeUnaryOperation() ? 
OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1;
-                                       if(_op == OpOp1.SELP || _op == 
OpOp1.EXP) {
-                                               et = 
findGPUExecTypeByMemEstimate(et);
+                                       switch(_op) {
+                                               case SELP:case EXP:case 
SQRT:case LOG:case ABS:
+                                               case ROUND:case FLOOR:case CEIL:
+                                               case SIN:case COS: case 
TAN:case ASIN:case ACOS:case ATAN:
+                                               case SIGN:
+                                                       et = 
findGPUExecTypeByMemEstimate(et);
+                                                       break;
+                                               default:
                                        }
                                        Unary unary1 = new 
Unary(input.constructLops(), HopsOpOp1LopsU.get(_op), 
                                                                         
getDataType(), getValueType(), et, k);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1fc764b9/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 4a45521..443d0eb 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -69,12 +69,27 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType.put( "^2"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
                String2GPUInstructionType.put( "*2"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
                String2GPUInstructionType.put( "-nz"  , 
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
-               String2GPUInstructionType.put( "+*"     , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "-*"     , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "+*"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "-*"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
                
                // Builtin functions
                String2GPUInstructionType.put( "sel+"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
                String2GPUInstructionType.put( "exp"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "log"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "abs"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "sqrt"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "round"  , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "floor"  , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "ceil"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "sin"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "cos"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "tan"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "asin"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "acos"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "atan"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "sign"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+
+
 
                String2GPUInstructionType.put( "solve"  , 
GPUINSTRUCTION_TYPE.BuiltinBinary);
 


Reply via email to