Continue from question #209709 
(https://answers.launchpad.net/pocl/+question/209709)

The disassembly of kernel_linked.bc follows:

; ModuleID = 'kernel_linked.bc'
target datalayout = 
"e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32"
target triple = "mipsel"

@_group_id_x = external global i32
@_group_id_y = external global i32
@_group_id_z = external global i32
@_local_id_x = external global i32
@_local_id_y = external global i32
@_local_id_z = external global i32

define void @matrix_transpose(float addrspace(3)* nocapture %output, 
float addrspace(3)* nocapture %input, float* %tile) nounwind {
entry:
   %0 = load i32* @_group_id_x, align 4
   %1 = load i32* @_group_id_y, align 4
   %2 = load i32* @_local_id_x, align 4
   %and = and i32 %2, 31
   %shr = lshr i32 %2, 5
   %mul.i14 = mul nsw i32 %shr, 33
   %add.i15 = add nsw i32 %mul.i14, %and
   %mul.i12 = mul nsw i32 %and, 33
   %add.i13 = add nsw i32 %mul.i12, %shr
   %mul.i10 = shl nsw i32 %0, 5
   %add.i11 = or i32 %mul.i10, %and
   %mul.i8 = shl nsw i32 %1, 5
   %add.i9 = add nsw i32 %mul.i8, %shr
   %mul.i6 = shl nsw i32 %add.i9, 8
   %add.i7 = add nsw i32 %mul.i6, %add.i11
   %add.i5 = or i32 %mul.i8, %and
   %add.i3 = add nsw i32 %mul.i10, %shr
   %mul.i = mul nsw i32 %add.i3, 4128
   %add.i = add nsw i32 %mul.i, %add.i5
   %arrayidx = getelementptr inbounds float addrspace(3)* %input, i32 
%add.i7
   %3 = load float addrspace(3)* %arrayidx, align 4, !tbaa !1
   %arrayidx12 = getelementptr inbounds float* %tile, i32 %add.i15
   store float %3, float* %arrayidx12, align 4, !tbaa !1
   %add = add nsw i32 %add.i15, 66
   %add13 = add nsw i32 %add.i7, 512
   %arrayidx14 = getelementptr inbounds float addrspace(3)* %input, i32 
%add13
   %4 = load float addrspace(3)* %arrayidx14, align 4, !tbaa !1
   %arrayidx15 = getelementptr inbounds float* %tile, i32 %add
   store float %4, float* %arrayidx15, align 4, !tbaa !1
   %add16 = add nsw i32 %add.i15, 132
   %add17 = add nsw i32 %add.i7, 1024
   %arrayidx18 = getelementptr inbounds float addrspace(3)* %input, i32 
%add17
   %5 = load float addrspace(3)* %arrayidx18, align 4, !tbaa !1
   %arrayidx19 = getelementptr inbounds float* %tile, i32 %add16
   store float %5, float* %arrayidx19, align 4, !tbaa !1
   %add20 = add nsw i32 %add.i15, 198
   %add21 = add nsw i32 %add.i7, 1536
   %arrayidx22 = getelementptr inbounds float addrspace(3)* %input, i32 
%add21
   %6 = load float addrspace(3)* %arrayidx22, align 4, !tbaa !1
   %arrayidx23 = getelementptr inbounds float* %tile, i32 %add20
   store float %6, float* %arrayidx23, align 4, !tbaa !1
   %add24 = add nsw i32 %add.i15, 264
   %add25 = add nsw i32 %add.i7, 2048
   %arrayidx26 = getelementptr inbounds float addrspace(3)* %input, i32 
%add25
   %7 = load float addrspace(3)* %arrayidx26, align 4, !tbaa !1
   %arrayidx27 = getelementptr inbounds float* %tile, i32 %add24
   store float %7, float* %arrayidx27, align 4, !tbaa !1
   %add28 = add nsw i32 %add.i15, 330
   %add29 = add nsw i32 %add.i7, 2560
   %arrayidx30 = getelementptr inbounds float addrspace(3)* %input, i32 
%add29
   %8 = load float addrspace(3)* %arrayidx30, align 4, !tbaa !1
   %arrayidx31 = getelementptr inbounds float* %tile, i32 %add28
   store float %8, float* %arrayidx31, align 4, !tbaa !1
   %add32 = add nsw i32 %add.i15, 396
   %add33 = add nsw i32 %add.i7, 3072
   %arrayidx34 = getelementptr inbounds float addrspace(3)* %input, i32 
%add33
   %9 = load float addrspace(3)* %arrayidx34, align 4, !tbaa !1
   %arrayidx35 = getelementptr inbounds float* %tile, i32 %add32
   store float %9, float* %arrayidx35, align 4, !tbaa !1
   %add36 = add nsw i32 %add.i15, 462
   %add37 = add nsw i32 %add.i7, 3584
   %arrayidx38 = getelementptr inbounds float addrspace(3)* %input, i32 
%add37
   %10 = load float addrspace(3)* %arrayidx38, align 4, !tbaa !1
   %arrayidx39 = getelementptr inbounds float* %tile, i32 %add36
   store float %10, float* %arrayidx39, align 4, !tbaa !1
   %add40 = add nsw i32 %add.i15, 528
   %add41 = add nsw i32 %add.i7, 4096
   %arrayidx42 = getelementptr inbounds float addrspace(3)* %input, i32 
%add41
   %11 = load float addrspace(3)* %arrayidx42, align 4, !tbaa !1
   %arrayidx43 = getelementptr inbounds float* %tile, i32 %add40
   store float %11, float* %arrayidx43, align 4, !tbaa !1
   %add44 = add nsw i32 %add.i15, 594
   %add45 = add nsw i32 %add.i7, 4608
   %arrayidx46 = getelementptr inbounds float addrspace(3)* %input, i32 
%add45
   %12 = load float addrspace(3)* %arrayidx46, align 4, !tbaa !1
   %arrayidx47 = getelementptr inbounds float* %tile, i32 %add44
   store float %12, float* %arrayidx47, align 4, !tbaa !1
   %add48 = add nsw i32 %add.i15, 660
   %add49 = add nsw i32 %add.i7, 5120
   %arrayidx50 = getelementptr inbounds float addrspace(3)* %input, i32 
%add49
   %13 = load float addrspace(3)* %arrayidx50, align 4, !tbaa !1
   %arrayidx51 = getelementptr inbounds float* %tile, i32 %add48
   store float %13, float* %arrayidx51, align 4, !tbaa !1
   %add52 = add nsw i32 %add.i15, 726
   %add53 = add nsw i32 %add.i7, 5632
   %arrayidx54 = getelementptr inbounds float addrspace(3)* %input, i32 
%add53
   %14 = load float addrspace(3)* %arrayidx54, align 4, !tbaa !1
   %arrayidx55 = getelementptr inbounds float* %tile, i32 %add52
   store float %14, float* %arrayidx55, align 4, !tbaa !1
   %add56 = add nsw i32 %add.i15, 792
   %add57 = add nsw i32 %add.i7, 6144
   %arrayidx58 = getelementptr inbounds float addrspace(3)* %input, i32 
%add57
   %15 = load float addrspace(3)* %arrayidx58, align 4, !tbaa !1
   %arrayidx59 = getelementptr inbounds float* %tile, i32 %add56
   store float %15, float* %arrayidx59, align 4, !tbaa !1
   %add60 = add nsw i32 %add.i15, 858
   %add61 = add nsw i32 %add.i7, 6656
   %arrayidx62 = getelementptr inbounds float addrspace(3)* %input, i32 
%add61
   %16 = load float addrspace(3)* %arrayidx62, align 4, !tbaa !1
   %arrayidx63 = getelementptr inbounds float* %tile, i32 %add60
   store float %16, float* %arrayidx63, align 4, !tbaa !1
   %add64 = add nsw i32 %add.i15, 924
   %add65 = add nsw i32 %add.i7, 7168
   %arrayidx66 = getelementptr inbounds float addrspace(3)* %input, i32 
%add65
   %17 = load float addrspace(3)* %arrayidx66, align 4, !tbaa !1
   %arrayidx67 = getelementptr inbounds float* %tile, i32 %add64
   store float %17, float* %arrayidx67, align 4, !tbaa !1
   %add68 = add nsw i32 %add.i15, 990
   %add69 = add nsw i32 %add.i7, 7680
   %arrayidx70 = getelementptr inbounds float addrspace(3)* %input, i32 
%add69
   %18 = load float addrspace(3)* %arrayidx70, align 4, !tbaa !1
   %arrayidx71 = getelementptr inbounds float* %tile, i32 %add68
   store float %18, float* %arrayidx71, align 4, !tbaa !1
   call void @pocl.barrier() nounwind
   %arrayidx72 = getelementptr inbounds float* %tile, i32 %add.i13
   %19 = load float* %arrayidx72, align 4, !tbaa !1
   %arrayidx73 = getelementptr inbounds float addrspace(3)* %output, i32 
%add.i
   store float %19, float addrspace(3)* %arrayidx73, align 4, !tbaa !1
   %add74 = add nsw i32 %add.i13, 2
   %add75 = add nsw i32 %add.i, 8256
   %arrayidx76 = getelementptr inbounds float* %tile, i32 %add74
   %20 = load float* %arrayidx76, align 4, !tbaa !1
   %arrayidx77 = getelementptr inbounds float addrspace(3)* %output, i32 
%add75
   store float %20, float addrspace(3)* %arrayidx77, align 4, !tbaa !1
   %add78 = add nsw i32 %add.i13, 4
   %add79 = add nsw i32 %add.i, 16512
   %arrayidx80 = getelementptr inbounds float* %tile, i32 %add78
   %21 = load float* %arrayidx80, align 4, !tbaa !1
   %arrayidx81 = getelementptr inbounds float addrspace(3)* %output, i32 
%add79
   store float %21, float addrspace(3)* %arrayidx81, align 4, !tbaa !1
   %add82 = add nsw i32 %add.i13, 6
   %add83 = add nsw i32 %add.i, 24768
   %arrayidx84 = getelementptr inbounds float* %tile, i32 %add82
   %22 = load float* %arrayidx84, align 4, !tbaa !1
   %arrayidx85 = getelementptr inbounds float addrspace(3)* %output, i32 
%add83
   store float %22, float addrspace(3)* %arrayidx85, align 4, !tbaa !1
   %add86 = add nsw i32 %add.i13, 8
   %add87 = add nsw i32 %add.i, 33024
   %arrayidx88 = getelementptr inbounds float* %tile, i32 %add86
   %23 = load float* %arrayidx88, align 4, !tbaa !1
   %arrayidx89 = getelementptr inbounds float addrspace(3)* %output, i32 
%add87
   store float %23, float addrspace(3)* %arrayidx89, align 4, !tbaa !1
   %add90 = add nsw i32 %add.i13, 10
   %add91 = add nsw i32 %add.i, 41280
   %arrayidx92 = getelementptr inbounds float* %tile, i32 %add90
   %24 = load float* %arrayidx92, align 4, !tbaa !1
   %arrayidx93 = getelementptr inbounds float addrspace(3)* %output, i32 
%add91
   store float %24, float addrspace(3)* %arrayidx93, align 4, !tbaa !1
   %add94 = add nsw i32 %add.i13, 12
   %add95 = add nsw i32 %add.i, 49536
   %arrayidx96 = getelementptr inbounds float* %tile, i32 %add94
   %25 = load float* %arrayidx96, align 4, !tbaa !1
   %arrayidx97 = getelementptr inbounds float addrspace(3)* %output, i32 
%add95
   store float %25, float addrspace(3)* %arrayidx97, align 4, !tbaa !1
   %add98 = add nsw i32 %add.i13, 14
   %add99 = add nsw i32 %add.i, 57792
   %arrayidx100 = getelementptr inbounds float* %tile, i32 %add98
   %26 = load float* %arrayidx100, align 4, !tbaa !1
   %arrayidx101 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add99
   store float %26, float addrspace(3)* %arrayidx101, align 4, !tbaa !1
   %add102 = add nsw i32 %add.i13, 16
   %add103 = add nsw i32 %add.i, 66048
   %arrayidx104 = getelementptr inbounds float* %tile, i32 %add102
   %27 = load float* %arrayidx104, align 4, !tbaa !1
   %arrayidx105 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add103
   store float %27, float addrspace(3)* %arrayidx105, align 4, !tbaa !1
   %add106 = add nsw i32 %add.i13, 18
   %add107 = add nsw i32 %add.i, 74304
   %arrayidx108 = getelementptr inbounds float* %tile, i32 %add106
   %28 = load float* %arrayidx108, align 4, !tbaa !1
   %arrayidx109 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add107
   store float %28, float addrspace(3)* %arrayidx109, align 4, !tbaa !1
   %add110 = add nsw i32 %add.i13, 20
   %add111 = add nsw i32 %add.i, 82560
   %arrayidx112 = getelementptr inbounds float* %tile, i32 %add110
   %29 = load float* %arrayidx112, align 4, !tbaa !1
   %arrayidx113 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add111
   store float %29, float addrspace(3)* %arrayidx113, align 4, !tbaa !1
   %add114 = add nsw i32 %add.i13, 22
   %add115 = add nsw i32 %add.i, 90816
   %arrayidx116 = getelementptr inbounds float* %tile, i32 %add114
   %30 = load float* %arrayidx116, align 4, !tbaa !1
   %arrayidx117 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add115
   store float %30, float addrspace(3)* %arrayidx117, align 4, !tbaa !1
   %add118 = add nsw i32 %add.i13, 24
   %add119 = add nsw i32 %add.i, 99072
   %arrayidx120 = getelementptr inbounds float* %tile, i32 %add118
   %31 = load float* %arrayidx120, align 4, !tbaa !1
   %arrayidx121 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add119
   store float %31, float addrspace(3)* %arrayidx121, align 4, !tbaa !1
   %add122 = add nsw i32 %add.i13, 26
   %add123 = add nsw i32 %add.i, 107328
   %arrayidx124 = getelementptr inbounds float* %tile, i32 %add122
   %32 = load float* %arrayidx124, align 4, !tbaa !1
   %arrayidx125 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add123
   store float %32, float addrspace(3)* %arrayidx125, align 4, !tbaa !1
   %add126 = add nsw i32 %add.i13, 28
   %add127 = add nsw i32 %add.i, 115584
   %arrayidx128 = getelementptr inbounds float* %tile, i32 %add126
   %33 = load float* %arrayidx128, align 4, !tbaa !1
   %arrayidx129 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add127
   store float %33, float addrspace(3)* %arrayidx129, align 4, !tbaa !1
   %add130 = add nsw i32 %add.i13, 30
   %add131 = add nsw i32 %add.i, 123840
   %arrayidx132 = getelementptr inbounds float* %tile, i32 %add130
   %34 = load float* %arrayidx132, align 4, !tbaa !1
   %arrayidx133 = getelementptr inbounds float addrspace(3)* %output, 
i32 %add131
   store float %34, float addrspace(3)* %arrayidx133, align 4, !tbaa !1
   ret void
}

define i32 @_Z5mad24iii(i32 %a, i32 %b, i32 %c) nounwind readnone {
entry:
   %mul = mul nsw i32 %b, %a
   %add = add nsw i32 %mul, %c
   ret i32 %add
}

define <2 x i32> @_Z5mad24Dv2_iS_S_(<2 x i32> %a, <2 x i32> %b, <2 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <2 x i32> %a, %b
   %add = add <2 x i32> %mul, %c
   ret <2 x i32> %add
}

define <3 x i32> @_Z5mad24Dv3_iS_S_(<3 x i32> %a, <3 x i32> %b, <3 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <3 x i32> %a, %b
   %add = add <3 x i32> %mul, %c
   ret <3 x i32> %add
}

define <4 x i32> @_Z5mad24Dv4_iS_S_(<4 x i32> %a, <4 x i32> %b, <4 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <4 x i32> %a, %b
   %add = add <4 x i32> %mul, %c
   ret <4 x i32> %add
}

define <8 x i32> @_Z5mad24Dv8_iS_S_(<8 x i32> %a, <8 x i32> %b, <8 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <8 x i32> %a, %b
   %add = add <8 x i32> %mul, %c
   ret <8 x i32> %add
}

define <16 x i32> @_Z5mad24Dv16_iS_S_(<16 x i32> %a, <16 x i32> %b, <16 
x i32> %c) nounwind readnone {
entry:
   %mul = mul <16 x i32> %a, %b
   %add = add <16 x i32> %mul, %c
   ret <16 x i32> %add
}

define i32 @_Z5mad24jjj(i32 %a, i32 %b, i32 %c) nounwind readnone {
entry:
   %mul = mul i32 %b, %a
   %add = add i32 %mul, %c
   ret i32 %add
}

define <2 x i32> @_Z5mad24Dv2_jS_S_(<2 x i32> %a, <2 x i32> %b, <2 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <2 x i32> %a, %b
   %add = add <2 x i32> %mul, %c
   ret <2 x i32> %add
}

define <3 x i32> @_Z5mad24Dv3_jS_S_(<3 x i32> %a, <3 x i32> %b, <3 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <3 x i32> %a, %b
   %add = add <3 x i32> %mul, %c
   ret <3 x i32> %add
}

define <4 x i32> @_Z5mad24Dv4_jS_S_(<4 x i32> %a, <4 x i32> %b, <4 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <4 x i32> %a, %b
   %add = add <4 x i32> %mul, %c
   ret <4 x i32> %add
}

define <8 x i32> @_Z5mad24Dv8_jS_S_(<8 x i32> %a, <8 x i32> %b, <8 x 
i32> %c) nounwind readnone {
entry:
   %mul = mul <8 x i32> %a, %b
   %add = add <8 x i32> %mul, %c
   ret <8 x i32> %add
}

define <16 x i32> @_Z5mad24Dv16_jS_S_(<16 x i32> %a, <16 x i32> %b, <16 
x i32> %c) nounwind readnone {
entry:
   %mul = mul <16 x i32> %a, %b
   %add = add <16 x i32> %mul, %c
   ret <16 x i32> %add
}

declare void @pocl.barrier()

define void @barrier(i32 %flags) {
entry:
   call void @pocl.barrier()
   ret void
}

define i32 @get_group_id(i32 %dimindx) nounwind readonly {
entry:
   switch i32 %dimindx, label %return [
     i32 0, label %sw.bb
     i32 1, label %sw.bb1
     i32 2, label %sw.bb2
   ]

sw.bb:                                            ; preds = %entry
   %0 = load i32* @_group_id_x, align 4
   br label %return

sw.bb1:                                           ; preds = %entry
   %1 = load i32* @_group_id_y, align 4
   br label %return

sw.bb2:                                           ; preds = %entry
   %2 = load i32* @_group_id_z, align 4
   br label %return

return:                                           ; preds = %sw.bb2, 
%sw.bb1, %sw.bb, %entry
   %retval.0 = phi i32 [ %2, %sw.bb2 ], [ %1, %sw.bb1 ], [ %0, %sw.bb ], 
[ 0, %entry ]
   ret i32 %retval.0
}

define i32 @get_local_id(i32 %dimindx) nounwind readonly {
entry:
   switch i32 %dimindx, label %return [
     i32 0, label %sw.bb
     i32 1, label %sw.bb1
     i32 2, label %sw.bb2
   ]

sw.bb:                                            ; preds = %entry
   %0 = load i32* @_local_id_x, align 4
   br label %return

sw.bb1:                                           ; preds = %entry
   %1 = load i32* @_local_id_y, align 4
   br label %return

sw.bb2:                                           ; preds = %entry
   %2 = load i32* @_local_id_z, align 4
   br label %return

return:                                           ; preds = %sw.bb2, 
%sw.bb1, %sw.bb, %entry
   %retval.0 = phi i32 [ %2, %sw.bb2 ], [ %1, %sw.bb1 ], [ %0, %sw.bb ], 
[ 0, %entry ]
   ret i32 %retval.0
}

!opencl.kernels = !{!0}

!0 = metadata !{void (float addrspace(3)*, float addrspace(3)*, 
float*)* @matrix_transpose}
!1 = metadata !{metadata !"float", metadata !2}
!2 = metadata !{metadata !"omnipotent char", metadata !3}
!3 = metadata !{metadata !"Simple C/C++ TBAA"}





------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://ad.doubleclick.net/clk;258768047;13503038;j?
http://info.appdynamics.com/FreeJavaPerformanceDownload.html
_______________________________________________
pocl-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/pocl-devel

Reply via email to