Continue from question #209709
(https://answers.launchpad.net/pocl/+question/209709)
The disassembly of kernel_linked.bc follows:
; ModuleID = 'kernel_linked.bc'
target datalayout =
"e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32"
target triple = "mipsel"
@_group_id_x = external global i32
@_group_id_y = external global i32
@_group_id_z = external global i32
@_local_id_x = external global i32
@_local_id_y = external global i32
@_local_id_z = external global i32
define void @matrix_transpose(float addrspace(3)* nocapture %output,
float addrspace(3)* nocapture %input, float* %tile) nounwind {
entry:
%0 = load i32* @_group_id_x, align 4
%1 = load i32* @_group_id_y, align 4
%2 = load i32* @_local_id_x, align 4
%and = and i32 %2, 31
%shr = lshr i32 %2, 5
%mul.i14 = mul nsw i32 %shr, 33
%add.i15 = add nsw i32 %mul.i14, %and
%mul.i12 = mul nsw i32 %and, 33
%add.i13 = add nsw i32 %mul.i12, %shr
%mul.i10 = shl nsw i32 %0, 5
%add.i11 = or i32 %mul.i10, %and
%mul.i8 = shl nsw i32 %1, 5
%add.i9 = add nsw i32 %mul.i8, %shr
%mul.i6 = shl nsw i32 %add.i9, 8
%add.i7 = add nsw i32 %mul.i6, %add.i11
%add.i5 = or i32 %mul.i8, %and
%add.i3 = add nsw i32 %mul.i10, %shr
%mul.i = mul nsw i32 %add.i3, 4128
%add.i = add nsw i32 %mul.i, %add.i5
%arrayidx = getelementptr inbounds float addrspace(3)* %input, i32
%add.i7
%3 = load float addrspace(3)* %arrayidx, align 4, !tbaa !1
%arrayidx12 = getelementptr inbounds float* %tile, i32 %add.i15
store float %3, float* %arrayidx12, align 4, !tbaa !1
%add = add nsw i32 %add.i15, 66
%add13 = add nsw i32 %add.i7, 512
%arrayidx14 = getelementptr inbounds float addrspace(3)* %input, i32
%add13
%4 = load float addrspace(3)* %arrayidx14, align 4, !tbaa !1
%arrayidx15 = getelementptr inbounds float* %tile, i32 %add
store float %4, float* %arrayidx15, align 4, !tbaa !1
%add16 = add nsw i32 %add.i15, 132
%add17 = add nsw i32 %add.i7, 1024
%arrayidx18 = getelementptr inbounds float addrspace(3)* %input, i32
%add17
%5 = load float addrspace(3)* %arrayidx18, align 4, !tbaa !1
%arrayidx19 = getelementptr inbounds float* %tile, i32 %add16
store float %5, float* %arrayidx19, align 4, !tbaa !1
%add20 = add nsw i32 %add.i15, 198
%add21 = add nsw i32 %add.i7, 1536
%arrayidx22 = getelementptr inbounds float addrspace(3)* %input, i32
%add21
%6 = load float addrspace(3)* %arrayidx22, align 4, !tbaa !1
%arrayidx23 = getelementptr inbounds float* %tile, i32 %add20
store float %6, float* %arrayidx23, align 4, !tbaa !1
%add24 = add nsw i32 %add.i15, 264
%add25 = add nsw i32 %add.i7, 2048
%arrayidx26 = getelementptr inbounds float addrspace(3)* %input, i32
%add25
%7 = load float addrspace(3)* %arrayidx26, align 4, !tbaa !1
%arrayidx27 = getelementptr inbounds float* %tile, i32 %add24
store float %7, float* %arrayidx27, align 4, !tbaa !1
%add28 = add nsw i32 %add.i15, 330
%add29 = add nsw i32 %add.i7, 2560
%arrayidx30 = getelementptr inbounds float addrspace(3)* %input, i32
%add29
%8 = load float addrspace(3)* %arrayidx30, align 4, !tbaa !1
%arrayidx31 = getelementptr inbounds float* %tile, i32 %add28
store float %8, float* %arrayidx31, align 4, !tbaa !1
%add32 = add nsw i32 %add.i15, 396
%add33 = add nsw i32 %add.i7, 3072
%arrayidx34 = getelementptr inbounds float addrspace(3)* %input, i32
%add33
%9 = load float addrspace(3)* %arrayidx34, align 4, !tbaa !1
%arrayidx35 = getelementptr inbounds float* %tile, i32 %add32
store float %9, float* %arrayidx35, align 4, !tbaa !1
%add36 = add nsw i32 %add.i15, 462
%add37 = add nsw i32 %add.i7, 3584
%arrayidx38 = getelementptr inbounds float addrspace(3)* %input, i32
%add37
%10 = load float addrspace(3)* %arrayidx38, align 4, !tbaa !1
%arrayidx39 = getelementptr inbounds float* %tile, i32 %add36
store float %10, float* %arrayidx39, align 4, !tbaa !1
%add40 = add nsw i32 %add.i15, 528
%add41 = add nsw i32 %add.i7, 4096
%arrayidx42 = getelementptr inbounds float addrspace(3)* %input, i32
%add41
%11 = load float addrspace(3)* %arrayidx42, align 4, !tbaa !1
%arrayidx43 = getelementptr inbounds float* %tile, i32 %add40
store float %11, float* %arrayidx43, align 4, !tbaa !1
%add44 = add nsw i32 %add.i15, 594
%add45 = add nsw i32 %add.i7, 4608
%arrayidx46 = getelementptr inbounds float addrspace(3)* %input, i32
%add45
%12 = load float addrspace(3)* %arrayidx46, align 4, !tbaa !1
%arrayidx47 = getelementptr inbounds float* %tile, i32 %add44
store float %12, float* %arrayidx47, align 4, !tbaa !1
%add48 = add nsw i32 %add.i15, 660
%add49 = add nsw i32 %add.i7, 5120
%arrayidx50 = getelementptr inbounds float addrspace(3)* %input, i32
%add49
%13 = load float addrspace(3)* %arrayidx50, align 4, !tbaa !1
%arrayidx51 = getelementptr inbounds float* %tile, i32 %add48
store float %13, float* %arrayidx51, align 4, !tbaa !1
%add52 = add nsw i32 %add.i15, 726
%add53 = add nsw i32 %add.i7, 5632
%arrayidx54 = getelementptr inbounds float addrspace(3)* %input, i32
%add53
%14 = load float addrspace(3)* %arrayidx54, align 4, !tbaa !1
%arrayidx55 = getelementptr inbounds float* %tile, i32 %add52
store float %14, float* %arrayidx55, align 4, !tbaa !1
%add56 = add nsw i32 %add.i15, 792
%add57 = add nsw i32 %add.i7, 6144
%arrayidx58 = getelementptr inbounds float addrspace(3)* %input, i32
%add57
%15 = load float addrspace(3)* %arrayidx58, align 4, !tbaa !1
%arrayidx59 = getelementptr inbounds float* %tile, i32 %add56
store float %15, float* %arrayidx59, align 4, !tbaa !1
%add60 = add nsw i32 %add.i15, 858
%add61 = add nsw i32 %add.i7, 6656
%arrayidx62 = getelementptr inbounds float addrspace(3)* %input, i32
%add61
%16 = load float addrspace(3)* %arrayidx62, align 4, !tbaa !1
%arrayidx63 = getelementptr inbounds float* %tile, i32 %add60
store float %16, float* %arrayidx63, align 4, !tbaa !1
%add64 = add nsw i32 %add.i15, 924
%add65 = add nsw i32 %add.i7, 7168
%arrayidx66 = getelementptr inbounds float addrspace(3)* %input, i32
%add65
%17 = load float addrspace(3)* %arrayidx66, align 4, !tbaa !1
%arrayidx67 = getelementptr inbounds float* %tile, i32 %add64
store float %17, float* %arrayidx67, align 4, !tbaa !1
%add68 = add nsw i32 %add.i15, 990
%add69 = add nsw i32 %add.i7, 7680
%arrayidx70 = getelementptr inbounds float addrspace(3)* %input, i32
%add69
%18 = load float addrspace(3)* %arrayidx70, align 4, !tbaa !1
%arrayidx71 = getelementptr inbounds float* %tile, i32 %add68
store float %18, float* %arrayidx71, align 4, !tbaa !1
call void @pocl.barrier() nounwind
%arrayidx72 = getelementptr inbounds float* %tile, i32 %add.i13
%19 = load float* %arrayidx72, align 4, !tbaa !1
%arrayidx73 = getelementptr inbounds float addrspace(3)* %output, i32
%add.i
store float %19, float addrspace(3)* %arrayidx73, align 4, !tbaa !1
%add74 = add nsw i32 %add.i13, 2
%add75 = add nsw i32 %add.i, 8256
%arrayidx76 = getelementptr inbounds float* %tile, i32 %add74
%20 = load float* %arrayidx76, align 4, !tbaa !1
%arrayidx77 = getelementptr inbounds float addrspace(3)* %output, i32
%add75
store float %20, float addrspace(3)* %arrayidx77, align 4, !tbaa !1
%add78 = add nsw i32 %add.i13, 4
%add79 = add nsw i32 %add.i, 16512
%arrayidx80 = getelementptr inbounds float* %tile, i32 %add78
%21 = load float* %arrayidx80, align 4, !tbaa !1
%arrayidx81 = getelementptr inbounds float addrspace(3)* %output, i32
%add79
store float %21, float addrspace(3)* %arrayidx81, align 4, !tbaa !1
%add82 = add nsw i32 %add.i13, 6
%add83 = add nsw i32 %add.i, 24768
%arrayidx84 = getelementptr inbounds float* %tile, i32 %add82
%22 = load float* %arrayidx84, align 4, !tbaa !1
%arrayidx85 = getelementptr inbounds float addrspace(3)* %output, i32
%add83
store float %22, float addrspace(3)* %arrayidx85, align 4, !tbaa !1
%add86 = add nsw i32 %add.i13, 8
%add87 = add nsw i32 %add.i, 33024
%arrayidx88 = getelementptr inbounds float* %tile, i32 %add86
%23 = load float* %arrayidx88, align 4, !tbaa !1
%arrayidx89 = getelementptr inbounds float addrspace(3)* %output, i32
%add87
store float %23, float addrspace(3)* %arrayidx89, align 4, !tbaa !1
%add90 = add nsw i32 %add.i13, 10
%add91 = add nsw i32 %add.i, 41280
%arrayidx92 = getelementptr inbounds float* %tile, i32 %add90
%24 = load float* %arrayidx92, align 4, !tbaa !1
%arrayidx93 = getelementptr inbounds float addrspace(3)* %output, i32
%add91
store float %24, float addrspace(3)* %arrayidx93, align 4, !tbaa !1
%add94 = add nsw i32 %add.i13, 12
%add95 = add nsw i32 %add.i, 49536
%arrayidx96 = getelementptr inbounds float* %tile, i32 %add94
%25 = load float* %arrayidx96, align 4, !tbaa !1
%arrayidx97 = getelementptr inbounds float addrspace(3)* %output, i32
%add95
store float %25, float addrspace(3)* %arrayidx97, align 4, !tbaa !1
%add98 = add nsw i32 %add.i13, 14
%add99 = add nsw i32 %add.i, 57792
%arrayidx100 = getelementptr inbounds float* %tile, i32 %add98
%26 = load float* %arrayidx100, align 4, !tbaa !1
%arrayidx101 = getelementptr inbounds float addrspace(3)* %output,
i32 %add99
store float %26, float addrspace(3)* %arrayidx101, align 4, !tbaa !1
%add102 = add nsw i32 %add.i13, 16
%add103 = add nsw i32 %add.i, 66048
%arrayidx104 = getelementptr inbounds float* %tile, i32 %add102
%27 = load float* %arrayidx104, align 4, !tbaa !1
%arrayidx105 = getelementptr inbounds float addrspace(3)* %output,
i32 %add103
store float %27, float addrspace(3)* %arrayidx105, align 4, !tbaa !1
%add106 = add nsw i32 %add.i13, 18
%add107 = add nsw i32 %add.i, 74304
%arrayidx108 = getelementptr inbounds float* %tile, i32 %add106
%28 = load float* %arrayidx108, align 4, !tbaa !1
%arrayidx109 = getelementptr inbounds float addrspace(3)* %output,
i32 %add107
store float %28, float addrspace(3)* %arrayidx109, align 4, !tbaa !1
%add110 = add nsw i32 %add.i13, 20
%add111 = add nsw i32 %add.i, 82560
%arrayidx112 = getelementptr inbounds float* %tile, i32 %add110
%29 = load float* %arrayidx112, align 4, !tbaa !1
%arrayidx113 = getelementptr inbounds float addrspace(3)* %output,
i32 %add111
store float %29, float addrspace(3)* %arrayidx113, align 4, !tbaa !1
%add114 = add nsw i32 %add.i13, 22
%add115 = add nsw i32 %add.i, 90816
%arrayidx116 = getelementptr inbounds float* %tile, i32 %add114
%30 = load float* %arrayidx116, align 4, !tbaa !1
%arrayidx117 = getelementptr inbounds float addrspace(3)* %output,
i32 %add115
store float %30, float addrspace(3)* %arrayidx117, align 4, !tbaa !1
%add118 = add nsw i32 %add.i13, 24
%add119 = add nsw i32 %add.i, 99072
%arrayidx120 = getelementptr inbounds float* %tile, i32 %add118
%31 = load float* %arrayidx120, align 4, !tbaa !1
%arrayidx121 = getelementptr inbounds float addrspace(3)* %output,
i32 %add119
store float %31, float addrspace(3)* %arrayidx121, align 4, !tbaa !1
%add122 = add nsw i32 %add.i13, 26
%add123 = add nsw i32 %add.i, 107328
%arrayidx124 = getelementptr inbounds float* %tile, i32 %add122
%32 = load float* %arrayidx124, align 4, !tbaa !1
%arrayidx125 = getelementptr inbounds float addrspace(3)* %output,
i32 %add123
store float %32, float addrspace(3)* %arrayidx125, align 4, !tbaa !1
%add126 = add nsw i32 %add.i13, 28
%add127 = add nsw i32 %add.i, 115584
%arrayidx128 = getelementptr inbounds float* %tile, i32 %add126
%33 = load float* %arrayidx128, align 4, !tbaa !1
%arrayidx129 = getelementptr inbounds float addrspace(3)* %output,
i32 %add127
store float %33, float addrspace(3)* %arrayidx129, align 4, !tbaa !1
%add130 = add nsw i32 %add.i13, 30
%add131 = add nsw i32 %add.i, 123840
%arrayidx132 = getelementptr inbounds float* %tile, i32 %add130
%34 = load float* %arrayidx132, align 4, !tbaa !1
%arrayidx133 = getelementptr inbounds float addrspace(3)* %output,
i32 %add131
store float %34, float addrspace(3)* %arrayidx133, align 4, !tbaa !1
ret void
}
define i32 @_Z5mad24iii(i32 %a, i32 %b, i32 %c) nounwind readnone {
entry:
%mul = mul nsw i32 %b, %a
%add = add nsw i32 %mul, %c
ret i32 %add
}
define <2 x i32> @_Z5mad24Dv2_iS_S_(<2 x i32> %a, <2 x i32> %b, <2 x
i32> %c) nounwind readnone {
entry:
%mul = mul <2 x i32> %a, %b
%add = add <2 x i32> %mul, %c
ret <2 x i32> %add
}
define <3 x i32> @_Z5mad24Dv3_iS_S_(<3 x i32> %a, <3 x i32> %b, <3 x
i32> %c) nounwind readnone {
entry:
%mul = mul <3 x i32> %a, %b
%add = add <3 x i32> %mul, %c
ret <3 x i32> %add
}
define <4 x i32> @_Z5mad24Dv4_iS_S_(<4 x i32> %a, <4 x i32> %b, <4 x
i32> %c) nounwind readnone {
entry:
%mul = mul <4 x i32> %a, %b
%add = add <4 x i32> %mul, %c
ret <4 x i32> %add
}
define <8 x i32> @_Z5mad24Dv8_iS_S_(<8 x i32> %a, <8 x i32> %b, <8 x
i32> %c) nounwind readnone {
entry:
%mul = mul <8 x i32> %a, %b
%add = add <8 x i32> %mul, %c
ret <8 x i32> %add
}
define <16 x i32> @_Z5mad24Dv16_iS_S_(<16 x i32> %a, <16 x i32> %b, <16
x i32> %c) nounwind readnone {
entry:
%mul = mul <16 x i32> %a, %b
%add = add <16 x i32> %mul, %c
ret <16 x i32> %add
}
define i32 @_Z5mad24jjj(i32 %a, i32 %b, i32 %c) nounwind readnone {
entry:
%mul = mul i32 %b, %a
%add = add i32 %mul, %c
ret i32 %add
}
define <2 x i32> @_Z5mad24Dv2_jS_S_(<2 x i32> %a, <2 x i32> %b, <2 x
i32> %c) nounwind readnone {
entry:
%mul = mul <2 x i32> %a, %b
%add = add <2 x i32> %mul, %c
ret <2 x i32> %add
}
define <3 x i32> @_Z5mad24Dv3_jS_S_(<3 x i32> %a, <3 x i32> %b, <3 x
i32> %c) nounwind readnone {
entry:
%mul = mul <3 x i32> %a, %b
%add = add <3 x i32> %mul, %c
ret <3 x i32> %add
}
define <4 x i32> @_Z5mad24Dv4_jS_S_(<4 x i32> %a, <4 x i32> %b, <4 x
i32> %c) nounwind readnone {
entry:
%mul = mul <4 x i32> %a, %b
%add = add <4 x i32> %mul, %c
ret <4 x i32> %add
}
define <8 x i32> @_Z5mad24Dv8_jS_S_(<8 x i32> %a, <8 x i32> %b, <8 x
i32> %c) nounwind readnone {
entry:
%mul = mul <8 x i32> %a, %b
%add = add <8 x i32> %mul, %c
ret <8 x i32> %add
}
define <16 x i32> @_Z5mad24Dv16_jS_S_(<16 x i32> %a, <16 x i32> %b, <16
x i32> %c) nounwind readnone {
entry:
%mul = mul <16 x i32> %a, %b
%add = add <16 x i32> %mul, %c
ret <16 x i32> %add
}
declare void @pocl.barrier()
define void @barrier(i32 %flags) {
entry:
call void @pocl.barrier()
ret void
}
define i32 @get_group_id(i32 %dimindx) nounwind readonly {
entry:
switch i32 %dimindx, label %return [
i32 0, label %sw.bb
i32 1, label %sw.bb1
i32 2, label %sw.bb2
]
sw.bb: ; preds = %entry
%0 = load i32* @_group_id_x, align 4
br label %return
sw.bb1: ; preds = %entry
%1 = load i32* @_group_id_y, align 4
br label %return
sw.bb2: ; preds = %entry
%2 = load i32* @_group_id_z, align 4
br label %return
return: ; preds = %sw.bb2,
%sw.bb1, %sw.bb, %entry
%retval.0 = phi i32 [ %2, %sw.bb2 ], [ %1, %sw.bb1 ], [ %0, %sw.bb ],
[ 0, %entry ]
ret i32 %retval.0
}
define i32 @get_local_id(i32 %dimindx) nounwind readonly {
entry:
switch i32 %dimindx, label %return [
i32 0, label %sw.bb
i32 1, label %sw.bb1
i32 2, label %sw.bb2
]
sw.bb: ; preds = %entry
%0 = load i32* @_local_id_x, align 4
br label %return
sw.bb1: ; preds = %entry
%1 = load i32* @_local_id_y, align 4
br label %return
sw.bb2: ; preds = %entry
%2 = load i32* @_local_id_z, align 4
br label %return
return: ; preds = %sw.bb2,
%sw.bb1, %sw.bb, %entry
%retval.0 = phi i32 [ %2, %sw.bb2 ], [ %1, %sw.bb1 ], [ %0, %sw.bb ],
[ 0, %entry ]
ret i32 %retval.0
}
!opencl.kernels = !{!0}
!0 = metadata !{void (float addrspace(3)*, float addrspace(3)*,
float*)* @matrix_transpose}
!1 = metadata !{metadata !"float", metadata !2}
!2 = metadata !{metadata !"omnipotent char", metadata !3}
!3 = metadata !{metadata !"Simple C/C++ TBAA"}
------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://ad.doubleclick.net/clk;258768047;13503038;j?
http://info.appdynamics.com/FreeJavaPerformanceDownload.html
_______________________________________________
pocl-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/pocl-devel