[LLVMbugs] [Bug 17002] New: LLVM produces poor code for 8-vectors split into two 4-vectors when using select.

bugzilla-daemon Mon, 26 Aug 2013 07:39:44 -0700

http://llvm.org/bugs/show_bug.cgi?id=17002


            Bug ID: 17002
           Summary: LLVM produces poor code for 8-vectors split into two
                    4-vectors when using select.
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: normal
          Priority: P
         Component: Common Code Generator Code
          Assignee: [email protected]
          Reporter: [email protected]
                CC: [email protected]
    Classification: Unclassified

I'm on a non-AVX box, and am trying to write a min() function that produces the
min of two 8-vectors.
The select gets turned into poor code.  (See code below)
What ideally would happen is that the code produced would be two loads, two
MINPS's, then two stores.  
Instead we get a lot of shuffling and shifting.
The problem seems to be due to splitting of the select node in the selection
DAG.
Happens with LLVM trunk, on Windows x64.


Optimised IR:

; ModuleID = 'WinterModule'
target datalayout =
"e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"

; Function Attrs: nounwind
define void @"main(Float8Struct, Float8Struct)"({ <8 x float> }* noalias
nocapture sret %ret, { <8 x float> }* noalias nocapture readonly %a, { <8 x
float> }* noalias nocapture readonly %b, i32* nocapture readnone %hidden) #0 {
entry:
  %a.idx = getelementptr { <8 x float> }* %a, i64 0, i32 0
  %a.idx.val = load <8 x float>* %a.idx, align 32
  %b.idx = getelementptr { <8 x float> }* %b, i64 0, i32 0
  %b.idx.val = load <8 x float>* %b.idx, align 32
  %0 = fcmp olt <8 x float> %a.idx.val, %b.idx.val
  %1 = select <8 x i1> %0, <8 x float> %a.idx.val, <8 x float> %b.idx.val
  %.fca.0.insert.i = insertvalue { <8 x float> } undef, <8 x float> %1, 0
  store { <8 x float> } %.fca.0.insert.i, { <8 x float> }* %ret, align 32
  ret void
}

attributes #0 = { nounwind }




Resulting code:

Function Live Ins: %RCX in %vreg0, %RDX in %vreg1, %R8 in %vreg2

BB#0: derived from LLVM BB %entry
    Live Ins: %RCX %RDX %R8
        %XMM2<def> = MOVAPSrm %R8, 1, %noreg, 0, %noreg;
mem:LD16[%b.idx2](align=32)
        %XMM3<def> = MOVAPSrm %R8<kill>, 1, %noreg, 16, %noreg;
mem:LD16[%b.idx2(align=32)+16](align=16)
        %XMM4<def> = MOVAPSrm %RDX, 1, %noreg, 0, %noreg;
mem:LD16[%a.idx1](align=32)
        %XMM5<def> = MOVAPSrm %RDX<kill>, 1, %noreg, 16, %noreg;
mem:LD16[%a.idx1(align=32)+16](align=16)
        %XMM0<def> = MOVAPSrr %XMM5
        %XMM0<def,tied1> = CMPPSrri %XMM0<kill,tied0>, %XMM3, 1
        %RAX<def> = MOV64ri <cp#0>
        %XMM0<def,tied1> = PSHUFBrm %XMM0<kill,tied0>, %RAX<kill>, 1, %noreg,
0, %noreg; mem:LD16[ConstantPool]
        %XMM1<def> = MOVAPSrr %XMM4
        %XMM1<def,tied1> = CMPPSrri %XMM1<kill,tied0>, %XMM2, 1
        %RAX<def> = MOV64ri <cp#1>
        %XMM1<def,tied1> = PSHUFBrm %XMM1<kill,tied0>, %RAX<kill>, 1, %noreg,
0, %noreg; mem:LD16[ConstantPool]
        %XMM1<def,tied1> = PORrr %XMM1<kill,tied0>, %XMM0<kill>
        %XMM1<def,tied1> = PSLLWri %XMM1<kill,tied0>, 15
        %XMM1<def,tied1> = PSRAWri %XMM1<kill,tied0>, 15
        %XMM0<def> = MOVDQArr %XMM1
        %XMM0<def,tied1> = PUNPCKHBWrr %XMM0<kill,tied0>, %XMM0<undef>
        %XMM0<def,tied1> = PSLLDri %XMM0<kill,tied0>, 31
        %XMM3<def,tied1> = BLENDVPSrr0 %XMM3<kill,tied0>, %XMM5<kill>,
%XMM0<imp-use>
        MOVAPSmr %RCX, 1, %noreg, 16, %noreg, %XMM3<kill>;
mem:ST16[%ret(align=32)+16](align=16)
        %XMM1<def,tied1> = PUNPCKLBWrr %XMM1<kill,tied0>, %XMM0<undef>
        %XMM1<def,tied1> = PSLLDri %XMM1<kill,tied0>, 31
        %XMM0<def> = MOVDQArr %XMM1<kill>
        %XMM2<def,tied1> = BLENDVPSrr0 %XMM2<kill,tied0>, %XMM4<kill>,
%XMM0<imp-use>
        MOVAPSmr %RCX, 1, %noreg, 0, %noreg, %XMM2<kill>;
mem:ST16[%ret](align=32)
        %RAX<def> = MOV64rr %RCX<kill>
        RET %RAX

# End machine code for function main(Float8Struct, Float8Struct).

-- 
You are receiving this mail because:
You are on the CC list for the bug.

_______________________________________________
LLVMbugs mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/llvmbugs

[LLVMbugs] [Bug 17002] New: LLVM produces poor code for 8-vectors split into two 4-vectors when using select.

Reply via email to