https://bugs.llvm.org/show_bug.cgi?id=36696

            Bug ID: 36696
           Summary: X86 reassociate generates add chain instead of tree
                    for ILP
           Product: tools
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: llc
          Assignee: unassignedb...@nondot.org
          Reporter: a...@azul.com
                CC: llvm-bugs@lists.llvm.org

Given an IR like the one below:
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
target triple = "x86_64-unknown-linux-gnu"

define i32 @test(i8 addrspace(1)* %tmp, i32 %init) {
entry:
  br label %loop

loop:
  %sum = phi i32 [0, %entry], [%csum, %loop]
  %iv = phi i32 [%init, %entry], [ %ivnext, %loop]
  %c1 = getelementptr inbounds i8, i8 addrspace(1)* %tmp, i64 8
  %addr = bitcast i8 addrspace(1)* %c1 to i32 addrspace(1)*
  %c2 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c3 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c4 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c5 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c6 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c7 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c8 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c9 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c14 = add i32 %c3, %c2 
  %c13 = add i32 %c14, %c4 
  %c15 = add i32 %c13, %c5 
  %c11 = add i32 %c15, %c6 
  %c10 = add i32 %c11, %c7 
  %c12 = add i32 %c10, %c8 
  %c16 = add i32 %c12, %c9 
  %csum = add i32 %sum, %c16
  %ivnext = add nsw nuw i32 %iv, 1
  %cond = icmp ult i32 %ivnext, 100 
  br i1 %cond, label %loop, label %exit

exit:
  ret i32 %sum
}

!0 = !{!"-mcpu=haswell"}
!1 = !{!"-mattr=+sse2,+cx16,+sahf,+avx"}

The LLC machine combiner which is based on registers and critical path is not
able to convert this into add-tree form (instead of the add-chain form).

@Sanjay, could we extend https://reviews.llvm.org/D10321 to handle this case? I
tried reducing the adds to just 3 and see if it's some sort of register
pressure that's preventing ILP identification. Also, tried the same IR in
straight line code, but no luck. There's AVX support and this IR is run on
haswell machine.

The assembly generated with LLC -O3 is:
  movl  %ecx, %eax
  movl  8(%rdi), %ecx
  #MEMBARRIER
  movl  8(%rdi), %edx
  #MEMBARRIER
  movl  8(%rdi), %r8d
  #MEMBARRIER
  movl  8(%rdi), %r9d
  #MEMBARRIER
  movl  8(%rdi), %r10d
  #MEMBARRIER
  movl  8(%rdi), %r11d
  #MEMBARRIER
  movl  8(%rdi), %ebx
  #MEMBARRIER
  movl  8(%rdi), %ebp
  #MEMBARRIER
  addl  %eax, %ecx <-- we do add-chaining here.
  addl  %edx, %ecx
  addl  %r8d, %ecx
  addl  %r9d, %ecx
  addl  %r10d, %ecx
  addl  %r11d, %ecx
  addl  %ebx, %ecx
  addl  %ebp, %ecx
  incl  %esi


We do add chaining instead of something like this:

  movl  %ecx, %eax
  movl  8(%rdi), %r8d
  #MEMBARRIER
  movl  8(%rdi), %r10d
  #MEMBARRIER
  movl  8(%rdi), %r9d
  #MEMBARRIER
  movl  8(%rdi), %edx
  #MEMBARRIER
  movl  8(%rdi), %r11d
  #MEMBARRIER
  movl  8(%rdi), %ebx
  #MEMBARRIER
  movl  8(%rdi), %ebp
  #MEMBARRIER
  movl  8(%rdi), %ecx
  #MEMBARRIER
  addl  %r8d, %r10d  <-- add-tree form instead of chaining.
  addl  %r9d, %edx
  addl  %r10d, %edx
  addl  %r11d, %ebx
  addl  %ebp, %ecx
  addl  %ebx, %ecx
  addl  %edx, %ecx
  addl  %eax, %ecx
  incl  %esi


I generated the above assembly based on add-tree IR:
define i32 @test(i8 addrspace(1)* %tmp, i32 %init) {
entry:
  br label %loop

loop:
  %sum = phi i32 [0, %entry], [%csum, %loop]
  %iv = phi i32 [%init, %entry], [ %ivnext, %loop]
  %c1 = getelementptr inbounds i8, i8 addrspace(1)* %tmp, i64 8
  %addr = bitcast i8 addrspace(1)* %c1 to i32 addrspace(1)*
  %c2 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c3 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c4 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c5 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c6 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c7 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c8 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c9 = load i32, i32 addrspace(1)* %addr, align 8
  fence acquire
  %c10 = add i32 %c2, %c3 
  %c11 = add i32 %c4, %c5 
  %c12 = add i32 %c10, %c11
  %c13 = add i32 %c6, %c7 
  %c14 = add i32 %c8, %c9 
  %c15 = add i32 %c13, %c14
  %c16 = add i32 %c12, %c15
  %csum = add i32 %sum, %c16
  %ivnext = add nsw nuw i32 %iv, 1
  %cond = icmp ult i32 %ivnext, 100 
  br i1 %cond, label %loop, label %exit

exit:
  ret i32 %sum
}

-- 
You are receiving this mail because:
You are on the CC list for the bug.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to