Issue 164785
Summary [X86] Poor LZCNT/TZCNT expansion of very large integers
Labels backend:X86, missed-optimization, llvm:SelectionDAG
Assignees
Reporter RKSimon
    Split off from #164275
```ll
define i64 @lzcnt128(ptr %p0) {
  %a0 = load i128, ptr %p0, align 8
  %cnt = tail call i128 @llvm.ctlz.i128(i128 %a0, i1 true)
  %res = trunc i128 %cnt to i64
  ret i64 %res
}

define i64 @lzcnt1024(ptr %p0) {
  %a0 = load i1024, ptr %p0, align 8
  %cnt = tail call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 true)
  %res = trunc i1024 %cnt to i64
  ret i64 %res
}
```
The i128 case:
```asm
lzcnt128: # @lzcnt128
 movq 8(%rdi), %rcx
  lzcntq %rcx, %rdx
  lzcntq (%rdi), %rax
  addq $64, %rax
  testq %rcx, %rcx
  cmovneq %rdx, %rax
  retq
```
Could be:
```asm
lzcnt128: # @lzcnt128
  lzcntq (%rdi), %rax
  addq $64, %rax ; <--- lea 64(%rax), %rax could be used to avoid EFLAGS contamination as well
 lzcntq 8(%rdi) %rdx
  cmovnbq %rdx, %rax
  retq
```
The i1024 case:
```asm
lzcnt1024: # @lzcnt1024
  pushq %rbp
  pushq %r15
  pushq %r14
  pushq %r13
  pushq %r12
  pushq %rbx
  movq 32(%rdi), %r14
  movq 48(%rdi), %rbp
  movq 64(%rdi), %r11
  movq 72(%rdi), %r10
  movq 80(%rdi), %rdx
  movq 88(%rdi), %rbx
  movq 96(%rdi), %rsi
  movq 104(%rdi), %r9
  movq 112(%rdi), %r8
  movq 120(%rdi), %r15
  lzcntq %r15, %rax
  lzcntq %r8, %rcx
  addq $64, %rcx
  testq %r15, %r15
  cmovneq %rax, %rcx
  lzcntq %r9, %r12
  lzcntq %rsi, %rax
  movq %rsi, -16(%rsp) # 8-byte Spill
  addq $64, %rax
  testq %r9, %r9
  movq %r9, -32(%rsp) # 8-byte Spill
  cmovneq %r12, %rax
  subq $-128, %rax
  movq %r8, %r12
 movq %r8, -24(%rsp) # 8-byte Spill
  orq %r15, %r12
  cmovneq %rcx, %rax
 lzcntq %rbx, %rcx
  movq %rdx, -8(%rsp) # 8-byte Spill
  lzcntq %rdx, %r13
  addq $64, %r13
  testq %rbx, %rbx
  cmovneq %rcx, %r13
  lzcntq %r10, %rcx
  lzcntq %r11, %r12
  addq $64, %r12
  testq %r10, %r10
 cmovneq %rcx, %r12
  subq $-128, %r12
  movq %rdx, %rcx
  orq %rbx, %rcx
 cmovneq %r13, %r12
  addq $256, %r12 # imm = 0x100
  movq %r9, %rcx
  orq %r15, %rcx
  orq %r8, %rsi
  orq %rcx, %rsi
  movq 56(%rdi), %r13
 cmovneq %rax, %r12
  lzcntq %r13, %rcx
  movq %rbp, %rsi
  movq %rbp, -40(%rsp) # 8-byte Spill
  lzcntq %rbp, %rax
  addq $64, %rax
  testq %r13, %r13
  cmovneq %rcx, %rax
  lzcntq %r14, %rbp
  addq $64, %rbp
 movq 40(%rdi), %r8
  lzcntq %r8, %rdx
  testq %r8, %r8
  cmovneq %rdx, %rbp
  subq $-128, %rbp
  movq %rsi, %rdx
  orq %r13, %rdx
  cmovneq %rax, %rbp
  movq 16(%rdi), %r9
  lzcntq %r9, %rcx
  addq $64, %rcx
 movq 24(%rdi), %rsi
  lzcntq %rsi, %rax
  testq %rsi, %rsi
  cmovneq %rax, %rcx
  movq 8(%rdi), %rdx
  lzcntq (%rdi), %rax
  addq $64, %rax
  lzcntq %rdx, %rdi
  testq %rdx, %rdx
  cmovneq %rdi, %rax
  subq $-128, %rax
 orq %rsi, %r9
  cmovneq %rcx, %rax
  orq %r13, %r8
  orq -40(%rsp), %r14 # 8-byte Folded Reload
  addq $256, %rax # imm = 0x100
  orq %r8, %r14
 cmovneq %rbp, %rax
  orq %r15, %rbx
  orq -32(%rsp), %r10 # 8-byte Folded Reload
  orq %rbx, %r10
  movq -8(%rsp), %rcx # 8-byte Reload
  orq -24(%rsp), %rcx # 8-byte Folded Reload
  orq -16(%rsp), %r11 # 8-byte Folded Reload
  orq %rcx, %r11
  addq $512, %rax # imm = 0x200
  orq %r10, %r11
 cmovneq %r12, %rax
  popq %rbx
  popq %r12
  popq %r13
  popq %r14
 popq %r15
  popq %rbp
  retq
```
This suffers because ExpandIntRes_CTTZ splits 1024 -> 2 x 512 -> 4 x 128 -> 8 x 64, if we used the EFLAGS chain more we could end up with something more sequential like:
```asm
lzcnt1024: # @lzcnt1024
  lzcntq (%rdi), %rax
  addq $960, %rax
  
  lzcntq 8(%rdi) %rdx
  leaq 896(%rdx), %rdx
  cmovnbq %rdx, %rax
  
  lzcntq 16(%rdi) %rdx
  leaq 832(%rdx), %rdx
  cmovnbq %rdx, %rax
  ...
  lzcntq 112(%rdi) %rdx
  leaq 64(%rdx), %rdx
  cmovnbq %rdx, %rax
  lzcntq 120(%rdi) %rdx
 cmovnbq %rdx, %rax
  retq
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to