| Issue |
164785
|
| Summary |
[X86] Poor LZCNT/TZCNT expansion of very large integers
|
| Labels |
backend:X86,
missed-optimization,
llvm:SelectionDAG
|
| Assignees |
|
| Reporter |
RKSimon
|
Split off from #164275
```ll
define i64 @lzcnt128(ptr %p0) {
%a0 = load i128, ptr %p0, align 8
%cnt = tail call i128 @llvm.ctlz.i128(i128 %a0, i1 true)
%res = trunc i128 %cnt to i64
ret i64 %res
}
define i64 @lzcnt1024(ptr %p0) {
%a0 = load i1024, ptr %p0, align 8
%cnt = tail call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 true)
%res = trunc i1024 %cnt to i64
ret i64 %res
}
```
The i128 case:
```asm
lzcnt128: # @lzcnt128
movq 8(%rdi), %rcx
lzcntq %rcx, %rdx
lzcntq (%rdi), %rax
addq $64, %rax
testq %rcx, %rcx
cmovneq %rdx, %rax
retq
```
Could be:
```asm
lzcnt128: # @lzcnt128
lzcntq (%rdi), %rax
addq $64, %rax ; <--- lea 64(%rax), %rax could be used to avoid EFLAGS contamination as well
lzcntq 8(%rdi) %rdx
cmovnbq %rdx, %rax
retq
```
The i1024 case:
```asm
lzcnt1024: # @lzcnt1024
pushq %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
movq 32(%rdi), %r14
movq 48(%rdi), %rbp
movq 64(%rdi), %r11
movq 72(%rdi), %r10
movq 80(%rdi), %rdx
movq 88(%rdi), %rbx
movq 96(%rdi), %rsi
movq 104(%rdi), %r9
movq 112(%rdi), %r8
movq 120(%rdi), %r15
lzcntq %r15, %rax
lzcntq %r8, %rcx
addq $64, %rcx
testq %r15, %r15
cmovneq %rax, %rcx
lzcntq %r9, %r12
lzcntq %rsi, %rax
movq %rsi, -16(%rsp) # 8-byte Spill
addq $64, %rax
testq %r9, %r9
movq %r9, -32(%rsp) # 8-byte Spill
cmovneq %r12, %rax
subq $-128, %rax
movq %r8, %r12
movq %r8, -24(%rsp) # 8-byte Spill
orq %r15, %r12
cmovneq %rcx, %rax
lzcntq %rbx, %rcx
movq %rdx, -8(%rsp) # 8-byte Spill
lzcntq %rdx, %r13
addq $64, %r13
testq %rbx, %rbx
cmovneq %rcx, %r13
lzcntq %r10, %rcx
lzcntq %r11, %r12
addq $64, %r12
testq %r10, %r10
cmovneq %rcx, %r12
subq $-128, %r12
movq %rdx, %rcx
orq %rbx, %rcx
cmovneq %r13, %r12
addq $256, %r12 # imm = 0x100
movq %r9, %rcx
orq %r15, %rcx
orq %r8, %rsi
orq %rcx, %rsi
movq 56(%rdi), %r13
cmovneq %rax, %r12
lzcntq %r13, %rcx
movq %rbp, %rsi
movq %rbp, -40(%rsp) # 8-byte Spill
lzcntq %rbp, %rax
addq $64, %rax
testq %r13, %r13
cmovneq %rcx, %rax
lzcntq %r14, %rbp
addq $64, %rbp
movq 40(%rdi), %r8
lzcntq %r8, %rdx
testq %r8, %r8
cmovneq %rdx, %rbp
subq $-128, %rbp
movq %rsi, %rdx
orq %r13, %rdx
cmovneq %rax, %rbp
movq 16(%rdi), %r9
lzcntq %r9, %rcx
addq $64, %rcx
movq 24(%rdi), %rsi
lzcntq %rsi, %rax
testq %rsi, %rsi
cmovneq %rax, %rcx
movq 8(%rdi), %rdx
lzcntq (%rdi), %rax
addq $64, %rax
lzcntq %rdx, %rdi
testq %rdx, %rdx
cmovneq %rdi, %rax
subq $-128, %rax
orq %rsi, %r9
cmovneq %rcx, %rax
orq %r13, %r8
orq -40(%rsp), %r14 # 8-byte Folded Reload
addq $256, %rax # imm = 0x100
orq %r8, %r14
cmovneq %rbp, %rax
orq %r15, %rbx
orq -32(%rsp), %r10 # 8-byte Folded Reload
orq %rbx, %r10
movq -8(%rsp), %rcx # 8-byte Reload
orq -24(%rsp), %rcx # 8-byte Folded Reload
orq -16(%rsp), %r11 # 8-byte Folded Reload
orq %rcx, %r11
addq $512, %rax # imm = 0x200
orq %r10, %r11
cmovneq %r12, %rax
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
```
This suffers because ExpandIntRes_CTTZ splits 1024 -> 2 x 512 -> 4 x 128 -> 8 x 64, if we used the EFLAGS chain more we could end up with something more sequential like:
```asm
lzcnt1024: # @lzcnt1024
lzcntq (%rdi), %rax
addq $960, %rax
lzcntq 8(%rdi) %rdx
leaq 896(%rdx), %rdx
cmovnbq %rdx, %rax
lzcntq 16(%rdi) %rdx
leaq 832(%rdx), %rdx
cmovnbq %rdx, %rax
...
lzcntq 112(%rdi) %rdx
leaq 64(%rdx), %rdx
cmovnbq %rdx, %rax
lzcntq 120(%rdi) %rdx
cmovnbq %rdx, %rax
retq
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs