| Issue |
103946
|
| Summary |
[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
mratsim
|
I'm not sure who to blame between `opt` or the code generator:
- either opt generates IR that uses a pattern that the codegenerator doesn't recognize and fuse correctly
- or the codegenerator should recognize the pattern
In any case the following code has a good version with 12 instructions in the critical path (add/adc, sub/sbb and cmov) and the bad version .v2 has 20 instructions in the critical path.
Both are from the same IR except that one has a constant inlined by `opt`.
This is similar to https://github.com/llvm/llvm-project/issues/102868 but even worse.
https://alive2.llvm.org/ce/z/KzsXa9
## Original IR
```llvm
define void @_modadd_mayo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%ax = zext i256 %a to i320
%bx = zext i256 %b to i320
%a_plus_b = add i320 %ax, %bx
%mx = zext i256 %M to i320
%apb_minus_M = sub i320 %a_plus_b, %mx
%5 = lshr i320 %apb_minus_M, 319
%6 = trunc i320 %5 to i1
%7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
%8 = trunc i320 %7 to i256
store i256 %8, ptr %0, align 4
ret void
}
@secp256k1_fp_mod = constant i256 -4294968273, section "ctt.secp256k1_fp.constants", align 64
; Function Attrs: hot
define internal fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%ax = zext i256 %a to i320
%bx = zext i256 %b to i320
%a_plus_b = add i320 %ax, %bx
%mx = zext i256 %M to i320
%apb_minus_M = sub i320 %a_plus_b, %mx
%5 = lshr i320 %apb_minus_M, 319
%6 = trunc i320 %5 to i1
%7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
%8 = trunc i320 %7 to i256
store i256 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define void @secp256k1_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.secp256k1_fp" {
call fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr @secp256k1_fp_mod)
ret void
}
attributes #2 = { hot }
```
## opt -O3
```llvm
define void @_modadd_mayo.u64x4(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2, ptr nocapture readonly %3) local_unnamed_addr #0 section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%ax = zext i256 %a to i320
%bx = zext i256 %b to i320
%a_plus_b = add nuw nsw i320 %bx, %ax
%mx = zext i256 %M to i320
%apb_minus_M = sub nsw i320 %a_plus_b, %mx
%.not1 = icmp slt i320 %apb_minus_M, 0
%5 = select i1 %.not1, i320 %a_plus_b, i320 %apb_minus_M
%6 = trunc i320 %5 to i256
store i256 %6, ptr %0, align 4
ret void
}
define void @secp256k1_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #1 section "ctt.secp256k1_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
%ax.i = zext i256 %.val to i320
%bx.i = zext i256 %.val1 to i320
%a_plus_b.i = add nuw nsw i320 %bx.i, %ax.i
%apb_minus_M.i = add nuw nsw i320 %a_plus_b.i, 4294968273
%.not.i = icmp ugt i320 %a_plus_b.i, 115792089237316195423570985008687907853269984665640564039457584007908834671662
%4 = select i1 %.not.i, i320 %apb_minus_M.i, i320 %a_plus_b.i
%5 = trunc i320 %4 to i256
store i256 %5, ptr %0, align 4
ret void
}
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
```
## Codegen
```asm
_modadd_mayo.u64x4: # @_modadd_mayo.u64x4
push r14
push rbx
mov r9, qword ptr [rdx + 24]
mov r8, qword ptr [rdx + 16]
mov rax, qword ptr [rdx]
mov rdx, qword ptr [rdx + 8]
add rax, qword ptr [rsi]
adc rdx, qword ptr [rsi + 8]
adc r8, qword ptr [rsi + 16]
adc r9, qword ptr [rsi + 24]
setb sil
movzx esi, sil
mov r10, rax
sub r10, qword ptr [rcx]
mov r11, rdx
sbb r11, qword ptr [rcx + 8]
mov rbx, r8
sbb rbx, qword ptr [rcx + 16]
mov r14, r9
sbb r14, qword ptr [rcx + 24]
sbb rsi, 0
sar rsi, 63
cmovs r11, rdx
cmovs r14, r9
cmovs rbx, r8
cmovs r10, rax
mov qword ptr [rdi + 16], rbx
mov qword ptr [rdi + 24], r14
mov qword ptr [rdi], r10
mov qword ptr [rdi + 8], r11
pop rbx
pop r14
ret
secp256k1_fp_add: # @secp256k1_fp_add
push r15
push r14
push r12
push rbx
mov r8, qword ptr [rdx + 24]
mov rcx, qword ptr [rdx + 16]
mov rax, qword ptr [rdx]
mov rdx, qword ptr [rdx + 8]
add rax, qword ptr [rsi]
adc rdx, qword ptr [rsi + 8]
adc rcx, qword ptr [rsi + 16]
adc r8, qword ptr [rsi + 24]
setb sil
movzx r11d, sil
movabs rsi, 4294968273
add rsi, rax
mov r9, rdx
adc r9, 0
mov r10, rcx
adc r10, 0
mov rbx, r8
adc rbx, 0
xor r14d, r14d
movabs r15, -4294968274
cmp r15, rax
mov r15, -1
mov r12, -1
sbb r12, rdx
mov r12, -1
sbb r12, rcx
sbb r15, r8
mov r15d, 0
sbb r15, r11
mov r11d, 0
sbb r11, r11
mov r11d, 0
sbb r11, r11
sbb r14, r14
cmovae r9, rdx
cmovae rbx, r8
cmovae r10, rcx
cmovae rsi, rax
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 24], rbx
mov qword ptr [rdi], rsi
mov qword ptr [rdi + 8], r9
pop rbx
pop r12
pop r14
pop r15
ret
```
## Analysis
In the "good" version, the extra SAR when using i320 is unnecessary and has been already reported separately in https://github.com/llvm/llvm-project/issues/103841
In the bad version, the following is equivalent to substractiong by M
```asm
movabs rsi, 4294968273
add rsi, rax
mov r9, rdx
adc r9, 0
mov r10, rcx
adc r10, 0
mov rbx, r8
adc rbx, 0
```
The optimizer misses that there can't be carries here, but that's not the main issue.
Then there is this problematic extra 12 instructions as a followup
```asm
movabs r15, -4294968274
cmp r15, rax
mov r15, -1
mov r12, -1
sbb r12, rdx
mov r12, -1
sbb r12, rcx
sbb r15, r8
mov r15d, 0
sbb r15, r11
mov r11d, 0
sbb r11, r11
mov r11d, 0
sbb r11, r11
sbb r14, r14
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs