Issue 103946
Summary [opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion
Labels new issue
Assignees
Reporter mratsim
    I'm not sure who to blame between `opt` or the code generator:
- either opt generates IR that uses a pattern that the codegenerator doesn't recognize and fuse correctly
- or the codegenerator should recognize the pattern

In any case the following code has a good version with 12 instructions in the critical path (add/adc, sub/sbb and cmov) and the bad version .v2 has 20 instructions in the critical path.

Both are from the same IR except that one has a constant inlined by `opt`.

This is similar to https://github.com/llvm/llvm-project/issues/102868 but even worse.

https://alive2.llvm.org/ce/z/KzsXa9

## Original IR

```llvm
define void @_modadd_mayo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add i320 %ax, %bx
  %mx = zext i256 %M to i320
  %apb_minus_M = sub i320 %a_plus_b, %mx
  %5 = lshr i320 %apb_minus_M, 319
  %6 = trunc i320 %5 to i1
  %7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
  %8 = trunc i320 %7 to i256
  store i256 %8, ptr %0, align 4
  ret void
}

@secp256k1_fp_mod = constant i256 -4294968273, section "ctt.secp256k1_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
 %a_plus_b = add i320 %ax, %bx
  %mx = zext i256 %M to i320
 %apb_minus_M = sub i320 %a_plus_b, %mx
  %5 = lshr i320 %apb_minus_M, 319
  %6 = trunc i320 %5 to i1
  %7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
  %8 = trunc i320 %7 to i256
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @secp256k1_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.secp256k1_fp" {
  call fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr @secp256k1_fp_mod)
  ret void
}

attributes #2 = { hot }
```

## opt -O3
```llvm
define void @_modadd_mayo.u64x4(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2, ptr nocapture readonly %3) local_unnamed_addr #0 section "ctt.fields" {
 %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add nuw nsw i320 %bx, %ax
  %mx = zext i256 %M to i320
  %apb_minus_M = sub nsw i320 %a_plus_b, %mx
  %.not1 = icmp slt i320 %apb_minus_M, 0
  %5 = select i1 %.not1, i320 %a_plus_b, i320 %apb_minus_M
  %6 = trunc i320 %5 to i256
  store i256 %6, ptr %0, align 4
  ret void
}

define void @secp256k1_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #1 section "ctt.secp256k1_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %ax.i = zext i256 %.val to i320
  %bx.i = zext i256 %.val1 to i320
  %a_plus_b.i = add nuw nsw i320 %bx.i, %ax.i
  %apb_minus_M.i = add nuw nsw i320 %a_plus_b.i, 4294968273
  %.not.i = icmp ugt i320 %a_plus_b.i, 115792089237316195423570985008687907853269984665640564039457584007908834671662
 %4 = select i1 %.not.i, i320 %apb_minus_M.i, i320 %a_plus_b.i
  %5 = trunc i320 %4 to i256
  store i256 %5, ptr %0, align 4
  ret void
}

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
```

## Codegen

```asm
_modadd_mayo.u64x4: # @_modadd_mayo.u64x4
        push    r14
        push rbx
        mov     r9, qword ptr [rdx + 24]
        mov     r8, qword ptr [rdx + 16]
        mov     rax, qword ptr [rdx]
        mov rdx, qword ptr [rdx + 8]
        add     rax, qword ptr [rsi]
 adc     rdx, qword ptr [rsi + 8]
        adc     r8, qword ptr [rsi + 16]
        adc     r9, qword ptr [rsi + 24]
        setb    sil
 movzx   esi, sil
        mov     r10, rax
        sub     r10, qword ptr [rcx]
        mov     r11, rdx
        sbb     r11, qword ptr [rcx + 8]
        mov     rbx, r8
        sbb     rbx, qword ptr [rcx + 16]
        mov     r14, r9
        sbb     r14, qword ptr [rcx + 24]
        sbb     rsi, 0
        sar     rsi, 63
        cmovs r11, rdx
        cmovs   r14, r9
        cmovs   rbx, r8
 cmovs   r10, rax
        mov     qword ptr [rdi + 16], rbx
        mov qword ptr [rdi + 24], r14
        mov     qword ptr [rdi], r10
 mov     qword ptr [rdi + 8], r11
        pop     rbx
        pop r14
        ret
secp256k1_fp_add:                       # @secp256k1_fp_add
        push    r15
        push    r14
 push    r12
        push    rbx
        mov     r8, qword ptr [rdx + 24]
        mov     rcx, qword ptr [rdx + 16]
        mov     rax, qword ptr [rdx]
        mov     rdx, qword ptr [rdx + 8]
        add rax, qword ptr [rsi]
        adc     rdx, qword ptr [rsi + 8]
 adc     rcx, qword ptr [rsi + 16]
        adc     r8, qword ptr [rsi + 24]
        setb    sil
        movzx   r11d, sil
        movabs rsi, 4294968273
        add     rsi, rax
        mov     r9, rdx
 adc     r9, 0
        mov     r10, rcx
        adc     r10, 0
 mov     rbx, r8
        adc     rbx, 0
        xor     r14d, r14d
        movabs  r15, -4294968274
        cmp     r15, rax
 mov     r15, -1
        mov     r12, -1
        sbb     r12, rdx
 mov     r12, -1
        sbb     r12, rcx
        sbb     r15, r8
        mov     r15d, 0
        sbb     r15, r11
        mov r11d, 0
        sbb     r11, r11
        mov     r11d, 0
        sbb r11, r11
        sbb     r14, r14
        cmovae  r9, rdx
 cmovae  rbx, r8
        cmovae  r10, rcx
        cmovae  rsi, rax
 mov     qword ptr [rdi + 16], r10
        mov     qword ptr [rdi + 24], rbx
        mov     qword ptr [rdi], rsi
        mov     qword ptr [rdi + 8], r9
        pop     rbx
        pop     r12
        pop r14
        pop     r15
        ret
```

## Analysis

In the "good" version, the extra SAR when using i320 is unnecessary and has been already reported separately in https://github.com/llvm/llvm-project/issues/103841

In the bad version, the following is equivalent to substractiong by M
```asm
        movabs rsi, 4294968273
        add     rsi, rax
        mov     r9, rdx
 adc     r9, 0
        mov     r10, rcx
        adc     r10, 0
 mov     rbx, r8
        adc     rbx, 0
```
The optimizer misses that there can't be carries here, but that's not the main issue.

Then there is this problematic extra 12 instructions as a followup
```asm
        movabs  r15, -4294968274
        cmp r15, rax
        mov     r15, -1
        mov     r12, -1
        sbb r12, rdx
        mov     r12, -1
        sbb     r12, rcx
 sbb     r15, r8
        mov     r15d, 0
        sbb     r15, r11
 mov     r11d, 0
        sbb     r11, r11
        mov     r11d, 0
        sbb     r11, r11
        sbb     r14, r14
```

_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to