Issue 124309
Summary x86-64 backend refuses to put i128 in SIMD registers when only bitops are used
Labels new issue
Assignees
Reporter orlp
    Consider these two Rust functions:

```rust
#[unsafe(no_mangle)]
pub fn num(x: &mut [u128; 4]) {
    let a = x[0];
    let b = x[1];
    let c = x[2];
    x[3] = (a & b) | (!a & c);
}

#[unsafe(no_mangle)]
pub fn autovec(x: &mut [[u8; 16]; 4]) {
    let a = x[0];
    let b = x[1];
    let c = x[2];
    x[3] = std::array::from_fn(|i| (a[i] & b[i]) | (!a[i] & c[i]));
}
```

It compiles to this LLVM IR:

```llvm
define void @num(ptr noalias nocapture noundef align 16 dereferenceable(64) %x) unnamed_addr {
start:
  %a = load i128, ptr %x, align 16
  %0 = getelementptr inbounds i8, ptr %x, i64 16
 %b = load i128, ptr %0, align 16
  %1 = getelementptr inbounds i8, ptr %x, i64 32
  %c = load i128, ptr %1, align 16
  %_5 = and i128 %b, %a
  %_7 = xor i128 %a, -1
  %_6 = and i128 %c, %_7
  %2 = getelementptr inbounds i8, ptr %x, i64 48
  %3 = or disjoint i128 %_6, %_5
  store i128 %3, ptr %2, align 16
  ret void
}

define void @autovec(ptr noalias nocapture noundef align 1 dereferenceable(64) %x) unnamed_addr personality ptr @rust_eh_personality {
start:
  %0 = getelementptr inbounds i8, ptr %x, i64 16
  %1 = getelementptr inbounds i8, ptr %x, i64 32
  %2 = getelementptr inbounds i8, ptr %x, i64 48
  %3 = load <16 x i8>, ptr %x, align 1
  %4 = load <16 x i8>, ptr %0, align 1
  %5 = load <16 x i8>, ptr %1, align 1
  %6 = and <16 x i8> %4, %3
  %7 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %8 = and <16 x i8> %5, %7
  %9 = or disjoint <16 x i8> %8, %6
 store <16 x i8> %9, ptr %2, align 1
  ret void
}
```

So far so good. However we get this x86-64-v4 output assembly:

```asm
num:
        mov rax, qword ptr [rdi]
        mov     rcx, qword ptr [rdi + 8]
        mov rdx, qword ptr [rdi + 24]
        and     rdx, rcx
        mov     rsi, qword ptr [rdi + 16]
        and     rsi, rax
        andn    rcx, rcx, qword ptr [rdi + 40]
        or      rcx, rdx
        andn    rax, rax, qword ptr [rdi + 32]
        or      rax, rsi
        mov     qword ptr [rdi + 56], rcx
        mov     qword ptr [rdi + 48], rax
 ret

autovec:
        vmovdqu xmm0, xmmword ptr [rdi]
        vmovdqu xmm1, xmmword ptr [rdi + 16]
        vpternlogq      xmm1, xmm0, xmmword ptr [rdi + 32], 226
        vmovdqu xmmword ptr [rdi + 48], xmm1
 ret
```

I don't understand why they're not identical; why the autovec code is so much better.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to