| Issue |
124309
|
| Summary |
x86-64 backend refuses to put i128 in SIMD registers when only bitops are used
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
orlp
|
Consider these two Rust functions:
```rust
#[unsafe(no_mangle)]
pub fn num(x: &mut [u128; 4]) {
let a = x[0];
let b = x[1];
let c = x[2];
x[3] = (a & b) | (!a & c);
}
#[unsafe(no_mangle)]
pub fn autovec(x: &mut [[u8; 16]; 4]) {
let a = x[0];
let b = x[1];
let c = x[2];
x[3] = std::array::from_fn(|i| (a[i] & b[i]) | (!a[i] & c[i]));
}
```
It compiles to this LLVM IR:
```llvm
define void @num(ptr noalias nocapture noundef align 16 dereferenceable(64) %x) unnamed_addr {
start:
%a = load i128, ptr %x, align 16
%0 = getelementptr inbounds i8, ptr %x, i64 16
%b = load i128, ptr %0, align 16
%1 = getelementptr inbounds i8, ptr %x, i64 32
%c = load i128, ptr %1, align 16
%_5 = and i128 %b, %a
%_7 = xor i128 %a, -1
%_6 = and i128 %c, %_7
%2 = getelementptr inbounds i8, ptr %x, i64 48
%3 = or disjoint i128 %_6, %_5
store i128 %3, ptr %2, align 16
ret void
}
define void @autovec(ptr noalias nocapture noundef align 1 dereferenceable(64) %x) unnamed_addr personality ptr @rust_eh_personality {
start:
%0 = getelementptr inbounds i8, ptr %x, i64 16
%1 = getelementptr inbounds i8, ptr %x, i64 32
%2 = getelementptr inbounds i8, ptr %x, i64 48
%3 = load <16 x i8>, ptr %x, align 1
%4 = load <16 x i8>, ptr %0, align 1
%5 = load <16 x i8>, ptr %1, align 1
%6 = and <16 x i8> %4, %3
%7 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%8 = and <16 x i8> %5, %7
%9 = or disjoint <16 x i8> %8, %6
store <16 x i8> %9, ptr %2, align 1
ret void
}
```
So far so good. However we get this x86-64-v4 output assembly:
```asm
num:
mov rax, qword ptr [rdi]
mov rcx, qword ptr [rdi + 8]
mov rdx, qword ptr [rdi + 24]
and rdx, rcx
mov rsi, qword ptr [rdi + 16]
and rsi, rax
andn rcx, rcx, qword ptr [rdi + 40]
or rcx, rdx
andn rax, rax, qword ptr [rdi + 32]
or rax, rsi
mov qword ptr [rdi + 56], rcx
mov qword ptr [rdi + 48], rax
ret
autovec:
vmovdqu xmm0, xmmword ptr [rdi]
vmovdqu xmm1, xmmword ptr [rdi + 16]
vpternlogq xmm1, xmm0, xmmword ptr [rdi + 32], 226
vmovdqu xmmword ptr [rdi + 48], xmm1
ret
```
I don't understand why they're not identical; why the autovec code is so much better.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs