| Issue |
107438
|
| Summary |
[Aarch64] Bad register scheduling (`bsl` -> `st4` might have to do with it?)
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
Validark
|
[Godbolt link](https://zig.godbolt.org/z/8PK7vEqfb)
```zig
const std = @import("std");
const builtin = @import("builtin");
export fn foo(data: @Vector(64, u8), prefix_sums: @Vector(64, u8), dest: [*]u8) void {
const shifted_left = shiftInterleavedElementsLeft(prefix_sums, 1, u32);
const shifted_compressed_data = shiftInterleavedElementsLeft(data, 1, u32);
st4(dest, BSL(shifted_left, shifted_compressed_data, data));
}
const HAS_ARM_NEON = switch (builtin.cpu.arch) {
.aarch64, .aarch64_be => std.Target.aarch64.featureSetHas(builtin.cpu.features, .neon),
.arm, .armeb => std.Target.arm.featureSetHas(builtin.cpu.features, .neon),
else => false,
};
fn BSL(selector: @Vector(64, u8), a: @Vector(64, u8), b: @Vector(64, u8)) @Vector(64, u8) {
return (a & selector) | (b & ~selector);
}
fn st4(ptr: [*]u8, vec: @Vector(64, u8)) void {
const chunks: [4]@Vector(16, u8) = @bitCast(vec);
if (!HAS_ARM_NEON or @inComptime()) {
ptr[0..64].* = std.simd.interlace(chunks);
} else struct {
extern fn @"llvm.aarch64.neon.st4.v16i8.p0"(@Vector(16, u8), @Vector(16, u8), @Vector(16, u8), @Vector(16, u8), [*]u8) void;
}.@"llvm.aarch64.neon.st4.v16i8.p0"(chunks[0], chunks[1], chunks[2], chunks[3], ptr);
}
fn shiftElementsLeft(vec: @Vector(16, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), comptime boundary: type) @Vector(16, u8) {
return if (boundary == u128)
std.simd.shiftElementsLeft(vec, amount, 0)
else
@bitCast(@as(@Vector(16 / @sizeOf(boundary), boundary), @bitCast(vec)) >> @splat(8*amount));
}
fn shiftInterleavedElementsLeft(vecs: @Vector(64, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), boundary: type) @Vector(64, u8) {
var new_vecs: [4]@Vector(16, u8) = @bitCast(vecs);
if ((amount & 1) == 1) {
const n = shiftElementsLeft(new_vecs[0], 1, boundary);
new_vecs[0] = new_vecs[1];
new_vecs[1] = new_vecs[2];
new_vecs[2] = new_vecs[3];
new_vecs[3] = n;
}
if ((amount & 2) == 2) {
const n0 = shiftElementsLeft(new_vecs[0], 1, boundary);
const n1 = shiftElementsLeft(new_vecs[1], 1, boundary);
new_vecs[0] = new_vecs[2];
new_vecs[1] = new_vecs[3];
new_vecs[2] = n0;
new_vecs[3] = n1;
}
const left_amt = amount >> 2;
if (left_amt > 0) {
new_vecs = .{
shiftElementsLeft(new_vecs[0], left_amt, boundary),
shiftElementsLeft(new_vecs[1], left_amt, boundary),
shiftElementsLeft(new_vecs[2], left_amt, boundary),
shiftElementsLeft(new_vecs[3], left_amt, boundary)
};
}
return @bitCast(new_vecs);
}
```
This gives me 3 vector-to-vector `mov` instructions for the Apple M3:
```diff
foo:
ldp q0, q1, [x0, #32]
ldp q3, q2, [x0]
ldp q5, q4, [x1]
ushr v5.4s, v5.4s, #8
ushr v6.4s, v3.4s, #8
and v6.16b, v5.16b, v6.16b
bic v5.16b, v1.16b, v5.16b
ldp q7, q16, [x1, #32] ; what's the point of loading into q7 and q16?
- mov v18.16b, v16.16b
bsl v18.16b, v1.16b, v0.16b
- mov v17.16b, v7.16b
bsl v17.16b, v0.16b, v2.16b
- mov v16.16b, v4.16b
bsl v16.16b, v2.16b, v3.16b
orr v19.16b, v5.16b, v6.16b
st4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x2]
ret
```
Pre-optimized LLVM dump from Zig (via `zig build-obj ./src/llvm_code.zig -O ReleaseFast -target aarch64-linux -mcpu apple_latest --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1`):
```llvm
; ModuleID = 'llvm_code'
source_filename = "llvm_code"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-musl"
%Target.Cpu.Feature.Set = type { [5 x i64] }
%Target.Cpu.Model = type { { ptr, i64 }, { ptr, i64 }, %Target.Cpu.Feature.Set }
%Target.Cpu = type { ptr, %Target.Cpu.Feature.Set, i6, [7 x i8] }
@builtin.zig_backend = internal unnamed_addr constant i64 2, align 8
@Target.Cpu.Feature.Set.empty = internal unnamed_addr constant %Target.Cpu.Feature.Set zeroinitializer, align 8
@Target.aarch64.cpu.apple_latest = internal unnamed_addr constant %Target.Cpu.Model { { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_227, i64 0), i64 12 }, { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_230, i64 0), i64 12 }, %Target.Cpu.Feature.Set { [5 x i64] [i64 158329674598400, i64 2251799830972420, i64 1125900041060352, i64 12885032960, i64 0] } }, align 8
@__anon_227 = internal unnamed_addr constant [13 x i8] c"apple_latest\00", align 1
@__anon_230 = internal unnamed_addr constant [13 x i8] c"apple-latest\00", align 1
@builtin.cpu = internal unnamed_addr constant %Target.Cpu { ptr getelementptr inbounds (i8, ptr @Target.aarch64.cpu.apple_latest, i64 0), %Target.Cpu.Feature.Set { [5 x i64] [i64 -6882457295353816576, i64 3831332528523300484, i64 4612917471702155264, i64 47783866528, i64 0] }, i6 6, [7 x i8] undef }, align 8
@start.simplified_logic = internal unnamed_addr constant i1 false, align 1
@builtin.output_mode = internal unnamed_addr constant i2 -2, align 1
@llvm_code.HAS_ARM_NEON = internal unnamed_addr constant i1 true, align 1
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local void @foo(ptr noundef %0, ptr noundef %1, ptr align 1 nonnull %2) #0 {
3:
%4 = load <64 x i8>, ptr %0, align 64
%5 = load <64 x i8>, ptr %1, align 64
%6 = call fastcc <64 x i8> @llvm_code.shiftInterleavedElementsLeft__anon_1505(<64 x i8> %5)
%7 = call fastcc <64 x i8> @llvm_code.shiftInterleavedElementsLeft__anon_1505(<64 x i8> %4)
%8 = call fastcc <64 x i8> @llvm_code.BSL(<64 x i8> %6, <64 x i8> %7, <64 x i8> %4)
call fastcc void @llvm_code.st4(ptr align 1 nonnull %2, <64 x i8> %8)
ret void
}
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <64 x i8> @llvm_code.shiftInterleavedElementsLeft__anon_1505(<64 x i8> %0) unnamed_addr #0 {
1:
%2 = alloca [64 x i8], align 16
%3 = alloca [64 x i8], align 16
store <64 x i8> %0, ptr %2, align 16
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %2, i64 64, i1 false)
%4 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
%5 = load <16 x i8>, ptr %4
%6 = call fastcc <16 x i8> @llvm_code.shiftElementsLeft__anon_1523(<16 x i8> %5)
%7 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
%8 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
%9 = load <16 x i8>, ptr %8
store <16 x i8> %9, ptr %7, align 16
%10 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
%11 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
%12 = load <16 x i8>, ptr %11
store <16 x i8> %12, ptr %10, align 16
%13 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
%14 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
%15 = load <16 x i8>, ptr %14
store <16 x i8> %15, ptr %13, align 16
%16 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
store <16 x i8> %6, ptr %16, align 16
%17 = load <64 x i8>, ptr %3, align 16
ret <64 x i8> %17
}
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <64 x i8> @llvm_code.BSL(<64 x i8> %0, <64 x i8> %1, <64 x i8> %2) unnamed_addr #0 {
3:
%4 = and <64 x i8> %1, %0
%5 = xor <64 x i8> %0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%6 = and <64 x i8> %2, %5
%7 = or <64 x i8> %4, %6
ret <64 x i8> %7
}
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc void @llvm_code.st4(ptr align 1 nonnull %0, <64 x i8> %1) unnamed_addr #0 {
2:
%3 = alloca [64 x i8], align 16
store <64 x i8> %1, ptr %3, align 16
%4 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
%5 = load <16 x i8>, ptr %4
%6 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
%7 = load <16 x i8>, ptr %6
%8 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
%9 = load <16 x i8>, ptr %8
%10 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
%11 = load <16 x i8>, ptr %10
call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %5, <16 x i8> %7, <16 x i8> %9, <16 x i8> %11, ptr align 1 nonnull %0)
ret void
}
; Function Attrs: nounwind willreturn nofree nocallback memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly %0, ptr noalias nocapture readonly %1, i64 %2, i1 immarg %3) #1
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.shiftElementsLeft__anon_1523(<16 x i8> %0) unnamed_addr #0 {
1:
%2 = bitcast <16 x i8> %0 to <4 x i32>
%3 = zext <4 x i5> <i5 8, i5 8, i5 8, i5 8> to <4 x i32>
%4 = lshr <4 x i32> %2, %3
%5 = bitcast <4 x i32> %4 to <16 x i8>
ret <16 x i8> %5
}
; Function Attrs: nounwind uwtable
declare void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, ptr align 1 nonnull %4) #2
attributes #0 = { nounwind uwtable nosanitize_coverage skipprofile "frame-pointer"="none" "target-cpu"="apple-latest" "target-features"="-a510,-a520,-a65,-a710,-a720,-a76,-a78,-a78c,-addr-lsl-fast,+aes,-aggressive-fma,+alternate-sextload-cvt-f32-pattern,+altnzcv,-alu-lsl-fast,+am,+amvs,+arith-bcc-fusion,+arith-cbz-fusion,-ascend-store-address,-b16b16,-balance-fp-ops,+bf16,-brbe,+bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,+ccdp,+ccidx,+ccpp,-chk,-clrbhb,-cmp-bcc-fusion,+complxnum,+CONTEXTIDREL2,-cortex-r82,-cpa,+crc,+crypto,-cssc,-d128,+disable-latency-sched-heuristic,-disable-ldp,-disable-stp,+dit,+dotprod,+ecv,+el2vmsa,+el3,-enable-select-opt,-ete,-exynos-cheap-as-move,-f32mm,-f64mm,-faminmax,+fgt,-fix-cortex-a53-835769,+flagm,-fmv,-force-32bit-jump-tables,+fp16fml,-fp8,-fp8dot2,-fp8dot4,-fp8fma,+fp-armv8,-fpmr,+fptoint,+fullfp16,+fuse-address,-fuse-addsub-2reg-const1,-fuse-adrp-add,+fuse-aes,+fuse-arith-logic,+fuse-crypto-eor,+fuse-csel,+fuse-literals,-gcs,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,+hcx,+i8mm,-ite,+jsconv,-ldp-aligned-only,+lor,-ls64,+lse,-lse128,+lse2,-lut,-mec,-mops,+mpam,-mte,+neon,-nmi,-no-bti-at-return-twice,-no-neg-immediates,-no-sve-fp-ld1r,-no-zcz-fp,+nv,-outline-atomics,+pan,+pan-rwv,+pauth,-pauth-lr,+perfmon,-predictable-select-expensive,+predres,-prfm-slc-target,-rand,+ras,-rasv2,+rcpc,-rcpc3,+rcpc-immo,+rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,+sb,+sel2,+sha2,+sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme2,-sme2p1,-sme-f16f16,-sme-f64f64,-sme-f8f16,-sme-f8f32,-sme-fa64,-sme-i16i64,-sme-lutv2,-spe,-spe-eef,-specres2,+specrestrict,+ssbs,-ssve-fp8dot2,-ssve-fp8dot4,-ssve-fp8fma,+store-pair-suppress,-stp-aligned-only,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-sve2p1,-tagged-globals,-the,+tlb-rmi,-tlbiw,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tpidrro-el0,+tracev8.4,-trbe,+uaops,-use-experimental-zeroing-pseudos,-use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,-v8.7a,-v8.8a,-v8.9a,+v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9.4a,-v9.5a,-v9a,+vh,-wfxt,-xs,+zcm,+zcz,-zcz-fp-workaround,+zcz-gp" }
attributes #1 = { nounwind willreturn nofree nocallback memory(argmem: readwrite) }
attributes #2 = { nounwind uwtable "frame-pointer"="none" "target-cpu"="apple-latest" "target-features"="-a510,-a520,-a65,-a710,-a720,-a76,-a78,-a78c,-addr-lsl-fast,+aes,-aggressive-fma,+alternate-sextload-cvt-f32-pattern,+altnzcv,-alu-lsl-fast,+am,+amvs,+arith-bcc-fusion,+arith-cbz-fusion,-ascend-store-address,-b16b16,-balance-fp-ops,+bf16,-brbe,+bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,+ccdp,+ccidx,+ccpp,-chk,-clrbhb,-cmp-bcc-fusion,+complxnum,+CONTEXTIDREL2,-cortex-r82,-cpa,+crc,+crypto,-cssc,-d128,+disable-latency-sched-heuristic,-disable-ldp,-disable-stp,+dit,+dotprod,+ecv,+el2vmsa,+el3,-enable-select-opt,-ete,-exynos-cheap-as-move,-f32mm,-f64mm,-faminmax,+fgt,-fix-cortex-a53-835769,+flagm,-fmv,-force-32bit-jump-tables,+fp16fml,-fp8,-fp8dot2,-fp8dot4,-fp8fma,+fp-armv8,-fpmr,+fptoint,+fullfp16,+fuse-address,-fuse-addsub-2reg-const1,-fuse-adrp-add,+fuse-aes,+fuse-arith-logic,+fuse-crypto-eor,+fuse-csel,+fuse-literals,-gcs,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,+hcx,+i8mm,-ite,+jsconv,-ldp-aligned-only,+lor,-ls64,+lse,-lse128,+lse2,-lut,-mec,-mops,+mpam,-mte,+neon,-nmi,-no-bti-at-return-twice,-no-neg-immediates,-no-sve-fp-ld1r,-no-zcz-fp,+nv,-outline-atomics,+pan,+pan-rwv,+pauth,-pauth-lr,+perfmon,-predictable-select-expensive,+predres,-prfm-slc-target,-rand,+ras,-rasv2,+rcpc,-rcpc3,+rcpc-immo,+rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,+sb,+sel2,+sha2,+sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme2,-sme2p1,-sme-f16f16,-sme-f64f64,-sme-f8f16,-sme-f8f32,-sme-fa64,-sme-i16i64,-sme-lutv2,-spe,-spe-eef,-specres2,+specrestrict,+ssbs,-ssve-fp8dot2,-ssve-fp8dot4,-ssve-fp8fma,+store-pair-suppress,-stp-aligned-only,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-sve2p1,-tagged-globals,-the,+tlb-rmi,-tlbiw,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tpidrro-el0,+tracev8.4,-trbe,+uaops,-use-experimental-zeroing-pseudos,-use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,-v8.7a,-v8.8a,-v8.9a,+v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9.4a,-v9.5a,-v9a,+vh,-wfxt,-xs,+zcm,+zcz,-zcz-fp-workaround,+zcz-gp" }
!llvm.module.flags = !{}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs