Issue 107438
Summary [Aarch64] Bad register scheduling (`bsl` -> `st4` might have to do with it?)
Labels new issue
Assignees
Reporter Validark
    [Godbolt link](https://zig.godbolt.org/z/8PK7vEqfb)

```zig
const std = @import("std");
const builtin = @import("builtin");

export fn foo(data: @Vector(64, u8), prefix_sums: @Vector(64, u8), dest: [*]u8) void {
    const shifted_left = shiftInterleavedElementsLeft(prefix_sums, 1, u32);
    const shifted_compressed_data = shiftInterleavedElementsLeft(data, 1, u32);
    st4(dest, BSL(shifted_left, shifted_compressed_data, data));
}

const HAS_ARM_NEON = switch (builtin.cpu.arch) {
    .aarch64, .aarch64_be => std.Target.aarch64.featureSetHas(builtin.cpu.features, .neon),
    .arm, .armeb => std.Target.arm.featureSetHas(builtin.cpu.features, .neon),
 else => false,
};

fn BSL(selector: @Vector(64, u8), a: @Vector(64, u8), b: @Vector(64, u8)) @Vector(64, u8) {
    return (a & selector) | (b & ~selector);
}

fn st4(ptr: [*]u8, vec: @Vector(64, u8)) void {
 const chunks: [4]@Vector(16, u8) = @bitCast(vec);
    if (!HAS_ARM_NEON or @inComptime()) {
        ptr[0..64].* = std.simd.interlace(chunks);
 } else struct {
        extern fn @"llvm.aarch64.neon.st4.v16i8.p0"(@Vector(16, u8), @Vector(16, u8), @Vector(16, u8), @Vector(16, u8), [*]u8) void;
 }.@"llvm.aarch64.neon.st4.v16i8.p0"(chunks[0], chunks[1], chunks[2], chunks[3], ptr);
}

fn shiftElementsLeft(vec: @Vector(16, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), comptime boundary: type) @Vector(16, u8) {
    return if (boundary == u128)
 std.simd.shiftElementsLeft(vec, amount, 0)
    else
 @bitCast(@as(@Vector(16 / @sizeOf(boundary), boundary), @bitCast(vec)) >> @splat(8*amount));
}

fn shiftInterleavedElementsLeft(vecs: @Vector(64, u8), comptime amount: std.simd.VectorCount(@Vector(64, u8)), boundary: type) @Vector(64, u8) {
    var new_vecs: [4]@Vector(16, u8) = @bitCast(vecs);

    if ((amount & 1) == 1) {
        const n = shiftElementsLeft(new_vecs[0], 1, boundary);
        new_vecs[0] = new_vecs[1];
        new_vecs[1] = new_vecs[2];
        new_vecs[2] = new_vecs[3];
        new_vecs[3] = n;
    }

    if ((amount & 2) == 2) {
        const n0 = shiftElementsLeft(new_vecs[0], 1, boundary);
        const n1 = shiftElementsLeft(new_vecs[1], 1, boundary);
        new_vecs[0] = new_vecs[2];
        new_vecs[1] = new_vecs[3];
        new_vecs[2] = n0;
        new_vecs[3] = n1;
 }

    const left_amt = amount >> 2;

    if (left_amt > 0) {
 new_vecs = .{
            shiftElementsLeft(new_vecs[0], left_amt, boundary),
            shiftElementsLeft(new_vecs[1], left_amt, boundary),
            shiftElementsLeft(new_vecs[2], left_amt, boundary),
            shiftElementsLeft(new_vecs[3], left_amt, boundary)
        };
    }

    return @bitCast(new_vecs);
}
```

This gives me 3 vector-to-vector `mov` instructions for the Apple M3:

```diff
foo:
        ldp     q0, q1, [x0, #32]
        ldp     q3, q2, [x0]
        ldp     q5, q4, [x1]
        ushr    v5.4s, v5.4s, #8
        ushr    v6.4s, v3.4s, #8
        and     v6.16b, v5.16b, v6.16b
        bic     v5.16b, v1.16b, v5.16b
        ldp     q7, q16, [x1, #32] ; what's the point of loading into q7 and q16?
-       mov     v18.16b, v16.16b
        bsl v18.16b, v1.16b, v0.16b
-       mov     v17.16b, v7.16b
        bsl v17.16b, v0.16b, v2.16b
-       mov     v16.16b, v4.16b
        bsl v16.16b, v2.16b, v3.16b
        orr     v19.16b, v5.16b, v6.16b
 st4     { v16.16b, v17.16b, v18.16b, v19.16b }, [x2]
 ret
```

Pre-optimized LLVM dump from Zig (via `zig build-obj ./src/llvm_code.zig -O ReleaseFast -target aarch64-linux -mcpu apple_latest --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1`):

```llvm
; ModuleID = 'llvm_code'
source_filename = "llvm_code"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-musl"

%Target.Cpu.Feature.Set = type { [5 x i64] }
%Target.Cpu.Model = type { { ptr, i64 }, { ptr, i64 }, %Target.Cpu.Feature.Set }
%Target.Cpu = type { ptr, %Target.Cpu.Feature.Set, i6, [7 x i8] }

@builtin.zig_backend = internal unnamed_addr constant i64 2, align 8
@Target.Cpu.Feature.Set.empty = internal unnamed_addr constant %Target.Cpu.Feature.Set zeroinitializer, align 8
@Target.aarch64.cpu.apple_latest = internal unnamed_addr constant %Target.Cpu.Model { { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_227, i64 0), i64 12 }, { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_230, i64 0), i64 12 }, %Target.Cpu.Feature.Set { [5 x i64] [i64 158329674598400, i64 2251799830972420, i64 1125900041060352, i64 12885032960, i64 0] } }, align 8
@__anon_227 = internal unnamed_addr constant [13 x i8] c"apple_latest\00", align 1
@__anon_230 = internal unnamed_addr constant [13 x i8] c"apple-latest\00", align 1
@builtin.cpu = internal unnamed_addr constant %Target.Cpu { ptr getelementptr inbounds (i8, ptr @Target.aarch64.cpu.apple_latest, i64 0), %Target.Cpu.Feature.Set { [5 x i64] [i64 -6882457295353816576, i64 3831332528523300484, i64 4612917471702155264, i64 47783866528, i64 0] }, i6 6, [7 x i8] undef }, align 8
@start.simplified_logic = internal unnamed_addr constant i1 false, align 1
@builtin.output_mode = internal unnamed_addr constant i2 -2, align 1
@llvm_code.HAS_ARM_NEON = internal unnamed_addr constant i1 true, align 1

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local void @foo(ptr noundef %0, ptr noundef %1, ptr align 1 nonnull %2) #0 {
3:
  %4 = load <64 x i8>, ptr %0, align 64
 %5 = load <64 x i8>, ptr %1, align 64
  %6 = call fastcc <64 x i8> @llvm_code.shiftInterleavedElementsLeft__anon_1505(<64 x i8> %5)
  %7 = call fastcc <64 x i8> @llvm_code.shiftInterleavedElementsLeft__anon_1505(<64 x i8> %4)
  %8 = call fastcc <64 x i8> @llvm_code.BSL(<64 x i8> %6, <64 x i8> %7, <64 x i8> %4)
  call fastcc void @llvm_code.st4(ptr align 1 nonnull %2, <64 x i8> %8)
  ret void
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <64 x i8> @llvm_code.shiftInterleavedElementsLeft__anon_1505(<64 x i8> %0) unnamed_addr #0 {
1:
  %2 = alloca [64 x i8], align 16
  %3 = alloca [64 x i8], align 16
  store <64 x i8> %0, ptr %2, align 16
  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %2, i64 64, i1 false)
  %4 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  %5 = load <16 x i8>, ptr %4
  %6 = call fastcc <16 x i8> @llvm_code.shiftElementsLeft__anon_1523(<16 x i8> %5)
  %7 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  %8 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %9 = load <16 x i8>, ptr %8
  store <16 x i8> %9, ptr %7, align 16
  %10 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %11 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %12 = load <16 x i8>, ptr %11
  store <16 x i8> %12, ptr %10, align 16
  %13 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %14 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %15 = load <16 x i8>, ptr %14
  store <16 x i8> %15, ptr %13, align 16
  %16 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  store <16 x i8> %6, ptr %16, align 16
  %17 = load <64 x i8>, ptr %3, align 16
  ret <64 x i8> %17
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <64 x i8> @llvm_code.BSL(<64 x i8> %0, <64 x i8> %1, <64 x i8> %2) unnamed_addr #0 {
3:
  %4 = and <64 x i8> %1, %0
  %5 = xor <64 x i8> %0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %6 = and <64 x i8> %2, %5
  %7 = or <64 x i8> %4, %6
  ret <64 x i8> %7
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc void @llvm_code.st4(ptr align 1 nonnull %0, <64 x i8> %1) unnamed_addr #0 {
2:
  %3 = alloca [64 x i8], align 16
  store <64 x i8> %1, ptr %3, align 16
  %4 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  %5 = load <16 x i8>, ptr %4
  %6 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %7 = load <16 x i8>, ptr %6
  %8 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %9 = load <16 x i8>, ptr %8
  %10 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %11 = load <16 x i8>, ptr %10
  call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %5, <16 x i8> %7, <16 x i8> %9, <16 x i8> %11, ptr align 1 nonnull %0)
  ret void
}

; Function Attrs: nounwind willreturn nofree nocallback memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly %0, ptr noalias nocapture readonly %1, i64 %2, i1 immarg %3) #1

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.shiftElementsLeft__anon_1523(<16 x i8> %0) unnamed_addr #0 {
1:
  %2 = bitcast <16 x i8> %0 to <4 x i32>
  %3 = zext <4 x i5> <i5 8, i5 8, i5 8, i5 8> to <4 x i32>
  %4 = lshr <4 x i32> %2, %3
  %5 = bitcast <4 x i32> %4 to <16 x i8>
  ret <16 x i8> %5
}

; Function Attrs: nounwind uwtable
declare void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, ptr align 1 nonnull %4) #2

attributes #0 = { nounwind uwtable nosanitize_coverage skipprofile "frame-pointer"="none" "target-cpu"="apple-latest" "target-features"="-a510,-a520,-a65,-a710,-a720,-a76,-a78,-a78c,-addr-lsl-fast,+aes,-aggressive-fma,+alternate-sextload-cvt-f32-pattern,+altnzcv,-alu-lsl-fast,+am,+amvs,+arith-bcc-fusion,+arith-cbz-fusion,-ascend-store-address,-b16b16,-balance-fp-ops,+bf16,-brbe,+bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,+ccdp,+ccidx,+ccpp,-chk,-clrbhb,-cmp-bcc-fusion,+complxnum,+CONTEXTIDREL2,-cortex-r82,-cpa,+crc,+crypto,-cssc,-d128,+disable-latency-sched-heuristic,-disable-ldp,-disable-stp,+dit,+dotprod,+ecv,+el2vmsa,+el3,-enable-select-opt,-ete,-exynos-cheap-as-move,-f32mm,-f64mm,-faminmax,+fgt,-fix-cortex-a53-835769,+flagm,-fmv,-force-32bit-jump-tables,+fp16fml,-fp8,-fp8dot2,-fp8dot4,-fp8fma,+fp-armv8,-fpmr,+fptoint,+fullfp16,+fuse-address,-fuse-addsub-2reg-const1,-fuse-adrp-add,+fuse-aes,+fuse-arith-logic,+fuse-crypto-eor,+fuse-csel,+fuse-literals,-gcs,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,+hcx,+i8mm,-ite,+jsconv,-ldp-aligned-only,+lor,-ls64,+lse,-lse128,+lse2,-lut,-mec,-mops,+mpam,-mte,+neon,-nmi,-no-bti-at-return-twice,-no-neg-immediates,-no-sve-fp-ld1r,-no-zcz-fp,+nv,-outline-atomics,+pan,+pan-rwv,+pauth,-pauth-lr,+perfmon,-predictable-select-expensive,+predres,-prfm-slc-target,-rand,+ras,-rasv2,+rcpc,-rcpc3,+rcpc-immo,+rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,+sb,+sel2,+sha2,+sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme2,-sme2p1,-sme-f16f16,-sme-f64f64,-sme-f8f16,-sme-f8f32,-sme-fa64,-sme-i16i64,-sme-lutv2,-spe,-spe-eef,-specres2,+specrestrict,+ssbs,-ssve-fp8dot2,-ssve-fp8dot4,-ssve-fp8fma,+store-pair-suppress,-stp-aligned-only,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-sve2p1,-tagged-globals,-the,+tlb-rmi,-tlbiw,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tpidrro-el0,+tracev8.4,-trbe,+uaops,-use-experimental-zeroing-pseudos,-use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,-v8.7a,-v8.8a,-v8.9a,+v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9.4a,-v9.5a,-v9a,+vh,-wfxt,-xs,+zcm,+zcz,-zcz-fp-workaround,+zcz-gp" }
attributes #1 = { nounwind willreturn nofree nocallback memory(argmem: readwrite) }
attributes #2 = { nounwind uwtable "frame-pointer"="none" "target-cpu"="apple-latest" "target-features"="-a510,-a520,-a65,-a710,-a720,-a76,-a78,-a78c,-addr-lsl-fast,+aes,-aggressive-fma,+alternate-sextload-cvt-f32-pattern,+altnzcv,-alu-lsl-fast,+am,+amvs,+arith-bcc-fusion,+arith-cbz-fusion,-ascend-store-address,-b16b16,-balance-fp-ops,+bf16,-brbe,+bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,+ccdp,+ccidx,+ccpp,-chk,-clrbhb,-cmp-bcc-fusion,+complxnum,+CONTEXTIDREL2,-cortex-r82,-cpa,+crc,+crypto,-cssc,-d128,+disable-latency-sched-heuristic,-disable-ldp,-disable-stp,+dit,+dotprod,+ecv,+el2vmsa,+el3,-enable-select-opt,-ete,-exynos-cheap-as-move,-f32mm,-f64mm,-faminmax,+fgt,-fix-cortex-a53-835769,+flagm,-fmv,-force-32bit-jump-tables,+fp16fml,-fp8,-fp8dot2,-fp8dot4,-fp8fma,+fp-armv8,-fpmr,+fptoint,+fullfp16,+fuse-address,-fuse-addsub-2reg-const1,-fuse-adrp-add,+fuse-aes,+fuse-arith-logic,+fuse-crypto-eor,+fuse-csel,+fuse-literals,-gcs,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,+hcx,+i8mm,-ite,+jsconv,-ldp-aligned-only,+lor,-ls64,+lse,-lse128,+lse2,-lut,-mec,-mops,+mpam,-mte,+neon,-nmi,-no-bti-at-return-twice,-no-neg-immediates,-no-sve-fp-ld1r,-no-zcz-fp,+nv,-outline-atomics,+pan,+pan-rwv,+pauth,-pauth-lr,+perfmon,-predictable-select-expensive,+predres,-prfm-slc-target,-rand,+ras,-rasv2,+rcpc,-rcpc3,+rcpc-immo,+rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,+sb,+sel2,+sha2,+sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme2,-sme2p1,-sme-f16f16,-sme-f64f64,-sme-f8f16,-sme-f8f32,-sme-fa64,-sme-i16i64,-sme-lutv2,-spe,-spe-eef,-specres2,+specrestrict,+ssbs,-ssve-fp8dot2,-ssve-fp8dot4,-ssve-fp8fma,+store-pair-suppress,-stp-aligned-only,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-sve2p1,-tagged-globals,-the,+tlb-rmi,-tlbiw,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tpidrro-el0,+tracev8.4,-trbe,+uaops,-use-experimental-zeroing-pseudos,-use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,-v8.7a,-v8.8a,-v8.9a,+v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9.4a,-v9.5a,-v9a,+vh,-wfxt,-xs,+zcm,+zcz,-zcz-fp-workaround,+zcz-gp" }

!llvm.module.flags = !{}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to