https://github.com/Amichaxx updated https://github.com/llvm/llvm-project/pull/182835
>From 855acfc0c5e8600f67673a6ab8a3736abac96df0 Mon Sep 17 00:00:00 2001 From: Amichaxx <[email protected]> Date: Thu, 19 Feb 2026 17:48:12 +0000 Subject: [PATCH 1/2] [AArch64] Fold zero-high vector inserts in MI peephole optimisation Summary This patch follows on from #178227. The previous ISel fold lowers the 64-bit case to: fmov d0, x0 fmov d0, d0 which is not ideal and could be fmov d0, x0. A redundant copy comes from the INSERT_SUBREG/INSvi64lane. This peephole detects <2 x i64> vectors made of a zeroed upper and low lane produced by FMOVXDr/FMOVDr, then removes the redundant copy. Further updated tests and added MIR tests. --- .../Target/AArch64/AArch64MIPeepholeOpt.cpp | 51 +++++++++++++++++-- llvm/test/CodeGen/AArch64/aarch64-addv.ll | 3 -- .../AArch64/aarch64-matrix-umull-smull.ll | 15 +++--- llvm/test/CodeGen/AArch64/bitcast-extend.ll | 1 - llvm/test/CodeGen/AArch64/ctpop.ll | 1 - llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll | 48 ++++++++--------- .../AArch64/neon-lowhalf128-optimisation.ll | 1 - .../CodeGen/AArch64/peephole-insvigpr.mir | 51 +++++++++++++++++++ 8 files changed, 129 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 21ff921da9b8a..398273babe1b1 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -687,14 +687,57 @@ bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { } // All instructions that set a FPR64 will implicitly zero the top bits of the -// register. +// register. When the def is expressed as a COPY from a GPR, turn it into an +// explicit FMOV so it cannot be elided later in further passes. static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, - MachineRegisterInfo *MRI) { + MachineRegisterInfo *MRI, + const AArch64InstrInfo *TII) { if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef()) return false; const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg()); if (RC != &AArch64::FPR64RegClass) return false; + if (MI->getOpcode() == TargetOpcode::COPY) { + MachineOperand &SrcOp = MI->getOperand(1); + if (!SrcOp.isReg()) + return false; + if (SrcOp.getSubReg()) + return false; + Register SrcReg = SrcOp.getReg(); + auto IsGPR64Like = [&]() -> bool { + if (SrcReg.isVirtual()) + return AArch64::GPR64allRegClass.hasSubClassEq( + MRI->getRegClass(SrcReg)); + return AArch64::GPR64allRegClass.contains(SrcReg); + }; + if (!IsGPR64Like()) + return false; + assert(TII && "Expected InstrInfo when materializing COPYs"); + // FMOVXDr insists on strict GPR64 operands, so fix up the COPY source. + MachineOperand &SrcMO = MI->getOperand(1); + bool SrcKill = SrcMO.isKill(); + if (SrcReg.isVirtual()) { + if (MRI->getRegClass(SrcReg) != &AArch64::GPR64RegClass) { + // Pass the value through a temporary GPR64 vreg to satisfy the + // verifier. + Register NewSrc = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY), NewSrc) + .addReg(SrcReg, getKillRegState(SrcKill)); + SrcReg = NewSrc; + SrcKill = true; + } + } else if (!AArch64::GPR64RegClass.contains(SrcReg)) { + return false; + } + SrcMO.setReg(SrcReg); + SrcMO.setSubReg(0); + SrcMO.setIsKill(SrcKill); + // Replace the COPY with an explicit FMOV so the zeroing behaviour stays + // visible. + MI->setDesc(TII->get(AArch64::FMOVXDr)); + return true; + } return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; } @@ -710,7 +753,7 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) return false; Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg()); - if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) + if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI, TII)) return false; // Check there is `mov 0` MI for high 64-bits. @@ -751,7 +794,7 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) { // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR. MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); - if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) + if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI, TII)) return false; // Let's remove MIs for high 64-bits. diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index d8aeeff79b936..de68a79824eb3 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -545,7 +545,6 @@ define i8 @addv_zero_lanes_v16i8(ptr %arr) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ldrb w8, [x0] ; CHECK-SD-NEXT: fmov d0, x8 -; CHECK-SD-NEXT: fmov d0, d0 ; CHECK-SD-NEXT: addv b0, v0.16b ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret @@ -570,7 +569,6 @@ define i16 @addv_zero_lanes_v8i16(ptr %arr) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ldrh w8, [x0] ; CHECK-SD-NEXT: fmov d0, x8 -; CHECK-SD-NEXT: fmov d0, d0 ; CHECK-SD-NEXT: addv h0, v0.8h ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret @@ -595,7 +593,6 @@ define i32 @addv_zero_lanes_v4i32(ptr %arr) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ldr w8, [x0] ; CHECK-SD-NEXT: fmov d0, x8 -; CHECK-SD-NEXT: fmov d0, d0 ; CHECK-SD-NEXT: addv s0, v0.4s ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index fa982ce27c7d0..ff2d5c68af531 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -823,14 +823,13 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none ; CHECK-SD-NEXT: cbz x11, .LBB6_13 ; CHECK-SD-NEXT: .LBB6_10: // %vec.epilog.ph ; CHECK-SD-NEXT: mov w11, w1 -; CHECK-SD-NEXT: fmov d0, x8 -; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 -; CHECK-SD-NEXT: sxtb x8, w11 +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 ; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff +; CHECK-SD-NEXT: sxtb x11, w11 +; CHECK-SD-NEXT: fmov d2, x8 +; CHECK-SD-NEXT: dup v1.2s, w11 ; CHECK-SD-NEXT: mov x11, x10 ; CHECK-SD-NEXT: and x10, x9, #0xfffffffc -; CHECK-SD-NEXT: fmov d0, d0 -; CHECK-SD-NEXT: dup v2.2s, w8 ; CHECK-SD-NEXT: sub x8, x11, x10 ; CHECK-SD-NEXT: add x11, x0, x11 ; CHECK-SD-NEXT: .LBB6_11: // %vec.epilog.vector.body @@ -845,11 +844,11 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none ; CHECK-SD-NEXT: and v4.16b, v4.16b, v3.16b ; CHECK-SD-NEXT: xtn v5.2s, v5.2d ; CHECK-SD-NEXT: xtn v4.2s, v4.2d -; CHECK-SD-NEXT: smlal v1.2d, v2.2s, v4.2s -; CHECK-SD-NEXT: smlal v0.2d, v2.2s, v5.2s +; CHECK-SD-NEXT: smlal v0.2d, v1.2s, v4.2s +; CHECK-SD-NEXT: smlal v2.2d, v1.2s, v5.2s ; CHECK-SD-NEXT: b.ne .LBB6_11 ; CHECK-SD-NEXT: // %bb.12: // %vec.epilog.middle.block -; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: add v0.2d, v2.2d, v0.2d ; CHECK-SD-NEXT: cmp x10, x9 ; CHECK-SD-NEXT: addp d0, v0.2d ; CHECK-SD-NEXT: fmov x8, d0 diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll index 2bd91a8dc9a7d..b981c1701725a 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll @@ -341,7 +341,6 @@ define <16 x i8> @load_zext_v16i8(ptr %p) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ldr w8, [x0] ; CHECK-SD-NEXT: fmov d0, x8 -; CHECK-SD-NEXT: fmov d0, d0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: load_zext_v16i8: diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index df817afb12368..84984c23f129e 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -602,7 +602,6 @@ define i128 @i128_mask(i128 %x) { ; CHECK-SD-NEXT: and x8, x0, #0xff ; CHECK-SD-NEXT: mov x1, xzr ; CHECK-SD-NEXT: fmov d0, x8 -; CHECK-SD-NEXT: fmov d0, d0 ; CHECK-SD-NEXT: cnt v0.16b, v0.16b ; CHECK-SD-NEXT: addv b0, v0.16b ; CHECK-SD-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll index fd7c869fe2f92..137a7feb1a85c 100644 --- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll @@ -829,9 +829,9 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-CVT-SD-NEXT: cmp x20, #0 -; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-CVT-SD-NEXT: fmov d0, x8 -; CHECK-CVT-SD-NEXT: fmov d1, x9 +; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-CVT-SD-NEXT: fmov d1, x8 ; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-CVT-SD-NEXT: add sp, sp, #48 @@ -858,9 +858,9 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-FP16-SD-NEXT: cmp x20, #0 -; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-FP16-SD-NEXT: fmov d0, x8 -; CHECK-FP16-SD-NEXT: fmov d1, x9 +; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-FP16-SD-NEXT: fmov d1, x8 ; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-FP16-SD-NEXT: add sp, sp, #48 @@ -1296,9 +1296,9 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-CVT-SD-NEXT: cmp x20, #0 -; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-CVT-SD-NEXT: fmov d0, x8 -; CHECK-CVT-SD-NEXT: fmov d1, x9 +; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-CVT-SD-NEXT: fmov d1, x8 ; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-CVT-SD-NEXT: add sp, sp, #48 @@ -1326,9 +1326,9 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-FP16-SD-NEXT: cmp x20, #0 -; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-FP16-SD-NEXT: fmov d0, x8 -; CHECK-FP16-SD-NEXT: fmov d1, x9 +; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-FP16-SD-NEXT: fmov d1, x8 ; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-FP16-SD-NEXT: add sp, sp, #48 @@ -1748,9 +1748,9 @@ define <2 x i64> @utest_f16i64(<2 x half> %x) { ; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-CVT-SD-NEXT: cmp x20, #0 -; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-CVT-SD-NEXT: fmov d0, x8 -; CHECK-CVT-SD-NEXT: fmov d1, x9 +; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-CVT-SD-NEXT: fmov d1, x8 ; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-CVT-SD-NEXT: add sp, sp, #48 @@ -1778,9 +1778,9 @@ define <2 x i64> @utest_f16i64(<2 x half> %x) { ; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-FP16-SD-NEXT: cmp x20, #0 -; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-FP16-SD-NEXT: fmov d0, x8 -; CHECK-FP16-SD-NEXT: fmov d1, x9 +; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-FP16-SD-NEXT: fmov d1, x8 ; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-FP16-SD-NEXT: add sp, sp, #48 @@ -2774,9 +2774,9 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-CVT-SD-NEXT: cmp x20, #0 -; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-CVT-SD-NEXT: fmov d0, x8 -; CHECK-CVT-SD-NEXT: fmov d1, x9 +; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-CVT-SD-NEXT: fmov d1, x8 ; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-CVT-SD-NEXT: add sp, sp, #48 @@ -2803,9 +2803,9 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-FP16-SD-NEXT: cmp x20, #0 -; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-FP16-SD-NEXT: fmov d0, x8 -; CHECK-FP16-SD-NEXT: fmov d1, x9 +; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-FP16-SD-NEXT: fmov d1, x8 ; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-FP16-SD-NEXT: add sp, sp, #48 @@ -3232,9 +3232,9 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-CVT-SD-NEXT: cmp x20, #0 -; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-CVT-SD-NEXT: fmov d0, x8 -; CHECK-CVT-SD-NEXT: fmov d1, x9 +; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-CVT-SD-NEXT: fmov d1, x8 ; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-CVT-SD-NEXT: add sp, sp, #48 @@ -3262,9 +3262,9 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-FP16-SD-NEXT: cmp x20, #0 -; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-FP16-SD-NEXT: fmov d0, x8 -; CHECK-FP16-SD-NEXT: fmov d1, x9 +; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-FP16-SD-NEXT: fmov d1, x8 ; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-FP16-SD-NEXT: add sp, sp, #48 @@ -3675,9 +3675,9 @@ define <2 x i64> @utest_f16i64_mm(<2 x half> %x) { ; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-CVT-SD-NEXT: cmp x20, #0 -; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-CVT-SD-NEXT: fmov d0, x8 -; CHECK-CVT-SD-NEXT: fmov d1, x9 +; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-CVT-SD-NEXT: fmov d1, x8 ; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-CVT-SD-NEXT: add sp, sp, #48 @@ -3705,9 +3705,9 @@ define <2 x i64> @utest_f16i64_mm(<2 x half> %x) { ; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload ; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq ; CHECK-FP16-SD-NEXT: cmp x20, #0 -; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq ; CHECK-FP16-SD-NEXT: fmov d0, x8 -; CHECK-FP16-SD-NEXT: fmov d1, x9 +; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq +; CHECK-FP16-SD-NEXT: fmov d1, x8 ; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-FP16-SD-NEXT: add sp, sp, #48 diff --git a/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll b/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll index 4e30813187fec..38be2992c8211 100644 --- a/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll +++ b/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll @@ -5,7 +5,6 @@ define <2 x i64> @low_vector_splat_v2i64_from_i64(i64 %0){ ; CHECK-LABEL: low_vector_splat_v2i64_from_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: fmov d0, d0 ; CHECK-NEXT: ret %2 = insertelement <1 x i64> poison, i64 %0, i64 0 %3 = shufflevector <1 x i64> %2, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1> diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir index aef01e42ed7cc..a68eda11d5ca1 100644 --- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir +++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir @@ -41,6 +41,11 @@ ret void } + define void @insert_vec_from_gpr64_zero_high(i64 %v, ptr %dst) { + entry: + ret void + } + attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } ... @@ -521,4 +526,50 @@ body: | STRSui killed %16, %0, 0 :: (store (s32) into %ir.hist) RET_ReallyLR +--- +name: insert_vec_from_gpr64_zero_high +tracksRegLiveness: true +registers: + - { id: 0, class: gpr64common, preferred-register: '' } + - { id: 1, class: gpr64common, preferred-register: '' } + - { id: 2, class: fpr64, preferred-register: '' } + - { id: 3, class: fpr128, preferred-register: '' } + - { id: 4, class: fpr128, preferred-register: '' } + - { id: 5, class: fpr64, preferred-register: '' } + - { id: 6, class: fpr128, preferred-register: '' } + - { id: 7, class: fpr128, preferred-register: '' } + - { id: 8, class: fpr128, preferred-register: '' } +liveins: + - { reg: '$x0', virtual-reg: '%0' } + - { reg: '$x1', virtual-reg: '%1' } +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: insert_vec_from_gpr64_zero_high + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PTR:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[VAL:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[GPR:%[0-9]+]]:gpr64 = COPY [[VAL]] + ; CHECK-NEXT: [[FMOV:%[0-9]+]]:fpr64 = FMOVXDr killed [[GPR]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_LOW:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[FMOV]], %subreg.dsub + ; CHECK-NEXT: [[MOVID:%[0-9]+]]:fpr64 = MOVID 0 + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_ZERO:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], killed [[MOVID]], %subreg.dsub + ; CHECK-NEXT: STRQui killed [[INSERT_LOW]], [[PTR]], 0 :: (store (s128) into %ir.dst, align 8) + ; CHECK-NEXT: RET_ReallyLR + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:fpr64 = COPY %1 + %4:fpr128 = IMPLICIT_DEF + %3:fpr128 = INSERT_SUBREG %4, %2, %subreg.dsub + %5:fpr64 = MOVID 0 + %7:fpr128 = IMPLICIT_DEF + %6:fpr128 = INSERT_SUBREG %7, killed %5, %subreg.dsub + %8:fpr128 = INSvi64lane %3, 1, killed %6, 0 + STRQui killed %8, %0, 0 :: (store (s128) into %ir.dst, align 8) + RET_ReallyLR + ... >From b9f5d7091c6790a85ef808bb33f0ee61c1b5353f Mon Sep 17 00:00:00 2001 From: Amichaxx <[email protected]> Date: Mon, 23 Feb 2026 12:18:15 +0000 Subject: [PATCH 2/2] Compile time regression test --- .../aarch64-neonvector-tensorflow-regression.ll | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll diff --git a/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll b/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll new file mode 100644 index 0000000000000..716508fb78246 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -start-before=aarch64-isel %s -o /dev/null +; Regression test for AArch64 compile-time regression, referring to PR #166962. + +define fastcc <2 x i64> @_ZN10tensorflow12_GLOBAL__N_125ComputeXWeightsAndIndicesERKNS_17ImageResizerStateEbPNSt3__u6vectorINS0_17WeightsAndIndicesENS4_9allocatorIS6_EEEE(<2 x i64> %0) { +entry: + %1 = tail call <2 x i64> @llvm.smin.v2i64(<2 x i64> %0, <2 x i64> <i64 -1, i64 0>) + ret <2 x i64> %1 +} + +; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) #0 + +attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
