kmclaughlin updated this revision to Diff 239303. kmclaughlin added a comment.
- Rebased patch after changes made to parent revision CHANGES SINCE LAST ACTION https://reviews.llvm.org/D73025/new/ https://reviews.llvm.org/D73025 Files: llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.h llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td llvm/lib/Target/AArch64/SVEInstrFormats.td llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll @@ -0,0 +1,220 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDFF1B +; + +define <vscale x 16 x i8> @ldff1b(<vscale x 16 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldff1b: +; CHECK: ldff1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ldff1.nxv16i8(<vscale x 16 x i1> %pg, i8* %a) + ret <vscale x 16 x i8> %load +} + +define <vscale x 8 x i16> @ldff1b_h(<vscale x 8 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldff1b_h: +; CHECK: ldff1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a) + %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16> + ret <vscale x 8 x i16> %res +} + +define <vscale x 4 x i32> @ldff1b_s(<vscale x 4 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldff1b_s: +; CHECK: ldff1b { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a) + %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 2 x i64> @ldff1b_d(<vscale x 2 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldff1b_d: +; CHECK: ldff1b { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a) + %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +; +; LDFF1SB +; + +define <vscale x 8 x i16> @ldff1sb_h(<vscale x 8 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldff1sb_h: +; CHECK: ldff1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a) + %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16> + ret <vscale x 8 x i16> %res +} + +define <vscale x 4 x i32> @ldff1sb_s(<vscale x 4 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldff1sb_s: +; CHECK: ldff1sb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a) + %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 2 x i64> @ldff1sb_d(<vscale x 2 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldff1sb_d: +; CHECK: ldff1sb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a) + %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +; +; LDFF1H +; + +define <vscale x 8 x i16> @ldff1h(<vscale x 8 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldff1h: +; CHECK: ldff1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1> %pg, i16* %a) + ret <vscale x 8 x i16> %load +} + +define <vscale x 4 x i32> @ldff1h_s(<vscale x 4 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldff1h_s: +; CHECK: ldff1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a) + %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 2 x i64> @ldff1h_d(<vscale x 2 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldff1h_d: +; CHECK: ldff1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a) + %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 8 x half> @ldff1h_f16(<vscale x 8 x i1> %pg, half* %a) { +; CHECK-LABEL: ldff1h_f16: +; CHECK: ldff1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1> %pg, half* %a) + ret <vscale x 8 x half> %load +} + +; +; LDFF1SH +; + +define <vscale x 4 x i32> @ldff1sh_s(<vscale x 4 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldff1sh_s: +; CHECK: ldff1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a) + %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 2 x i64> @ldff1sh_d(<vscale x 2 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldff1sh_d: +; CHECK: ldff1sh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a) + %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +; +; LDFF1W +; + +define <vscale x 4 x i32> @ldff1w(<vscale x 4 x i1> %pg, i32* %a) { +; CHECK-LABEL: ldff1w: +; CHECK: ldff1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.nxv4i32(<vscale x 4 x i1> %pg, i32* %a) + ret <vscale x 4 x i32> %load +} + +define <vscale x 2 x i64> @ldff1w_d(<vscale x 2 x i1> %pg, i32* %a) { +; CHECK-LABEL: ldff1w_d: +; CHECK: ldff1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a) + %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 4 x float> @ldff1w_f32(<vscale x 4 x i1> %pg, float* %a) { +; CHECK-LABEL: ldff1w_f32: +; CHECK: ldff1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.nxv4f32(<vscale x 4 x i1> %pg, float* %a) + ret <vscale x 4 x float> %load +} + +define <vscale x 2 x float> @ldff1w_2f32(<vscale x 2 x i1> %pg, float* %a) { +; CHECK-LABEL: ldff1w_2f32: +; CHECK: ldff1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x float> @llvm.aarch64.sve.ldff1.nxv2f32(<vscale x 2 x i1> %pg, float* %a) + ret <vscale x 2 x float> %load +} + +; +; LDFF1SW +; + +define <vscale x 2 x i64> @ldff1sw_d(<vscale x 2 x i1> %pg, i32* %a) { +; CHECK-LABEL: ldff1sw_d: +; CHECK: ldff1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a) + %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +; +; LDFF1D +; + +define <vscale x 2 x i64> @ldff1d(<vscale x 2 x i1> %pg, i64* %a) { +; CHECK-LABEL: ldff1d: +; CHECK: ldff1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.nxv2i64(<vscale x 2 x i1> %pg, i64* %a) + ret <vscale x 2 x i64> %load +} + + +define <vscale x 2 x double> @ldff1d_f64(<vscale x 2 x i1> %pg, double* %a) { +; CHECK-LABEL: ldff1d_f64: +; CHECK: ldff1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.nxv2f64(<vscale x 2 x i1> %pg, double* %a) + ret <vscale x 2 x double> %load +} + +declare <vscale x 16 x i8> @llvm.aarch64.sve.ldff1.nxv16i8(<vscale x 16 x i1>, i8*) + +declare <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1>, i8*) +declare <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1>, i16*) +declare <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1>, half*) + +declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1>, i8*) +declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1>, i16*) +declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.nxv4i32(<vscale x 4 x i1>, i32*) +declare <vscale x 2 x float> @llvm.aarch64.sve.ldff1.nxv2f32(<vscale x 2 x i1>, float*) +declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.nxv4f32(<vscale x 4 x i1>, float*) + +declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1>, i8*) +declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1>, i16*) +declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1>, i32*) +declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.nxv2i64(<vscale x 2 x i1>, i64*) +declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.nxv2f64(<vscale x 2 x i1>, double*) Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5780,6 +5780,13 @@ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>; + } } multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty, Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -47,6 +47,7 @@ def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ldnf1s : SDNode<"AArch64ISD::LDNF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1s : SDNode<"AArch64ISD::LDFF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; @@ -1266,6 +1267,40 @@ // 16-element contiguous non-faulting loads defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>; + multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> { + // Add more complex addressing modes here as required. + // Base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, XZR)>; + } + + // 2-element contiguous first faulting loads + defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i8>; + defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i8>; + defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i16>; + defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i16>; + defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i32>; + defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i32>; + defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i64>; + defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1, nxv2i1, nxv2f32>; + defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1, nxv2i1, nxv2f64>; + + // 4-element contiguous first faulting loads + defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i8>; + defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i8>; + defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i16>; + defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i16>; + defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1, nxv4i1, nxv4i32>; + defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1, nxv4i1, nxv4f32>; + + // 8-element contiguous first faulting loads + defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i8>; + defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s, nxv8i1, nxv8i8>; + defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i16>; + defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1, nxv8i1, nxv8f16>; + + // 16-element contiguous first faulting loads + defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1, nxv16i1, nxv16i8>; } let Predicates = [HasSVE2] in { Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -553,6 +553,7 @@ ]>; def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1 : SDNode<"AArch64ISD::LDFF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -217,6 +217,8 @@ LDNF1, LDNF1S, + LDFF1, + LDFF1S, // Unsigned gather loads. GLD1, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1361,6 +1361,8 @@ case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1"; case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S"; + case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1"; + case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S"; case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; @@ -10181,6 +10183,7 @@ // perfect candidates for combining. switch (Src->getOpcode()) { case AArch64ISD::LDNF1: + case AArch64ISD::LDFF1: MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); break; case AArch64ISD::GLD1: @@ -11242,7 +11245,7 @@ ISD::UNINDEXED, false, false); } -static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) { +static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG, bool isFF) { SDLoc DL(N); EVT VT = N->getValueType(0); @@ -11259,7 +11262,8 @@ N->getOperand(3), // Base DAG.getValueType(VT) }; - SDValue Load = DAG.getNode(AArch64ISD::LDNF1, DL, VTs, Ops); + unsigned Opc = isFF ? AArch64ISD::LDFF1 : AArch64ISD::LDNF1; + SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); SDValue LoadChain = SDValue(Load.getNode(), 1); if (ContainerVT.isInteger() && (VT != ContainerVT)) @@ -12515,6 +12519,10 @@ NewOpc = AArch64ISD::LDNF1S; MemVTOpNum = 3; break; + case AArch64ISD::LDFF1: + NewOpc = AArch64ISD::LDFF1S; + MemVTOpNum = 3; + break; case AArch64ISD::GLD1: NewOpc = AArch64ISD::GLD1S; break; @@ -12650,7 +12658,9 @@ case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); case Intrinsic::aarch64_sve_ldnf1: - return performLDNF1Combine(N, DAG); + return performLDNF1Combine(N, DAG, false); + case Intrinsic::aarch64_sve_ldff1: + return performLDNF1Combine(N, DAG, true); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); case Intrinsic::aarch64_sve_ld1_gather: Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1176,6 +1176,7 @@ def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic; +def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic; // // Stores
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits