kmclaughlin created this revision.
kmclaughlin added reviewers: sdesmalen, efriedma, andwar, dancgr, mgudim.
Herald added subscribers: psnobl, rkruppe, hiraditya, kristof.beyls, tschuett.
Herald added a reviewer: rengolin.
Herald added a project: LLVM.

This patch adds the llvm.aarch64.sve.ldnf1 intrinsic, plus
DAG combine rules for non-faulting loads and sign/zero extends


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D71698

Files:
  llvm/include/llvm/IR/IntrinsicsAArch64.td
  llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
  llvm/lib/Target/AArch64/AArch64ISelLowering.h
  llvm/lib/Target/AArch64/AArch64InstrInfo.td
  llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
  llvm/lib/Target/AArch64/SVEInstrFormats.td
  llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll

Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
@@ -0,0 +1,182 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @ldnf1b(<vscale x 16 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b:
+; CHECK: ldnf1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1> %pg, i8* %a)
+  ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 8 x i16> @ldnf1b_h(<vscale x 8 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b_h:
+; CHECK: ldnf1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
+  %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i16> @ldnf1sb_h(<vscale x 8 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1sb_h:
+; CHECK: ldnf1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
+  %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i16> @ldnf1h(<vscale x 8 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1h:
+; CHECK: ldnf1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1> %pg, i16* %a)
+  ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 8 x half> @ldnf1h_f16(<vscale x 8 x i1> %pg, half* %a) {
+; CHECK-LABEL: ldnf1h_f16:
+; CHECK: ldnf1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1> %pg, half* %a)
+  ret <vscale x 8 x half> %load
+}
+
+define <vscale x 4 x i32> @ldnf1b_s(<vscale x 4 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b_s:
+; CHECK: ldnf1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
+  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1sb_s(<vscale x 4 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1sb_s:
+; CHECK: ldnf1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
+  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1h_s(<vscale x 4 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1h_s:
+; CHECK: ldnf1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1sh_s(<vscale x 4 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1sh_s:
+; CHECK: ldnf1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
+  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @ldnf1w(<vscale x 4 x i1> %pg, i32* %a) {
+; CHECK-LABEL: ldnf1w:
+; CHECK: ldnf1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1> %pg, i32* %a)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x float> @ldnf1w_f32(<vscale x 4 x i1> %pg, float* %a) {
+; CHECK-LABEL: ldnf1w_f32:
+; CHECK: ldnf1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnf1.nxv4f32(<vscale x 4 x i1> %pg, float* %a)
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x i64> @ldnf1b_d(<vscale x 2 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1b_d:
+; CHECK: ldnf1b { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1sb_d(<vscale x 2 x i1> %pg, i8* %a) {
+; CHECK-LABEL: ldnf1sb_d:
+; CHECK: ldnf1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
+  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1h_d(<vscale x 2 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1h_d:
+; CHECK: ldnf1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1sh_d(<vscale x 2 x i1> %pg, i16* %a) {
+; CHECK-LABEL: ldnf1sh_d:
+; CHECK: ldnf1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1w_d(<vscale x 2 x i1> %pg, i32* %a) {
+; CHECK-LABEL: ldnf1w_d:
+; CHECK: ldnf1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1sw_d(<vscale x 2 x i1> %pg, i32* %a) {
+; CHECK-LABEL: ldnf1sw_d:
+; CHECK: ldnf1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @ldnf1d(<vscale x 2 x i1> %pg, i64* %a) {
+; CHECK-LABEL: ldnf1d:
+; CHECK: ldnf1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnf1.nxv2i64(<vscale x 2 x i1> %pg, i64* %a)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @ldnf1d_f64(<vscale x 2 x i1> %pg, double* %a) {
+; CHECK-LABEL: ldnf1d_f64:
+; CHECK: ldnf1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnf1.nxv2f64(<vscale x 2 x i1> %pg, double* %a)
+  ret <vscale x 2 x double> %load
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1>, i8*)
+
+declare <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1>, i8*)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1>, half*)
+
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1>, i8*)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1>, i16*)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnf1.nxv4f32(<vscale x 4 x i1>, float*)
+
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1>, i8*)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1>, i16*)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1>, i32*)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnf1.nxv2i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnf1.nxv2f64(<vscale x 2 x i1>, double*)
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5320,14 +5320,21 @@
 
 multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
                                RegisterOperand listty, ZPRRegOp zprty> {
-  def "" : sve_mem_cld_si_base<dtype, nf, asm, listty>;
+  def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
-                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
-                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
-                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+                  (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
+  def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
+           PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
+  }
 }
 
 multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -46,6 +46,7 @@
 def AArch64ld1_gather_sxtw_scaled    : SDNode<"AArch64ISD::GLD1_SXTW_SCALED",   SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
 def AArch64ld1_gather_imm            : SDNode<"AArch64ISD::GLD1_IMM",           SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
 
+def AArch64ldnf1s                    : SDNode<"AArch64ISD::LDNF1S",             SDT_AArch64_LDNF1,    [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
 def AArch64ld1s_gather               : SDNode<"AArch64ISD::GLD1S",              SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
 def AArch64ld1s_gather_scaled        : SDNode<"AArch64ISD::GLD1S_SCALED",       SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
 def AArch64ld1s_gather_uxtw          : SDNode<"AArch64ISD::GLD1S_UXTW",         SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
@@ -1210,6 +1211,40 @@
   defm : pred_store<nxv8i16, nxv8i1,  non_temporal_store, STNT1H_ZRI>;
   defm : pred_store<nxv4i32, nxv4i1,  non_temporal_store, STNT1W_ZRI>;
   defm : pred_store<nxv2i64, nxv2i1,  non_temporal_store, STNT1D_ZRI>;
+
+  multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
+    // base
+    def : Pat<(Ty (Load  (PredTy PPR:$gp), GPR64:$base, MemVT)),
+              (I PPR:$gp, GPR64sp:$base, (i64 0))>;
+  }
+
+  // 2-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_D_IMM,  nxv2i64, AArch64ldnf1,  nxv2i1, nxv2i8>;
+  defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i8>;
+  defm : ldnf1<LDNF1H_D_IMM,  nxv2i64, AArch64ldnf1,  nxv2i1, nxv2i16>;
+  defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i16>;
+  defm : ldnf1<LDNF1W_D_IMM,  nxv2i64, AArch64ldnf1,  nxv2i1, nxv2i32>;
+  defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i32>;
+  defm : ldnf1<LDNF1D_IMM,    nxv2i64, AArch64ldnf1,  nxv2i1, nxv2i64>;
+  defm : ldnf1<LDNF1D_IMM,    nxv2f64, AArch64ldnf1,  nxv2i1, nxv2f64>;
+
+  // 4-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_S_IMM,  nxv4i32, AArch64ldnf1,  nxv4i1, nxv4i8>;
+  defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i8>;
+  defm : ldnf1<LDNF1H_S_IMM,  nxv4i32, AArch64ldnf1,  nxv4i1, nxv4i16>;
+  defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i16>;
+  defm : ldnf1<LDNF1W_IMM,    nxv4i32, AArch64ldnf1,  nxv4i1, nxv4i32>;
+  defm : ldnf1<LDNF1W_IMM,    nxv4f32, AArch64ldnf1,  nxv4i1, nxv4f32>;
+
+  // 8-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_H_IMM,  nxv8i16, AArch64ldnf1,  nxv8i1, nxv8i8>;
+  defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s, nxv8i1, nxv8i8>;
+  defm : ldnf1<LDNF1H_IMM,    nxv8i16, AArch64ldnf1,  nxv8i1, nxv8i16>;
+  defm : ldnf1<LDNF1H_IMM,    nxv8f16, AArch64ldnf1,  nxv8i1, nxv8f16>;
+
+  // 16-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_IMM,    nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;
+
 }
 
 let Predicates = [HasSVE2] in {
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -541,6 +541,13 @@
 def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def SDT_AArch64_LDNF1 : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ldnf1  : SDNode<"AArch64ISD::LDNF1",  SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -206,6 +206,9 @@
 
   INSR,
 
+  LDNF1,
+  LDNF1S,
+
   // Unsigned gather loads.
   GLD1,
   GLD1_SCALED,
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1347,6 +1347,8 @@
   case AArch64ISD::UUNPKHI:           return "AArch64ISD::UUNPKHI";
   case AArch64ISD::UUNPKLO:           return "AArch64ISD::UUNPKLO";
   case AArch64ISD::INSR:              return "AArch64ISD::INSR";
+  case AArch64ISD::LDNF1:             return "AArch64ISD::LDNF1";
+  case AArch64ISD::LDNF1S:            return "AArch64ISD::LDNF1S";
   case AArch64ISD::GLD1:              return "AArch64ISD::GLD1";
   case AArch64ISD::GLD1_SCALED:       return "AArch64ISD::GLD1_SCALED";
   case AArch64ISD::GLD1_SXTW:         return "AArch64ISD::GLD1_SXTW";
@@ -9991,9 +9993,14 @@
   if (!Src.hasOneUse())
     return SDValue();
 
+  EVT MemVT;
+
   // GLD1* instructions perform an implicit zero-extend, which makes them
   // perfect candidates for combining.
   switch (Src->getOpcode()) {
+  case AArch64ISD::LDNF1:
+    MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
+    break;
   case AArch64ISD::GLD1:
   case AArch64ISD::GLD1_SCALED:
   case AArch64ISD::GLD1_SXTW:
@@ -10001,13 +10008,12 @@
   case AArch64ISD::GLD1_UXTW:
   case AArch64ISD::GLD1_UXTW_SCALED:
   case AArch64ISD::GLD1_IMM:
+    MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
     break;
   default:
     return SDValue();
   }
 
-  EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
-
   if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
     return Src;
 
@@ -11033,6 +11039,39 @@
                             ISD::UNINDEXED, false, false);
 }
 
+static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+    return SDValue();
+
+  EVT ContainerVT = VT;
+  if (ContainerVT.isInteger()) {
+    switch (VT.getVectorNumElements()) {
+    default: return SDValue();
+    case 16: ContainerVT = MVT::nxv16i8; break;
+    case  8: ContainerVT = MVT::nxv8i16; break;
+    case  4: ContainerVT = MVT::nxv4i32; break;
+    case  2: ContainerVT = MVT::nxv2i64; break;
+    }
+  }
+
+  SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
+  SDValue Ops[] = { N->getOperand(0), // Chain
+                    N->getOperand(2), // Pg
+                    N->getOperand(3), // Base
+                    DAG.getValueType(VT) };
+
+  SDValue Load = DAG.getNode(AArch64ISD::LDNF1, DL, VTs, Ops);
+  SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+  if (ContainerVT.isInteger() && (VT != ContainerVT))
+    Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
+
+  return DAG.getMergeValues({ Load, LoadChain }, DL);
+}
+
 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
 /// load store optimizer pass will merge them to store pair stores.  This should
 /// be better than a movi to create the vector zero followed by a vector store
@@ -12246,6 +12285,9 @@
   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
   unsigned NewOpc;
   switch (Opc) {
+  case AArch64ISD::LDNF1:
+    NewOpc = AArch64ISD::LDNF1S;
+    break;
   case AArch64ISD::GLD1:
     NewOpc = AArch64ISD::GLD1S;
     break;
@@ -12272,15 +12314,23 @@
   }
 
   EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
-  EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
 
-  if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse())
+  unsigned OpNum = NewOpc == AArch64ISD::LDNF1S ? 3 : 4;
+  EVT LD1SrcMemVT = cast<VTSDNode>(Src->getOperand(OpNum))->getVT();
+
+  if ((SignExtSrcVT != LD1SrcMemVT) || !Src.hasOneUse())
     return SDValue();
 
   EVT DstVT = N->getValueType(0);
   SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
-  SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2),
-                   Src->getOperand(3), Src->getOperand(4)};
+
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(Src->getOperand(0));
+  Ops.push_back(Src->getOperand(1));
+  Ops.push_back(Src->getOperand(2));
+  Ops.push_back(Src->getOperand(3));
+  if (NewOpc != AArch64ISD::LDNF1S)
+    Ops.push_back(Src->getOperand(4));
 
   SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
   DCI.CombineTo(N, ExtLoad);
@@ -12380,6 +12430,8 @@
       return performNEONPostLDSTCombine(N, DCI, DAG);
     case Intrinsic::aarch64_sve_ldnt1:
       return performLDNT1Combine(N, DAG);
+    case Intrinsic::aarch64_sve_ldnf1:
+      return performLDNF1Combine(N, DAG);
     case Intrinsic::aarch64_sve_stnt1:
       return performSTNT1Combine(N, DAG);
     case Intrinsic::aarch64_sve_ld1_gather:
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -775,6 +775,12 @@
                  LLVMPointerTo<0>],
                 [IntrReadMem, IntrArgMemOnly]>;
 
+  class AdvSIMD_1Vec_PredFaultingLoad_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMPointerToElt<0>],
+                [IntrReadMem, IntrArgMemOnly]>;
+
   class AdvSIMD_1Vec_PredStore_Intrinsic
     : Intrinsic<[],
                 [llvm_anyvector_ty,
@@ -1116,6 +1122,8 @@
 
 def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
 
+def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
+
 //
 // Stores
 //
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to