[llvm-branch-commits] [llvm] AMDGPU: Fix buffer intrinsic store of bfloat (PR #95377)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/95377

None

>From 520d91d73339d8bea65f2e30e2a4d7fd0eb3d92b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 9 Jun 2024 22:54:35 +0200
Subject: [PATCH] AMDGPU: Fix buffer intrinsic store of bfloat

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  4 +-
 .../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll  | 37 ---
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4946129c65a95..81098201e9c0f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -874,7 +874,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
  {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
   MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
   MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
-  MVT::f16, MVT::i16, MVT::i8, MVT::i128},
+  MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
  Custom);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
@@ -9973,7 +9973,7 @@ SDValue 
SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
   EVT VDataType, SDLoc DL,
   SDValue Ops[],
   MemSDNode *M) const {
-  if (VDataType == MVT::f16)
+  if (VDataType == MVT::f16 || VDataType == MVT::bf16)
 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
 
   SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index f7f3742a90633..82dd35ab4c240 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -5,11 +5,38 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 
%s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | 
FileCheck --check-prefixes=GFX11 %s
 
-; FIXME
-; define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, 
bfloat %data, i32 %offset) {
-;   call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr 
addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
-;   ret void
-; }
+define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat 
%data, i32 %offset) {
+; GFX7-LABEL: buffer_store_bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:s_endpgm
+;
+; GFX8-LABEL: buffer_store_bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:s_endpgm
+;
+; GFX9-LABEL: buffer_store_bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:s_endpgm
+;
+; GFX10-LABEL: buffer_store_bf16:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:s_endpgm
+;
+; GFX11-LABEL: buffer_store_bf16:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:buffer_store_b16 v0, v1, s[0:3], 0 offen
+; GFX11-NEXT:s_nop 0
+; GFX11-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr 
addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
 
 define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x 
bfloat> %data, i32 %offset) {
 ; GFX7-LABEL: buffer_store_v2bf16:

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/95379

Make sure we test all the address spaces since this support isn't
free in gisel.

>From b05179ed684e289ce31f7aee8b57939c7bf2809c Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 12 Jun 2024 10:10:20 +0200
Subject: [PATCH] AMDGPU: Fix buffer load/store of pointers

Make sure we test all the address spaces since this support isn't
free in gisel.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  31 +-
 .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 596 ++
 .../llvm.amdgcn.raw.ptr.buffer.store.ll   | 144 +
 3 files changed, 759 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81098201e9c0f..7a36c88b892c8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1112,29 +1112,33 @@ unsigned 
SITargetLowering::getVectorTypeBreakdownForCallingConv(
 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
-static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
   assert(MaxNumLanes != 0);
 
+  LLVMContext &Ctx = Ty->getContext();
   if (auto *VT = dyn_cast(Ty)) {
 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
-return EVT::getVectorVT(Ty->getContext(),
-EVT::getEVT(VT->getElementType()),
+return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
 NumElts);
   }
 
-  return EVT::getEVT(Ty);
+  return TLI.getValueType(DL, Ty);
 }
 
 // Peek through TFE struct returns to only use the data size.
-static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
+   const DataLayout &DL, Type *Ty,
+   unsigned MaxNumLanes) {
   auto *ST = dyn_cast(Ty);
   if (!ST)
-return memVTFromLoadIntrData(Ty, MaxNumLanes);
+return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
 
   // TFE intrinsics return an aggregate type.
   assert(ST->getNumContainedTypes() == 2 &&
  ST->getContainedType(1)->isIntegerTy(32));
-  return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
+  return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
 }
 
 /// Map address space 7 to MVT::v5i32 because that's its in-memory
@@ -1219,10 +1223,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
 }
 
-Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
+ CI.getType(), MaxNumLanes);
   } else {
-Info.memVT = memVTFromLoadIntrReturn(
-CI.getType(), std::numeric_limits::max());
+Info.memVT =
+memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+std::numeric_limits::max());
   }
 
   // FIXME: What does alignment mean for an image?
@@ -1235,9 +1241,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   if (RsrcIntr->IsImage) {
 unsigned DMask = 
cast(CI.getArgOperand(1))->getZExtValue();
 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
-Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
+Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
+   DMaskLanes);
   } else
-Info.memVT = EVT::getEVT(DataTy);
+Info.memVT = getValueType(MF.getDataLayout(), DataTy);
 
   Info.flags |= MachineMemOperand::MOStore;
 } else {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 3e3371091ef72..4d557c76dc4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -1280,6 +1280,602 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr 
addrspace(8) inreg %rsrc, i
   ret <2 x i64> %data
 }
 
+define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 
%voffset) {
+; PREGFX10-LABEL: buffer_load_p0__voffset_add:
+; PREGFX10:   ; %bb.0:
+; PREGFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p0__voffset_add:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expc

[llvm-branch-commits] [llvm] AMDGPU: Cleanup selection patterns for buffer loads (PR #95378)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/95378

We should just support these for all register types.

>From 46c7f8b4529827204e5273472ea5b642ecb7266e Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 9 Jun 2024 23:12:31 +0200
Subject: [PATCH] AMDGPU: Cleanup selection patterns for buffer loads

We should just support these for all register types.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 72 ++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 16 ++---
 2 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 94dd45f1333b0..2f52edb7f917a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1421,27 +1421,21 @@ let OtherPredicates = [HasPackedD16VMem] in {
   defm : MUBUF_LoadIntrinsicPat;
 } // End HasPackedD16VMem.
 
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
 
 defm : MUBUF_LoadIntrinsicPat;
 defm : MUBUF_LoadIntrinsicPat;
@@ -1532,27 +1526,21 @@ let OtherPredicates = [HasPackedD16VMem] in {
   defm : MUBUF_StoreIntrinsicPat;
 } // End HasPackedD16VMem.
 
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
 
 defm : MUBUF_StoreIntrinsicPat;
 defm : MUBUF_StoreIntrinsicPat;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td 
b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index caac7126068ef..a8efe2b2ba35e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,7 +586,9 @@ class RegisterTypes reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, 
p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, 
v4bf16]>;
+def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
+def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, 
v8bf16]>;
 
 let HasVGPR = 1 in {
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -744,7 +746,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, 
i16, f16, bf16, v2i16,
   let BaseClassOrder = 1;
 }
 
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, 
v8f16, v8bf16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -815,7 +817,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, 
i16, f16, bf16, v2i16, v
   let HasSGPR = 1;
 }
 
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, 
v4bf16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
 (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -905,8 +907,8 @@ multiclass SRegClass;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], 
SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, Reg128Types.types, SGPR_

[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/95379?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#95379** https://app.graphite.dev/github/pr/llvm/llvm-project/95379?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95378** https://app.graphite.dev/github/pr/llvm/llvm-project/95378?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95377** https://app.graphite.dev/github/pr/llvm/llvm-project/95377?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95376** https://app.graphite.dev/github/pr/llvm/llvm-project/95376?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/95379
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer intrinsic store of bfloat (PR #95377)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/95377?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#95379** https://app.graphite.dev/github/pr/llvm/llvm-project/95379?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95378** https://app.graphite.dev/github/pr/llvm/llvm-project/95378?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95377** https://app.graphite.dev/github/pr/llvm/llvm-project/95377?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95376** https://app.graphite.dev/github/pr/llvm/llvm-project/95376?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/95377
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Cleanup selection patterns for buffer loads (PR #95378)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/95378?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#95379** https://app.graphite.dev/github/pr/llvm/llvm-project/95379?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95378** https://app.graphite.dev/github/pr/llvm/llvm-project/95378?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95377** https://app.graphite.dev/github/pr/llvm/llvm-project/95377?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95376** https://app.graphite.dev/github/pr/llvm/llvm-project/95376?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/95378
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Cleanup selection patterns for buffer loads (PR #95378)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/95378
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/95379
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer intrinsic store of bfloat (PR #95377)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/95377
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Cleanup selection patterns for buffer loads (PR #95378)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95378

>From 1dfcc0961e82bbe656faded0c38e694da0d76c9b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 9 Jun 2024 23:12:31 +0200
Subject: [PATCH] AMDGPU: Cleanup selection patterns for buffer loads

We should just support these for all register types.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 72 ++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 16 ++---
 2 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 50e62788c5eac..978d261f5a662 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1421,27 +1421,21 @@ let OtherPredicates = [HasPackedD16VMem] in {
   defm : MUBUF_LoadIntrinsicPat;
 } // End HasPackedD16VMem.
 
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}
 
 defm : MUBUF_LoadIntrinsicPat;
 defm : MUBUF_LoadIntrinsicPat;
@@ -1532,27 +1526,21 @@ let OtherPredicates = [HasPackedD16VMem] in {
   defm : MUBUF_StoreIntrinsicPat;
 } // End HasPackedD16VMem.
 
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
-defm : MUBUF_StoreIntrinsicPat;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_StoreIntrinsicPat;
+}
 
 defm : MUBUF_StoreIntrinsicPat;
 defm : MUBUF_StoreIntrinsicPat;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td 
b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index caac7126068ef..a8efe2b2ba35e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,7 +586,9 @@ class RegisterTypes reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, 
p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, 
v4bf16]>;
+def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
+def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, 
v8bf16]>;
 
 let HasVGPR = 1 in {
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -744,7 +746,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, 
i16, f16, bf16, v2i16,
   let BaseClassOrder = 1;
 }
 
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, 
v8f16, v8bf16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -815,7 +817,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, 
i16, f16, bf16, v2i16, v
   let HasSGPR = 1;
 }
 
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, 
v4bf16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
 (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -905,8 +907,8 @@ multiclass SRegClass;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], 
SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>;
 defm "" : SRegClass<5, [v5i32

[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95379

>From 14695322d92821374dd6599d8f0f76d212e50169 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 12 Jun 2024 10:10:20 +0200
Subject: [PATCH] AMDGPU: Fix buffer load/store of pointers

Make sure we test all the address spaces since this support isn't
free in gisel.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  31 +-
 .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 596 ++
 .../llvm.amdgcn.raw.ptr.buffer.store.ll   | 456 ++
 3 files changed, 1071 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81098201e9c0f..7a36c88b892c8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1112,29 +1112,33 @@ unsigned 
SITargetLowering::getVectorTypeBreakdownForCallingConv(
 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
-static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
   assert(MaxNumLanes != 0);
 
+  LLVMContext &Ctx = Ty->getContext();
   if (auto *VT = dyn_cast(Ty)) {
 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
-return EVT::getVectorVT(Ty->getContext(),
-EVT::getEVT(VT->getElementType()),
+return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
 NumElts);
   }
 
-  return EVT::getEVT(Ty);
+  return TLI.getValueType(DL, Ty);
 }
 
 // Peek through TFE struct returns to only use the data size.
-static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
+   const DataLayout &DL, Type *Ty,
+   unsigned MaxNumLanes) {
   auto *ST = dyn_cast(Ty);
   if (!ST)
-return memVTFromLoadIntrData(Ty, MaxNumLanes);
+return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
 
   // TFE intrinsics return an aggregate type.
   assert(ST->getNumContainedTypes() == 2 &&
  ST->getContainedType(1)->isIntegerTy(32));
-  return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
+  return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
 }
 
 /// Map address space 7 to MVT::v5i32 because that's its in-memory
@@ -1219,10 +1223,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
 }
 
-Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
+ CI.getType(), MaxNumLanes);
   } else {
-Info.memVT = memVTFromLoadIntrReturn(
-CI.getType(), std::numeric_limits::max());
+Info.memVT =
+memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+std::numeric_limits::max());
   }
 
   // FIXME: What does alignment mean for an image?
@@ -1235,9 +1241,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   if (RsrcIntr->IsImage) {
 unsigned DMask = 
cast(CI.getArgOperand(1))->getZExtValue();
 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
-Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
+Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
+   DMaskLanes);
   } else
-Info.memVT = EVT::getEVT(DataTy);
+Info.memVT = getValueType(MF.getDataLayout(), DataTy);
 
   Info.flags |= MachineMemOperand::MOStore;
 } else {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 3e3371091ef72..4d557c76dc4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -1280,6 +1280,602 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr 
addrspace(8) inreg %rsrc, i
   ret <2 x i64> %data
 }
 
+define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 
%voffset) {
+; PREGFX10-LABEL: buffer_load_p0__voffset_add:
+; PREGFX10:   ; %bb.0:
+; PREGFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p0__voffset_add:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:buffer_load_dwordx2 v[0:1], v0, s[4:7

[llvm-branch-commits] [llvm] AMDGPU: Cleanup selection patterns for buffer loads (PR #95378)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits


@@ -1421,27 +1421,21 @@ let OtherPredicates = [HasPackedD16VMem] in {
   defm : MUBUF_LoadIntrinsicPat;
 } // End HasPackedD16VMem.
 
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
-defm : MUBUF_LoadIntrinsicPat;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_LoadIntrinsicPat;
+}

arsenm wrote:

I'm not a big fan of omitting the braces, especially in tablegen. If we're 
going to delete the braces the lines should at least be indented 

https://github.com/llvm/llvm-project/pull/95378
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

That's what we've traditionally done and I think we should stop. We currently 
skip inserting the casts if the type is legal. 

It introduces extra bitcasts, which have a cost and increase pattern match 
complexity. We have a bunch of patterns that don't bother to look through the 
casts for a load/store 

https://github.com/llvm/llvm-project/pull/95379
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-13 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

I don't think we should be trying to handle the unreasonable illegal types in 
the intrinsics themselves. Theoretically the intrinsic should correspond to 
direct support.

We would handle the ugly types in the fat pointer lowering in terms of the 
intrinsics. 

https://github.com/llvm/llvm-project/pull/95379
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> On the other hand, it's a lot easier to handle ugly types down in instruction 
> selection, where you get to play much more fast and loose with types.

I think it's mostly easier to do this in the IR 

> 
> And there are buffer uses that don't fit into the fat pointer use use case 
> where we'd still want them to work. For example, both `str 
> uct.ptr.bufferload.v6f16` and `struct.ptr.buffer.load.v3f32` should be a 
> `buffer_load_dwordx3`, but I'm pretty sure 6 x half isn't a register type.

Yes, we should just fix this one

> 
> The load and store intrinsics are already overloaded to handle various {8, 
> 16, ..., 128}-bit types, and it seems much cleaner to let it support any type 
> of those lengths. It's just a load/store with somewhat weird indexing 
> semantics, is all.

Splitting is pretty ugly too, especially for a truly arbitrary type in 
legalization.



https://github.com/llvm/llvm-project/pull/95379
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat (PR #95394)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits


@@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace 
<"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
 }
 
 let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
+// FIXME: These do not have signed offsets

arsenm wrote:

Yes, but I was planning on copying the pre-existing bug and fixing them both 
together later (assuming this is actually a bug and there's not some special 
case I haven't found documentation for)

https://github.com/llvm/llvm-project/pull/95394
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat (PR #95394)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits


@@ -15931,6 +15931,26 @@ static OptimizationRemark 
emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
  << " operation at memory scope " << MemScope;
 }
 
+static bool isHalf2OrBFloat2(Type *Ty) {

arsenm wrote:

Both instructions were added together. The currently defined feature is 
HasAtomicFlatPkAdd16Insts, so this mirrors that 

https://github.com/llvm/llvm-project/pull/95394
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] clang/AMDGPU: Emit atomicrmw from ds_fadd builtins (PR #95395)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95395

>From b6fa394408069d850c2e074cec64eef8028d7737 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 10 Jun 2024 19:40:59 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from ds_fadd builtins

We should have done this for the f32/f64 case a long time ago. Now that
codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting
it instead.

This also does upgrade the behavior to respect a volatile qualified pointer,
which was previously ignored (for the cases that don't have an explicit
volatile argument).
---
 clang/lib/CodeGen/CGBuiltin.cpp   | 113 +++---
 clang/test/CodeGenCUDA/builtins-amdgcn.cu |   2 +-
 .../test/CodeGenCUDA/builtins-spirv-amdgcn.cu |   2 +-
 .../builtins-unsafe-atomics-gfx90a.cu |   5 +-
 ...tins-unsafe-atomics-spirv-amdgcn-gfx90a.cu |   2 +-
 .../test/CodeGenOpenCL/builtins-amdgcn-vi.cl  |  37 +-
 .../builtins-fp-atomics-gfx12.cl  |  14 ++-
 .../CodeGenOpenCL/builtins-fp-atomics-gfx8.cl |   9 +-
 .../builtins-fp-atomics-gfx90a.cl |   4 +-
 .../builtins-fp-atomics-gfx940.cl |  10 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   3 +-
 11 files changed, 139 insertions(+), 62 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 511e1fd4016d7..d81cf40c912de 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18140,9 +18140,35 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value 
*Order, Value *Scope,
 break;
   }
 
+  // Some of the atomic builtins take the scope as a string name.
   StringRef scp;
-  llvm::getConstantStringInfo(Scope, scp);
-  SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
+  if (llvm::getConstantStringInfo(Scope, scp)) {
+SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
+return;
+  }
+
+  // Older builtins had an enum argument for the memory scope.
+  int scope = cast(Scope)->getZExtValue();
+  switch (scope) {
+  case 0: // __MEMORY_SCOPE_SYSTEM
+SSID = llvm::SyncScope::System;
+break;
+  case 1: // __MEMORY_SCOPE_DEVICE
+SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+break;
+  case 2: // __MEMORY_SCOPE_WRKGRP
+SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
+break;
+  case 3: // __MEMORY_SCOPE_WVFRNT
+SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+break;
+  case 4: // __MEMORY_SCOPE_SINGLE
+SSID = llvm::SyncScope::SingleThread;
+break;
+  default:
+SSID = llvm::SyncScope::System;
+break;
+  }
 }
 
 llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned 
ICEArguments,
@@ -18558,14 +18584,10 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
 Intrinsic::ID Intrin;
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_ds_faddf:
-  Intrin = Intrinsic::amdgcn_ds_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
   Intrin = Intrinsic::amdgcn_ds_fmin;
   break;
@@ -18656,35 +18678,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_ds_fadd;
-  break;
-case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
-  ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_ds_fadd;
-  break;
-case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_ds_fadd;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
-llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
-llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
-llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
-llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
-return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr

[llvm-branch-commits] [llvm] AMDGPU: Create pseudo to real mapping for flat/buffer atomic fmin/fmax (PR #95591)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/95591

The global/flat/buffer atomic fmin/fmax situation is a mess. These
instructions have been renamed 3 times. We currently have
separate pseudos defined for the same opcodes with the different names
(e.g. GLOBAL_ATOMIC_MIN_F64 from gfx90a and GLOBAL_ATOMIC_FMIN_X2 from gfx10).

Use the _FMIN versions as the canonical name for the f32 versions. Use the
_MIN_F64 style as the canonical name for the f64 case. This is because
gfx90a has the most sensible names, but does not have the f32 versions.t sho

Wire through the pseudo to use for the instruction properties vs. the assembly
name like in other cases. This will simplify handling of direct atomicrmw 
selection.

This will simplify directly selecting these from atomicrmw.

>From b00ad0dab49ad96c160a16c062f35d7788bd77c8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 13 Jun 2024 18:33:11 +0200
Subject: [PATCH] AMDGPU: Create pseudo to real mapping for flat/buffer atomic
 fmin/fmax

The global/flat/buffer atomic fmin/fmax situation is a mess. These
instructions have been renamed 3 times. We currently have
separate pseudos defined for the same opcodes with the different names
(e.g. GLOBAL_ATOMIC_MIN_F64 from gfx90a and GLOBAL_ATOMIC_FMIN_X2 from gfx10).

Use the _FMIN versions as the canonical name for the f32 versions. Use the
_MIN_F64 style as the canonical name for the f64 case. This is because
gfx90a has the most sensible names, but does not have the f32 versions.t sho

Wire through the pseudo to use for the instruction properties vs. the assembly
name like in other cases. This will simplify handling of direct atomicrmw 
selection.

This will simplify directly selecting these from atomicrmw.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  |   4 +-
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 103 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td| 107 +-
 .../AMDGPU/fp-atomic-to-s_denormmode.mir  |  40 +++
 4 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d0d7a9dc17247..0a1550ccb53c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1864,7 +1864,9 @@ def HasFlatAddressSpace : 
Predicate<"Subtarget->hasFlatAddressSpace()">,
 
 def HasBufferFlatGlobalAtomicsF64 :
   Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
+  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+
 def HasLdsAtomicAddF64 :
   Predicate<"Subtarget->hasLdsAtomicAddF64()">,
   AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43e5434ea2700..9d21f93a957cc 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1163,12 +1163,6 @@ let SubtargetPredicate = isGFX6GFX7GFX10 in {
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
 >;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
->;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
->;
 
 }
 
@@ -1318,6 +1312,9 @@ let SubtargetPredicate = isGFX90APlus in {
 
 let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+
+  // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
+  // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
 } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
@@ -1763,8 +1760,8 @@ let OtherPredicates = [isGFX6GFX7GFX10Plus] in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
 }
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
-  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_FMIN_X2">;
-  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, 
"BUFFER_ATOMIC_FMAX_X2">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, 
"BUFFER_ATOMIC_MAX_F64">;
 }
 
 class NoUseBufferAtomic : PatFrag <
@@ -2315,6 +2312,12 @@ let OtherPredicates = [HasPackedD16VMem] in {
 // Target-specific instruction encodings.
 
//===--===//
 
+// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the
+// specific pseudo (bothen in this case) since

[llvm-branch-commits] [llvm] AMDGPU: Create pseudo to real mapping for flat/buffer atomic fmin/fmax (PR #95591)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/95591?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#95592** https://app.graphite.dev/github/pr/llvm/llvm-project/95592?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95591** https://app.graphite.dev/github/pr/llvm/llvm-project/95591?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95590** https://app.graphite.dev/github/pr/llvm/llvm-project/95590?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/95591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Start selecting flat/global atomicrmw fmin/fmax. (PR #95592)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/95592?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#95592** https://app.graphite.dev/github/pr/llvm/llvm-project/95592?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95591** https://app.graphite.dev/github/pr/llvm/llvm-project/95591?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95590** https://app.graphite.dev/github/pr/llvm/llvm-project/95590?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/95592
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Start selecting buffer fat pointer atomicrmw fmin/fmax (PR #95593)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/95593?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#95593** https://app.graphite.dev/github/pr/llvm/llvm-project/95593?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95592** https://app.graphite.dev/github/pr/llvm/llvm-project/95592?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95591** https://app.graphite.dev/github/pr/llvm/llvm-project/95591?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95590** https://app.graphite.dev/github/pr/llvm/llvm-project/95590?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/95593
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Create pseudo to real mapping for flat/buffer atomic fmin/fmax (PR #95591)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/95591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Start selecting flat/global atomicrmw fmin/fmax. (PR #95592)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/95592
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Start selecting buffer fat pointer atomicrmw fmin/fmax (PR #95593)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/95593
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] clang/AMDGPU: Emit atomicrmw from ds_fadd builtins (PR #95395)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95395

>From 0bfa259e0ec5f98261a7f84a8f0fe8248cd0e2fe Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 10 Jun 2024 19:40:59 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from ds_fadd builtins

We should have done this for the f32/f64 case a long time ago. Now that
codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting
it instead.

This also does upgrade the behavior to respect a volatile qualified pointer,
which was previously ignored (for the cases that don't have an explicit
volatile argument).
---
 clang/lib/CodeGen/CGBuiltin.cpp   | 113 +++---
 clang/test/CodeGenCUDA/builtins-amdgcn.cu |   2 +-
 .../test/CodeGenCUDA/builtins-spirv-amdgcn.cu |   2 +-
 .../builtins-unsafe-atomics-gfx90a.cu |   5 +-
 ...tins-unsafe-atomics-spirv-amdgcn-gfx90a.cu |   2 +-
 .../test/CodeGenOpenCL/builtins-amdgcn-vi.cl  |  37 +-
 .../builtins-fp-atomics-gfx12.cl  |  14 ++-
 .../CodeGenOpenCL/builtins-fp-atomics-gfx8.cl |   9 +-
 .../builtins-fp-atomics-gfx90a.cl |   4 +-
 .../builtins-fp-atomics-gfx940.cl |  10 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   3 +-
 11 files changed, 139 insertions(+), 62 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 511e1fd4016d7..d81cf40c912de 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18140,9 +18140,35 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value 
*Order, Value *Scope,
 break;
   }
 
+  // Some of the atomic builtins take the scope as a string name.
   StringRef scp;
-  llvm::getConstantStringInfo(Scope, scp);
-  SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
+  if (llvm::getConstantStringInfo(Scope, scp)) {
+SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
+return;
+  }
+
+  // Older builtins had an enum argument for the memory scope.
+  int scope = cast(Scope)->getZExtValue();
+  switch (scope) {
+  case 0: // __MEMORY_SCOPE_SYSTEM
+SSID = llvm::SyncScope::System;
+break;
+  case 1: // __MEMORY_SCOPE_DEVICE
+SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+break;
+  case 2: // __MEMORY_SCOPE_WRKGRP
+SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
+break;
+  case 3: // __MEMORY_SCOPE_WVFRNT
+SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+break;
+  case 4: // __MEMORY_SCOPE_SINGLE
+SSID = llvm::SyncScope::SingleThread;
+break;
+  default:
+SSID = llvm::SyncScope::System;
+break;
+  }
 }
 
 llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned 
ICEArguments,
@@ -18558,14 +18584,10 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
-  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
 Intrinsic::ID Intrin;
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_ds_faddf:
-  Intrin = Intrinsic::amdgcn_ds_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
   Intrin = Intrinsic::amdgcn_ds_fmin;
   break;
@@ -18656,35 +18678,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_ds_fadd;
-  break;
-case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
-  ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_ds_fadd;
-  break;
-case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_ds_fadd;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
-llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
-llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
-llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
-llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
-return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr

[llvm-branch-commits] [clang] [llvm] AMDGPU: Remove ds atomic fadd intrinsics (PR #95396)

2024-06-14 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95396

>From 0ef98ac6c1858ec0e35cb0f1c293d5934f96b3ad Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 10 Jun 2024 19:48:13 +0200
Subject: [PATCH] AMDGPU: Remove ds atomic fadd intrinsics

These have been replaced with atomicrmw fadd
---
 clang/lib/CodeGen/CGBuiltin.cpp   |   2 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   5 -
 llvm/lib/IR/AutoUpgrade.cpp   |  92 --
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   1 -
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 -
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  |   3 -
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  10 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  15 -
 llvm/test/Bitcode/amdgcn-atomic.ll| 136 +
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll|  55 
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  | 125 +---
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll  | 279 --
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 102 ---
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll |  97 --
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 125 +---
 llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll   |  25 --
 18 files changed, 232 insertions(+), 848 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d81cf40c912de..34d7e59ca45fd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19084,7 +19084,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_fadd_* builtins do not have syncscope/order arguments.
+  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
   SSID = llvm::SyncScope::System;
   AO = AtomicOrdering::SequentiallyConsistent;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 1d5b360742059..0a4dd7a4725db 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -571,7 +571,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
 def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
 def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
 
@@ -2970,10 +2969,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic {
 // bf16 atomics use v2i16 argument since there is no bf16 data type in the 
llvm.
 def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn;
 def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn;
-def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
-[llvm_v2i16_ty],
-[LLVMQualPointerType<3>, llvm_v2i16_ty],
-[IntrArgMemOnly, NoCapture>]>;
 
 defset list AMDGPUMFMAIntrinsics940 = {
 def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 2f4b8351e747a..29310ad79ef70 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,6 +1033,12 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
 break; // No other 'amdgcn.atomic.*'
   }
 
+  if (Name.starts_with("ds.fadd")) {
+// Replaced with atomicrmw fadd, so there's no new declaration.
+NewFn = nullptr;
+return true;
+  }
+
   if (Name.starts_with("ldexp.")) {
 // Target specific intrinsic became redundant
 NewFn = Intrinsic::getDeclaration(
@@ -2331,40 +2337,74 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, 
CallBase *CI, Function *F,
   llvm_unreachable("Unknown function for ARM CallBase upgrade.");
 }
 
+// These are expected to have the arguments:
+// atomic.intrin (ptr, rmw_value, ordering, scope, isVolatile)
+//
+// Except for int_amdgcn_ds_fadd_v2bf16 which only has (ptr, rmw_value).
+//
 static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
  Function *F, IRBuilder<> &Builder) {
-  const bool IsInc = Name.starts_with("atomic.inc.");
-  if (IsInc || Name.starts_with("atomic.dec.")) {
-if (CI->getNumOperands() != 6) // Malformed bitcode.
-  return nullptr;
+  AtomicRMWInst::BinOp RMWOp =
+  StringSwitch(Name)
+  .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
+  .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
+
+  unsigned NumOperands = CI->getNumOperands();

[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat (PR #95394)

2024-06-15 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Jun 15, 3:51 AM EDT**: @arsenm started a stack merge that includes this 
pull request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/95394).


https://github.com/llvm/llvm-project/pull/95394
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-15 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95379

>From c895288fc5ba347b5be14dae8802073f6037e59b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 12 Jun 2024 10:10:20 +0200
Subject: [PATCH] AMDGPU: Fix buffer load/store of pointers

Make sure we test all the address spaces since this support isn't
free in gisel.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  31 +-
 .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 596 ++
 .../llvm.amdgcn.raw.ptr.buffer.store.ll   | 456 ++
 3 files changed, 1071 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ba9259541310a..bcc6122c84beb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1112,29 +1112,33 @@ unsigned 
SITargetLowering::getVectorTypeBreakdownForCallingConv(
 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
-static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
   assert(MaxNumLanes != 0);
 
+  LLVMContext &Ctx = Ty->getContext();
   if (auto *VT = dyn_cast(Ty)) {
 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
-return EVT::getVectorVT(Ty->getContext(),
-EVT::getEVT(VT->getElementType()),
+return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
 NumElts);
   }
 
-  return EVT::getEVT(Ty);
+  return TLI.getValueType(DL, Ty);
 }
 
 // Peek through TFE struct returns to only use the data size.
-static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
+   const DataLayout &DL, Type *Ty,
+   unsigned MaxNumLanes) {
   auto *ST = dyn_cast(Ty);
   if (!ST)
-return memVTFromLoadIntrData(Ty, MaxNumLanes);
+return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
 
   // TFE intrinsics return an aggregate type.
   assert(ST->getNumContainedTypes() == 2 &&
  ST->getContainedType(1)->isIntegerTy(32));
-  return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
+  return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
 }
 
 /// Map address space 7 to MVT::v5i32 because that's its in-memory
@@ -1219,10 +1223,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
 }
 
-Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
+ CI.getType(), MaxNumLanes);
   } else {
-Info.memVT = memVTFromLoadIntrReturn(
-CI.getType(), std::numeric_limits::max());
+Info.memVT =
+memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+std::numeric_limits::max());
   }
 
   // FIXME: What does alignment mean for an image?
@@ -1235,9 +1241,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   if (RsrcIntr->IsImage) {
 unsigned DMask = 
cast(CI.getArgOperand(1))->getZExtValue();
 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
-Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
+Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
+   DMaskLanes);
   } else
-Info.memVT = EVT::getEVT(DataTy);
+Info.memVT = getValueType(MF.getDataLayout(), DataTy);
 
   Info.flags |= MachineMemOperand::MOStore;
 } else {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 3e3371091ef72..4d557c76dc4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -1280,6 +1280,602 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr 
addrspace(8) inreg %rsrc, i
   ret <2 x i64> %data
 }
 
+define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 
%voffset) {
+; PREGFX10-LABEL: buffer_load_p0__voffset_add:
+; PREGFX10:   ; %bb.0:
+; PREGFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p0__voffset_add:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:buffer_load_dwordx2 v[0:1], v0, s[4:7

[llvm-branch-commits] [llvm] AMDGPU: Start selecting flat/global atomicrmw fmin/fmax. (PR #95592)

2024-06-15 Thread Matt Arsenault via llvm-branch-commits


@@ -1582,33 +1603,33 @@ let OtherPredicates = [isGFX12Plus] in {
   }
 }
 
-let OtherPredicates = [isGFX10Plus] in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = 
[HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", 
f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", 
f32>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
-
-let OtherPredicates = [isGFX10GFX11] in {
 defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", 
"int_amdgcn_global_atomic_fmin", f32>;
 defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", 
"int_amdgcn_global_atomic_fmax", f32>;
+}
 
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
 defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", 
"int_amdgcn_flat_atomic_fmin", f32>;
 defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", 
"int_amdgcn_flat_atomic_fmax", f32>;
 }
 
-let OtherPredicates = [isGFX10Only] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", 
"atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", 
"atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", 
"int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", 
"int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", 
f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", 
f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", 
"int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", 
"int_amdgcn_flat_atomic_fmax", f64>;
-}
+// let OtherPredicates = [isGFX10Only] in { // fixme

arsenm wrote:

We had duplicated pseudos and handling on different sub targets, I unified 
these in #95591 so this should be removable 

https://github.com/llvm/llvm-project/pull/95592
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Create pseudo to real mapping for flat/buffer atomic fmin/fmax (PR #95591)

2024-06-17 Thread Matt Arsenault via llvm-branch-commits


@@ -1608,14 +1598,14 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", 
"int_amdgcn_flat_atomic_fmax
 }
 
 let OtherPredicates = [isGFX10Only] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", 
"atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", 
"atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", 
"int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", 
"int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", 
f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", 
f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", 
"int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", 
"int_amdgcn_flat_atomic_fmax", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", 
"atomic_load_fmin_global", f64>;

arsenm wrote:

Yes, there's a lot of predicate junk here that needs to be cleaned up. Most of 
these atomics are imprecisely guarded with isGFX10/11/12 checks that are 
available on a subset of gfx9 

https://github.com/llvm/llvm-project/pull/95591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Create pseudo to real mapping for flat/buffer atomic fmin/fmax (PR #95591)

2024-06-17 Thread Matt Arsenault via llvm-branch-commits


@@ -1608,14 +1598,14 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", 
"int_amdgcn_flat_atomic_fmax
 }
 
 let OtherPredicates = [isGFX10Only] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", 
"atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", 
"atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", 
"int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", 
"int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", 
f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", 
f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", 
"int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", 
"int_amdgcn_flat_atomic_fmax", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", 
"atomic_load_fmin_global", f64>;

arsenm wrote:

This specific cleanup requires the next patch in the series to start 
decomposing the features to the granularity we need 

https://github.com/llvm/llvm-project/pull/95591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Create pseudo to real mapping for flat/buffer atomic fmin/fmax (PR #95591)

2024-06-17 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95591

>From 71287fe3e9b31cf09dfc4a0a47e6bc02e40a1445 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 13 Jun 2024 18:33:11 +0200
Subject: [PATCH] AMDGPU: Create pseudo to real mapping for flat/buffer atomic
 fmin/fmax

The global/flat/buffer atomic fmin/fmax situation is a mess. These
instructions have been renamed 3 times. We currently have
separate pseudos defined for the same opcodes with the different names
(e.g. GLOBAL_ATOMIC_MIN_F64 from gfx90a and GLOBAL_ATOMIC_FMIN_X2 from gfx10).

Use the _FMIN versions as the canonical name for the f32 versions. Use the
_MIN_F64 style as the canonical name for the f64 case. This is because
gfx90a has the most sensible names, but does not have the f32 versions.t sho

Wire through the pseudo to use for the instruction properties vs. the assembly
name like in other cases. This will simplify handling of direct atomicrmw 
selection.

This will simplify directly selecting these from atomicrmw.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  |   4 +-
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 103 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td| 107 +-
 .../AMDGPU/fp-atomic-to-s_denormmode.mir  |  40 +++
 4 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d0d7a9dc17247..0a1550ccb53c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1864,7 +1864,9 @@ def HasFlatAddressSpace : 
Predicate<"Subtarget->hasFlatAddressSpace()">,
 
 def HasBufferFlatGlobalAtomicsF64 :
   Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
+  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+
 def HasLdsAtomicAddF64 :
   Predicate<"Subtarget->hasLdsAtomicAddF64()">,
   AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43e5434ea2700..9d21f93a957cc 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1163,12 +1163,6 @@ let SubtargetPredicate = isGFX6GFX7GFX10 in {
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
 >;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
->;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
->;
 
 }
 
@@ -1318,6 +1312,9 @@ let SubtargetPredicate = isGFX90APlus in {
 
 let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+
+  // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
+  // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
 } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
@@ -1763,8 +1760,8 @@ let OtherPredicates = [isGFX6GFX7GFX10Plus] in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
 }
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
-  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_FMIN_X2">;
-  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, 
"BUFFER_ATOMIC_FMAX_X2">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, 
"BUFFER_ATOMIC_MAX_F64">;
 }
 
 class NoUseBufferAtomic : PatFrag <
@@ -2315,6 +2312,12 @@ let OtherPredicates = [HasPackedD16VMem] in {
 // Target-specific instruction encodings.
 
//===--===//
 
+// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the
+// specific pseudo (bothen in this case) since any of them will work.
+class get_BUF_ps {
+  string Mnemonic = !cast(name # "_OFFSET").Mnemonic;
+}
+
 
//===--===//
 // Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11.
 
//===--===//
@@ -2346,8 +2349,8 @@ multiclass MUBUF_Real_gfx11 op, string real_name 
= !cast(N
   }
 }
 
-class Base_MUBUF_Real_gfx6_gfx7_gfx10 op, MUBUF_Pseudo ps, int ef> :
-  Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11 {
+class Base_MUBUF_Real_gfx6_gfx7_gfx10 op, MUBUF_Pseudo ps, int ef, 
string asmName> :
+  Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11 {
   let Inst{12}= ps.offen;
   let Inst{13}= ps.idxen;
   let Inst

[llvm-branch-commits] [llvm] AMDGPU: Fix buffer load/store of pointers (PR #95379)

2024-06-17 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95379

>From cad588c3cb951121a7a2ee4024839b0f40ecd147 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 12 Jun 2024 10:10:20 +0200
Subject: [PATCH] AMDGPU: Fix buffer load/store of pointers

Make sure we test all the address spaces since this support isn't
free in gisel.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  31 +-
 .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 596 ++
 .../llvm.amdgcn.raw.ptr.buffer.store.ll   | 456 ++
 3 files changed, 1071 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d9a163ded6bab..1faccd7e061ce 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1112,29 +1112,33 @@ unsigned 
SITargetLowering::getVectorTypeBreakdownForCallingConv(
 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
-static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
   assert(MaxNumLanes != 0);
 
+  LLVMContext &Ctx = Ty->getContext();
   if (auto *VT = dyn_cast(Ty)) {
 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
-return EVT::getVectorVT(Ty->getContext(),
-EVT::getEVT(VT->getElementType()),
+return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
 NumElts);
   }
 
-  return EVT::getEVT(Ty);
+  return TLI.getValueType(DL, Ty);
 }
 
 // Peek through TFE struct returns to only use the data size.
-static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
+   const DataLayout &DL, Type *Ty,
+   unsigned MaxNumLanes) {
   auto *ST = dyn_cast(Ty);
   if (!ST)
-return memVTFromLoadIntrData(Ty, MaxNumLanes);
+return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
 
   // TFE intrinsics return an aggregate type.
   assert(ST->getNumContainedTypes() == 2 &&
  ST->getContainedType(1)->isIntegerTy(32));
-  return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
+  return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
 }
 
 /// Map address space 7 to MVT::v5i32 because that's its in-memory
@@ -1219,10 +1223,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
 }
 
-Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
+ CI.getType(), MaxNumLanes);
   } else {
-Info.memVT = memVTFromLoadIntrReturn(
-CI.getType(), std::numeric_limits::max());
+Info.memVT =
+memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+std::numeric_limits::max());
   }
 
   // FIXME: What does alignment mean for an image?
@@ -1235,9 +1241,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
   if (RsrcIntr->IsImage) {
 unsigned DMask = 
cast(CI.getArgOperand(1))->getZExtValue();
 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
-Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
+Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
+   DMaskLanes);
   } else
-Info.memVT = EVT::getEVT(DataTy);
+Info.memVT = getValueType(MF.getDataLayout(), DataTy);
 
   Info.flags |= MachineMemOperand::MOStore;
 } else {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 3e3371091ef72..4d557c76dc4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -1280,6 +1280,602 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr 
addrspace(8) inreg %rsrc, i
   ret <2 x i64> %data
 }
 
+define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 
%voffset) {
+; PREGFX10-LABEL: buffer_load_p0__voffset_add:
+; PREGFX10:   ; %bb.0:
+; PREGFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p0__voffset_add:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:buffer_load_dwordx2 v[0:1], v0, s[4:7

[llvm-branch-commits] [llvm] AMDGPU: Create pseudo to real mapping for flat/buffer atomic fmin/fmax (PR #95591)

2024-06-17 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95591

>From a5b973d9b4816fb5097ba005929601639347c657 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 13 Jun 2024 18:33:11 +0200
Subject: [PATCH] AMDGPU: Create pseudo to real mapping for flat/buffer atomic
 fmin/fmax

The global/flat/buffer atomic fmin/fmax situation is a mess. These
instructions have been renamed 3 times. We currently have
separate pseudos defined for the same opcodes with the different names
(e.g. GLOBAL_ATOMIC_MIN_F64 from gfx90a and GLOBAL_ATOMIC_FMIN_X2 from gfx10).

Use the _FMIN versions as the canonical name for the f32 versions. Use the
_MIN_F64 style as the canonical name for the f64 case. This is because
gfx90a has the most sensible names, but does not have the f32 versions.t sho

Wire through the pseudo to use for the instruction properties vs. the assembly
name like in other cases. This will simplify handling of direct atomicrmw 
selection.

This will simplify directly selecting these from atomicrmw.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  |   4 +-
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 103 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td| 107 +-
 .../AMDGPU/fp-atomic-to-s_denormmode.mir  |  40 +++
 4 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 65c4abef2bf8a..cb5ceb9959325 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1864,7 +1864,9 @@ def HasFlatAddressSpace : 
Predicate<"Subtarget->hasFlatAddressSpace()">,
 
 def HasBufferFlatGlobalAtomicsF64 :
   Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
+  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+
 def HasLdsAtomicAddF64 :
   Predicate<"Subtarget->hasLdsAtomicAddF64()">,
   AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43e5434ea2700..9d21f93a957cc 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1163,12 +1163,6 @@ let SubtargetPredicate = isGFX6GFX7GFX10 in {
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
 >;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
->;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
->;
 
 }
 
@@ -1318,6 +1312,9 @@ let SubtargetPredicate = isGFX90APlus in {
 
 let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+
+  // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
+  // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
 } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
@@ -1763,8 +1760,8 @@ let OtherPredicates = [isGFX6GFX7GFX10Plus] in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
 }
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
-  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_FMIN_X2">;
-  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, 
"BUFFER_ATOMIC_FMAX_X2">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
+  defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, 
"BUFFER_ATOMIC_MAX_F64">;
 }
 
 class NoUseBufferAtomic : PatFrag <
@@ -2315,6 +2312,12 @@ let OtherPredicates = [HasPackedD16VMem] in {
 // Target-specific instruction encodings.
 
//===--===//
 
+// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the
+// specific pseudo (bothen in this case) since any of them will work.
+class get_BUF_ps {
+  string Mnemonic = !cast(name # "_OFFSET").Mnemonic;
+}
+
 
//===--===//
 // Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11.
 
//===--===//
@@ -2346,8 +2349,8 @@ multiclass MUBUF_Real_gfx11 op, string real_name 
= !cast(N
   }
 }
 
-class Base_MUBUF_Real_gfx6_gfx7_gfx10 op, MUBUF_Pseudo ps, int ef> :
-  Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11 {
+class Base_MUBUF_Real_gfx6_gfx7_gfx10 op, MUBUF_Pseudo ps, int ef, 
string asmName> :
+  Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11 {
   let Inst{12}= ps.offen;
   let Inst{13}= ps.idxen;
   let Inst

[llvm-branch-commits] [llvm] AMDGPU: Support local atomicrmw fmin/fmax for float/double (PR #95590)

2024-06-17 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/95590
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Start selecting flat/global atomicrmw fmin/fmax. (PR #95592)

2024-06-17 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/95592
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [llvm] AMDGPU: Remove ds atomic fadd intrinsics (PR #95396)

2024-06-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/95396

>From 62be4ae64075ec326ed81e0905f3eb3979ad Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 10 Jun 2024 19:48:13 +0200
Subject: [PATCH] AMDGPU: Remove ds atomic fadd intrinsics

These have been replaced with atomicrmw fadd
---
 clang/lib/CodeGen/CGBuiltin.cpp   |   2 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   5 -
 llvm/lib/IR/AutoUpgrade.cpp   |  92 --
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   1 -
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 -
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  |   3 -
 llvm/lib/Target/AMDGPU/DSInstructions.td  |  10 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  15 -
 llvm/test/Bitcode/amdgcn-atomic.ll| 136 +
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll|  55 
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  | 125 +---
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll  | 279 --
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 102 ---
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll |  13 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll |  38 ++-
 llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll   |  25 --
 18 files changed, 255 insertions(+), 654 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d81cf40c912de..34d7e59ca45fd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19084,7 +19084,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_fadd_* builtins do not have syncscope/order arguments.
+  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
   SSID = llvm::SyncScope::System;
   AO = AtomicOrdering::SequentiallyConsistent;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8a5566ae12020..7a5e919fe26e3 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -571,7 +571,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
 def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
 def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
 
@@ -2930,10 +2929,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic {
 // bf16 atomics use v2i16 argument since there is no bf16 data type in the 
llvm.
 def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn;
 def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn;
-def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
-[llvm_v2i16_ty],
-[LLVMQualPointerType<3>, llvm_v2i16_ty],
-[IntrArgMemOnly, NoCapture>]>;
 
 defset list AMDGPUMFMAIntrinsics940 = {
 def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 2f4b8351e747a..29310ad79ef70 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,6 +1033,12 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
 break; // No other 'amdgcn.atomic.*'
   }
 
+  if (Name.starts_with("ds.fadd")) {
+// Replaced with atomicrmw fadd, so there's no new declaration.
+NewFn = nullptr;
+return true;
+  }
+
   if (Name.starts_with("ldexp.")) {
 // Target specific intrinsic became redundant
 NewFn = Intrinsic::getDeclaration(
@@ -2331,40 +2337,74 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, 
CallBase *CI, Function *F,
   llvm_unreachable("Unknown function for ARM CallBase upgrade.");
 }
 
+// These are expected to have the arguments:
+// atomic.intrin (ptr, rmw_value, ordering, scope, isVolatile)
+//
+// Except for int_amdgcn_ds_fadd_v2bf16 which only has (ptr, rmw_value).
+//
 static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
  Function *F, IRBuilder<> &Builder) {
-  const bool IsInc = Name.starts_with("atomic.inc.");
-  if (IsInc || Name.starts_with("atomic.dec.")) {
-if (CI->getNumOperands() != 6) // Malformed bitcode.
-  return nullptr;
+  AtomicRMWInst::BinOp RMWOp =
+  StringSwitch(Name)
+  .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
+  .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
+
+  unsigned NumOperands = CI->getNumOperands();
+  if (N

[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2bf16 atomicrmw fadd for gfx12 (PR #95930)

2024-06-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/95930

Annoyingly gfx90a/940 support this for global/flat but not buffer.

>From 3bf1ef74e41889bfc70f8795e0fadb6031122286 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 18 Jun 2024 15:48:20 +0200
Subject: [PATCH] AMDGPU: Handle legal v2bf16 atomicrmw fadd for gfx12

Annoyingly gfx90a/940 support this for global/flat but not buffer.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  |  12 +-
 llvm/lib/Target/AMDGPU/BUFInstructions.td |   7 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |   5 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   7 +-
 .../buffer-fat-pointer-atomicrmw-fadd.ll  | 147 ++
 5 files changed, 36 insertions(+), 142 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb5ceb9959325..060358cc310bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -743,6 +743,12 @@ def FeatureAtomicGlobalPkAddBF16Inst : 
SubtargetFeature<"atomic-global-pk-add-bf
  [FeatureFlatGlobalInsts]
 >;
 
+def FeatureAtomicBufferPkAddBF16Inst : 
SubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
+ "HasAtomicBufferPkAddBF16Inst",
+ "true",
+ "Has buffer_atomic_pk_add_bf16 instruction"
+>;
+
 def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts",
   "HasAtomicCSubNoRtnInsts",
   "true",
@@ -1560,6 +1566,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureAtomicFlatPkAdd16Insts,
FeatureAtomicBufferGlobalPkAddF16Insts,
FeatureAtomicGlobalPkAddBF16Inst,
+   FeatureAtomicBufferPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
@@ -2120,7 +2127,10 @@ def HasAtomicBufferGlobalPkAddF16Insts
   AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>;
 def HasAtomicGlobalPkAddBF16Inst
   : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">,
-  AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+def HasAtomicBufferPkAddBF16Inst
+  : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">,
+AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>;
 def HasFlatAtomicFaddF32Inst
   : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
   AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 21335e9b64647..1c0dce0990571 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1241,7 +1241,9 @@ let SubtargetPredicate = isGFX12Plus in {
 defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_cond_sub_u32", VGPR_32, i32
 >;
+}
 
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
 let FPAtomic = 1 in
 defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
@@ -1735,8 +1737,11 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, 
"BUFFER_ATOMIC_DEC_X2">;
 let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
 defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", 
["noret"]>;
 
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
   defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2bf16, 
"BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
+}
+
+let SubtargetPredicate = isGFX12Plus in {
   defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, 
"BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
 
   let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467f22389..fb565fbf49804 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
   bool HasAtomicGlobalPkAddBF16Inst = false;
+  bool HasAtomicBufferPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
   bool HasDefaultComponentZero = false;
   bool HasDefaultComponentBroadcast = false;
@@ -844,6 +845,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 return HasAtomicGlobalPkAddBF16Inst;
   }
 
+  bool hasAtomicBufferPkAddBF16Inst() const {
+return HasAtomicBufferPkAddBF16Inst;
+  }
+
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 777d4061da877..ef500a4836eb8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16037,9 +16037,10 @@ 
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *R

[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2bf16 atomicrmw fadd for gfx12 (PR #95930)

2024-06-18 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/95930
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2bf16 atomicrmw fadd for gfx12 (PR #95930)

2024-06-18 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/95930
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Start selecting flat/global atomicrmw fmin/fmax. (PR #95592)

2024-06-18 Thread Matt Arsenault via llvm-branch-commits


@@ -1699,7 +1709,7 @@ multiclass SIBufferAtomicPat_Common RtnModes = ["ret", "noret"]> {
-  let SubtargetPredicate = HasUnrestrictedSOffset in {
+  let OtherPredicates = [HasUnrestrictedSOffset] in {

arsenm wrote:

SubtargetPredicate is just one thing, and OtherPredicates is a list? The usage 
pattern consistency has been declining for a while 

https://github.com/llvm/llvm-project/pull/95592
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2bf16 atomicrmw fadd for gfx12 (PR #95930)

2024-06-18 Thread Matt Arsenault via llvm-branch-commits


@@ -1735,8 +1737,11 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, 
"BUFFER_ATOMIC_DEC_X2">;
 let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
 defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", 
["noret"]>;
 
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
   defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2bf16, 
"BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;

arsenm wrote:

I think these should be inverted, the inner predicate should be the buffer 
thing 

https://github.com/llvm/llvm-project/pull/95930
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2bf16 atomicrmw fadd for gfx12 (PR #95930)

2024-06-18 Thread Matt Arsenault via llvm-branch-commits


@@ -1735,8 +1737,11 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, 
"BUFFER_ATOMIC_DEC_X2">;
 let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
 defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", 
["noret"]>;
 
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
   defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2bf16, 
"BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;

arsenm wrote:

I'm not really sure what VBUFFER is, but as far as I can tell the predicates 
are broken anyway. If you do select the non-VBUFFER version, something else 
breaks later and we don't have the negated predicate on the not-VBUFFER 
patterns 

https://github.com/llvm/llvm-project/pull/95930
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)

2024-06-20 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

I'm still not sure why we have so much in this pass. The load and store 
vectorization should have happened in the IR. This pass originally was for the 
multi offset DS instructions 

https://github.com/llvm/llvm-project/pull/96162
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)

2024-06-20 Thread Matt Arsenault via llvm-branch-commits


@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const 
CombineInfo &CI,
   return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
 }
   case S_LOAD_IMM:
-switch (Width) {
-default:
-  return 0;
-case 2:
-  return AMDGPU::S_LOAD_DWORDX2_IMM;
-case 3:
-  return AMDGPU::S_LOAD_DWORDX3_IMM;
-case 4:
-  return AMDGPU::S_LOAD_DWORDX4_IMM;
-case 8:
-  return AMDGPU::S_LOAD_DWORDX8_IMM;
+// For targets that support XNACK replay, use the constrained load opcode.
+if (STI && STI->hasXnackReplay()) {
+  switch (Width) {

arsenm wrote:

One switch and move the condition inside each size case?

https://github.com/llvm/llvm-project/pull/96162
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)

2024-06-20 Thread Matt Arsenault via llvm-branch-commits


@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const 
CombineInfo &CI,
   return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
 }
   case S_LOAD_IMM:
-switch (Width) {
-default:
-  return 0;
-case 2:
-  return AMDGPU::S_LOAD_DWORDX2_IMM;
-case 3:
-  return AMDGPU::S_LOAD_DWORDX3_IMM;
-case 4:
-  return AMDGPU::S_LOAD_DWORDX4_IMM;
-case 8:
-  return AMDGPU::S_LOAD_DWORDX8_IMM;
+// For targets that support XNACK replay, use the constrained load opcode.
+if (STI && STI->hasXnackReplay()) {

arsenm wrote:

STI should never be null. The conservative default would be to assume ec if it 
were possible 

https://github.com/llvm/llvm-project/pull/96162
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Codegen support for constrained multi-dword sloads (PR #96163)

2024-06-21 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm commented:

In a separate patch, we should add a verifier check that you used the correct 
tied version depending on whether xnack is enabled or not 

https://github.com/llvm/llvm-project/pull/96163
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Codegen support for constrained multi-dword sloads (PR #96163)

2024-06-21 Thread Matt Arsenault via llvm-branch-commits


@@ -867,13 +867,104 @@ def SMRDBufferImm   : ComplexPattern;
 def SMRDBufferImm32 : ComplexPattern;
 def SMRDBufferSgprImm : ComplexPattern;
 
+class SMRDAlignedLoadPat : PatFrag <(ops node:$ptr), (Op 
node:$ptr), [{
+  // Returns true if it is a naturally aligned multi-dword load.
+  LoadSDNode *Ld = cast(N);
+  unsigned Size = Ld->getMemoryVT().getStoreSize();
+  return (Size <= 4) || (Ld->getAlign().value() >= PowerOf2Ceil(Size));

arsenm wrote:

Shouldn't need the PowerOf2Ceil? 

https://github.com/llvm/llvm-project/pull/96163
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Codegen support for constrained multi-dword sloads (PR #96163)

2024-06-21 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm commented:

Another strategy that might simplify the patterns is to always select the _ec 
versions, and then later swap to the non-ec versions if xnack is disabled 

https://github.com/llvm/llvm-project/pull/96163
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Start selecting buffer fat pointer atomicrmw fmin/fmax (PR #95593)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **Jun 23, 4:06 AM EDT**: @arsenm started a stack merge that includes this 
pull request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/95593).


https://github.com/llvm/llvm-project/pull/95593
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96442

Atomic access to fine-grained remote memory does not work on all
subtargets. Add a feature for targets where this is expected to work.

>From 2da0565165d97cded02896b18a9d7f8083474da9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 22:02:36 +0200
Subject: [PATCH] AMDGPU: Add a subtarget feature for fine-grained remote
 memory support

Atomic access to fine-grained remote memory does not work on all
subtargets. Add a feature for targets where this is expected to work.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  8 
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5bc0fe8bba608..7ff861f5b144d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,14 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+  : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
+  "HasAgentScopeFineGrainedRemoteMemoryAtomics",
+  "true",
+  "Agent (device) scoped atomic operations not directly supported by "
+  "PCIe work for allocations in host or peer PCIe device memory"
+>;
+
 def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
   "HasDefaultComponentZero",
   "true",
@@ -1207,7 +1215,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
FeatureMaxHardClauseLength32,
-   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
+   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   ]
 >;
 
@@ -1415,7 +1424,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureBackOffBarrier,
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
-   FeatureAtomicFMinFMaxF64FlatInsts
+   FeatureAtomicFMinFMaxF64FlatInsts,
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 966708db4f37c..c40efbdcf7f0b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -174,6 +174,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicBufferPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
   bool HasDefaultComponentZero = false;
+  bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
   bool HasDefaultComponentBroadcast = false;
   /// The maximum number of instructions that may be placed within an S_CLAUSE,
   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
@@ -871,6 +872,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if atomic operations targeting fine-grained memory work
+  /// correctly at device scope, in allocations in host or peer PCIe device
+  /// memory.
+  bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
+return HasAgentScopeFineGrainedRemoteMemoryAtomics;
+  }
+
   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
 
   bool hasDefaultComponentBroadcast() const {

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96443

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.

>From f99d34b66486a17e2fe70d372d67fbabde82d5fb Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for global atomic fadd denormal
 support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 13 +++--
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7ff861f5b144d..5f798b4391704 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1425,7 +1432,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1628,7 +1636,8 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVGPRSingleUseHintInsts,
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
-   FeatureMaxHardClauseLength32]>;
+   FeatureMaxHardClauseLength32,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c40efbdcf7f0b..674d84422538f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96443
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96444

None

>From a663c429ebe7a05754c771d67856a7cdb20cc11d Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5f798b4391704..fe3cd75d81009 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFaddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasAtomicMemoryAtomicFaddF32DenormalSupport",
@@ -1388,7 +1395,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 674d84422538f..922435c5efaa6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -174,6 +174,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicGlobalPkAddBF16Inst = false;
   bool HasAtomicBufferPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
+  bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
   bool HasDefaultComponentZero = false;
   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
   bool HasDefaultComponentBroadcast = false;
@@ -873,6 +874,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target has flat, global, and buffer atomic fadd for
+  /// double.
+  bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
+return HasFlatBufferGlobalAtomicFaddF64Inst;
+  }
+
   /// \return true if the target's flat, global, and buffer atomic fadd for
   /// float supports denormal handling.
   bool hasMemoryAtomicFaddF32DenormalSupport() const {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index eec750e5b8251..6b5ba160d6402 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16028,7 +16028,7 @@ 
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   return AtomicExpansionKind::CmpXChg;
 
 // global and flat atomic fadd f64: gfx90a, gfx940.
-if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
+if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
   return ReportUnsafeHWInst(AtomicExpansionKind::None);
 
 if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96444
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96443
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96444
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-23 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> Hmm, I might need to reparse this but I'm not entirely sure how this is 
> expected to work in general since PCI-E attachment is not always / entirely 
> knowable at compile time. Have I missed some preamble discussion here?

We do statically know for some of the targets (mostly gfx12 and gfx940) that 
it's supposed to work. This is the "scope downgrade" vs. "nop" cases in the 
atomic support table 

https://github.com/llvm/llvm-project/pull/96442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-24 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> Actually not, we do not know the bus. Moreover, we know this is opposite.

On gfx940/gfx12, we don't need to know the bus to handle the agent scope case 
correctly. The instructions still function, just always at device scope. We do 
need to know the bus and/or location to handle the system scope correctly, 
which will come from the new metadata.

https://github.com/llvm/llvm-project/pull/96442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-24 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From f99d34b66486a17e2fe70d372d67fbabde82d5fb Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/2] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 13 +++--
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7ff861f5b144d..5f798b4391704 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1425,7 +1432,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1628,7 +1636,8 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVGPRSingleUseHintInsts,
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
-   FeatureMaxHardClauseLength32]>;
+   FeatureMaxHardClauseLength32,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c40efbdcf7f0b..674d84422538f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 45941357740eb44a7c5828bf7c534aef65448226 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/2] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5f798b4391704..0ec65f759bc35 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1545,7 +1545,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-24 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From 80c3f71f03d3b2ccbcd418d76d417f2a243fdbe4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH 1/2] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0ec65f759bc35..028c54d8d94d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFaddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasAtomicMemoryAtomicFaddF32DenormalSupport",
@@ -1388,7 +1395,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 674d84422538f..922435c5efaa6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -174,6 +174,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicGlobalPkAddBF16Inst = false;
   bool HasAtomicBufferPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
+  bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
   bool HasDefaultComponentZero = false;
   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
   bool HasDefaultComponentBroadcast = false;
@@ -873,6 +874,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target has flat, global, and buffer atomic fadd for
+  /// double.
+  bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
+return HasFlatBufferGlobalAtomicFaddF64Inst;
+  }
+
   /// \return true if the target's flat, global, and buffer atomic fadd for
   /// float supports denormal handling.
   bool hasMemoryAtomicFaddF32DenormalSupport() const {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index eec750e5b8251..6b5ba160d6402 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16028,7 +16028,7 @@ 
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   return AtomicExpansionKind::CmpXChg;
 
 // global and flat atomic fadd f64: gfx90a, gfx940.
-if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
+if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
   return ReportUnsafeHWInst(AtomicExpansionKind::None);
 
 if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {

>From c1354032fc55234ffddf9136f17f5ee400c01c16 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 15:42:17 +0200
Subject: [PATCH 2/2] Add to gfx940

---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 028c54d8d94d2..3ed68a259ca15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1441,7 +1441,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFaddF32DenormalSupport
+   FeatureMemoryAtomicFaddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-24 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

No, the string description mentions this is for the non-PCIe supported cases 

https://github.com/llvm/llvm-project/pull/96442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-24 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> need some tests

This does nothing as it is. The real use patches have thousands of tests 

https://github.com/llvm/llvm-project/pull/96442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-24 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> It is worse than that. It behaves differently depending on where atomic is 
> executed. There is no single answer if this instruction supports denorms or 
> not.

That doesn't matter. The flat case that sometimes flushes is just a no. 
Flushing is never a guarantee, we only need to know a flush may happen 

https://github.com/llvm/llvm-project/pull/96443
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-24 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From baaf96125e8f913a161f1c13216618a3de128182 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0ec65f759bc35..9aaeaf73287d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFaddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasAtomicMemoryAtomicFaddF32DenormalSupport",
@@ -1388,7 +1395,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1433,7 +1441,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFaddF32DenormalSupport
+   FeatureMemoryAtomicFaddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1928,11 +1937,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructi

[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-25 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96442

>From 1a441c05eb510f3310604594b2687ddf90e884fe Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 22:02:36 +0200
Subject: [PATCH] AMDGPU: Add a subtarget feature for fine-grained remote
 memory support

Atomic access to fine-grained remote memory does not work on all
subtargets. Add a feature for targets where this is expected to work.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 16 ++--
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  8 
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5bc0fe8bba608..4d2faacaa915b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,16 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+  : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
+  "HasAgentScopeFineGrainedRemoteMemoryAtomics",
+  "true",
+  "Agent (device) scoped atomic operations, excluding those directly "
+  "supported by PCIe (i.e. integer atomic add, exchange, and "
+  "compare-and-swap), are functional for allocations in host or peer "
+  "device memory."
+>;
+
 def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
   "HasDefaultComponentZero",
   "true",
@@ -1207,7 +1217,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
FeatureMaxHardClauseLength32,
-   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
+   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   ]
 >;
 
@@ -1415,7 +1426,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureBackOffBarrier,
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
-   FeatureAtomicFMinFMaxF64FlatInsts
+   FeatureAtomicFMinFMaxF64FlatInsts,
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 966708db4f37c..c40efbdcf7f0b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -174,6 +174,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicBufferPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
   bool HasDefaultComponentZero = false;
+  bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
   bool HasDefaultComponentBroadcast = false;
   /// The maximum number of instructions that may be placed within an S_CLAUSE,
   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
@@ -871,6 +872,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if atomic operations targeting fine-grained memory work
+  /// correctly at device scope, in allocations in host or peer PCIe device
+  /// memory.
+  bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
+return HasAgentScopeFineGrainedRemoteMemoryAtomics;
+  }
+
   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
 
   bool hasDefaultComponentBroadcast() const {

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-25 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From 975c64c4becd802f9d0038fa79f46e4e94623691 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/2] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 13 +++--
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4d2faacaa915b..9e8c36af97cf2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1630,7 +1638,8 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVGPRSingleUseHintInsts,
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
-   FeatureMaxHardClauseLength32]>;
+   FeatureMaxHardClauseLength32,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c40efbdcf7f0b..674d84422538f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 3ec4e64a5cb5a3fe7904fa608c20a774bdc11b2b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/2] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 9e8c36af97cf2..3d0d18e59a8c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-25 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From e948fe9cd65450f109ff02a7f1a033afeb04 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 3d0d18e59a8c3..c6d2645e48b2b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFaddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasAtomicMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFaddF32DenormalSupport
+   FeatureMemoryAtomicFaddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1930,11 +1939,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructi

[llvm-branch-commits] [llvm] AMDGPU: Add a subtarget feature for fine-grained remote memory support (PR #96442)

2024-06-25 Thread Matt Arsenault via llvm-branch-commits


@@ -788,6 +788,14 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+  : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
+  "HasAgentScopeFineGrainedRemoteMemoryAtomics",
+  "true",
+  "Agent (device) scoped atomic operations not directly supported by "

arsenm wrote:

I dropped the last PCIe reference since XGMI should also work 

https://github.com/llvm/llvm-project/pull/96442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-25 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From 78edc216186854e3320ec5e16b78a26af19dee66 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/2] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 56ec5e9c4cfc2..6b212e1b2af03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
-   Feature1_5xVGPRs]>;
+   Feature1_5xVGPRs,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9e2a316a9ed28..db0b2b67a0388 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 47017c26844bc49a9842b2c40056392184119943 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/2] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b212e1b2af03..39a1d629a4aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
-   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-25 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From 7648917e1a8f14940f31add840201d3413abbbdb Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 39a1d629a4aea..97dd45df91da2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFaddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasAtomicMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFaddF32DenormalSupport
+   FeatureMemoryAtomicFaddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1932,11 +1941,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructi

[llvm-branch-commits] [llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96739

These have been replaced with atomicrmw.

>From e95c252f91dea9dbb89711eb3b851fcfe6555f7c Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 11:46:15 +0200
Subject: [PATCH] AMDGPU: Remove ds_fmin/ds_fmax intrinsics

These have been replaced with atomicrmw.
---
 llvm/docs/ReleaseNotes.rst|  5 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 14 -
 llvm/lib/IR/AutoUpgrade.cpp   |  8 ++-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 32 
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |  3 --
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |  2 -
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  | 20 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 +-
 llvm/test/Bitcode/amdgcn-atomic.ll| 52 +++
 9 files changed, 65 insertions(+), 86 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 76356dd76f1d2..7644da2b78bd7 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -132,6 +132,11 @@ Changes to the AMDGPU Backend
 
 * Implemented :ref:`llvm.get.rounding ` and 
:ref:`llvm.set.rounding `
 
+* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
+  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
+  :ref:`atomicrmw ` instruction with `fadd`, `fmin` and
+  `fmax` with addrspace(3) instead.
+
 Changes to the ARM Backend
 --
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 11662ccc1a695..2aa52ef99aaf8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
 [IntrNoMem, IntrSpeculatable]
 >;
 
-class AMDGPULDSIntrin :
-  Intrinsic<[llvm_any_ty],
-[LLVMQualPointerType<3>,
-LLVMMatchType<0>,
-llvm_i32_ty, // ordering
-llvm_i32_ty, // scope
-llvm_i1_ty], // isVolatile
-[IntrArgMemOnly, IntrWillReturn, NoCapture>,
- ImmArg>, ImmArg>, ImmArg>, 
IntrNoCallback, IntrNoFree]
->;
-
 // FIXME: The m0 argument should be moved after the normal arguments
 class AMDGPUDSOrderedIntrinsic : Intrinsic<
   [llvm_i32_ty],
@@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
-def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
-
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d7825d9b3e3e5..32076a07d30e7 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
 break; // No other 'amdgcn.atomic.*'
   }
 
-  if (Name.starts_with("ds.fadd")) {
-// Replaced with atomicrmw fadd, so there's no new declaration.
+  if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
+  Name.starts_with("ds.fmax")) {
+// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
+// declaration.
 NewFn = nullptr;
 return true;
   }
@@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   AtomicRMWInst::BinOp RMWOp =
   StringSwitch(Name)
   .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("ds.fmin", AtomicRMWInst::FMin)
+  .StartsWith("ds.fmax", AtomicRMWInst::FMax)
   .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
   .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4b48091b7143e..83a5933ceaed6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5401,35 +5401,6 @@ bool 
AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
   return true;
 }
 
-static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
-  switch (IID) {
-  case Intrinsic::amdgcn_ds_fmin:
-return AMDGPU::G_ATOMICRMW_FMIN;
-  case Intrinsic::amdgcn_ds_fmax:
-return AMDGPU::G_ATOMICRMW_FMAX;
-  default:
-llvm_unreachable("not a DS FP intrinsic");
-  }
-}
-
-bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-  MachineInstr &MI,
-  Intrinsic::ID IID) const 
{
-  GISelChangeObserver &Observer = Helper.Observer;
-  Observer.changingInstr(MI);
-
-  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
-
-  // The remaining operands were used to set fields in the MemOperand on
-  // construction.
-  for (int I = 6; I > 3; --I

[llvm-branch-commits] [llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96739?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96739** https://app.graphite.dev/github/pr/llvm/llvm-project/96739?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96738** https://app.graphite.dev/github/pr/llvm/llvm-project/96738?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96739
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96739
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From 78edc216186854e3320ec5e16b78a26af19dee66 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 56ec5e9c4cfc2..6b212e1b2af03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
-   Feature1_5xVGPRs]>;
+   Feature1_5xVGPRs,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9e2a316a9ed28..db0b2b67a0388 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 47017c26844bc49a9842b2c40056392184119943 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/3] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b212e1b2af03..39a1d629a4aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
-   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<

>From 23ec97c971fb5a93a39908da6e652899830dcb4e Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 11:30:51 +0200
Subject: [PATCH 3/3] Rename

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 39a1d629a4aea..34c6f6ff19bff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -78

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From db519863301bd95fe0d50b56d74584b0f7f2fbf6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 34c6f6ff19bff..84ea040477763 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFAddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFAddF32DenormalSupport
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1932,11 +1941,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td

[llvm-branch-commits] [llvm] AMDGPU: Handle new atomicrmw metadata for fadd case (PR #96760)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96760
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering (PR #96759)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96759
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering (PR #96759)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96759
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle new atomicrmw metadata for fadd case (PR #96760)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96760
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

2024-06-26 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96739

>From 401d82fb69592c8715e6ffa367ffdedd923746ae Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 11:46:15 +0200
Subject: [PATCH] AMDGPU: Remove ds_fmin/ds_fmax intrinsics

These have been replaced with atomicrmw.
---
 llvm/docs/ReleaseNotes.rst|  5 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 14 -
 llvm/lib/IR/AutoUpgrade.cpp   |  8 ++-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 32 
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |  3 --
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |  2 -
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  | 20 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 +-
 llvm/test/Bitcode/amdgcn-atomic.ll| 52 +++
 9 files changed, 65 insertions(+), 86 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 76356dd76f1d2..7644da2b78bd7 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -132,6 +132,11 @@ Changes to the AMDGPU Backend
 
 * Implemented :ref:`llvm.get.rounding ` and 
:ref:`llvm.set.rounding `
 
+* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
+  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
+  :ref:`atomicrmw ` instruction with `fadd`, `fmin` and
+  `fmax` with addrspace(3) instead.
+
 Changes to the ARM Backend
 --
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 11662ccc1a695..2aa52ef99aaf8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
 [IntrNoMem, IntrSpeculatable]
 >;
 
-class AMDGPULDSIntrin :
-  Intrinsic<[llvm_any_ty],
-[LLVMQualPointerType<3>,
-LLVMMatchType<0>,
-llvm_i32_ty, // ordering
-llvm_i32_ty, // scope
-llvm_i1_ty], // isVolatile
-[IntrArgMemOnly, IntrWillReturn, NoCapture>,
- ImmArg>, ImmArg>, ImmArg>, 
IntrNoCallback, IntrNoFree]
->;
-
 // FIXME: The m0 argument should be moved after the normal arguments
 class AMDGPUDSOrderedIntrinsic : Intrinsic<
   [llvm_i32_ty],
@@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
-def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
-
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d7825d9b3e3e5..32076a07d30e7 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
 break; // No other 'amdgcn.atomic.*'
   }
 
-  if (Name.starts_with("ds.fadd")) {
-// Replaced with atomicrmw fadd, so there's no new declaration.
+  if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
+  Name.starts_with("ds.fmax")) {
+// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
+// declaration.
 NewFn = nullptr;
 return true;
   }
@@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   AtomicRMWInst::BinOp RMWOp =
   StringSwitch(Name)
   .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("ds.fmin", AtomicRMWInst::FMin)
+  .StartsWith("ds.fmax", AtomicRMWInst::FMax)
   .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
   .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4b48091b7143e..83a5933ceaed6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5401,35 +5401,6 @@ bool 
AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
   return true;
 }
 
-static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
-  switch (IID) {
-  case Intrinsic::amdgcn_ds_fmin:
-return AMDGPU::G_ATOMICRMW_FMIN;
-  case Intrinsic::amdgcn_ds_fmax:
-return AMDGPU::G_ATOMICRMW_FMAX;
-  default:
-llvm_unreachable("not a DS FP intrinsic");
-  }
-}
-
-bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-  MachineInstr &MI,
-  Intrinsic::ID IID) const 
{
-  GISelChangeObserver &Observer = Helper.Observer;
-  Observer.changingInstr(MI);
-
-  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
-
-  // The remaining operands were used to set fields in the MemOperand on
-  // construction.
-  for (int I = 6; I > 3; --I)
-MI.removeOperand(I);
-
-  MI.remove

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From f29955ee4dfb3319d0ea99187d2cc24587c9e716 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 56ec5e9c4cfc2..6b212e1b2af03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
-   Feature1_5xVGPRs]>;
+   Feature1_5xVGPRs,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9e2a316a9ed28..db0b2b67a0388 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 2eed4079db56e5983166e5fb0e55ae3d80594f19 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/3] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b212e1b2af03..39a1d629a4aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
-   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<

>From b57b67e84ee062f993d39fc33d248d2cb73e2c6a Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 11:30:51 +0200
Subject: [PATCH 3/3] Rename

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 39a1d629a4aea..34c6f6ff19bff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -78

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From 36cbbdfaa31c6313c96a9c908bade1e6f7debc5b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 34c6f6ff19bff..84ea040477763 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFAddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFAddF32DenormalSupport
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1932,11 +1941,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td

[llvm-branch-commits] [llvm] AMDGPU: Remove ds_fmin/ds_fmax intrinsics (PR #96739)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96739

>From 864e3bbfc5f40bfb1e87f7689ede0d5f33aa42da Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 11:46:15 +0200
Subject: [PATCH] AMDGPU: Remove ds_fmin/ds_fmax intrinsics

These have been replaced with atomicrmw.
---
 llvm/docs/ReleaseNotes.rst|5 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |   14 -
 llvm/lib/IR/AutoUpgrade.cpp   |8 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   32 -
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |3 -
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |2 -
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp  |   20 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   15 +-
 llvm/test/Bitcode/amdgcn-atomic.ll|   52 +
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll  |  371 -
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll  |  279 
 .../CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll| 1418 -
 12 files changed, 65 insertions(+), 2154 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 416b3952f1ac4..ed7d252668850 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -132,6 +132,11 @@ Changes to the AMDGPU Backend
 
 * Implemented :ref:`llvm.get.rounding ` and 
:ref:`llvm.set.rounding `
 
+* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
+  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
+  :ref:`atomicrmw ` instruction with `fadd`, `fmin` and
+  `fmax` with addrspace(3) instead.
+
 Changes to the ARM Backend
 --
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d040aa8f38278..71b1e832bde3c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
 [IntrNoMem, IntrSpeculatable]
 >;
 
-class AMDGPULDSIntrin :
-  Intrinsic<[llvm_any_ty],
-[LLVMQualPointerType<3>,
-LLVMMatchType<0>,
-llvm_i32_ty, // ordering
-llvm_i32_ty, // scope
-llvm_i1_ty], // isVolatile
-[IntrArgMemOnly, IntrWillReturn, NoCapture>,
- ImmArg>, ImmArg>, ImmArg>, 
IntrNoCallback, IntrNoFree]
->;
-
 // FIXME: The m0 argument should be moved after the normal arguments
 class AMDGPUDSOrderedIntrinsic : Intrinsic<
   [llvm_i32_ty],
@@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
-def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
-
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d7825d9b3e3e5..32076a07d30e7 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, 
Function *&NewFn,
 break; // No other 'amdgcn.atomic.*'
   }
 
-  if (Name.starts_with("ds.fadd")) {
-// Replaced with atomicrmw fadd, so there's no new declaration.
+  if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
+  Name.starts_with("ds.fmax")) {
+// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
+// declaration.
 NewFn = nullptr;
 return true;
   }
@@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, 
CallBase *CI,
   AtomicRMWInst::BinOp RMWOp =
   StringSwitch(Name)
   .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
+  .StartsWith("ds.fmin", AtomicRMWInst::FMin)
+  .StartsWith("ds.fmax", AtomicRMWInst::FMax)
   .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
   .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f1254b2e9e1d2..dc165d65fa6ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5401,35 +5401,6 @@ bool 
AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
   return true;
 }
 
-static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
-  switch (IID) {
-  case Intrinsic::amdgcn_ds_fmin:
-return AMDGPU::G_ATOMICRMW_FMIN;
-  case Intrinsic::amdgcn_ds_fmax:
-return AMDGPU::G_ATOMICRMW_FMAX;
-  default:
-llvm_unreachable("not a DS FP intrinsic");
-  }
-}
-
-bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
-  MachineInstr &MI,
- 

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for global atomic fadd denormal support (PR #96443)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96443

>From eaa00157741d5e4f134df22ed27a80fe3d853e6e Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 16:44:08 +0200
Subject: [PATCH 1/3] AMDGPU: Add subtarget feature for global atomic fadd
 denormal support

Not sure what the behavior for gfx90a is. The SPG says it always flushes.
The instruction documentation says it does not.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 14 --
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  7 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 56ec5e9c4cfc2..6b212e1b2af03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureMemoryAtomicFaddF32DenormalSupport
+  : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
+  "HasAtomicMemoryAtomicFaddF32DenormalSupport",
+  "true",
+  "global/flat/buffer atomic fadd for float supports denormal handling"
+>;
+
 def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
   "HasAgentScopeFineGrainedRemoteMemoryAtomics",
@@ -1427,7 +1434,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureKernargPreload,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+   FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1631,7 +1639,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
-   Feature1_5xVGPRs]>;
+   Feature1_5xVGPRs,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9e2a316a9ed28..db0b2b67a0388 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
+  bool HasAtomicMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
@@ -872,6 +873,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  /// \return true if the target's flat, global, and buffer atomic fadd for
+  /// float supports denormal handling.
+  bool hasMemoryAtomicFaddF32DenormalSupport() const {
+return HasAtomicMemoryAtomicFaddF32DenormalSupport;
+  }
+
   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.

>From 84c8e017f521236c51a75a275c24f87dc919fd4b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 24 Jun 2024 12:10:37 +0200
Subject: [PATCH 2/3] Add to gfx11.

RDNA 3 manual says "Floating-point addition handles NAN/INF/denorm"
thought I'm not sure I trust it.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b212e1b2af03..39a1d629a4aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1547,7 +1547,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeaturePackedTID,
-   FeatureVcmpxPermlaneHazard]>;
+   FeatureVcmpxPermlaneHazard,
+   FeatureMemoryAtomicFaddF32DenormalSupport]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1640,7 +1641,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
-   FeatureMemoryAtomicFaddF32DenormalSupport]>;
+   FeatureMemoryAtomicFaddF32DenormalSupport
]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<

>From 5a627920d5c77a3b1d9b9ec1ddef1aa31fa1cf09 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 11:30:51 +0200
Subject: [PATCH 3/3] Rename

---
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 39a1d629a4aea..34c6f6ff19bff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -78

[llvm-branch-commits] [llvm] AMDGPU: Add subtarget feature for memory atomic fadd f64 (PR #96444)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/96444

>From 0381e27b091f0cb6558fb9b4bf3e5359655acab0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sun, 23 Jun 2024 17:07:53 +0200
Subject: [PATCH] AMDGPU: Add subtarget feature for memory atomic fadd f64

---
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 21 ++---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 10 ++
 llvm/lib/Target/AMDGPU/FLATInstructions.td |  6 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 10 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 34c6f6ff19bff..84ea040477763 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -788,6 +788,13 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureFlatBufferGlobalAtomicFaddF64Inst
+  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
+  "HasFlatBufferGlobalAtomicFaddF64Inst",
+  "true",
+  "Has flat, buffer, and global instructions for f64 atomic fadd"
+>;
+
 def FeatureMemoryAtomicFAddF32DenormalSupport
   : SubtargetFeature<"memory-atomic-fadd-f32-denormal-support",
   "HasMemoryAtomicFaddF32DenormalSupport",
@@ -1390,7 +1397,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
  FeatureBackOffBarrier,
  FeatureKernargPreload,
  FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureFlatBufferGlobalAtomicFaddF64Inst
  ])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
@@ -1435,7 +1443,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics,
-   FeatureMemoryAtomicFAddF32DenormalSupport
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
@@ -1932,11 +1941,9 @@ def isGFX12Plus :
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
-
-def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
-  Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
-  // FIXME: This is too coarse, and working around using pseudo's predicates 
on real instruction.
-  AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, 
FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasFlatBufferGlobalAtomicFaddF64Inst :
+  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
+  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
 
 def HasAtomicFMinFMaxF32GlobalInsts :
   Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 3b8d94b744000..a904c8483dbf5 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1312,14 +1312,16 @@ let SubtargetPredicate = isGFX90APlus in {
   }
 } // End SubtargetPredicate = isGFX90APlus
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", 
VReg_64, f64>;
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
   defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", 
VReg_64, f64>;
   defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", 
VReg_64, f64>;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+}
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
   let SubtargetPredicate = isGFX940Plus;
@@ -1836,9 +1838,9 @@ let SubtargetPredicate = 
HasAtomicBufferGlobalPkAddF16Insts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, 
"BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
 } // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
 
-let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
+let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, 
"BUFFER_ATOMIC_ADD_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, 
"BUFFER_ATOMIC_MIN_F64">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..89946a4719557 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96872

Need to emit syncscope and new metadata to get the native instruction,
most of the time.

>From bd298a4cb7aaa7f287da0654c8a530e378f0362a Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Tue, 11 Jun 2024 10:58:44 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for
 __builtin_amdgcn_global_atomic_fadd_{f32|f64}

Need to emit syncscope and new metadata to get the native instruction,
most of the time.
---
 clang/lib/CodeGen/CGBuiltin.cpp   | 39 +--
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl|  2 +-
 .../builtins-fp-atomics-gfx12.cl  |  4 +-
 .../builtins-fp-atomics-gfx90a.cl |  4 +-
 .../builtins-fp-atomics-gfx940.cl |  4 +-
 5 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 96dcf6283f9f8..d90762748d925 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getHalfTy(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
   EmitScalarExpr(E->getArg(3)), AO, SSID);
 } else {
-  // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
-  SSID = llvm::SyncScope::System;
+  // Most of the builtins do not have syncscope/order arguments. For DS
+  // atomics the scope doesn't really matter, as they implicitly operate at
+  // workgroup scope.
+  //
+  // The global/flat cases need to use agent scope to consistently produce
+  // the native instruction instead of a cmpxchg expansion.
+  SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
 if (Volatile)
   RMW->setVolatile(true);
+
+unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+if (AddrSpace != llvm::AMDGPUAS::L

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96873

None

>From 65a690d80cf39df132cacff510371c9dcb1b97fd Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:12:59 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from
 {global|flat}_atomic_fadd_v2f16 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 20 ++-
 .../builtins-fp-atomics-gfx12.cl  |  9 ++---
 .../builtins-fp-atomics-gfx90a.cl |  2 +-
 .../builtins-fp-atomics-gfx940.cl |  3 ++-
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d90762748d925..2a1861e4413fd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18655,22 +18655,15 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_global_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmin;
   break;
@@ -18690,11 +18683,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   ArgTy = llvm::Type::getFloatTy(getLLVMContext());
   IID = Intrinsic::amdgcn_flat_atomic_fadd;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 2);
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19085,7 +19073,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19103,6 +19093,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 21c1c38bc78dc..cf304d7b0818a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -48,7 +48,8 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr 
%{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX12-LABEL:  test_flat_add_2f16
 // GFX12: flat_atomic_pk_add_f16
 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
@@ -64,7 +65,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_global_add_half2
-// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr 
addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> 
%{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory 
!{{[0-9]+$}}
+
 // GFX12-LABEL:  test_global_add_half2
 // GFX12:  global_atomic_pk_add_f16 v2, v[0:1], v2, off th

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96874

None

>From 9347154207e5a8d75755b11813b870b207fd125a Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:15:26 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64}
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp | 17 ++---
 .../CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl |  6 --
 .../CodeGenOpenCL/builtins-fp-atomics-gfx940.cl |  3 ++-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2a1861e4413fd..54e363d6fd0e8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18657,10 +18657,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 Intrinsic::ID IID;
 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
 switch (BuiltinID) {
@@ -18670,19 +18668,12 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_global_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmin;
   break;
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
   IID = Intrinsic::amdgcn_flat_atomic_fmax;
   break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  ArgTy = llvm::Type::getFloatTy(getLLVMContext());
-  IID = Intrinsic::amdgcn_flat_atomic_fadd;
-  break;
 }
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
 llvm::Value *Val = EmitScalarExpr(E->getArg(1));
@@ -19075,7 +19066,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19095,6 +19088,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 4980c44215743..60a3033a36c17 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -45,7 +45,8 @@ void test_global_max_f64(__global double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_add_local_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr 
addrspace(3) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8{{$}}
+
 // GFX90A-LABEL:  test_flat_add_local_f64$local
 // GFX90A:  ds_add_rtn_f64
 void test_flat_add_local_f64(__local double *addr, double x){
@@ -54,7 +55,8 @@ void test_flat_add_local_f64(__local double *addr, double x){
 }
 
 // CHECK-LABEL: test_flat_global_add_f64
-// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_flat_global_add_f64$local
 // GFX90A:  global_atomic_add_f64
 void test_flat_global_add_f64(__global double *addr, double x){
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index a4f438bea33a6..2618e2809fbbf 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -10,7 +10,8 @@ typedef half  __attribute__((ext_vector_type(2)))

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96872
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96875

None

>From 94d04eb6576b811e11175ca36a340649a63bf007 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 19:34:43 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16
 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 26 ++-
 .../builtins-fp-atomics-gfx12.cl  | 18 ++---
 .../builtins-fp-atomics-gfx940.cl | 10 +--
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 54e363d6fd0e8..4bbb4375ee997 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18681,22 +18681,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
 return Builder.CreateCall(F, {Addr, Val});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
-Intrinsic::ID IID;
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
-  IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19068,7 +19052,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19090,6 +19076,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
 }
@@ -19126,7 +19114,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   AO = AtomicOrdering::SequentiallyConsistent;
 
   // The v2bf16 builtin uses i16 instead of a natural bfloat type.
-  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
+  if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
+  BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
 llvm::Type *V2BF16Ty = FixedVectorType::get(
 llvm::Type::getBFloatTy(Builder.getContext()), 2);
 Val = Builder.CreateBitCast(Val, V2BF16Ty);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index cf304d7b0818a..8d6bb948b0a7a 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_local_add_2bf16
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> 
[[BC0]] syncscope("agent") seq_cst, align 4
+// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x 
bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16
@@ -57,7 +57,10 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_flat_add_2bf16
-// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, 
<2 x i16> %{{.*}})
+// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC]] 
syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
+
 // GFX12-LABEL:  test_flat_add_2bf16
 // GFX12: flat_atomic_pk_

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from flat_atomic_{f32|f64} builtins (PR #96874)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96874
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins (PR #96873)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96873
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/96876

None

>From 0e174a54c24c70343a0e28c6ca053ab4bbbae3d2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 26 Jun 2024 23:18:32 +0200
Subject: [PATCH] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max
 f64 builtins

---
 clang/lib/CodeGen/CGBuiltin.cpp   | 42 +++
 .../builtins-fp-atomics-gfx90a.cl | 18 +---
 2 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4bbb4375ee997..a3115bfa4d230 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18655,32 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
   }
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
-Intrinsic::ID IID;
-llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
-switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_global_atomic_fmax;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmin;
-  break;
-case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
-  IID = Intrinsic::amdgcn_flat_atomic_fmax;
-  break;
-}
-llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Value *Val = EmitScalarExpr(E->getArg(1));
-llvm::Function *F =
-CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
-return Builder.CreateCall(F, {Addr, Val});
-  }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19054,7 +19028,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
-  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
 llvm::AtomicRMWInst::BinOp BinOp;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19080,6 +19058,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
   BinOp = llvm::AtomicRMWInst::FAdd;
   break;
+case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
+  BinOp = llvm::AtomicRMWInst::FMin;
+  break;
+case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
+case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+  BinOp = llvm::AtomicRMWInst::FMax;
+  break;
 }
 
 Address Ptr = CheckAtomicAlignment(*this, E);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl 
b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 60a3033a36c17..cfc5adc57bf5e 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -27,7 +27,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
 }
 
 // CHECK-LABEL: test_global_global_min_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmin ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_global_min_f64$local
 // GFX90A:  global_atomic_min_f64
 void test_global_global_min_f64(__global double *addr, double x){
@@ -36,7 +37,8 @@ void test_global_global_min_f64(__global double *addr, double 
x){
 }
 
 // CHECK-LABEL: test_global_max_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr 
addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fmax ptr addrspace(1) {{.+}}, double %{{.+}} 
syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
 // GFX90A-LABEL:  test_global_max_f64$local
 // GFX90A:  global_atomic_max_f64
 void tes

[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins (PR #96875)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96875?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96876** https://app.graphite.dev/github/pr/llvm/llvm-project/96876?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96875** https://app.graphite.dev/github/pr/llvm/llvm-project/96875?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96875
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins (PR #96876)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/96876?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#96876** https://app.graphite.dev/github/pr/llvm/llvm-project/96876?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#96875** https://app.graphite.dev/github/pr/llvm/llvm-project/96875?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96874** https://app.graphite.dev/github/pr/llvm/llvm-project/96874?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96873** https://app.graphite.dev/github/pr/llvm/llvm-project/96873?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96872** https://app.graphite.dev/github/pr/llvm/llvm-project/96872?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96760** https://app.graphite.dev/github/pr/llvm/llvm-project/96760?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96759** https://app.graphite.dev/github/pr/llvm/llvm-project/96759?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96444** https://app.graphite.dev/github/pr/llvm/llvm-project/96444?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96443** https://app.graphite.dev/github/pr/llvm/llvm-project/96443?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#96442** https://app.graphite.dev/github/pr/llvm/llvm-project/96442?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95930** https://app.graphite.dev/github/pr/llvm/llvm-project/95930?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#95929** https://app.graphite.dev/github/pr/llvm/llvm-project/95929?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/96876
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)

2024-06-27 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/96872
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


  1   2   3   4   5   6   7   >