[PATCH] D87947: [AMDGPU] Make ds fp atomics overloadable

Stanislav Mekhanoshin via Phabricator via cfe-commits Wed, 23 Sep 2020 11:41:24 -0700

This revision was automatically updated to reflect the committed changes.
Closed by commit rG59691dc8740c: [AMDGPU] Make ds fp atomics overloadable 
(authored by rampitec).
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D87947/new/

https://reviews.llvm.org/D87947

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenCUDA/builtins-amdgcn.cu
  clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll

Index: llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll
+++ llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
-declare float @llvm.amdgcn.ds.fadd(float addrspace(3)* nocapture, float, i32, i32, i1)
-declare float @llvm.amdgcn.ds.fmin(float addrspace(3)* nocapture, float, i32, i32, i1)
-declare float @llvm.amdgcn.ds.fmax(float addrspace(3)* nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
 
 ; GCN-LABEL: {{^}}lds_ds_fadd:
 ; VI-DAG: s_mov_b32 m0
@@ -19,9 +19,9 @@
   %shl1 = shl i32 %idx.add, 4
   %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
   %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
+  %a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
   store float %a3, float addrspace(1)* %out
   ret void
 }
@@ -40,9 +40,9 @@
   %shl1 = shl i32 %idx.add, 4
   %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
   %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
+  %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
   store float %a3, float addrspace(1)* %out
   ret void
 }
@@ -61,9 +61,9 @@
   %shl1 = shl i32 %idx.add, 4
   %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
   %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
+  %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
   store float %a3, float addrspace(1)* %out
   ret void
 }
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -397,11 +397,10 @@
 def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
 def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
 
-class AMDGPULDSF32Intrin<string clang_builtin> :
-  GCCBuiltin<clang_builtin>,
-  Intrinsic<[llvm_float_ty],
-    [LLVMQualPointerType<llvm_float_ty, 3>,
-    llvm_float_ty,
+class AMDGPULDSIntrin :
+  Intrinsic<[llvm_any_ty],
+    [LLVMQualPointerType<LLVMMatchType<0>, 3>,
+    LLVMMatchType<0>,
     llvm_i32_ty, // ordering
     llvm_i32_ty, // scope
     llvm_i1_ty], // isVolatile
@@ -446,9 +445,9 @@
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
-def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
-def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
+def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
+def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
+def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
 
 } // TargetPrefix = "amdgcn"
 
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -114,19 +114,19 @@
 }
 
 // CHECK-LABEL: @test_ds_fadd
-// CHECK: call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
 void test_ds_faddf(local float *out, float src) {
   *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_ds_fmin
-// CHECK: call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
 void test_ds_fminf(local float *out, float src) {
   *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_ds_fmax
-// CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
 void test_ds_fmaxf(local float *out, float src) {
   *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
 }
Index: clang/test/CodeGenCUDA/builtins-amdgcn.cu
===================================================================
--- clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -10,7 +10,7 @@
 }
 
 // CHECK-LABEL: @_Z12test_ds_fmaxf(
-// CHECK: call contract float @llvm.amdgcn.ds.fmax(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
+// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
 __global__
 void test_ds_fmax(float src) {
   __shared__ float shared;
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -14746,6 +14746,32 @@
     Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
     return Builder.CreateCall(F, { Src0, Builder.getFalse() });
   }
+  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+    Intrinsic::ID Intrin;
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+      Intrin = Intrinsic::amdgcn_ds_fadd;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+      Intrin = Intrinsic::amdgcn_ds_fmin;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+      Intrin = Intrinsic::amdgcn_ds_fmax;
+      break;
+    }
+    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+    llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
+    llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
+    llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
+    llvm::FunctionType *FTy = F->getFunctionType();
+    llvm::Type *PTy = FTy->getParamType(0);
+    Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
+    return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
+  }
   case AMDGPU::BI__builtin_amdgcn_read_exec: {
     CallInst *CI = cast<CallInst>(
       EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec"));

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D87947: [AMDGPU] Make ds fp atomics overloadable

Reply via email to