llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: Alex Voicu (AlexVlx)

<details>
<summary>Changes</summary>

This exposes Clang builtins corresponding to the `amdgcn.wave.shuffle` and 
`amdgcn.wave.id` LLVM intrinsics. The shuffle implementation is a bit more 
verbose, since we're using an overloaded interface as opposed to having 
multiple suffixed standalone variants, contrary to what wave_reduce chose to do.

---
Full diff: https://github.com/llvm/llvm-project/pull/179492.diff


6 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.td (+9) 
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+4) 
- (modified) clang/lib/Sema/SemaAMDGPU.cpp (+25) 
- (modified) clang/lib/Sema/SemaChecking.cpp (+3-1) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl (+16) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn.cl (+21) 


``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td 
b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 1950757097fc6..a9acc1544ad53 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -67,6 +67,8 @@ def __builtin_amdgcn_mbcnt_lo : AMDGPUBuiltin<"unsigned 
int(unsigned int, unsign
 
 def __builtin_amdgcn_s_memtime : AMDGPUBuiltin<"uint64_t()", [], 
"s-memtime-inst">;
 
+def __builtin_amdgcn_wave_id : AMDGPUBuiltin<"int32_t()", [Const], 
"architected-sgprs">;
+
 
//===----------------------------------------------------------------------===//
 // Instruction builtins.
 
//===----------------------------------------------------------------------===//
@@ -413,6 +415,13 @@ def __builtin_amdgcn_wave_reduce_fsub_f64 : 
AMDGPUBuiltin<"double(double, _Const
 def __builtin_amdgcn_wave_reduce_fmin_f64 : AMDGPUBuiltin<"double(double, 
_Constant int32_t)", [Const]>;
 def __builtin_amdgcn_wave_reduce_fmax_f64 : AMDGPUBuiltin<"double(double, 
_Constant int32_t)", [Const]>;
 
+//===----------------------------------------------------------------------===//
+// Wave Shuffle builtins.
+//===----------------------------------------------------------------------===//
+
+// This is an overloaded builtin modelled after the atomic ones
+def __builtin_amdgcn_wave_shuffle : AMDGPUBuiltin<"void(...)", [Const, 
CustomTypeChecking]>;
+
 
//===----------------------------------------------------------------------===//
 // R600-NI only builtins.
 
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index a096ed27a788e..619c9b4be9090 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -449,6 +449,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
     return Builder.CreateCall(F, {Value, Strategy});
   }
+  case AMDGPU::BI__builtin_amdgcn_wave_shuffle:
+    // TODO: can we unify this with wave_reduce?
+    return emitBuiltinWithOneOverloadedType<2>(*this, E,
+                                               Intrinsic::amdgcn_wave_shuffle);
   case AMDGPU::BI__builtin_amdgcn_div_scale:
   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
     // Translate from the intrinsics's struct return to the builtin's out
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 4261e1849133f..d5403f22eb7bb 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -296,6 +296,31 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned 
BuiltinID,
     }
     return false;
   }
+  case AMDGPU::BI__builtin_amdgcn_wave_shuffle: {
+    Expr *Val = TheCall->getArg(0);
+    QualType ValTy = Val->getType();
+
+    if ((!ValTy->isIntegerType() && !ValTy->isFloatingType()) ||
+        SemaRef.getASTContext().getTypeSize(ValTy) > 32)
+      return Diag(Val->getExprLoc(), diag::err_builtin_invalid_arg_type)
+          << Val << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/2
+          << ValTy;
+
+    Expr *Idx = TheCall->getArg(1);
+    QualType IdxTy = Idx->getType();
+    if (!IdxTy->isIntegerType())
+      return Diag(Idx->getExprLoc(), diag::err_typecheck_expect_int) << IdxTy;
+    if (SemaRef.getASTContext().getTypeSize(IdxTy) > 32)
+      return Diag(Idx->getExprLoc(), diag::err_builtin_invalid_arg_type)
+          << Idx << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/0
+          << IdxTy;
+
+    // Resolve the overload here, now that we know that the invocation is
+    // correct: the intrinsic returns the type of the value argument.
+    TheCall->setType(ValTy);
+
+    return false;
+  }
   default:
     return false;
   }
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e2e1b37572364..9858264aa042d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2100,8 +2100,10 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo 
&TI, unsigned BuiltinID,
   case llvm::Triple::spirv:
   case llvm::Triple::spirv32:
   case llvm::Triple::spirv64:
-    if (TI.getTriple().getOS() != llvm::Triple::OSType::AMDHSA)
+    if (TI.getTriple().getVendor() != llvm::Triple::VendorType::AMD)
       return SPIRV().CheckSPIRVBuiltinFunctionCall(TI, BuiltinID, TheCall);
+    else
+      return AMDGPU().CheckAMDGCNBuiltinFunctionCall(BuiltinID, TheCall);
     return false;
   case llvm::Triple::systemz:
     return SystemZ().CheckSystemZBuiltinFunctionCall(BuiltinID, TheCall);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 8c02616780182..d39c4180178ad 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -317,3 +317,19 @@ void test_ds_bpermute_fi_b32(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_ds_bpermute_fi_b32(a, b);
 }
+
+__attribute__((target("architected-sgprs")))
+// CHECK-LABEL: @test_wave_id(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.wave.id()
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr 
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_wave_id(global int* out)
+{
+  *out = __builtin_amdgcn_wave_id();
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 376105cb6594c..4755cd32a2e2c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -937,6 +937,27 @@ void test_wave_reduce_max_u64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_max_u64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_shuffle_u32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.shuffle.i32
+void test_wave_shuffle_u32(global unsigned* out, unsigned in, int idx)
+{
+  *out = __builtin_amdgcn_wave_shuffle(in, idx);
+}
+
+// CHECK-LABEL: @test_wave_shuffle_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.shuffle.i32
+void test_wave_shuffle_i32(global int* out, int in, int idx)
+{
+  *out = __builtin_amdgcn_wave_shuffle(in, idx);
+}
+
+// CHECK-LABEL: @test_wave_shuffle_f32
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.shuffle.f32
+void test_wave_shuffle_f32(global float* out, float in, int idx)
+{
+  *out = __builtin_amdgcn_wave_shuffle(in, idx);
+}
+
 // CHECK-LABEL: @test_s_barrier
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
 void test_s_barrier()

``````````

</details>


https://github.com/llvm/llvm-project/pull/179492
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to