[clang] [llvm] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (PR #149194)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/149194 ___ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (PR #149194)
github-actions[bot] wrote:
:warning: undef deprecator found issues in your code. :warning:
You can test this locally with the following command:
``bash
git diff -U0 --pickaxe-regex -S
'([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
``
The following files introduce new uses of undef:
- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
[Undef](https://llvm.org/docs/LangRef.html#undefined-values) is now deprecated
and should only be used in the rare cases where no replacement is possible. For
example, a load of uninitialized memory yields `undef`. You should use `poison`
values for placeholders instead.
In tests, avoid using `undef` and having tests that trigger undefined behavior.
If you need an operand with some unimportant value, you can add a new argument
to the function and use that instead.
For example, this is considered a bad practice:
```llvm
define void @fn() {
...
br i1 undef, ...
}
```
Please use the following instead:
```llvm
define void @fn(i1 %cond) {
...
br i1 %cond, ...
}
```
Please refer to the [Undefined Behavior
Manual](https://llvm.org/docs/UndefinedBehavior.html) for more information.
https://github.com/llvm/llvm-project/pull/149194
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (PR #149194)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) Changes Co-authored-by: Mekhanoshin, Stanislav--- Patch is 69.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149194.diff 23 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+1) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+19) - (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+2) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll (+95) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+45) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+48) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s (+56) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s (+60) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s (+12) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s (+16) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+45) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+48) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s (+56) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s (+60) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s (+16) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s (+20) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt (+63) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt (+59) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt (+15) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt (+64) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt (+60) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt (+20) ``diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 313c0e640d240..a80f571140666 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -671,6 +671,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index dcfdea648e93c..8d227a5f957c8 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -421,6 +421,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_rsq: case AMDGPU::BI__builtin_amdgcn_rsqf: case AMDGPU::BI__builtin_amdgcn_rsqh: + case AMDGPU::BI__builtin_amdgcn_rsq_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq); case AMDGPU::BI__builtin_amdgcn_rsq_clamp: case AMDGPU::BI__builtin_amdgcn_rsq_clampf: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50f02ad27357..8b7ec143a2e00 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -80,6 +80,25 @@ void test_rcp_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_rcp_bf16(a); } +// CHECK-LABEL: @test_rsq_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT:[[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT:[[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT:[[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]]) +// CHECK-NEXT:[[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT:ret void +// +void test_rsq_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_rsq_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
[clang] [llvm] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (PR #149194)
llvmbot wrote: @llvm/pr-subscribers-mc Author: Shilei Tian (shiltian) Changes Co-authored-by: Mekhanoshin, Stanislav--- Patch is 69.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149194.diff 23 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+1) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+19) - (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+2) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll (+95) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+45) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+48) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s (+56) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s (+60) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s (+12) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s (+16) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+45) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+48) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s (+56) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s (+60) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s (+16) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s (+20) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt (+63) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt (+59) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt (+15) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt (+64) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt (+60) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt (+20) ``diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 313c0e640d240..a80f571140666 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -671,6 +671,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index dcfdea648e93c..8d227a5f957c8 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -421,6 +421,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_rsq: case AMDGPU::BI__builtin_amdgcn_rsqf: case AMDGPU::BI__builtin_amdgcn_rsqh: + case AMDGPU::BI__builtin_amdgcn_rsq_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq); case AMDGPU::BI__builtin_amdgcn_rsq_clamp: case AMDGPU::BI__builtin_amdgcn_rsq_clampf: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50f02ad27357..8b7ec143a2e00 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -80,6 +80,25 @@ void test_rcp_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_rcp_bf16(a); } +// CHECK-LABEL: @test_rsq_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT:[[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT:[[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT:[[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]]) +// CHECK-NEXT:[[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT:ret void +// +void test_rsq_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_rsq_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git
[clang] [llvm] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (PR #149194)
llvmbot wrote: @llvm/pr-subscribers-clang Author: Shilei Tian (shiltian) Changes Co-authored-by: Mekhanoshin, Stanislav--- Patch is 69.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149194.diff 23 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+1) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+19) - (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+2) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll (+95) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+45) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+48) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s (+56) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s (+60) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s (+12) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s (+16) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+45) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+48) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s (+56) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s (+60) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s (+16) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s (+20) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt (+63) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt (+59) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt (+15) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt (+64) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt (+60) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt (+20) ``diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 313c0e640d240..a80f571140666 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -671,6 +671,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index dcfdea648e93c..8d227a5f957c8 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -421,6 +421,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_rsq: case AMDGPU::BI__builtin_amdgcn_rsqf: case AMDGPU::BI__builtin_amdgcn_rsqh: + case AMDGPU::BI__builtin_amdgcn_rsq_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq); case AMDGPU::BI__builtin_amdgcn_rsq_clamp: case AMDGPU::BI__builtin_amdgcn_rsq_clampf: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50f02ad27357..8b7ec143a2e00 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -80,6 +80,25 @@ void test_rcp_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_rcp_bf16(a); } +// CHECK-LABEL: @test_rsq_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT:[[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT:[[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT:[[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]]) +// CHECK-NEXT:[[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT:store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT:ret void +// +void test_rsq_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_rsq_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --g
[clang] [llvm] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (PR #149194)
shiltian wrote: * **#149194** https://app.graphite.dev/github/pr/llvm/llvm-project/149194?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/149194?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/149194 ___ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (PR #149194)
https://github.com/shiltian created
https://github.com/llvm/llvm-project/pull/149194
Co-authored-by: Mekhanoshin, Stanislav
>From 296077854b4bcad36f9b924da1dbfe4376d8b4d5 Mon Sep 17 00:00:00 2001
From: Shilei Tian
Date: Wed, 16 Jul 2025 17:31:42 -0400
Subject: [PATCH] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250
Co-authored-by: Mekhanoshin, Stanislav
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 +
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19
llvm/lib/Target/AMDGPU/VOP1Instructions.td| 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll| 95 +++
llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 +
llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s| 48 ++
.../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 +++
llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 60
.../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 +++
llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 16
.../gfx1250_asm_vop3_from_vop1-fake16.s | 45 +
.../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s| 48 ++
.../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 +++
.../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 60
.../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16
.../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20
.../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 63
.../AMDGPU/gfx1250_dasm_vop1_dpp16.txt| 59
.../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 15 +++
.../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt| 64 +
.../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 60
.../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 20
23 files changed, 881 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 313c0e640d240..a80f571140666 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -671,6 +671,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs",
"n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index dcfdea648e93c..8d227a5f957c8 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -421,6 +421,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned
BuiltinID,
case AMDGPU::BI__builtin_amdgcn_rsq:
case AMDGPU::BI__builtin_amdgcn_rsqf:
case AMDGPU::BI__builtin_amdgcn_rsqh:
+ case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_rsq);
case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50f02ad27357..8b7ec143a2e00 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -80,6 +80,25 @@ void test_rcp_bf16(global __bf16* out, __bf16 a)
*out = __builtin_amdgcn_rcp_bf16(a);
}
+// CHECK-LABEL: @test_rsq_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
+// CHECK-NEXT:[[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:[[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:[[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5)
[[A_ADDR]] to ptr
+// CHECK-NEXT:store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]],
align 8
+// CHECK-NEXT:store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:[[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:[[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat
[[TMP0]])
+// CHECK-NEXT:[[TMP2:%.*]] = load ptr addrspace(1), ptr
[[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:ret void
+//
+void test_rsq_bf16(global __bf16* out, __bf16 a)
+{
+ *out = __builtin_amdgcn_rsq_bf16(a);
+}
+
// CHECK-LABEL: @test_cvt_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8,
addrspace(5)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index e2f371079179d..6f8437e82700e 100644
