[clang] [llvm] [AMDGPU][Clang] Builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
@@ -240,6 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts") arsenm wrote: clang should really be enforcing the valid immediate values for the size https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][Clang] Builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][Clang] Builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU][Clang] Builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/92962 >From 539c48a5a44392821547335bdfc3942cc03e1540 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 21 May 2024 18:26:25 -0400 Subject: [PATCH] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on MI3XX Fixes: SWDEV-459212 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx940.cl | 52 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 31 +-- 3 files changed, 67 insertions(+), 17 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3e21a2fe2ac6b..efa652eee9901 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -240,6 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts") //===--===// // Deep learning builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl new file mode 100644 index 0..fc5649d8a41f7 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl @@ -0,0 +1,52 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +// CHECK-LABEL: @test_global_load_lds_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u32(global u32* src, local u32 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_global_load_lds_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u16(global u16* src, local u16 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_global_load_lds_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u8(global u8* src, local u8 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
[clang] [llvm] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/92962 >From a9e9fa27022738b6f7b8c5c2cbba0fb9900cd993 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 21 May 2024 18:25:38 -0400 Subject: [PATCH] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on MI3XX Fixes: SWDEV-459212 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx940.cl | 52 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 31 +-- 3 files changed, 67 insertions(+), 17 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3e21a2fe2ac6b..efa652eee9901 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -240,6 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts") //===--===// // Deep learning builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl new file mode 100644 index 0..fc5649d8a41f7 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl @@ -0,0 +1,52 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +// CHECK-LABEL: @test_global_load_lds_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u32(global u32* src, local u32 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_global_load_lds_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u16(global u16* src, local u16 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_global_load_lds_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u8(global u8* src, local u8 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
[clang] [llvm] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
@@ -2466,23 +2466,20 @@ def int_amdgcn_perm : // GFX9 Intrinsics //===--===// -class AMDGPUGlobalLoadLDS : Intrinsic < - [], - [LLVMQualPointerType<1>, // Base global pointer to load from - LLVMQualPointerType<3>, // LDS base pointer to store to - llvm_i32_ty,// Data byte size: 1/2/4 - llvm_i32_ty,// imm offset (applied to both global and LDS address) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, - // bit 1 = slc/sc1, - // bit 2 = dlc on gfx10/gfx11)) - // bit 4 = scc/nt on gfx90a+)) - // gfx12+: - // cachepolicy (bits [0-2] = th, - // bits [3-4] = scope) - // swizzled buffer (bit 6 = swz), - [IntrWillReturn, NoCapture>, NoCapture>, - ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], - "", [SDNPMemOperand]>; +class AMDGPUGlobalLoadLDS : + ClangBuiltin<"__builtin_amdgcn_global_load_lds">, + Intrinsic < +[], +[LLVMQualPointerType<1>,// Base global pointer to load from + LLVMQualPointerType<3>,// LDS base pointer to store to + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) + llvm_i32_ty, // imm offset (applied to both global and LDS address) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, +// bit 1 = slc/sc1, +// bit 4 = scc/nt on gfx90a+)) rampitec wrote: Just sc0, sc1 and scc. It does not exist on gfx90a. https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
@@ -2466,23 +2466,24 @@ def int_amdgcn_perm : // GFX9 Intrinsics //===--===// -class AMDGPUGlobalLoadLDS : Intrinsic < - [], - [LLVMQualPointerType<1>, // Base global pointer to load from - LLVMQualPointerType<3>, // LDS base pointer to store to - llvm_i32_ty,// Data byte size: 1/2/4 - llvm_i32_ty,// imm offset (applied to both global and LDS address) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, - // bit 1 = slc/sc1, - // bit 2 = dlc on gfx10/gfx11)) - // bit 4 = scc/nt on gfx90a+)) - // gfx12+: - // cachepolicy (bits [0-2] = th, - // bits [3-4] = scope) - // swizzled buffer (bit 6 = swz), - [IntrWillReturn, NoCapture>, NoCapture>, - ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], - "", [SDNPMemOperand]>; +class AMDGPUGlobalLoadLDS : + ClangBuiltin<"__builtin_amdgcn_global_load_lds">, + Intrinsic < +[], +[LLVMQualPointerType<1>,// Base global pointer to load from + LLVMQualPointerType<3>,// LDS base pointer to store to + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) + llvm_i32_ty, // imm offset (applied to both global and LDS address) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, shiltian wrote: My bad. Forgot to remove all of them. https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/92962 >From 741d71e456825c156d4a34020c6f7d8f28e11a1d Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 21 May 2024 16:57:22 -0400 Subject: [PATCH] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on MI3XX Fixes: SWDEV-459212 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx940.cl | 52 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 31 +-- 3 files changed, 67 insertions(+), 17 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3e21a2fe2ac6b..efa652eee9901 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -240,6 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts") //===--===// // Deep learning builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl new file mode 100644 index 0..fc5649d8a41f7 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl @@ -0,0 +1,52 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +// CHECK-LABEL: @test_global_load_lds_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u32(global u32* src, local u32 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_global_load_lds_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u16(global u16* src, local u16 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_global_load_lds_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT:[[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT:store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT:[[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) +// CHECK-NEXT:ret void +// +void test_global_load_lds_u8(global u8* src, local u8 *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
[clang] [llvm] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
@@ -2466,23 +2466,24 @@ def int_amdgcn_perm : // GFX9 Intrinsics //===--===// -class AMDGPUGlobalLoadLDS : Intrinsic < - [], - [LLVMQualPointerType<1>, // Base global pointer to load from - LLVMQualPointerType<3>, // LDS base pointer to store to - llvm_i32_ty,// Data byte size: 1/2/4 - llvm_i32_ty,// imm offset (applied to both global and LDS address) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, - // bit 1 = slc/sc1, - // bit 2 = dlc on gfx10/gfx11)) - // bit 4 = scc/nt on gfx90a+)) - // gfx12+: - // cachepolicy (bits [0-2] = th, - // bits [3-4] = scope) - // swizzled buffer (bit 6 = swz), - [IntrWillReturn, NoCapture>, NoCapture>, - ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], - "", [SDNPMemOperand]>; +class AMDGPUGlobalLoadLDS : + ClangBuiltin<"__builtin_amdgcn_global_load_lds">, + Intrinsic < +[], +[LLVMQualPointerType<1>,// Base global pointer to load from + LLVMQualPointerType<3>,// LDS base pointer to store to + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) + llvm_i32_ty, // imm offset (applied to both global and LDS address) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, rampitec wrote: Keep description of only sc0, sc1, and scc? It is not supported except on gfx940 anyway. https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Clang builtin for GLOBAL_LOAD_LDS on GFX940 (PR #92962)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/92962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits