https://github.com/ranapratap55 updated https://github.com/llvm/llvm-project/pull/187064
>From 10653ad2b7ed5370e8f962053efb6ca1567c37c5 Mon Sep 17 00:00:00 2001 From: ranapratap55 <[email protected]> Date: Tue, 17 Mar 2026 22:24:52 +0530 Subject: [PATCH 1/2] [Clang][AMDGPU] Add clang builtins for buffer format load/store intrinsics --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 9 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 29 +++ .../builtins-amdgcn-buffer-format.hip | 208 ++++++++++++++++++ .../builtins-amdgcn-raw-buffer-load-format.cl | 44 ++++ ...builtins-amdgcn-raw-buffer-store-format.cl | 44 ++++ ...iltins-amdgcn-struct-buffer-load-format.cl | 35 +++ ...ltins-amdgcn-struct-buffer-store-format.cl | 35 +++ .../SemaHIP/builtins-amdgcn-buffer-format.hip | 46 ++++ ...builtins-amdgcn-raw-buffer-format-error.cl | 24 ++ ...ltins-amdgcn-struct-buffer-format-error.cl | 23 ++ 10 files changed, 497 insertions(+) create mode 100644 clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-format.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store-format.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-load-format.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-store-format.cl create mode 100644 clang/test/SemaHIP/builtins-amdgcn-buffer-format.hip create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-format-error.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-struct-buffer-format-error.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index acd0a34a79253..664655a1d4bfc 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -252,6 +252,15 @@ def __builtin_amdgcn_raw_buffer_load_b64 : AMDGPUBuiltin<"_ExtVector<2, unsigned def __builtin_amdgcn_raw_buffer_load_b96 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">; def __builtin_amdgcn_raw_buffer_load_b128 : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">; +def __builtin_amdgcn_raw_buffer_load_format_v4f32 : AMDGPUBuiltin<"_ExtVector<4, float>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">; +def __builtin_amdgcn_raw_buffer_load_format_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">; +def __builtin_amdgcn_raw_buffer_store_format_v4f32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">; +def __builtin_amdgcn_raw_buffer_store_format_v4f16 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">; +def __builtin_amdgcn_struct_buffer_load_format_v4f32 : AMDGPUBuiltin<"_ExtVector<4, float>(__amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">; +def __builtin_amdgcn_struct_buffer_load_format_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(__amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">; +def __builtin_amdgcn_struct_buffer_store_format_v4f32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, __amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">; +def __builtin_amdgcn_struct_buffer_store_format_v4f16 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, __amdgpu_buffer_rsrc_t, int, int, int, _Constant int)">; + def __builtin_amdgcn_raw_ptr_buffer_atomic_add_i32 : AMDGPUBuiltin<"int(int, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">; def __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32 : AMDGPUBuiltin<"float(float, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-fadd-rtn-insts">; diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 0d572d37ab972..13236a177b398 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -2028,6 +2028,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128: return emitBuiltinWithOneOverloadedType<5>( *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store); + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_format_v4f32: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_format_v4f16: + return emitBuiltinWithOneOverloadedType<5>( + *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store_format); case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: @@ -2061,6 +2065,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); } + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_format_v4f32: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_format_v4f16: { + llvm::Type *RetTy = ConvertType(E->getType()); + Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load_format, {RetTy}); + + return Builder.CreateCall( + F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); + } + case AMDGPU::BI__builtin_amdgcn_struct_buffer_store_format_v4f32: + case AMDGPU::BI__builtin_amdgcn_struct_buffer_store_format_v4f16: + return emitBuiltinWithOneOverloadedType<6>( + *this, E, Intrinsic::amdgcn_struct_ptr_buffer_store_format); + case AMDGPU::BI__builtin_amdgcn_struct_buffer_load_format_v4f32: + case AMDGPU::BI__builtin_amdgcn_struct_buffer_load_format_v4f16: { + llvm::Type *RetTy = ConvertType(E->getType()); + Function *F = CGM.getIntrinsic( + Intrinsic::amdgcn_struct_ptr_buffer_load_format, {RetTy}); + + return Builder.CreateCall( + F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)), + EmitScalarExpr(E->getArg(4))}); + } case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32: return emitBuiltinWithOneOverloadedType<5>( *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_add); diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip b/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip new file mode 100644 index 0000000000000..1534d2dd4810c --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip @@ -0,0 +1,208 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu verde -emit-llvm -disable-llvm-optzns -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef _Float16 v4f16 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: @_Z33test_raw_buffer_load_format_v4f32u22__amdgpu_buffer_rsrc_tii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +__device__ v4f32 test_raw_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @_Z33test_raw_buffer_load_format_v4f16u22__amdgpu_buffer_rsrc_tii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0) +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// +__device__ v4f16 test_raw_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, offset, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @_Z34test_raw_buffer_store_format_v4f32Dv4_fu22__amdgpu_buffer_rsrc_tii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VDATA_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) +// CHECK-NEXT: ret void +// +__device__ void test_raw_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc, offset, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @_Z34test_raw_buffer_store_format_v4f16Dv4_DF16_u22__amdgpu_buffer_rsrc_tii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store <4 x half> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[VDATA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) +// CHECK-NEXT: ret void +// +__device__ void test_raw_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata, rsrc, offset, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @_Z36test_struct_buffer_load_format_v4f32u22__amdgpu_buffer_rsrc_tiii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +__device__ v4f32 test_struct_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { + return __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @_Z36test_struct_buffer_load_format_v4f16u22__amdgpu_buffer_rsrc_tiii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) +// CHECK-NEXT: ret <4 x half> [[TMP4]] +// +__device__ v4f16 test_struct_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { + return __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex, offset, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @_Z37test_struct_buffer_store_format_v4f32Dv4_fu22__amdgpu_buffer_rsrc_tiii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VDATA_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], i32 0) +// CHECK-NEXT: ret void +// +__device__ void test_struct_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex, offset, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @_Z37test_struct_buffer_store_format_v4f16Dv4_DF16_u22__amdgpu_buffer_rsrc_tiii( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) +// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr +// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr +// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr +// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr +// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr +// CHECK-NEXT: store <4 x half> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[VDATA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f16(<4 x half> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], i32 0) +// CHECK-NEXT: ret void +// +__device__ void test_struct_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { + __builtin_amdgcn_struct_buffer_store_format_v4f16(vdata, rsrc, vindex, offset, soffset, /*aux=*/0); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-format.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-format.cl new file mode 100644 index 0000000000000..5c2e3e1a24862 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-format.cl @@ -0,0 +1,44 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: @test_raw_buffer_load_format_v4f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +v4f32 test_raw_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc) { + return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, 0, 0, 0); +} + +// CHECK-LABEL: @test_raw_buffer_load_format_v4f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// +v4f16 test_raw_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc) { + return __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, 0, 0, 0); +} + +// CHECK-LABEL: @test_raw_buffer_load_format_v4f32_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +v4f32 test_raw_buffer_load_format_v4f32_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset) { + return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, 0, 0); +} + +// CHECK-LABEL: @test_raw_buffer_load_format_v4f32_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +v4f32 test_raw_buffer_load_format_v4f32_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int soffset) { + return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, 0, soffset, 0); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store-format.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store-format.cl new file mode 100644 index 0000000000000..b10c6d59635f4 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-store-format.cl @@ -0,0 +1,44 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: @test_raw_buffer_store_format_v4f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_raw_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc) { + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc,0, 0, 0); +} + +// CHECK-LABEL: @test_raw_buffer_store_format_v4f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_raw_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc) { + __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata, rsrc,0, 0, 0); +} + +// CHECK-LABEL: @test_raw_buffer_store_format_v4f32_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_raw_buffer_store_format_v4f32_non_const_offset(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset) { + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc, offset, 0, 0); +} + +// CHECK-LABEL: @test_raw_buffer_store_format_v4f32_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret void +// +void test_raw_buffer_store_format_v4f32_non_const_soffset(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int soffset) { + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc,0, soffset, 0); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-load-format.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-load-format.cl new file mode 100644 index 0000000000000..c31c6ed82b82f --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-load-format.cl @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: @test_struct_buffer_load_format_v4f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +v4f32 test_struct_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int vindex) { + return __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex,0, 0, 0); +} + +// CHECK-LABEL: @test_struct_buffer_load_format_v4f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// +v4f16 test_struct_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int vindex) { + return __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex,0, 0, 0); +} + +// CHECK-LABEL: @test_struct_buffer_load_format_v4f32_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +v4f32 test_struct_buffer_load_format_v4f32_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset) { + return __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, 0, 0); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-store-format.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-store-format.cl new file mode 100644 index 0000000000000..b30a46eb78f32 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-struct-buffer-store-format.cl @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: @test_struct_buffer_store_format_v4f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_struct_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex) { + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex,0, 0, 0); +} + +// CHECK-LABEL: @test_struct_buffer_store_format_v4f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f16(<4 x half> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_struct_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex) { + __builtin_amdgcn_struct_buffer_store_format_v4f16(vdata, rsrc, vindex,0, 0, 0); +} + +// CHECK-LABEL: @test_struct_buffer_store_format_v4f32_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_struct_buffer_store_format_v4f32_non_const_offset(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset) { + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex, offset, 0, 0); +} diff --git a/clang/test/SemaHIP/builtins-amdgcn-buffer-format.hip b/clang/test/SemaHIP/builtins-amdgcn-buffer-format.hip new file mode 100644 index 0000000000000..15f02f821b0ba --- /dev/null +++ b/clang/test/SemaHIP/builtins-amdgcn-buffer-format.hip @@ -0,0 +1,46 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu verde -verify %s -fcuda-is-device +// REQUIRES: amdgpu-registered-target + +#define __device__ __attribute__((device)) + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef _Float16 v4f16 __attribute__((ext_vector_type(4))); + +__device__ void test_raw_buffer_format(__amdgpu_buffer_rsrc_t rsrc, v4f32 vdata_f32, v4f16 vdata_f16, int offset, int soffset) { + v4f32 ld_f32 = __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, 0); + v4f16 ld_f16 = __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, offset, soffset, 0); + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata_f32, rsrc, offset, soffset, 0); + __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata_f16, rsrc, offset, soffset, 0); +} + +__device__ void test_struct_buffer_format(__amdgpu_buffer_rsrc_t rsrc, v4f32 vdata_f32, v4f16 vdata_f16, int vindex, int offset, int soffset) { + v4f32 ld_f32 = __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, 0); + v4f16 ld_f16 = __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex, offset, soffset, 0); + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata_f32, rsrc, vindex, offset, soffset, 0); + __builtin_amdgcn_struct_buffer_store_format_v4f16(vdata_f16, rsrc, vindex, offset, soffset, 0); +} + +__device__ void test_raw_buffer_format_err(__amdgpu_buffer_rsrc_t rsrc, v4f32 vdata_f32, v4f16 vdata_f16, int offset, int soffset, int aux) { + __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_format_v4f32' must be a constant integer}} + __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_format_v4f16' must be a constant integer}} + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata_f32, rsrc, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_format_v4f32' must be a constant integer}} + __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata_f16, rsrc, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_format_v4f16' must be a constant integer}} +} + +__device__ void test_struct_buffer_format_err(__amdgpu_buffer_rsrc_t rsrc, v4f32 vdata_f32, v4f16 vdata_f16, int vindex, int offset, int soffset, int aux) { + __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_struct_buffer_load_format_v4f32' must be a constant integer}} + __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_struct_buffer_load_format_v4f16' must be a constant integer}} + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata_f32, rsrc, vindex, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_struct_buffer_store_format_v4f32' must be a constant integer}} + __builtin_amdgcn_struct_buffer_store_format_v4f16(vdata_f16, rsrc, vindex, offset, soffset, aux); // expected-error{{argument to '__builtin_amdgcn_struct_buffer_store_format_v4f16' must be a constant integer}} +} + +__device__ void test_raw_buffer_format_too_many_args(__amdgpu_buffer_rsrc_t rsrc, v4f32 vdata, int offset, int soffset) { + __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}} + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}} +} + +__device__ void test_struct_buffer_format_too_many_args(__amdgpu_buffer_rsrc_t rsrc, v4f32 vdata, int vindex, int offset, int soffset) { + __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}} + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-format-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-format-error.cl new file mode 100644 index 0000000000000..74f855c8b0067 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-format-error.cl @@ -0,0 +1,24 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); + +v4f32 test_raw_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_format_v4f32' must be a constant integer}} +} + +v4f16 test_raw_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_format_v4f16' must be a constant integer}} +} + +void test_raw_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_format_v4f32' must be a constant integer}} +} + +void test_raw_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata, rsrc, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_store_format_v4f16' must be a constant integer}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-struct-buffer-format-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-struct-buffer-format-error.cl new file mode 100644 index 0000000000000..0e858a576b1c2 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-struct-buffer-format-error.cl @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef float v4f32 __attribute__((ext_vector_type(4))); +typedef half v4f16 __attribute__((ext_vector_type(4))); + +v4f32 test_struct_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset, int aux) { + return __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_struct_buffer_load_format_v4f32' must be a constant integer}} +} + +v4f16 test_struct_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset, int aux) { + return __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_struct_buffer_load_format_v4f16' must be a constant integer}} +} + +void test_struct_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset, int aux) { + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_struct_buffer_store_format_v4f32' must be a constant integer}} +} + +void test_struct_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset, int aux) { + __builtin_amdgcn_struct_buffer_store_format_v4f16(vdata, rsrc, vindex, offset, soffset, aux); //expected-error{{argument to '__builtin_amdgcn_struct_buffer_store_format_v4f16' must be a constant integer}} +} >From b5ddd3a1815a5fd314626635eddf00d7d2787837 Mon Sep 17 00:00:00 2001 From: ranapratap55 <[email protected]> Date: Wed, 18 Mar 2026 11:28:40 +0530 Subject: [PATCH 2/2] [Clang][AMDGPU][test] Added -O1 to map builtin:intrinsic map and removed extra allocas in CHECK lines --- .../builtins-amdgcn-buffer-format.hip | 170 +++--------------- 1 file changed, 21 insertions(+), 149 deletions(-) diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip b/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip index 1534d2dd4810c..603e6522cd38c 100644 --- a/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip +++ b/clang/test/CodeGenHIP/builtins-amdgcn-buffer-format.hip @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu verde -emit-llvm -disable-llvm-optzns -fcuda-is-device -o - %s | FileCheck %s +// RUN: %clang_cc1 -O1 -triple amdgcn-amd-amdhsa -target-cpu verde -emit-llvm -fcuda-is-device -o - %s | FileCheck %s #define __device__ __attribute__((device)) @@ -9,200 +9,72 @@ typedef _Float16 v4f16 __attribute__((ext_vector_type(4))); // CHECK-LABEL: @_Z33test_raw_buffer_load_format_v4f32u22__amdgpu_buffer_rsrc_tii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call contract <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call contract <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // __device__ v4f32 test_raw_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { - return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, /*aux=*/0); + return __builtin_amdgcn_raw_buffer_load_format_v4f32(rsrc, offset, soffset, 0); } // CHECK-LABEL: @_Z33test_raw_buffer_load_format_v4f16u22__amdgpu_buffer_rsrc_tii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call contract <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call contract <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // __device__ v4f16 test_raw_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { - return __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, offset, soffset, /*aux=*/0); + return __builtin_amdgcn_raw_buffer_load_format_v4f16(rsrc, offset, soffset, 0); } // CHECK-LABEL: @_Z34test_raw_buffer_store_format_v4f32Dv4_fu22__amdgpu_buffer_rsrc_tii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store <4 x float> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VDATA_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) // CHECK-NEXT: ret void // __device__ void test_raw_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { - __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc, offset, soffset, /*aux=*/0); + __builtin_amdgcn_raw_buffer_store_format_v4f32(vdata, rsrc, offset, soffset, 0); } // CHECK-LABEL: @_Z34test_raw_buffer_store_format_v4f16Dv4_DF16_u22__amdgpu_buffer_rsrc_tii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store <4 x half> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[VDATA_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) // CHECK-NEXT: ret void // __device__ void test_raw_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { - __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata, rsrc, offset, soffset, /*aux=*/0); + __builtin_amdgcn_raw_buffer_store_format_v4f16(vdata, rsrc, offset, soffset, 0); } // CHECK-LABEL: @_Z36test_struct_buffer_load_format_v4f32u22__amdgpu_buffer_rsrc_tiii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) -// CHECK-NEXT: ret <4 x float> [[TMP4]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call contract <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // __device__ v4f32 test_struct_buffer_load_format_v4f32(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { - return __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, /*aux=*/0); + return __builtin_amdgcn_struct_buffer_load_format_v4f32(rsrc, vindex, offset, soffset, 0); } // CHECK-LABEL: @_Z36test_struct_buffer_load_format_v4f16u22__amdgpu_buffer_rsrc_tiii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 0) -// CHECK-NEXT: ret <4 x half> [[TMP4]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call contract <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // __device__ v4f16 test_struct_buffer_load_format_v4f16(__amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { - return __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex, offset, soffset, /*aux=*/0); + return __builtin_amdgcn_struct_buffer_load_format_v4f16(rsrc, vindex, offset, soffset, 0); } // CHECK-LABEL: @_Z37test_struct_buffer_store_format_v4f32Dv4_fu22__amdgpu_buffer_rsrc_tiii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store <4 x float> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VDATA_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], i32 0) +// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) // CHECK-NEXT: ret void // __device__ void test_struct_buffer_store_format_v4f32(v4f32 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { - __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex, offset, soffset, /*aux=*/0); + __builtin_amdgcn_struct_buffer_store_format_v4f32(vdata, rsrc, vindex, offset, soffset, 0); } // CHECK-LABEL: @_Z37test_struct_buffer_store_format_v4f16Dv4_DF16_u22__amdgpu_buffer_rsrc_tiii( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VDATA_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) -// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5) -// CHECK-NEXT: [[VINDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[SOFFSET_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[VDATA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VDATA_ADDR]] to ptr -// CHECK-NEXT: [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr -// CHECK-NEXT: [[VINDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VINDEX_ADDR]] to ptr -// CHECK-NEXT: [[OFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OFFSET_ADDR]] to ptr -// CHECK-NEXT: [[SOFFSET_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SOFFSET_ADDR]] to ptr -// CHECK-NEXT: store <4 x half> [[VDATA:%.*]], ptr [[VDATA_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[VINDEX:%.*]], ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[OFFSET:%.*]], ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[SOFFSET:%.*]], ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[VDATA_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VINDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[OFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SOFFSET_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f16(<4 x half> [[TMP0]], ptr addrspace(8) [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], i32 0) +// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f16(<4 x half> [[VDATA:%.*]], ptr addrspace(8) [[RSRC:%.*]], i32 [[VINDEX:%.*]], i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 0) // CHECK-NEXT: ret void // __device__ void test_struct_buffer_store_format_v4f16(v4f16 vdata, __amdgpu_buffer_rsrc_t rsrc, int vindex, int offset, int soffset) { - __builtin_amdgcn_struct_buffer_store_format_v4f16(vdata, rsrc, vindex, offset, soffset, /*aux=*/0); + __builtin_amdgcn_struct_buffer_store_format_v4f16(vdata, rsrc, vindex, offset, soffset, 0); } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
