Author: Sameer Sahasrabuddhe Date: 2026-06-09T12:39:50Z New Revision: 8b0223858b1e15a4310de18445d972e7648b740f
URL: https://github.com/llvm/llvm-project/commit/8b0223858b1e15a4310de18445d972e7648b740f DIFF: https://github.com/llvm/llvm-project/commit/8b0223858b1e15a4310de18445d972e7648b740f.diff LOG: [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 (#199176) These builtins allow the program to request store-available and load-visible accesses as described in #191246. Each of them takes a __MEMORY_SCOPE_* operand that is then translated to target-specific cache policy bits. This patch was extracted from #172090. Co-authored-by: macurtis-amd <[email protected]> Assisted-by: Claude Opus 4.6 --------- Co-authored-by: macurtis-amd <[email protected]> Added: clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl clang/test/SemaHIP/amdgpu-av-load-store.hip clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl Modified: clang/docs/LanguageExtensions.rst clang/include/clang/Basic/BuiltinsAMDGPU.td clang/include/clang/Basic/BuiltinsAMDGPUDocs.td clang/include/clang/Sema/SemaAMDGPU.h clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp clang/lib/Sema/SemaAMDGPU.cpp clang/test/CodeGen/amdgpu-builtin-is-invocable.c clang/test/CodeGen/amdgpu-builtin-processor-is.c clang/test/CodeGen/link-builtin-bitcode.c clang/test/CodeGenCXX/dynamic-cast-address-space.cpp llvm/lib/TargetParser/AMDGPUTargetParser.cpp Removed: ################################################################################ diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index fbb9947f39d3e..9fa39c09b1f6c 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -5261,6 +5261,8 @@ builtin function, and are named with a ``__opencl_`` prefix. The macros and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.) +.. _langext-__scoped_atomic: + __scoped_atomic builtins ------------------------ @@ -5756,6 +5758,32 @@ returns the bit at the position of the current lane. It is almost equivalent to ``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if the given mask has the same value for all active lanes of the current wave. + +__builtin_amdgcn_av_{load,store}_b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Signature: + +.. code-block:: c + + typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u; + + v4u __builtin_amdgcn_av_load_b128(v4u *src, int scope); + + void __builtin_amdgcn_av_store_b128(v4u *dst, v4u data, int scope); + +Load or store a vector of 4 unsigned integers from or to memory with cache +behavior specified by ``scope``, which is one of the ``__MEMORY_SCOPE_*`` macros +defined for :ref:`scoped atomic builtins<langext-__c11_atomic>`. + +The pointer argument must point to the global or generic address space. + +These builtins are supported on gfx9, gfx10, gfx11, and gfx12 targets. + +They map to the LLVM intrinsics ``llvm.amdgcn.av.load.b128`` and +``llvm.amdgcn.av.store.b128`` documented in `User Guide for AMDGPU Backend +<https://llvm.org/docs/AMDGPUUsage.html>`_. + ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index d8020bdcc8458..8eed188b0f4b2 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -280,6 +280,21 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgp def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; +//===----------------------------------------------------------------------===// +// Global Available/Visible memory accesses. +//===----------------------------------------------------------------------===// + +def __builtin_amdgcn_av_load_b128 + : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "flat-global-insts"> { + let Documentation = [DocAVLoadB128]; + let ArgNames = ["ptr", "scope"]; +} +def __builtin_amdgcn_av_store_b128 + : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "flat-global-insts"> { + let Documentation = [DocAVStoreB128]; + let ArgNames = ["ptr", "data", "scope"]; +} + //===----------------------------------------------------------------------===// // Async mark builtins. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td index a92b85d75d902..293431c5de7e8 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td @@ -573,6 +573,35 @@ WMMA with per-operand scale factors applied during the computation. }]; } +//===----------------------------------------------------------------------===// +// Global Available/Visible Memory Access Builtins +//===----------------------------------------------------------------------===// + +def DocCatAVLoadStore : DocumentationCategory<"Available/Visible Memory Access Builtins"> { + let Content = [{ +These builtins perform 128-bit global or flat memory loads and stores with +available/visible (AV) semantics. +}]; +} + +def DocAVLoadB128 : Documentation { + let Category = DocCatAVLoadStore; + let Content = [{ +Loads 128 bits (4 x i32) from the pointer ``ptr``. The pointer must be in +the global or generic address space. The ``scope`` argument specifies the +synchronization scope using a ``__MEMORY_SCOPE_*`` constant. +}]; +} + +def DocAVStoreB128 : Documentation { + let Category = DocCatAVLoadStore; + let Content = [{ +Stores 128 bits (4 x i32) of ``data`` to the pointer ``ptr``. The pointer +must be in the global or generic address space. The ``scope`` argument +specifies the synchronization scope using a ``__MEMORY_SCOPE_*`` constant. +}]; +} + //===----------------------------------------------------------------------===// // Wave Data Exchange Builtins //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index d520f3df544f4..a6205534e0de3 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -38,6 +38,7 @@ class SemaAMDGPU : public SemaBase { bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore); bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore); + bool checkAVLoadStore(CallExpr *TheCall, bool IsStore); bool checkAtomicMonitorLoad(CallExpr *TheCall); bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs, diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index a88dbb71b3ddf..21f32b12c4fd1 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1009,6 +1009,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()}); return Builder.CreateCall(F, {Args}); } + case AMDGPU::BI__builtin_amdgcn_av_load_b128: + case AMDGPU::BI__builtin_amdgcn_av_store_b128: { + const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128; + SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr + if (IsStore) + Args.push_back(EmitScalarExpr(E->getArg(1))); // data + const unsigned ScopeIdx = E->getNumArgs() - 1; + auto *ScopeExpr = + cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(ScopeIdx))); + Args.push_back(emitScopeMD(*this, ScopeExpr->getZExtValue())); + llvm::Function *F = + CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_av_store_b128 + : Intrinsic::amdgcn_av_load_b128, + {Args[0]->getType()}); + return Builder.CreateCall(F, Args); + } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 60f74fd15226f..757cdfbf20819 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -149,6 +149,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6: case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6: return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15); + case AMDGPU::BI__builtin_amdgcn_av_load_b128: + return checkAVLoadStore(TheCall, /*IsStore=*/false); + case AMDGPU::BI__builtin_amdgcn_av_store_b128: + return checkAVLoadStore(TheCall, /*IsStore=*/true); case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B: case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B: case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B: @@ -482,6 +486,14 @@ static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) { return false; } +bool SemaAMDGPU::checkAVLoadStore(CallExpr *TheCall, bool IsStore) { + if (checkGlobalOrFlatPointerArg(*this, TheCall)) + return true; + + Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1); + return checkScopeAsInt(*this, Scope); +} + bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall); diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c index 3f988cbdf7cee..8de4b0ec8bc20 100644 --- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -54,7 +54,7 @@ void foo() { // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" } // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. -// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind } // AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c index 308eec1d212a5..8dfbb55566598 100644 --- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c +++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c @@ -63,7 +63,7 @@ void foo() { //. // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" } //. -// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind } // AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. diff --git a/clang/test/CodeGen/link-builtin-bitcode.c b/clang/test/CodeGen/link-builtin-bitcode.c index ed5abca7f34b0..8849ee735ce04 100644 --- a/clang/test/CodeGen/link-builtin-bitcode.c +++ b/clang/test/CodeGen/link-builtin-bitcode.c @@ -44,6 +44,6 @@ int bar() { return no_attr() + attr_in_target() + attr_not_in_target() + attr_in // CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] { // CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" } -// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" } +// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-global-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+flat-global-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-global-insts,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" } diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp index dbca787e26da2..8252a511d6b44 100644 --- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp +++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp @@ -107,9 +107,9 @@ const B& f(A *a) { // CHECK: attributes #[[ATTR3]] = { nounwind } // CHECK: attributes #[[ATTR4]] = { noreturn } //. -// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } +// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) } -// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } +// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" } // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind } // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn } //. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl new file mode 100644 index 0000000000000..63d7fcac16874 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl @@ -0,0 +1,250 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + +//------------------------------------------------------------------------------ +// Global Load +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_global_load_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META7:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_wave(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META8:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_workgroup(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META9:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_device(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META10:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_system(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META11:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_single(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META12:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_cluster(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Global Store +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_global_store_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_wave(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_workgroup(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_device(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_system(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_single(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_cluster(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Flat Load +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_wave(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_workgroup(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_device(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_system(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_single(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_cluster(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Flat Store +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_wave(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_workgroup(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_device(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_system(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_single(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_cluster(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR); +} +//. +// CHECK: [[META7]] = !{!"wavefront"} +// CHECK: [[META8]] = !{!"workgroup"} +// CHECK: [[META9]] = !{!"agent"} +// CHECK: [[META10]] = !{!""} +// CHECK: [[META11]] = !{!"singlethread"} +// CHECK: [[META12]] = !{!"cluster"} +//. diff --git a/clang/test/SemaHIP/amdgpu-av-load-store.hip b/clang/test/SemaHIP/amdgpu-av-load-store.hip new file mode 100644 index 0000000000000..1e9688e891228 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-av-load-store.hip @@ -0,0 +1,20 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify %s -fcuda-is-device +// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; + +__device__ void test_av_load_store_device(v4u32 *ptr, v4u32 data) { + v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0); + __builtin_amdgcn_av_store_b128(ptr, data, 0); +} + +__global__ void test_av_load_store_kernel(v4u32 *ptr, v4u32 data) { + v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0); + __builtin_amdgcn_av_store_b128(ptr, data, 0); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl new file mode 100644 index 0000000000000..b2f7b46547632 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; +typedef v4u32 __private *private_ptr_to_v4u32; + +void test_amdgcn_av_store_b128_bad_ptr(private_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}} +} + +void test_amdgcn_av_store_b128_bad_scope(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}} +} + +v4u32 test_amdgcn_av_load_b128_bad_ptr(private_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}} +} + +v4u32 test_amdgcn_av_load_b128_bad_scope(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl new file mode 100644 index 0000000000000..9a61513cdc05b --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl @@ -0,0 +1,24 @@ +// RUN: split-file %s %t +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/load.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/load.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/load.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -S -verify -o - %t/store.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -S -verify -o - %t/store.cl +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -S -verify -o - %t/store.cl +// REQUIRES: amdgpu-registered-target + +//--- load.cl +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + +v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature flat-global-insts}} +} + +//--- store.cl +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + +void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature flat-global-insts}} +} diff --git a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp index 756b7c2154ca2..24e6ece329c4c 100644 --- a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp +++ b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp @@ -226,6 +226,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -280,6 +281,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -313,6 +315,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -354,6 +357,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["gfx11-insts"] = true; @@ -391,6 +395,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["gfx10-3-insts"] = true; Features["image-insts"] = true; @@ -427,6 +432,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dpp"] = true; Features["gfx8-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx10-insts"] = true; Features["image-insts"] = true; Features["s-memrealtime"] = true; @@ -486,6 +492,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, Features["dot7-insts"] = true; Features["dot10-insts"] = true; Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["gfx8-insts"] = true; Features["16-bit-insts"] = true; Features["dpp"] = true; @@ -532,6 +539,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, case GK_GFX900: case GK_GFX9_GENERIC: Features["gfx9-insts"] = true; + Features["flat-global-insts"] = true; Features["vmem-to-lds-load-insts"] = true; [[fallthrough]]; case GK_GFX810: _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
