[clang] 8b02238 - [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 (#199176)

via cfe-commits Tue, 09 Jun 2026 05:40:04 -0700

Author: Sameer Sahasrabuddhe
Date: 2026-06-09T12:39:50Z
New Revision: 8b0223858b1e15a4310de18445d972e7648b740f


URL: 
https://github.com/llvm/llvm-project/commit/8b0223858b1e15a4310de18445d972e7648b740f
DIFF: 
https://github.com/llvm/llvm-project/commit/8b0223858b1e15a4310de18445d972e7648b740f.diff

LOG: [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 (#199176)

These builtins allow the program to request store-available and
load-visible accesses as described in #191246. Each of them takes a
__MEMORY_SCOPE_* operand that is then translated to target-specific
cache policy bits.

This patch was extracted from #172090.

Co-authored-by: macurtis-amd <[email protected]>
Assisted-by: Claude Opus 4.6

---------

Co-authored-by: macurtis-amd <[email protected]>

Added: 
    clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
    clang/test/SemaHIP/amdgpu-av-load-store.hip
    clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
    clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl

Modified: 
    clang/docs/LanguageExtensions.rst
    clang/include/clang/Basic/BuiltinsAMDGPU.td
    clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
    clang/include/clang/Sema/SemaAMDGPU.h
    clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
    clang/lib/Sema/SemaAMDGPU.cpp
    clang/test/CodeGen/amdgpu-builtin-is-invocable.c
    clang/test/CodeGen/amdgpu-builtin-processor-is.c
    clang/test/CodeGen/link-builtin-bitcode.c
    clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
    llvm/lib/TargetParser/AMDGPUTargetParser.cpp

Removed: 
    


################################################################################
diff  --git a/clang/docs/LanguageExtensions.rst 
b/clang/docs/LanguageExtensions.rst
index fbb9947f39d3e..9fa39c09b1f6c 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5261,6 +5261,8 @@ builtin function, and are named with a ``__opencl_`` 
prefix. The macros
 and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values
 corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.)
 
+.. _langext-__scoped_atomic:
+
 __scoped_atomic builtins
 ------------------------
 
@@ -5756,6 +5758,32 @@ returns the bit at the position of the current lane. It 
is almost equivalent to
 ``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
 the given mask has the same value for all active lanes of the current wave.
 
+
+__builtin_amdgcn_av_{load,store}_b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Signature:
+
+.. code-block:: c
+
+    typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) 
unsigned int v4u;
+
+    v4u __builtin_amdgcn_av_load_b128(v4u *src, int scope);
+
+    void __builtin_amdgcn_av_store_b128(v4u *dst, v4u data, int scope);
+
+Load or store a vector of 4 unsigned integers from or to memory with cache
+behavior specified by ``scope``, which is one of the ``__MEMORY_SCOPE_*`` 
macros
+defined for :ref:`scoped atomic builtins<langext-__c11_atomic>`.
+
+The pointer argument must point to the global or generic address space.
+
+These builtins are supported on gfx9, gfx10, gfx11, and gfx12 targets.
+
+They map to the LLVM intrinsics ``llvm.amdgcn.av.load.b128`` and
+``llvm.amdgcn.av.store.b128`` documented in `User Guide for AMDGPU Backend
+<https://llvm.org/docs/AMDGPUUsage.html>`_.
+
 ARM/AArch64 Language Extensions
 -------------------------------
 

diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td 
b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index d8020bdcc8458..8eed188b0f4b2 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -280,6 +280,21 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : 
AMDGPUBuiltin<"void(__amdgp
 def __builtin_amdgcn_struct_ptr_buffer_load_lds : 
AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant 
unsigned int, int, int, int, _Constant int, _Constant int)", [], 
"vmem-to-lds-load-insts">;
 def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : 
AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant 
unsigned int, int, int, int, _Constant int, _Constant int)", [], 
"vmem-to-lds-load-insts">;
 
+//===----------------------------------------------------------------------===//
+// Global Available/Visible memory accesses.
+//===----------------------------------------------------------------------===//
+
+def __builtin_amdgcn_av_load_b128
+    : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> 
*, int)", [], "flat-global-insts"> {
+  let Documentation = [DocAVLoadB128];
+  let ArgNames = ["ptr", "scope"];
+}
+def __builtin_amdgcn_av_store_b128
+    : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, 
unsigned int>, int)", [], "flat-global-insts"> {
+  let Documentation = [DocAVStoreB128];
+  let ArgNames = ["ptr", "data", "scope"];
+}
+
 
//===----------------------------------------------------------------------===//
 // Async mark builtins.
 
//===----------------------------------------------------------------------===//

diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td 
b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
index a92b85d75d902..293431c5de7e8 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
@@ -573,6 +573,35 @@ WMMA with per-operand scale factors applied during the 
computation.
 }];
 }
 
+//===----------------------------------------------------------------------===//
+// Global Available/Visible Memory Access Builtins
+//===----------------------------------------------------------------------===//
+
+def DocCatAVLoadStore : DocumentationCategory<"Available/Visible Memory Access 
Builtins"> {
+  let Content = [{
+These builtins perform 128-bit global or flat memory loads and stores with
+available/visible (AV) semantics.
+}];
+}
+
+def DocAVLoadB128 : Documentation {
+  let Category = DocCatAVLoadStore;
+  let Content = [{
+Loads 128 bits (4 x i32) from the pointer ``ptr``. The pointer must be in
+the global or generic address space. The ``scope`` argument specifies the
+synchronization scope using a ``__MEMORY_SCOPE_*`` constant.
+}];
+}
+
+def DocAVStoreB128 : Documentation {
+  let Category = DocCatAVLoadStore;
+  let Content = [{
+Stores 128 bits (4 x i32) of ``data`` to the pointer ``ptr``. The pointer
+must be in the global or generic address space. The ``scope`` argument
+specifies the synchronization scope using a ``__MEMORY_SCOPE_*`` constant.
+}];
+}
+
 
//===----------------------------------------------------------------------===//
 // Wave Data Exchange Builtins
 
//===----------------------------------------------------------------------===//

diff  --git a/clang/include/clang/Sema/SemaAMDGPU.h 
b/clang/include/clang/Sema/SemaAMDGPU.h
index d520f3df544f4..a6205534e0de3 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -38,6 +38,7 @@ class SemaAMDGPU : public SemaBase {
   bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore);
 
   bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
+  bool checkAVLoadStore(CallExpr *TheCall, bool IsStore);
   bool checkAtomicMonitorLoad(CallExpr *TheCall);
 
   bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,

diff  --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index a88dbb71b3ddf..21f32b12c4fd1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1009,6 +1009,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
     return Builder.CreateCall(F, {Args});
   }
+  case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+  case AMDGPU::BI__builtin_amdgcn_av_store_b128: {
+    const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128;
+    SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
+    if (IsStore)
+      Args.push_back(EmitScalarExpr(E->getArg(1))); // data
+    const unsigned ScopeIdx = E->getNumArgs() - 1;
+    auto *ScopeExpr =
+        cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(ScopeIdx)));
+    Args.push_back(emitScopeMD(*this, ScopeExpr->getZExtValue()));
+    llvm::Function *F =
+        CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_av_store_b128
+                                 : Intrinsic::amdgcn_av_load_b128,
+                         {Args[0]->getType()});
+    return Builder.CreateCall(F, Args);
+  }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
     Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
                                    {llvm::Type::getInt64Ty(getLLVMContext())});

diff  --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 60f74fd15226f..757cdfbf20819 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -149,6 +149,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
     return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15);
+  case AMDGPU::BI__builtin_amdgcn_av_load_b128:
+    return checkAVLoadStore(TheCall, /*IsStore=*/false);
+  case AMDGPU::BI__builtin_amdgcn_av_store_b128:
+    return checkAVLoadStore(TheCall, /*IsStore=*/true);
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
   case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
@@ -482,6 +486,14 @@ static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) {
   return false;
 }
 
+bool SemaAMDGPU::checkAVLoadStore(CallExpr *TheCall, bool IsStore) {
+  if (checkGlobalOrFlatPointerArg(*this, TheCall))
+    return true;
+
+  Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1);
+  return checkScopeAsInt(*this, Scope);
+}
+
 bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
   bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall);
 

diff  --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c 
b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index 3f988cbdf7cee..8de4b0ec8bc20 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -54,7 +54,7 @@ void foo() {
 // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-cpu"="gfx1010" }
 // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind 
memory(inaccessiblemem: write) }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 // AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind 
memory(inaccessiblemem: write) }
 //.

diff  --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c 
b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 308eec1d212a5..8dfbb55566598 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -63,7 +63,7 @@ void foo() {
 //.
 // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-cpu"="gfx1010" }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 // AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind 
memory(inaccessiblemem: write) }
 //.

diff  --git a/clang/test/CodeGen/link-builtin-bitcode.c 
b/clang/test/CodeGen/link-builtin-bitcode.c
index ed5abca7f34b0..8849ee735ce04 100644
--- a/clang/test/CodeGen/link-builtin-bitcode.c
+++ b/clang/test/CodeGen/link-builtin-bitcode.c
@@ -44,6 +44,6 @@ int bar() { return no_attr() + attr_in_target() + 
attr_not_in_target() + attr_in
 // CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] {
 
 // CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" }
-// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
 }
-// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
 }
-// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts"
 }
+// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-global-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
 }
+// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+flat-global-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64"
 }
+// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-global-insts,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts"
 }

diff  --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp 
b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index dbca787e26da2..8252a511d6b44 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -107,9 +107,9 @@ const B& f(A *a) {
 // CHECK: attributes #[[ATTR3]] = { nounwind }
 // CHECK: attributes #[[ATTR4]] = { noreturn }
 //.
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind 
willreturn memory(read) }
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts"
 }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
 //.

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
new file mode 100644
index 0000000000000..63d7fcac16874
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
@@ -0,0 +1,250 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --check-globals smart
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx900  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx1010 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx1100 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx1200 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx1250 -emit-llvm -o - %s | FileCheck %s
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned 
int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+//------------------------------------------------------------------------------
+// Global Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata 
[[META7:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_wave(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata 
[[META8:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_workgroup(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata 
[[META9:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_device(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata 
[[META10:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_system(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata 
[[META11:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_single(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_load_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata 
[[META12:![0-9]+]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_cluster(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Global Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr 
addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_wave(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr 
addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_workgroup(global_ptr_to_v4u32 ptr, v4u32 
data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr 
addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_device(global_ptr_to_v4u32 ptr, v4u32 data) 
{
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr 
addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_system(global_ptr_to_v4u32 ptr, v4u32 data) 
{
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr 
addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_single(global_ptr_to_v4u32 ptr, v4u32 data) 
{
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_global_store_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p1(ptr 
addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_cluster(global_ptr_to_v4u32 ptr, v4u32 
data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Load
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META7]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_wave(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META8]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_workgroup(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META9]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_device(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META10]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_system(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META11]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_single(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_load_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META12]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_flat_load_b128_cluster(v4u32 * ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR);
+}
+
+//------------------------------------------------------------------------------
+// Flat Store
+//------------------------------------------------------------------------------
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_wave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr 
[[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_wave(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_workgroup(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr 
[[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_workgroup(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_device(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr 
[[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_device(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_system(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr 
[[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_system(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_single(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr 
[[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_single(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE);
+}
+
+// CHECK-LABEL: @test_amdgcn_flat_store_b128_cluster(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.av.store.b128.p0(ptr 
[[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_flat_store_b128_cluster(v4u32 * ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR);
+}
+//.
+// CHECK: [[META7]] = !{!"wavefront"}
+// CHECK: [[META8]] = !{!"workgroup"}
+// CHECK: [[META9]] = !{!"agent"}
+// CHECK: [[META10]] = !{!""}
+// CHECK: [[META11]] = !{!"singlethread"}
+// CHECK: [[META12]] = !{!"cluster"}
+//.

diff  --git a/clang/test/SemaHIP/amdgpu-av-load-store.hip 
b/clang/test/SemaHIP/amdgpu-av-load-store.hip
new file mode 100644
index 0000000000000..1e9688e891228
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-av-load-store.hip
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify %s 
-fcuda-is-device
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned 
int v4u32;
+
+__device__ void test_av_load_store_device(v4u32 *ptr, v4u32 data) {
+  v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0);
+  __builtin_amdgcn_av_store_b128(ptr, data, 0);
+}
+
+__global__ void test_av_load_store_kernel(v4u32 *ptr, v4u32 data) {
+  v4u32 res = __builtin_amdgcn_av_load_b128(ptr, 0);
+  __builtin_amdgcn_av_store_b128(ptr, data, 0);
+}

diff  --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
new file mode 100644
index 0000000000000..b2f7b46547632
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx950         -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned 
int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+typedef v4u32 __private *private_ptr_to_v4u32;
+
+void test_amdgcn_av_store_b128_bad_ptr(private_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM);  
//expected-error{{builtin requires a global or generic pointer}}
+}
+
+void test_amdgcn_av_store_b128_bad_scope(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, 42);  
//expected-error{{synchronization scope argument to atomic operation is 
invalid}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_ptr(private_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM);  
//expected-error{{builtin requires a global or generic pointer}}
+}
+
+v4u32 test_amdgcn_av_load_b128_bad_scope(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, 42);  
//expected-error{{synchronization scope argument to atomic operation is 
invalid}}
+}

diff  --git 
a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
new file mode 100644
index 0000000000000..9a61513cdc05b
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -0,0 +1,24 @@
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx602 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx705 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx810 -S -verify -o - %t/load.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx602 -S -verify -o - %t/store.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx705 -S -verify -o - %t/store.cl
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu 
gfx810 -S -verify -o - %t/store.cl
+// REQUIRES: amdgpu-registered-target
+
+//--- load.cl
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned 
int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_av_load_b128(ptr, 0); // 
expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature 
flat-global-insts}}
+}
+
+//--- store.cl
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned 
int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_av_store_b128(ptr, data, 0); // 
expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature 
flat-global-insts}}
+}

diff  --git a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp 
b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
index 756b7c2154ca2..24e6ece329c4c 100644
--- a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
+++ b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
@@ -226,6 +226,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -280,6 +281,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -313,6 +315,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -354,6 +357,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
@@ -391,6 +395,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["image-insts"] = true;
@@ -427,6 +432,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
     Features["dpp"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["image-insts"] = true;
     Features["s-memrealtime"] = true;
@@ -486,6 +492,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
     Features["dot7-insts"] = true;
     Features["dot10-insts"] = true;
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["gfx8-insts"] = true;
     Features["16-bit-insts"] = true;
     Features["dpp"] = true;
@@ -532,6 +539,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const 
Triple &T,
   case GK_GFX900:
   case GK_GFX9_GENERIC:
     Features["gfx9-insts"] = true;
+    Features["flat-global-insts"] = true;
     Features["vmem-to-lds-load-insts"] = true;
     [[fallthrough]];
   case GK_GFX810:


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 8b02238 - [AMDGPU][Clang] add __builtin_amdgcn_av_(load|store)_b128 (#199176)

Reply via email to