[clang] [Clang][AMDGPU] Add __builtin_amdgcn_s_buffer_load_ (PR #203352)

via cfe-commits Thu, 11 Jun 2026 10:56:52 -0700

https://github.com/macurtis-amd updated 
https://github.com/llvm/llvm-project/pull/203352


>From 953485cedcc86698e00119674783d81470291d98 Mon Sep 17 00:00:00 2001
From: Matthew Curtis <[email protected]>
Date: Thu, 11 Jun 2026 12:53:12 -0500
Subject: [PATCH] [Clang][AMDGPU] Add __builtin_amdgcn_s_buffer_load_<T>

Expose llvm.amdgcn.s.buffer.load intrinsic overloads for OpenCL via
__builtin_amdgcn_s_buffer_load_* builtins.

Function prototypes:

  <return-type>
  __builtin_amdgcn_s_buffer_load_<suffix>(
      v4i32 rsrc,  // 4-dword buffer resource in SGPRs
      int offset,  // byte offset from the base of the buffer
      int aux);    // cache-policy, control flags. Must be compile-time const.

Where:

  return-type       suffix
  ----------------  ------
  char              i8
  unsigned char     u8
  short             i16
  unsigned short    u16
  int               i32
  half              f16
  float             f32

  v2i8              v2i8
  v3i8              v3i8
  v4i8              v4i8

  v2i32             v2i32
  v3i32             v3i32
  v4i32             v4i32
  v8i32             v8i32
  v16i32            v16i32

  v2f16             v2f16
  v3f16             v3f16
  v4f16             v4f16

  v2f32             v2f32
  v3f32             v3f32
  v4f32             v4f32
  v8f32             v8f32
  v16f32            v16f32
---
 clang/docs/AMDGPUSupport.rst                  |   7 +
 clang/include/clang/Basic/BuiltinsAMDGPU.td   |  30 +++
 .../include/clang/Basic/BuiltinsAMDGPUDocs.td |  39 +++
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  33 +++
 .../builtins-amdgcn-s-buffer-load.cl          | 238 ++++++++++++++++++
 .../builtins-amdgcn-s-buffer-load-error.cl    | 125 +++++++++
 6 files changed, 472 insertions(+)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-s-buffer-load.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-s-buffer-load-error.cl

diff --git a/clang/docs/AMDGPUSupport.rst b/clang/docs/AMDGPUSupport.rst
index 18e3de8abe92a..8ca537fa5d729 100644
--- a/clang/docs/AMDGPUSupport.rst
+++ b/clang/docs/AMDGPUSupport.rst
@@ -57,3 +57,10 @@ Predefined Macros
      - Defined if FP64 instruction is available (deprecated).
 
 Please note that the specific architecture and feature names will vary 
depending on the GPU. Also, some macros are deprecated and may be removed in 
future releases.
+
+
+Target-Specific Builtins
+========================
+
+Clang exposes AMDGPU hardware intrinsics as target-specific builtins with the
+``__builtin_amdgcn_`` prefix. These are documented in 
:doc:`AMDGPUBuiltinReference`.
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td 
b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 8eed188b0f4b2..b24a2403ecec2 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -242,6 +242,10 @@ def __builtin_amdgcn_qsad_pk_u16_u8 : 
AMDGPUBuiltin<"uint64_t(uint64_t, unsigned
 def __builtin_amdgcn_mqsad_pk_u16_u8 : AMDGPUBuiltin<"uint64_t(uint64_t, 
unsigned int, uint64_t)", [Const], "mqsad-pk-insts">;
 def __builtin_amdgcn_mqsad_u32_u8 : AMDGPUBuiltin<"_ExtVector<4, unsigned 
int>(uint64_t, unsigned int, _ExtVector<4, unsigned int>)", [Const], 
"mqsad-insts">;
 
+//===----------------------------------------------------------------------===//
+// Buffer builtins.
+//===----------------------------------------------------------------------===//
+
 def __builtin_amdgcn_make_buffer_rsrc : 
AMDGPUBuiltin<"__amdgpu_buffer_rsrc_t(void *, short, int64_t, int)", [Const]>;
 def __builtin_amdgcn_raw_buffer_store_b8 : AMDGPUBuiltin<"void(unsigned char, 
__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 def __builtin_amdgcn_raw_buffer_store_b16 : AMDGPUBuiltin<"void(unsigned 
short, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
@@ -280,6 +284,32 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : 
AMDGPUBuiltin<"void(__amdgp
 def __builtin_amdgcn_struct_ptr_buffer_load_lds : 
AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant 
unsigned int, int, int, int, _Constant int, _Constant int)", [], 
"vmem-to-lds-load-insts">;
 def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : 
AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant 
unsigned int, int, int, int, _Constant int, _Constant int)", [], 
"vmem-to-lds-load-insts">;
 
+let Documentation = [DocSBufferLoad], ArgNames = ["rsrc", "offset", "aux"] in {
+def __builtin_amdgcn_s_buffer_load_i32 : AMDGPUBuiltin<"int(_Vector<4, int>, 
int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v2i32 : AMDGPUBuiltin<"_Vector<2, 
int>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v3i32 : AMDGPUBuiltin<"_Vector<3, 
int>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v4i32 : AMDGPUBuiltin<"_Vector<4, 
int>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v8i32 : AMDGPUBuiltin<"_Vector<8, 
int>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v16i32 : AMDGPUBuiltin<"_Vector<16, 
int>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_f32 : AMDGPUBuiltin<"float(_Vector<4, int>, 
int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v2f32 : AMDGPUBuiltin<"_Vector<2, 
float>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v3f32 : AMDGPUBuiltin<"_Vector<3, 
float>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v4f32 : AMDGPUBuiltin<"_Vector<4, 
float>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v8f32 : AMDGPUBuiltin<"_Vector<8, 
float>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v16f32 : AMDGPUBuiltin<"_Vector<16, 
float>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_i8 : AMDGPUBuiltin<"char(_Vector<4, int>, 
int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_u8 : AMDGPUBuiltin<"unsigned 
char(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_i16 : AMDGPUBuiltin<"short(_Vector<4, int>, 
int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_u16 : AMDGPUBuiltin<"unsigned 
short(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v2i8 : AMDGPUBuiltin<"_Vector<2, 
char>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v3i8 : AMDGPUBuiltin<"_Vector<3, 
char>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v4i8 : AMDGPUBuiltin<"_Vector<4, 
char>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_f16 : AMDGPUBuiltin<"_Float16(_Vector<4, 
int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v2f16 : AMDGPUBuiltin<"_Vector<2, 
_Float16>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v3f16 : AMDGPUBuiltin<"_Vector<3, 
_Float16>(_Vector<4, int>, int, _Constant int)", [Const]>;
+def __builtin_amdgcn_s_buffer_load_v4f16 : AMDGPUBuiltin<"_Vector<4, 
_Float16>(_Vector<4, int>, int, _Constant int)", [Const]>;
+}
+
 
//===----------------------------------------------------------------------===//
 // Global Available/Visible memory accesses.
 
//===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td 
b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
index 293431c5de7e8..d6bb56e08aa42 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
@@ -676,3 +676,42 @@ def DocTensorStoreFromLDS_GFX1250 : Documentation {
 Asynchronously copies a tensor from LDS into global memory.
 }];
 }
+
+//===----------------------------------------------------------------------===//
+// S-Buffer Load Builtins
+//===----------------------------------------------------------------------===//
+
+def DocCatSBufferLoad : DocumentationCategory<"S-Buffer Load Builtins"> {
+  let Content = [{
+These builtins lower to ``llvm.amdgcn.s.buffer.load`` and issue an
+``s_buffer_load`` when the byte offset is uniform across the wavefront.
+When the offset is divergent, the backend may lower to a ``buffer_load``.
+
+Unlike the ``__builtin_amdgcn_raw_buffer_load_*`` family, these builtins
+take the buffer resource as a 4-dword SGPR descriptor (``v4i32``) rather
+than ``__amdgpu_buffer_rsrc_t``.
+
+The return type selects the load width. Separate builtins are provided for
+each supported scalar and vector element type.
+}];
+}
+
+def DocSBufferLoad : Documentation {
+  let Category = DocCatSBufferLoad;
+  let Content = [{
+Loads data from a buffer using an SGPR buffer descriptor.
+
+- ``rsrc``: 4-dword buffer resource in SGPRs.
+- ``offset``: byte offset from the base of the buffer. May be variable; a
+  uniform offset enables ``s_buffer_load`` selection.
+- ``aux``: cache-policy and control flags. Must be a compile-time constant.
+  The encoding is target-dependent. Common fields include:
+
+  - Pre-GFX12: bit 0 = glc, bit 1 = slc, bit 2 = dlc (GFX10/GFX11), bit 3 =
+    swz, bit 4 = scc (GFX90a).
+  - GFX942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1.
+  - GFX12+: bits [0-2] = th, bits [3-4] = scope, bit 6 = swz.
+
+  The volatile bit is not permitted for this intrinsic.
+}];
+}
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 21f32b12c4fd1..b3f5322a93cb6 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -31,6 +31,15 @@ using namespace llvm;
 
 namespace {
 
+static Value *emitAMDGPUSBufferLoadBuiltin(CodeGenFunction &CGF,
+                                           const CallExpr *E) {
+  llvm::Type *RetTy = CGF.ConvertType(E->getType());
+  Function *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_s_buffer_load, RetTy);
+  return CGF.Builder.CreateCall(F, {CGF.EmitScalarExpr(E->getArg(0)),
+                                    CGF.EmitScalarExpr(E->getArg(1)),
+                                    CGF.EmitScalarExpr(E->getArg(2))});
+}
+
 // Has second type mangled argument.
 static Value *
 emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
@@ -2156,6 +2165,30 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64:
     return emitBuiltinWithOneOverloadedType<5>(
         *this, E, Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax);
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_i32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v3i32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v4i32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v8i32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v16i32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_f32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v2f32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v3f32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v4f32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v8f32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v16f32:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_i8:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_u8:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_i16:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_u16:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v2i8:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v3i8:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v4i8:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_f16:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v2f16:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v3f16:
+  case AMDGPU::BI__builtin_amdgcn_s_buffer_load_v4f16:
+    return emitAMDGPUSBufferLoadBuiltin(*this, E);
   case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
     return emitBuiltinWithOneOverloadedType<2>(
         *this, E, Intrinsic::amdgcn_s_prefetch_data);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-s-buffer-load.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-s-buffer-load.cl
new file mode 100644
index 0000000000000..0412c2cf4cc38
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-s-buffer-load.cl
@@ -0,0 +1,238 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm 
-o - %s | FileCheck %s
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef int v4i32 __attribute__((ext_vector_type(4)));
+typedef int v2i32 __attribute__((ext_vector_type(2)));
+typedef int v3i32 __attribute__((ext_vector_type(3)));
+typedef int v8i32 __attribute__((ext_vector_type(8)));
+typedef int v16i32 __attribute__((ext_vector_type(16)));
+typedef float v2f32 __attribute__((ext_vector_type(2)));
+typedef float v3f32 __attribute__((ext_vector_type(3)));
+typedef float v4f32 __attribute__((ext_vector_type(4)));
+typedef float v8f32 __attribute__((ext_vector_type(8)));
+typedef float v16f32 __attribute__((ext_vector_type(16)));
+typedef char v2i8 __attribute__((ext_vector_type(2)));
+typedef char v3i8 __attribute__((ext_vector_type(3)));
+typedef char v4i8 __attribute__((ext_vector_type(4)));
+typedef half v2f16 __attribute__((ext_vector_type(2)));
+typedef half v3f16 __attribute__((ext_vector_type(3)));
+typedef half v4f16 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 
@llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int test_amdgcn_s_buffer_load_i32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_i32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_i32_non_const_offset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 
@llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 
0)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int test_amdgcn_s_buffer_load_i32_non_const_offset(v4i32 rsrc, int offset) {
+  return __builtin_amdgcn_s_buffer_load_i32(rsrc, offset, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v2i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> 
@llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2i32 test_amdgcn_s_buffer_load_v2i32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v2i32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v3i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <3 x i32> 
@llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <3 x i32> [[TMP0]]
+//
+v3i32 test_amdgcn_s_buffer_load_v3i32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v3i32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v4i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> 
@llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 test_amdgcn_s_buffer_load_v4i32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v4i32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v8i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 test_amdgcn_s_buffer_load_v8i32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v8i32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v16i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i32> 
@llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <16 x i32> [[TMP0]]
+//
+v16i32 test_amdgcn_s_buffer_load_v16i32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v16i32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float 
@llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_amdgcn_s_buffer_load_f32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_f32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v2f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x float> 
@llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
+v2f32 test_amdgcn_s_buffer_load_v2f32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v2f32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v3f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <3 x float> 
@llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <3 x float> [[TMP0]]
+//
+v3f32 test_amdgcn_s_buffer_load_v3f32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v3f32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v4f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> 
@llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 test_amdgcn_s_buffer_load_v4f32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v4f32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v8f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 test_amdgcn_s_buffer_load_v8f32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v8f32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v16f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x float> 
@llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <16 x float> [[TMP0]]
+//
+v16f32 test_amdgcn_s_buffer_load_v16f32(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v16f32(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_i8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.s.buffer.load.i8(<4 
x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
+char test_amdgcn_s_buffer_load_i8(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_i8(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.s.buffer.load.i8(<4 
x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
+unsigned char test_amdgcn_s_buffer_load_u8(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_u8(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_i16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 
@llvm.amdgcn.s.buffer.load.i16(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+short test_amdgcn_s_buffer_load_i16(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_i16(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 
@llvm.amdgcn.s.buffer.load.i16(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+unsigned short test_amdgcn_s_buffer_load_u16(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_u16(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call half 
@llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret half [[TMP0]]
+//
+half test_amdgcn_s_buffer_load_f16(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_f16(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v2f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x half> 
@llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <2 x half> [[TMP0]]
+//
+v2f16 test_amdgcn_s_buffer_load_v2f16(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v2f16(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v3f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <3 x half> 
@llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <3 x half> [[TMP0]]
+//
+v3f16 test_amdgcn_s_buffer_load_v3f16(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v3f16(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v4f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x half> 
@llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
+v4f16 test_amdgcn_s_buffer_load_v4f16(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v4f16(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v2i8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i8> 
@llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <2 x i8> [[TMP0]]
+//
+v2i8 test_amdgcn_s_buffer_load_v2i8(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v2i8(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v3i8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <3 x i8> 
@llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <3 x i8> [[TMP0]]
+//
+v3i8 test_amdgcn_s_buffer_load_v3i8(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v3i8(rsrc, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_amdgcn_s_buffer_load_v4i8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i8> 
@llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> [[RSRC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    ret <4 x i8> [[TMP0]]
+//
+v4i8 test_amdgcn_s_buffer_load_v4i8(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_v4i8(rsrc, /*offset=*/0, /*aux=*/0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-s-buffer-load-error.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-s-buffer-load-error.cl
new file mode 100644
index 0000000000000..0ef220872b449
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-s-buffer-load-error.cl
@@ -0,0 +1,125 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -S -verify 
-o - %s
+// REQUIRES: amdgpu-registered-target
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef int v4i32 __attribute__((ext_vector_type(4)));
+typedef int v2i32 __attribute__((ext_vector_type(2)));
+typedef int v3i32 __attribute__((ext_vector_type(3)));
+typedef int v8i32 __attribute__((ext_vector_type(8)));
+typedef int v16i32 __attribute__((ext_vector_type(16)));
+typedef float v2f32 __attribute__((ext_vector_type(2)));
+typedef float v3f32 __attribute__((ext_vector_type(3)));
+typedef float v4f32 __attribute__((ext_vector_type(4)));
+typedef float v8f32 __attribute__((ext_vector_type(8)));
+typedef float v16f32 __attribute__((ext_vector_type(16)));
+typedef char v2i8 __attribute__((ext_vector_type(2)));
+typedef char v3i8 __attribute__((ext_vector_type(3)));
+typedef char v4i8 __attribute__((ext_vector_type(4)));
+typedef half v2f16 __attribute__((ext_vector_type(2)));
+typedef half v3f16 __attribute__((ext_vector_type(3)));
+typedef half v4f16 __attribute__((ext_vector_type(4)));
+
+int test_amdgcn_s_buffer_load_i32_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_i32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_i32' must be a 
constant integer}}
+}
+
+v2i32 test_amdgcn_s_buffer_load_v2i32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v2i32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v2i32' must be a 
constant integer}}
+}
+
+v3i32 test_amdgcn_s_buffer_load_v3i32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v3i32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v3i32' must be a 
constant integer}}
+}
+
+v4i32 test_amdgcn_s_buffer_load_v4i32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v4i32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v4i32' must be a 
constant integer}}
+}
+
+v8i32 test_amdgcn_s_buffer_load_v8i32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v8i32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v8i32' must be a 
constant integer}}
+}
+
+v16i32 test_amdgcn_s_buffer_load_v16i32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v16i32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v16i32' must be a 
constant integer}}
+}
+
+float test_amdgcn_s_buffer_load_f32_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_f32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_f32' must be a 
constant integer}}
+}
+
+v2f32 test_amdgcn_s_buffer_load_v2f32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v2f32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v2f32' must be a 
constant integer}}
+}
+
+v3f32 test_amdgcn_s_buffer_load_v3f32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v3f32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v3f32' must be a 
constant integer}}
+}
+
+v4f32 test_amdgcn_s_buffer_load_v4f32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v4f32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v4f32' must be a 
constant integer}}
+}
+
+v8f32 test_amdgcn_s_buffer_load_v8f32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v8f32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v8f32' must be a 
constant integer}}
+}
+
+v16f32 test_amdgcn_s_buffer_load_v16f32_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v16f32(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v16f32' must be a 
constant integer}}
+}
+
+char test_amdgcn_s_buffer_load_i8_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_i8(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_i8' must be a 
constant integer}}
+}
+
+unsigned char test_amdgcn_s_buffer_load_u8_non_const_aux(v4i32 rsrc, int 
offset, int aux) {
+  return __builtin_amdgcn_s_buffer_load_u8(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_u8' must be a 
constant integer}}
+}
+
+short test_amdgcn_s_buffer_load_i16_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_i16(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_i16' must be a 
constant integer}}
+}
+
+unsigned short test_amdgcn_s_buffer_load_u16_non_const_aux(v4i32 rsrc, int 
offset, int aux) {
+  return __builtin_amdgcn_s_buffer_load_u16(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_u16' must be a 
constant integer}}
+}
+
+v2i8 test_amdgcn_s_buffer_load_v2i8_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_v2i8(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v2i8' must be a 
constant integer}}
+}
+
+v3i8 test_amdgcn_s_buffer_load_v3i8_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_v3i8(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v3i8' must be a 
constant integer}}
+}
+
+v4i8 test_amdgcn_s_buffer_load_v4i8_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_v4i8(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v4i8' must be a 
constant integer}}
+}
+
+half test_amdgcn_s_buffer_load_f16_non_const_aux(v4i32 rsrc, int offset, int 
aux) {
+  return __builtin_amdgcn_s_buffer_load_f16(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_f16' must be a 
constant integer}}
+}
+
+v2f16 test_amdgcn_s_buffer_load_v2f16_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v2f16(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v2f16' must be a 
constant integer}}
+}
+
+v3f16 test_amdgcn_s_buffer_load_v3f16_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v3f16(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v3f16' must be a 
constant integer}}
+}
+
+v4f16 test_amdgcn_s_buffer_load_v4f16_non_const_aux(v4i32 rsrc, int offset, 
int aux) {
+  return __builtin_amdgcn_s_buffer_load_v4f16(rsrc, offset, aux); 
//expected-error{{argument to '__builtin_amdgcn_s_buffer_load_v4f16' must be a 
constant integer}}
+}
+
+int test_amdgcn_s_buffer_load_i32_too_few_args(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_i32(rsrc, 0); //expected-error{{too 
few arguments to function call, expected 3, have 2}}
+}
+
+int test_amdgcn_s_buffer_load_i32_too_many_args(v4i32 rsrc) {
+  return __builtin_amdgcn_s_buffer_load_i32(rsrc, 0, 0, 0); 
//expected-error{{too many arguments to function call, expected 3, have 4}}
+}
+
+int test_amdgcn_s_buffer_load_i32_wrong_rsrc_type(__amdgpu_buffer_rsrc_t rsrc) 
{
+  return __builtin_amdgcn_s_buffer_load_i32(rsrc, 0, 0); 
//expected-error{{passing '__private __amdgpu_buffer_rsrc_t' to parameter of 
incompatible type '__attribute__((__vector_size__(4 * sizeof(int)))) int' 
(vector of 4 'int' values)}}
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang][AMDGPU] Add __builtin_amdgcn_s_buffer_load_ (PR #203352)

Reply via email to