[clang] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple (PR #203980)

Joseph Huber via cfe-commits Mon, 15 Jun 2026 12:46:39 -0700

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/203980


>From 8ec209d5f9452dcf7666777a44db56bb55fe478a Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Mon, 15 Jun 2026 11:47:19 -0500
Subject: [PATCH] [Clang][HIP] Add LLVM vendored device headers for `*-llvm`
 triple

Summary:
We already have the `*-llvm` vendor triple to imply a hermetic LLVM
toolchain for HIP. This adds the device portion of the headers that can
be used without a ROCm installation. These are vendored just like the
CUDA ones. Despite being HIP, these are actually generic because it onyl
uses clang builtins and the `gpuintrin` shims.
---
 clang/lib/Basic/Targets/AMDGPU.cpp            |   4 +
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   7 +
 clang/lib/Headers/CMakeLists.txt              |   3 +
 clang/lib/Headers/__clang_hip_builtin_vars.h  |  57 +++
 .../Headers/__clang_hip_device_functions.h    | 357 ++++++++++++++++++
 clang/lib/Headers/__clang_hip_intrinsics.h    | 240 ++++++++++++
 .../Headers/__clang_hip_device_functions.hip  |  76 ++++
 7 files changed, 744 insertions(+)
 create mode 100644 clang/lib/Headers/__clang_hip_builtin_vars.h
 create mode 100644 clang/lib/Headers/__clang_hip_device_functions.h
 create mode 100644 clang/lib/Headers/__clang_hip_intrinsics.h
 create mode 100644 clang/test/Headers/__clang_hip_device_functions.hip

diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index bfa956fa9a4e3..906601850e353 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -252,6 +252,10 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   else
     Builder.defineMacro("__R600__");
 
+  // The 'llvm' environment selects the upstream headers.
+  if (getTriple().getEnvironment() == llvm::Triple::LLVM)
+    Builder.defineMacro("__AMDGCN_LLVM__");
+
   // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will 
be
   // removed in the near future.
   if (hasFMAF())
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp 
b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 01cb23d0aa230..117cb013fe60c 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -331,6 +331,13 @@ void HIPAMDToolChain::AddIAMCUIncludeArgs(const ArgList 
&Args,
 
 void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
+  if (getTriple().getEnvironment() == llvm::Triple::LLVM) {
+    if (!DriverArgs.hasArg(options::OPT_nohipwrapperinc) &&
+        !DriverArgs.hasArg(options::OPT_nobuiltininc))
+      CC1Args.append({"-include", "__clang_hip_device_functions.h"});
+    return;
+  }
+
   RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 439f2725168ba..4a60af68c1d23 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -86,6 +86,9 @@ set(hip_files
   __clang_hip_math.h
   __clang_hip_stdlib.h
   __clang_hip_runtime_wrapper.h
+  __clang_hip_builtin_vars.h
+  __clang_hip_device_functions.h
+  __clang_hip_intrinsics.h
   )
 
 set(hlsl_h
diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h 
b/clang/lib/Headers/__clang_hip_builtin_vars.h
new file mode 100644
index 0000000000000..f2e53ef93b38a
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_builtin_vars.h
@@ -0,0 +1,57 @@
+//===---- __clang_hip_builtin_vars.h - HIP built-in variables --------------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_BUILTIN_VARS_H__
+#define __CLANG_HIP_BUILTIN_VARS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#include <gpuintrin.h>
+
+// The warpSize is a runtime value rather than a compile-time constant.
+inline __attribute__((device)) const struct {
+  __attribute__((device, always_inline, const)) operator int() const noexcept {
+    return __gpu_num_lanes();
+  }
+} warpSize{};
+
+#pragma push_macro("__HIP_COORD_BUILTIN")
+#define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz)                           
\
+  struct __tag {                                                               
\
+    __declspec(property(get = __get_x)) unsigned int x;                        
\
+    __declspec(property(get = __get_y)) unsigned int y;                        
\
+    __declspec(property(get = __get_z)) unsigned int z;                        
\
+    __attribute__((device, always_inline)) unsigned int __get_x() const {      
\
+      return __fx;                                                             
\
+    }                                                                          
\
+    __attribute__((device, always_inline)) unsigned int __get_y() const {      
\
+      return __fy;                                                             
\
+    }                                                                          
\
+    __attribute__((device, always_inline)) unsigned int __get_z() const {      
\
+      return __fz;                                                             
\
+    }                                                                          
\
+  }
+
+__HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(),
+                    __gpu_thread_id_y(), __gpu_thread_id_z());
+__HIP_COORD_BUILTIN(__hip_builtin_blockIdx_t, __gpu_block_id_x(),
+                    __gpu_block_id_y(), __gpu_block_id_z());
+__HIP_COORD_BUILTIN(__hip_builtin_blockDim_t, __gpu_num_threads_x(),
+                    __gpu_num_threads_y(), __gpu_num_threads_z());
+__HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(),
+                    __gpu_num_blocks_y(), __gpu_num_blocks_z());
+
+#pragma pop_macro("__HIP_COORD_BUILTIN")
+
+extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx;
+extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx;
+extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim;
+extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim;
+
+#endif // __HIP__ && (defined(__AMDGPU__) || defined(__SPIRV__))
+#endif // __CLANG_HIP_BUILTIN_VARS_H__
diff --git a/clang/lib/Headers/__clang_hip_device_functions.h 
b/clang/lib/Headers/__clang_hip_device_functions.h
new file mode 100644
index 0000000000000..fb95c3116813d
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_device_functions.h
@@ -0,0 +1,357 @@
+//===---- __clang_hip_device_functions.h - HIP device functions ------------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__
+#define __CLANG_HIP_DEVICE_FUNCTIONS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#ifndef __device__
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+#endif
+
+#include <gpuintrin.h>
+
+#pragma push_macro("__HIP_DEVICE__")
+#define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline))
+
+#pragma push_macro("MAYBE_UNDEF")
+#define MAYBE_UNDEF __attribute__((maybe_undef))
+
+// warpSize and the threadIdx/blockIdx/blockDim/gridDim coordinate variables.
+#include <__clang_hip_builtin_vars.h>
+
+//===----------------------------------------------------------------------===//
+// Integer intrinsics.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int __popc(unsigned int __x) {
+  return __builtin_popcountg(__x);
+}
+__HIP_DEVICE__ unsigned int __popcll(unsigned long long __x) {
+  return __builtin_popcountg(__x);
+}
+
+__HIP_DEVICE__ int __clz(int __x) {
+  return __builtin_clzg((unsigned int)__x, 32);
+}
+__HIP_DEVICE__ int __clzll(long long __x) {
+  return __builtin_clzg((unsigned long long)__x, 64);
+}
+
+__HIP_DEVICE__ int __ffs(int __x) {
+  return __builtin_ctzg((unsigned int)__x, -1) + 1;
+}
+__HIP_DEVICE__ int __ffs(unsigned int __x) {
+  return __builtin_ctzg(__x, -1) + 1;
+}
+__HIP_DEVICE__ int __ffsll(long long __x) {
+  return __builtin_ctzg((unsigned long long)__x, -1) + 1;
+}
+__HIP_DEVICE__ int __ffsll(unsigned long long __x) {
+  return __builtin_ctzg(__x, -1) + 1;
+}
+
+__HIP_DEVICE__ unsigned int __brev(unsigned int __x) {
+  return __builtin_elementwise_bitreverse(__x);
+}
+__HIP_DEVICE__ unsigned long long __brevll(unsigned long long __x) {
+  return __builtin_elementwise_bitreverse(__x);
+}
+
+__HIP_DEVICE__ int __mul24(int __x, int __y) {
+  return (((int)((unsigned)__x << 8) >> 8)) *
+         (((int)((unsigned)__y << 8) >> 8));
+}
+__HIP_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) {
+  return (int)((__x & 0x00ffffffu) * (__y & 0x00ffffffu));
+}
+
+__HIP_DEVICE__ int __mulhi(int __x, int __y) {
+  return (int)(((long long)__x * (long long)__y) >> 32);
+}
+__HIP_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) {
+  return (unsigned int)(((unsigned long long)__x * (unsigned long long)__y) >>
+                        32);
+}
+__HIP_DEVICE__ long long __mul64hi(long long __x, long long __y) {
+  return (long long)(((__int128)__x * (__int128)__y) >> 64);
+}
+__HIP_DEVICE__ unsigned long long __umul64hi(unsigned long long __x,
+                                             unsigned long long __y) {
+  return (
+      unsigned long long)(((unsigned __int128)__x * (unsigned __int128)__y) >>
+                          64);
+}
+
+__HIP_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) {
+  return __x > __y ? __x - __y + __z : __y - __x + __z;
+}
+__HIP_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y,
+                                   unsigned int __z) {
+  return __x > __y ? __x - __y + __z : __y - __x + __z;
+}
+
+__HIP_DEVICE__ int __hadd(int __x, int __y) {
+  return (int)(((long long)__x + (long long)__y) >> 1);
+}
+__HIP_DEVICE__ int __rhadd(int __x, int __y) {
+  return (int)(((long long)__x + (long long)__y + 1) >> 1);
+}
+__HIP_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) {
+  return (unsigned int)(((unsigned long long)__x + (unsigned long long)__y) >>
+                        1);
+}
+__HIP_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) {
+  return (
+      unsigned int)(((unsigned long long)__x + (unsigned long long)__y + 1) >>
+                    1);
+}
+
+__HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y,
+                                        unsigned int __s) {
+  unsigned long long __tmp = ((unsigned long long)__y << 32) | __x;
+  unsigned int __result = 0;
+  for (int __i = 0; __i < 4; ++__i) {
+    unsigned int __sel = (__s >> (__i * 4)) & 0x7u;
+    __result |= (unsigned int)((__tmp >> (__sel * 8)) & 0xffu) << (__i * 8);
+  }
+  return __result;
+}
+
+//===----------------------------------------------------------------------===//
+// Bitfield operations.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) {
+  return (unsigned int)__builtin_ctzg(__x, -1);
+}
+
+__HIP_DEVICE__ unsigned int __bitextract_u32(unsigned int __src,
+                                             unsigned int __offset,
+                                             unsigned int __width) {
+  unsigned int __o = __offset & 31u;
+  unsigned int __w = __width & 31u;
+  return __w == 0 ? 0u : (__src << (32u - __o - __w)) >> (32u - __w);
+}
+__HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src,
+                                                   unsigned int __offset,
+                                                   unsigned int __width) {
+  unsigned long long __o = __offset & 63u;
+  unsigned long long __w = __width & 63u;
+  return __w == 0 ? 0ull : (__src << (64ull - __o - __w)) >> (64ull - __w);
+}
+
+__HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst,
+                                            unsigned int __src,
+                                            unsigned int __offset,
+                                            unsigned int __width) {
+  unsigned int __o = __offset & 31u;
+  unsigned int __mask = (1u << (__width & 31u)) - 1u;
+  return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o);
+}
+__HIP_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst,
+                                                  unsigned long long __src,
+                                                  unsigned int __offset,
+                                                  unsigned int __width) {
+  unsigned long long __o = __offset & 63u;
+  unsigned long long __mask = (1ull << (__width & 63u)) - 1ull;
+  return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o);
+}
+
+//===----------------------------------------------------------------------===//
+// Type punning.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ int __float_as_int(float __x) {
+  return __builtin_bit_cast(int, __x);
+}
+__HIP_DEVICE__ unsigned int __float_as_uint(float __x) {
+  return __builtin_bit_cast(unsigned int, __x);
+}
+__HIP_DEVICE__ float __int_as_float(int __x) {
+  return __builtin_bit_cast(float, __x);
+}
+__HIP_DEVICE__ float __uint_as_float(unsigned int __x) {
+  return __builtin_bit_cast(float, __x);
+}
+__HIP_DEVICE__ long long __double_as_longlong(double __x) {
+  return __builtin_bit_cast(long long, __x);
+}
+__HIP_DEVICE__ double __longlong_as_double(long long __x) {
+  return __builtin_bit_cast(double, __x);
+}
+__HIP_DEVICE__ int __double2hiint(double __x) {
+  return (int)(__builtin_bit_cast(unsigned long long, __x) >> 32);
+}
+__HIP_DEVICE__ int __double2loint(double __x) {
+  return (int)__builtin_bit_cast(unsigned long long, __x);
+}
+__HIP_DEVICE__ double __hiloint2double(int __hi, int __lo) {
+  return __builtin_bit_cast(double,
+                            ((unsigned long long)(unsigned int)__hi << 32) |
+                                (unsigned long long)(unsigned int)__lo);
+}
+
+//===----------------------------------------------------------------------===//
+// Wavefront vote and lane identity.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); }
+
+__HIP_DEVICE__ unsigned long long __ballot(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred);
+}
+__HIP_DEVICE__ unsigned long long __ballot64(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred);
+}
+__HIP_DEVICE__ unsigned long long __activemask(void) {
+  return __gpu_ballot(__gpu_lane_mask(), 1);
+}
+
+__HIP_DEVICE__ int __all(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred) == __gpu_lane_mask();
+}
+__HIP_DEVICE__ int __any(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred) != 0ull;
+}
+
+template <typename __T>
+__HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned int __base,
+                                  int __offset) {
+  const int __bits = (int)sizeof(__T) * 8;
+  __T __m = __mask;
+  int __off = __offset;
+  if (__offset == 0) {
+    __m &= ((__T)1 << __base);
+    __off = 1;
+  } else if (__offset < 0) {
+    __m = __builtin_elementwise_bitreverse(__mask);
+    __base = (unsigned int)(__bits - 1) - __base;
+    __off = -__offset;
+  }
+  __m &= (~(__T)0) << __base;
+  if (__builtin_popcountg(__m) < __off)
+    return -1;
+  int __total = 0;
+  for (int __i = __bits / 2; __i > 0; __i >>= 1) {
+    __T __lo = __m & (((__T)1 << __i) - 1);
+    int __pcnt = __builtin_popcountg(__lo);
+    if (__pcnt < __off) {
+      __m >>= __i;
+      __off -= __pcnt;
+      __total += __i;
+    } else {
+      __m = __lo;
+    }
+  }
+  return __offset < 0 ? (__bits - 1) - __total : __total;
+}
+__HIP_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base,
+                           int __offset) {
+  return __hip_fns_impl(__mask, __base, __offset);
+}
+__HIP_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base,
+                           int __offset) {
+  return __hip_fns_impl((unsigned int)__mask, __base, __offset);
+}
+__HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base,
+                         int __offset) {
+  return __fns32(__mask, __base, __offset);
+}
+
+//===----------------------------------------------------------------------===//
+// Synchronization and fences
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ __attribute__((convergent)) void __syncthreads(void) {
+  __gpu_sync_threads();
+}
+
+template <typename __Fn>
+__HIP_DEVICE__ __attribute__((convergent)) int
+__hip_block_reduce_impl(int __val, int __init, __Fn __op) {
+  static __shared__ int __scratch[32];
+  unsigned int __lanes = __gpu_num_lanes();
+  unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) *
+                            __gpu_num_threads(__GPU_Y_DIM) *
+                            __gpu_num_threads(__GPU_Z_DIM);
+  unsigned int __nwarps = (__nthreads + __lanes - 1) / __lanes;
+  unsigned int __tid =
+      (__gpu_thread_id(__GPU_Z_DIM) * __gpu_num_threads(__GPU_Y_DIM) +
+       __gpu_thread_id(__GPU_Y_DIM)) *
+          __gpu_num_threads(__GPU_X_DIM) +
+      __gpu_thread_id(__GPU_X_DIM);
+
+  if (__gpu_is_first_in_lane(__gpu_lane_mask()))
+    __scratch[__tid / __lanes] = __val;
+  __gpu_sync_threads();
+
+  int __acc = __init;
+  for (unsigned int __i = 0; __i < __nwarps; ++__i)
+    __acc = __op(__acc, __scratch[__i]);
+  __gpu_sync_threads();
+  return __acc;
+}
+
+__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_count(int __pred) 
{
+  unsigned long long __mask = __gpu_lane_mask();
+  int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred));
+  return __hip_block_reduce_impl(__val, 0,
+                                 [](int __a, int __b) { return __a + __b; });
+}
+__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_and(int __pred) {
+  unsigned long long __mask = __gpu_lane_mask();
+  int __val = __gpu_ballot(__mask, __pred) == __mask;
+  return __hip_block_reduce_impl(__val, 1,
+                                 [](int __a, int __b) { return __a & __b; });
+}
+__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_or(int __pred) {
+  unsigned long long __mask = __gpu_lane_mask();
+  int __val = __gpu_ballot(__mask, __pred) != 0ull;
+  return __hip_block_reduce_impl(__val, 0,
+                                 [](int __a, int __b) { return __a | __b; });
+}
+
+__HIP_DEVICE__ void __threadfence(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
+}
+__HIP_DEVICE__ void __threadfence_block(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_WRKGRP);
+}
+__HIP_DEVICE__ void __threadfence_system(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM);
+}
+
+//===----------------------------------------------------------------------===//
+// Timers
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ long long __clock64(void) {
+  return (long long)__builtin_readcyclecounter();
+}
+__HIP_DEVICE__ long long __clock(void) { return __clock64(); }
+__HIP_DEVICE__ long long clock64(void) { return __clock64(); }
+__HIP_DEVICE__ long long clock(void) { return __clock(); }
+__HIP_DEVICE__ long long wall_clock64(void) {
+  return (long long)__builtin_readsteadycounter();
+}
+
+// Warp shuffle / synchronization / reduction intrinsics.
+#include <__clang_hip_intrinsics.h>
+
+#pragma pop_macro("MAYBE_UNDEF")
+#pragma pop_macro("__HIP_DEVICE__")
+
+#endif // __HIP__ && defined(__AMDGPU__)
+#endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__
diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h 
b/clang/lib/Headers/__clang_hip_intrinsics.h
new file mode 100644
index 0000000000000..0ac117a8a3e3d
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_intrinsics.h
@@ -0,0 +1,240 @@
+//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers 
------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_INTRINSICS_H__
+#define __CLANG_HIP_INTRINSICS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#ifndef __HIP_DEVICE__
+#error                                                                         
\
+    "__clang_hip_intrinsics.h must be included via 
__clang_hip_device_functions.h"
+#endif
+
+//===----------------------------------------------------------------------===//
+// Wavefront shuffles
+//===----------------------------------------------------------------------===//
+
+template <typename __T>
+__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx,
+                                          int __w) {
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u64(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(unsigned long long, __v),
+                                   (unsigned int)__w));
+  else
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u32(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(unsigned int, __v),
+                                   (unsigned int)__w));
+}
+
+template <typename __T>
+__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane,
+                          int __width = warpSize) {
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__src_lane & (__width - 1)), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta,
+                             int __width = warpSize) {
+  int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel - (int)__delta;
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt < 0 ? __rel : __tgt), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta,
+                               int __width = warpSize) {
+  int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel + (int)__delta;
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask,
+                              int __width = warpSize) {
+  int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel ^ __lane_mask;
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
+}
+
+//===----------------------------------------------------------------------===//
+// Warp synchronization
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ __attribute__((convergent)) void
+__syncwarp(unsigned long long __mask = -1) {
+  __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT);
+  __gpu_sync_lane(__mask);
+  __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT);
+}
+
+//===----------------------------------------------------------------------===//
+// Wave syncrhonization sync aliases.
+//===----------------------------------------------------------------------===//
+
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) {
+  return __ballot(__pred) & (unsigned long long)__mask;
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __all_sync(__MaskT __mask, int __pred) {
+  return __ballot_sync(__mask, __pred) == (unsigned long long)__mask;
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __any_sync(__MaskT __mask, int __pred) {
+  return __ballot_sync(__mask, __pred) != 0ull;
+}
+
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                               int __src_lane, int __width = warpSize) {
+  (void)__mask;
+  return __shfl(__var, __src_lane, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                  unsigned int __delta,
+                                  int __width = warpSize) {
+  (void)__mask;
+  return __shfl_up(__var, __delta, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                    unsigned int __delta,
+                                    int __width = warpSize) {
+  (void)__mask;
+  return __shfl_down(__var, __delta, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                   int __lane_mask, int __width = warpSize) {
+  (void)__mask;
+  return __shfl_xor(__var, __lane_mask, __width);
+}
+
+//===----------------------------------------------------------------------===//
+// Match primitives.
+//===----------------------------------------------------------------------===//
+
+template <typename __T>
+__HIP_DEVICE__ unsigned long long __match_any(__T __value) {
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    return __gpu_match_any_u64(__gpu_lane_mask(),
+                               __builtin_bit_cast(unsigned long long, 
__value));
+  else
+    return __gpu_match_any_u32(__gpu_lane_mask(),
+                               __builtin_bit_cast(unsigned int, __value));
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask,
+                                                   __T __value) {
+  return __match_any(__value) & (unsigned long long)__mask;
+}
+
+template <typename __T>
+__HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) {
+  unsigned long long __r;
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    __r = __gpu_match_all_u64(__gpu_lane_mask(),
+                              __builtin_bit_cast(unsigned long long, __value));
+  else
+    __r = __gpu_match_all_u32(__gpu_lane_mask(),
+                              __builtin_bit_cast(unsigned int, __value));
+  *__pred = __r != 0;
+  return __r;
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value,
+                                                   int *__pred) {
+  (void)__mask;
+  return __match_all(__value, __pred);
+}
+
+//===----------------------------------------------------------------------===//
+// Wave reductions.
+//===----------------------------------------------------------------------===//
+
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_add_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) {
+  return (int)__gpu_lane_add_u32((unsigned long long)__mask,
+                                 (unsigned int)__val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_min_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) {
+  unsigned int __r = __gpu_lane_min_u32((unsigned long long)__mask,
+                                        (unsigned int)__val ^ 0x80000000u);
+  return (int)(__r ^ 0x80000000u);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_max_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) {
+  unsigned int __r = __gpu_lane_max_u32((unsigned long long)__mask,
+                                        (unsigned int)__val ^ 0x80000000u);
+  return (int)(__r ^ 0x80000000u);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_and_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask,
+                                             unsigned int __val) {
+  return __gpu_lane_or_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_xor_u32((unsigned long long)__mask, __val);
+}
+
+//===----------------------------------------------------------------------===//
+// Funnel shifts.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int
+__funnelshift_l(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift & 31u;
+  return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 
32);
+}
+__HIP_DEVICE__ unsigned int
+__funnelshift_lc(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift >= 32u ? 32u : __shift;
+  return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 
32);
+}
+__HIP_DEVICE__ unsigned int
+__funnelshift_r(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift & 31u;
+  return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s);
+}
+__HIP_DEVICE__ unsigned int
+__funnelshift_rc(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift >= 32u ? 32u : __shift;
+  return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s);
+}
+
+#endif // __HIP__ && defined(__AMDGPU__)
+#endif // __CLANG_HIP_INTRINSICS_H__
diff --git a/clang/test/Headers/__clang_hip_device_functions.hip 
b/clang/test/Headers/__clang_hip_device_functions.hip
new file mode 100644
index 0000000000000..472f1c97f66b1
--- /dev/null
+++ b/clang/test/Headers/__clang_hip_device_functions.hip
@@ -0,0 +1,76 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \
+// RUN:   -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \
+// RUN:   -include __clang_hip_device_functions.h
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \
+// RUN:   -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s \
+// RUN:   -include __clang_hip_device_functions.h
+
+// expected-no-diagnostics
+
+__global__ void test_kernel(int *p, float *f, double *d, unsigned *u) {
+  unsigned i = __popc(*u) + __popcll(*u);
+  i += __clz(*p) + __clzll(*p);
+  i += __ffs(*p) + __ffs(*u) + __ffsll((long long)*p) + __ffsll((unsigned long 
long)*u);
+  i += __brev(*u) + (unsigned)__brevll(*u);
+  i += __mul24(*p, *p) + __umul24(*u, *u) + __mulhi(*p, *p) + __umulhi(*u, *u);
+  i += (unsigned)__mul64hi(*p, *p) + (unsigned)__umul64hi(*u, *u);
+  i += __sad(*p, *p, *u) + __usad(*u, *u, *u);
+  i += __hadd(*p, *p) + __rhadd(*p, *p) + __uhadd(*u, *u) + __urhadd(*u, *u);
+  i += __byte_perm(*u, *u, *u);
+  i += __funnelshift_l(*u, *u, *u) + __funnelshift_lc(*u, *u, *u) +
+       __funnelshift_r(*u, *u, *u) + __funnelshift_rc(*u, *u, *u);
+
+  i += __lastbit_u32_u64(*u);
+  i += __bitextract_u32(*u, *u, *u) + (unsigned)__bitextract_u64(*u, *u, *u);
+  i += __bitinsert_u32(*u, *u, *u, *u) + (unsigned)__bitinsert_u64(*u, *u, *u, 
*u);
+
+  i += __float_as_int(*f) + __float_as_uint(*f);
+  *f = __int_as_float(*p) + __uint_as_float(*u);
+  i += (unsigned)__double_as_longlong(*d);
+  *d = __longlong_as_double((long long)*u);
+  i += __double2hiint(*d) + __double2loint(*d);
+  *d = __hiloint2double(*p, *p);
+
+  i += threadIdx.x + threadIdx.y + threadIdx.z;
+  i += blockIdx.x + blockIdx.y + blockIdx.z;
+  i += blockDim.x + blockDim.y + blockDim.z;
+  i += gridDim.x + gridDim.y + gridDim.z;
+  i += __lane_id();
+  unsigned long long m = __ballot(*p) + __ballot64(*p) + __activemask();
+  int v = __all(*p) + __any(*p);
+  v += __fns(m, 0, 1) + __fns32(m, 0, 1) + __fns64(m, __lane_id(), -1);
+
+  int s = __shfl(*p, 1) + __shfl_up(*p, 1) + __shfl_down(*p, 1) + 
__shfl_xor(*p, 1);
+  s += (int)__shfl(*u, 1, 32) + (int)__shfl_down(*f, 1) + (int)__shfl_xor(*d, 
1);
+  s += (int)__shfl((long long)*p, 0) + (int)__shfl((unsigned long long)*u, 0);
+
+  unsigned long long mask = ~0ull;
+  s += __shfl_sync(mask, *p, 1) + __shfl_up_sync(mask, *p, 1) +
+       __shfl_down_sync(mask, *p, 1) + __shfl_xor_sync(mask, *p, 1);
+  s += __all_sync(mask, *p) + __any_sync(mask, *p) + (int)__ballot_sync(mask, 
*p);
+  int pred;
+  s += (int)__match_any(*p) + (int)__match_any_sync(mask, *p) +
+       (int)__match_all_sync(mask, *p, &pred);
+  s += __reduce_add_sync(mask, *p) + __reduce_min_sync(mask, *p) +
+       __reduce_max_sync(mask, *p);
+  s += (int)(__reduce_add_sync(mask, *u) + __reduce_min_sync(mask, *u) +
+             __reduce_max_sync(mask, *u) + __reduce_and_sync(mask, *u) +
+             __reduce_or_sync(mask, *u) + __reduce_xor_sync(mask, *u));
+
+  __syncthreads();
+  s += __syncthreads_count(*p) + __syncthreads_and(*p) + __syncthreads_or(*p);
+  __syncwarp();
+  __syncwarp(mask);
+  __threadfence();
+  __threadfence_block();
+  __threadfence_system();
+  long long c = clock() + clock64() + __clock() + __clock64() + wall_clock64();
+
+  *p = (int)i + v + s + (int)c + warpSize;
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple (PR #203980)

Reply via email to