[clang] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple (PR #203980)

Joseph Huber via cfe-commits Thu, 18 Jun 2026 16:02:50 -0700

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/203980


>From 8a2190f6c9085ffe930b51e9b64c9a7cde83895c Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Mon, 15 Jun 2026 11:47:19 -0500
Subject: [PATCH 1/6] [Clang][HIP] Add LLVM vendored device headers for
 `*-llvm` triple

Summary:
We already have the `*-llvm` vendor triple to imply a hermetic LLVM
toolchain for HIP. This adds the device portion of the headers that can
be used without a ROCm installation. These are vendored just like the
CUDA ones. Despite being HIP, these are actually generic because it onyl
uses clang builtins and the `gpuintrin` shims.
---
 clang/lib/Basic/Targets/AMDGPU.cpp            |   4 +
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   7 +
 clang/lib/Headers/CMakeLists.txt              |   3 +
 clang/lib/Headers/__clang_hip_builtin_vars.h  |  57 +++
 .../Headers/__clang_hip_device_functions.h    | 356 ++++++++++++++++++
 clang/lib/Headers/__clang_hip_intrinsics.h    | 240 ++++++++++++
 .../Headers/__clang_hip_device_functions.hip  |  76 ++++
 7 files changed, 743 insertions(+)
 create mode 100644 clang/lib/Headers/__clang_hip_builtin_vars.h
 create mode 100644 clang/lib/Headers/__clang_hip_device_functions.h
 create mode 100644 clang/lib/Headers/__clang_hip_intrinsics.h
 create mode 100644 clang/test/Headers/__clang_hip_device_functions.hip

diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index bfa956fa9a4e3..906601850e353 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -252,6 +252,10 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   else
     Builder.defineMacro("__R600__");
 
+  // The 'llvm' environment selects the upstream headers.
+  if (getTriple().getEnvironment() == llvm::Triple::LLVM)
+    Builder.defineMacro("__AMDGCN_LLVM__");
+
   // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will 
be
   // removed in the near future.
   if (hasFMAF())
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp 
b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 84664bcddbb94..2dbd8e9f670ce 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -331,6 +331,13 @@ void HIPAMDToolChain::AddIAMCUIncludeArgs(const ArgList 
&Args,
 
 void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
+  if (getTriple().getEnvironment() == llvm::Triple::LLVM) {
+    if (!DriverArgs.hasArg(options::OPT_nohipwrapperinc) &&
+        !DriverArgs.hasArg(options::OPT_nobuiltininc))
+      CC1Args.append({"-include", "__clang_hip_device_functions.h"});
+    return;
+  }
+
   RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 439f2725168ba..4a60af68c1d23 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -86,6 +86,9 @@ set(hip_files
   __clang_hip_math.h
   __clang_hip_stdlib.h
   __clang_hip_runtime_wrapper.h
+  __clang_hip_builtin_vars.h
+  __clang_hip_device_functions.h
+  __clang_hip_intrinsics.h
   )
 
 set(hlsl_h
diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h 
b/clang/lib/Headers/__clang_hip_builtin_vars.h
new file mode 100644
index 0000000000000..f2e53ef93b38a
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_builtin_vars.h
@@ -0,0 +1,57 @@
+//===---- __clang_hip_builtin_vars.h - HIP built-in variables --------------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_BUILTIN_VARS_H__
+#define __CLANG_HIP_BUILTIN_VARS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#include <gpuintrin.h>
+
+// The warpSize is a runtime value rather than a compile-time constant.
+inline __attribute__((device)) const struct {
+  __attribute__((device, always_inline, const)) operator int() const noexcept {
+    return __gpu_num_lanes();
+  }
+} warpSize{};
+
+#pragma push_macro("__HIP_COORD_BUILTIN")
+#define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz)                           
\
+  struct __tag {                                                               
\
+    __declspec(property(get = __get_x)) unsigned int x;                        
\
+    __declspec(property(get = __get_y)) unsigned int y;                        
\
+    __declspec(property(get = __get_z)) unsigned int z;                        
\
+    __attribute__((device, always_inline)) unsigned int __get_x() const {      
\
+      return __fx;                                                             
\
+    }                                                                          
\
+    __attribute__((device, always_inline)) unsigned int __get_y() const {      
\
+      return __fy;                                                             
\
+    }                                                                          
\
+    __attribute__((device, always_inline)) unsigned int __get_z() const {      
\
+      return __fz;                                                             
\
+    }                                                                          
\
+  }
+
+__HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(),
+                    __gpu_thread_id_y(), __gpu_thread_id_z());
+__HIP_COORD_BUILTIN(__hip_builtin_blockIdx_t, __gpu_block_id_x(),
+                    __gpu_block_id_y(), __gpu_block_id_z());
+__HIP_COORD_BUILTIN(__hip_builtin_blockDim_t, __gpu_num_threads_x(),
+                    __gpu_num_threads_y(), __gpu_num_threads_z());
+__HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(),
+                    __gpu_num_blocks_y(), __gpu_num_blocks_z());
+
+#pragma pop_macro("__HIP_COORD_BUILTIN")
+
+extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx;
+extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx;
+extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim;
+extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim;
+
+#endif // __HIP__ && (defined(__AMDGPU__) || defined(__SPIRV__))
+#endif // __CLANG_HIP_BUILTIN_VARS_H__
diff --git a/clang/lib/Headers/__clang_hip_device_functions.h 
b/clang/lib/Headers/__clang_hip_device_functions.h
new file mode 100644
index 0000000000000..14edf987c9207
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_device_functions.h
@@ -0,0 +1,356 @@
+//===---- __clang_hip_device_functions.h - HIP device functions ------------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__
+#define __CLANG_HIP_DEVICE_FUNCTIONS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#ifndef __device__
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+#endif
+
+#include <gpuintrin.h>
+
+#pragma push_macro("__HIP_DEVICE__")
+#define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline))
+
+#pragma push_macro("MAYBE_UNDEF")
+#define MAYBE_UNDEF __attribute__((maybe_undef))
+
+// warpSize and the threadIdx/blockIdx/blockDim/gridDim coordinate variables.
+#include <__clang_hip_builtin_vars.h>
+
+//===----------------------------------------------------------------------===//
+// Integer intrinsics.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int __popc(unsigned int __x) {
+  return __builtin_popcountg(__x);
+}
+__HIP_DEVICE__ unsigned int __popcll(unsigned long long __x) {
+  return __builtin_popcountg(__x);
+}
+
+__HIP_DEVICE__ int __clz(int __x) {
+  return __builtin_clzg((unsigned int)__x, 32);
+}
+__HIP_DEVICE__ int __clzll(long long __x) {
+  return __builtin_clzg((unsigned long long)__x, 64);
+}
+
+__HIP_DEVICE__ int __ffs(int __x) {
+  return __builtin_ctzg((unsigned int)__x, -1) + 1;
+}
+__HIP_DEVICE__ int __ffs(unsigned int __x) {
+  return __builtin_ctzg(__x, -1) + 1;
+}
+__HIP_DEVICE__ int __ffsll(long long __x) {
+  return __builtin_ctzg((unsigned long long)__x, -1) + 1;
+}
+__HIP_DEVICE__ int __ffsll(unsigned long long __x) {
+  return __builtin_ctzg(__x, -1) + 1;
+}
+
+__HIP_DEVICE__ unsigned int __brev(unsigned int __x) {
+  return __builtin_elementwise_bitreverse(__x);
+}
+__HIP_DEVICE__ unsigned long long __brevll(unsigned long long __x) {
+  return __builtin_elementwise_bitreverse(__x);
+}
+
+__HIP_DEVICE__ int __mul24(int __x, int __y) {
+  return ((int(unsigned(__x) << 8) >> 8)) * ((int(unsigned(__y) << 8) >> 8));
+}
+__HIP_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) {
+  return int((__x & 0x00ffffffu) * (__y & 0x00ffffffu));
+}
+
+__HIP_DEVICE__ int __mulhi(int __x, int __y) {
+  return int(((long long)__x * (long long)__y) >> 32);
+}
+__HIP_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) {
+  return (unsigned int)(((unsigned long long)__x * (unsigned long long)__y) >>
+                        32);
+}
+__HIP_DEVICE__ long long __mul64hi(long long __x, long long __y) {
+  return (long long)(((__int128)__x * (__int128)__y) >> 64);
+}
+__HIP_DEVICE__ unsigned long long __umul64hi(unsigned long long __x,
+                                             unsigned long long __y) {
+  return (
+      unsigned long long)(((unsigned __int128)__x * (unsigned __int128)__y) >>
+                          64);
+}
+
+__HIP_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) {
+  return __x > __y ? __x - __y + __z : __y - __x + __z;
+}
+__HIP_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y,
+                                   unsigned int __z) {
+  return __x > __y ? __x - __y + __z : __y - __x + __z;
+}
+
+__HIP_DEVICE__ int __hadd(int __x, int __y) {
+  return int(((long long)__x + (long long)__y) >> 1);
+}
+__HIP_DEVICE__ int __rhadd(int __x, int __y) {
+  return int(((long long)__x + (long long)__y + 1) >> 1);
+}
+__HIP_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) {
+  return (unsigned int)(((unsigned long long)__x + (unsigned long long)__y) >>
+                        1);
+}
+__HIP_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) {
+  return (
+      unsigned int)(((unsigned long long)__x + (unsigned long long)__y + 1) >>
+                    1);
+}
+
+__HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y,
+                                        unsigned int __s) {
+  unsigned long long __tmp = ((unsigned long long)__y << 32) | __x;
+  unsigned int __result = 0;
+  for (int __i = 0; __i < 4; ++__i) {
+    unsigned int __sel = (__s >> (__i * 4)) & 0x7u;
+    __result |= (unsigned int)((__tmp >> (__sel * 8)) & 0xffu) << (__i * 8);
+  }
+  return __result;
+}
+
+//===----------------------------------------------------------------------===//
+// Bitfield operations.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) {
+  return (unsigned int)__builtin_ctzg(__x, -1);
+}
+
+__HIP_DEVICE__ unsigned int __bitextract_u32(unsigned int __src,
+                                             unsigned int __offset,
+                                             unsigned int __width) {
+  unsigned int __o = __offset & 31u;
+  unsigned int __w = __width & 31u;
+  return __w == 0 ? 0u : (__src << (32u - __o - __w)) >> (32u - __w);
+}
+__HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src,
+                                                   unsigned int __offset,
+                                                   unsigned int __width) {
+  unsigned long long __o = __offset & 63u;
+  unsigned long long __w = __width & 63u;
+  return __w == 0 ? 0ull : (__src << (64ull - __o - __w)) >> (64ull - __w);
+}
+
+__HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst,
+                                            unsigned int __src,
+                                            unsigned int __offset,
+                                            unsigned int __width) {
+  unsigned int __o = __offset & 31u;
+  unsigned int __mask = (1u << (__width & 31u)) - 1u;
+  return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o);
+}
+__HIP_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst,
+                                                  unsigned long long __src,
+                                                  unsigned int __offset,
+                                                  unsigned int __width) {
+  unsigned long long __o = __offset & 63u;
+  unsigned long long __mask = (1ull << (__width & 63u)) - 1ull;
+  return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o);
+}
+
+//===----------------------------------------------------------------------===//
+// Type punning.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ int __float_as_int(float __x) {
+  return __builtin_bit_cast(int, __x);
+}
+__HIP_DEVICE__ unsigned int __float_as_uint(float __x) {
+  return __builtin_bit_cast(unsigned int, __x);
+}
+__HIP_DEVICE__ float __int_as_float(int __x) {
+  return __builtin_bit_cast(float, __x);
+}
+__HIP_DEVICE__ float __uint_as_float(unsigned int __x) {
+  return __builtin_bit_cast(float, __x);
+}
+__HIP_DEVICE__ long long __double_as_longlong(double __x) {
+  return __builtin_bit_cast(long long, __x);
+}
+__HIP_DEVICE__ double __longlong_as_double(long long __x) {
+  return __builtin_bit_cast(double, __x);
+}
+__HIP_DEVICE__ int __double2hiint(double __x) {
+  return int(__builtin_bit_cast(unsigned long long, __x) >> 32);
+}
+__HIP_DEVICE__ int __double2loint(double __x) {
+  return int(__builtin_bit_cast(unsigned long long, __x));
+}
+__HIP_DEVICE__ double __hiloint2double(int __hi, int __lo) {
+  return __builtin_bit_cast(double,
+                            ((unsigned long long)(unsigned int)__hi << 32) |
+                                (unsigned long long)(unsigned int)__lo);
+}
+
+//===----------------------------------------------------------------------===//
+// Wavefront vote and lane identity.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); }
+
+__HIP_DEVICE__ unsigned long long __ballot(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred);
+}
+__HIP_DEVICE__ unsigned long long __ballot64(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred);
+}
+__HIP_DEVICE__ unsigned long long __activemask(void) {
+  return __gpu_ballot(__gpu_lane_mask(), 1);
+}
+
+__HIP_DEVICE__ int __all(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred) == __gpu_lane_mask();
+}
+__HIP_DEVICE__ int __any(int __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred) != 0ull;
+}
+
+template <typename __T>
+__HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned int __base,
+                                  int __offset) {
+  const int __bits = int(sizeof(__T)) * 8;
+  __T __m = __mask;
+  int __off = __offset;
+  if (__offset == 0) {
+    __m &= (__T(1) << __base);
+    __off = 1;
+  } else if (__offset < 0) {
+    __m = __builtin_elementwise_bitreverse(__mask);
+    __base = (unsigned int)(__bits - 1) - __base;
+    __off = -__offset;
+  }
+  __m &= (~__T(0)) << __base;
+  if (__builtin_popcountg(__m) < __off)
+    return -1;
+  int __total = 0;
+  for (int __i = __bits / 2; __i > 0; __i >>= 1) {
+    __T __lo = __m & ((__T(1) << __i) - 1);
+    int __pcnt = __builtin_popcountg(__lo);
+    if (__pcnt < __off) {
+      __m >>= __i;
+      __off -= __pcnt;
+      __total += __i;
+    } else {
+      __m = __lo;
+    }
+  }
+  return __offset < 0 ? (__bits - 1) - __total : __total;
+}
+__HIP_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base,
+                           int __offset) {
+  return __hip_fns_impl(__mask, __base, __offset);
+}
+__HIP_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base,
+                           int __offset) {
+  return __hip_fns_impl((unsigned int)__mask, __base, __offset);
+}
+__HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base,
+                         int __offset) {
+  return __fns32(__mask, __base, __offset);
+}
+
+//===----------------------------------------------------------------------===//
+// Synchronization and fences
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ __attribute__((convergent)) void __syncthreads(void) {
+  __gpu_sync_threads();
+}
+
+template <typename __Fn>
+__HIP_DEVICE__ __attribute__((convergent)) int
+__hip_block_reduce_impl(int __val, int __init, __Fn __op) {
+  static __shared__ int __scratch[32];
+  unsigned int __lanes = __gpu_num_lanes();
+  unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) *
+                            __gpu_num_threads(__GPU_Y_DIM) *
+                            __gpu_num_threads(__GPU_Z_DIM);
+  unsigned int __nwarps = (__nthreads + __lanes - 1) / __lanes;
+  unsigned int __tid =
+      (__gpu_thread_id(__GPU_Z_DIM) * __gpu_num_threads(__GPU_Y_DIM) +
+       __gpu_thread_id(__GPU_Y_DIM)) *
+          __gpu_num_threads(__GPU_X_DIM) +
+      __gpu_thread_id(__GPU_X_DIM);
+
+  if (__gpu_is_first_in_lane(__gpu_lane_mask()))
+    __scratch[__tid / __lanes] = __val;
+  __gpu_sync_threads();
+
+  int __acc = __init;
+  for (unsigned int __i = 0; __i < __nwarps; ++__i)
+    __acc = __op(__acc, __scratch[__i]);
+  __gpu_sync_threads();
+  return __acc;
+}
+
+__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_count(int __pred) 
{
+  unsigned long long __mask = __gpu_lane_mask();
+  int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred));
+  return __hip_block_reduce_impl(__val, 0,
+                                 [](int __a, int __b) { return __a + __b; });
+}
+__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_and(int __pred) {
+  unsigned long long __mask = __gpu_lane_mask();
+  int __val = __gpu_ballot(__mask, __pred) == __mask;
+  return __hip_block_reduce_impl(__val, 1,
+                                 [](int __a, int __b) { return __a & __b; });
+}
+__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_or(int __pred) {
+  unsigned long long __mask = __gpu_lane_mask();
+  int __val = __gpu_ballot(__mask, __pred) != 0ull;
+  return __hip_block_reduce_impl(__val, 0,
+                                 [](int __a, int __b) { return __a | __b; });
+}
+
+__HIP_DEVICE__ void __threadfence(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
+}
+__HIP_DEVICE__ void __threadfence_block(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_WRKGRP);
+}
+__HIP_DEVICE__ void __threadfence_system(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM);
+}
+
+//===----------------------------------------------------------------------===//
+// Timers
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ long long __clock64(void) {
+  return (long long)__builtin_readcyclecounter();
+}
+__HIP_DEVICE__ long long __clock(void) { return __clock64(); }
+__HIP_DEVICE__ long long clock64(void) { return __clock64(); }
+__HIP_DEVICE__ long long clock(void) { return __clock(); }
+__HIP_DEVICE__ long long wall_clock64(void) {
+  return (long long)__builtin_readsteadycounter();
+}
+
+// Warp shuffle / synchronization / reduction intrinsics.
+#include <__clang_hip_intrinsics.h>
+
+#pragma pop_macro("MAYBE_UNDEF")
+#pragma pop_macro("__HIP_DEVICE__")
+
+#endif // __HIP__ && defined(__AMDGPU__)
+#endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__
diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h 
b/clang/lib/Headers/__clang_hip_intrinsics.h
new file mode 100644
index 0000000000000..ef2bede1af070
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_intrinsics.h
@@ -0,0 +1,240 @@
+//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers 
------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_INTRINSICS_H__
+#define __CLANG_HIP_INTRINSICS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#ifndef __HIP_DEVICE__
+#error                                                                         
\
+    "__clang_hip_intrinsics.h must be included via 
__clang_hip_device_functions.h"
+#endif
+
+//===----------------------------------------------------------------------===//
+// Wavefront shuffles
+//===----------------------------------------------------------------------===//
+
+template <typename __T>
+__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx,
+                                          int __w) {
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u64(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(unsigned long long, __v),
+                                   (unsigned int)__w));
+  else
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u32(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(unsigned int, __v),
+                                   (unsigned int)__w));
+}
+
+template <typename __T>
+__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane,
+                          int __width = warpSize) {
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__src_lane & (__width - 1)), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta,
+                             int __width = warpSize) {
+  int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel - int(__delta);
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt < 0 ? __rel : __tgt), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta,
+                               int __width = warpSize) {
+  int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel + int(__delta);
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask,
+                              int __width = warpSize) {
+  int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel ^ __lane_mask;
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
+}
+
+//===----------------------------------------------------------------------===//
+// Warp synchronization
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ __attribute__((convergent)) void
+__syncwarp(unsigned long long __mask = -1) {
+  __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT);
+  __gpu_sync_lane(__mask);
+  __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT);
+}
+
+//===----------------------------------------------------------------------===//
+// Wave syncrhonization sync aliases.
+//===----------------------------------------------------------------------===//
+
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) {
+  return __ballot(__pred) & (unsigned long long)__mask;
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __all_sync(__MaskT __mask, int __pred) {
+  return __ballot_sync(__mask, __pred) == (unsigned long long)__mask;
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __any_sync(__MaskT __mask, int __pred) {
+  return __ballot_sync(__mask, __pred) != 0ull;
+}
+
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                               int __src_lane, int __width = warpSize) {
+  (void)__mask;
+  return __shfl(__var, __src_lane, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                  unsigned int __delta,
+                                  int __width = warpSize) {
+  (void)__mask;
+  return __shfl_up(__var, __delta, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                    unsigned int __delta,
+                                    int __width = warpSize) {
+  (void)__mask;
+  return __shfl_down(__var, __delta, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                   int __lane_mask, int __width = warpSize) {
+  (void)__mask;
+  return __shfl_xor(__var, __lane_mask, __width);
+}
+
+//===----------------------------------------------------------------------===//
+// Match primitives.
+//===----------------------------------------------------------------------===//
+
+template <typename __T>
+__HIP_DEVICE__ unsigned long long __match_any(__T __value) {
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    return __gpu_match_any_u64(__gpu_lane_mask(),
+                               __builtin_bit_cast(unsigned long long, 
__value));
+  else
+    return __gpu_match_any_u32(__gpu_lane_mask(),
+                               __builtin_bit_cast(unsigned int, __value));
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask,
+                                                   __T __value) {
+  return __match_any(__value) & (unsigned long long)__mask;
+}
+
+template <typename __T>
+__HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) {
+  unsigned long long __r;
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    __r = __gpu_match_all_u64(__gpu_lane_mask(),
+                              __builtin_bit_cast(unsigned long long, __value));
+  else
+    __r = __gpu_match_all_u32(__gpu_lane_mask(),
+                              __builtin_bit_cast(unsigned int, __value));
+  *__pred = __r != 0;
+  return __r;
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value,
+                                                   int *__pred) {
+  (void)__mask;
+  return __match_all(__value, __pred);
+}
+
+//===----------------------------------------------------------------------===//
+// Wave reductions.
+//===----------------------------------------------------------------------===//
+
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_add_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) {
+  return int(
+      __gpu_lane_add_u32((unsigned long long)__mask, (unsigned int)__val));
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_min_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) {
+  unsigned int __r = __gpu_lane_min_u32((unsigned long long)__mask,
+                                        (unsigned int)__val ^ 0x80000000u);
+  return int(__r ^ 0x80000000u);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_max_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) {
+  unsigned int __r = __gpu_lane_max_u32((unsigned long long)__mask,
+                                        (unsigned int)__val ^ 0x80000000u);
+  return int(__r ^ 0x80000000u);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_and_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask,
+                                             unsigned int __val) {
+  return __gpu_lane_or_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_xor_u32((unsigned long long)__mask, __val);
+}
+
+//===----------------------------------------------------------------------===//
+// Funnel shifts.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int
+__funnelshift_l(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift & 31u;
+  return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 
32);
+}
+__HIP_DEVICE__ unsigned int
+__funnelshift_lc(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift >= 32u ? 32u : __shift;
+  return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 
32);
+}
+__HIP_DEVICE__ unsigned int
+__funnelshift_r(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift & 31u;
+  return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s);
+}
+__HIP_DEVICE__ unsigned int
+__funnelshift_rc(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift >= 32u ? 32u : __shift;
+  return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s);
+}
+
+#endif // __HIP__ && defined(__AMDGPU__)
+#endif // __CLANG_HIP_INTRINSICS_H__
diff --git a/clang/test/Headers/__clang_hip_device_functions.hip 
b/clang/test/Headers/__clang_hip_device_functions.hip
new file mode 100644
index 0000000000000..472f1c97f66b1
--- /dev/null
+++ b/clang/test/Headers/__clang_hip_device_functions.hip
@@ -0,0 +1,76 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \
+// RUN:   -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \
+// RUN:   -include __clang_hip_device_functions.h
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \
+// RUN:   -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s \
+// RUN:   -include __clang_hip_device_functions.h
+
+// expected-no-diagnostics
+
+__global__ void test_kernel(int *p, float *f, double *d, unsigned *u) {
+  unsigned i = __popc(*u) + __popcll(*u);
+  i += __clz(*p) + __clzll(*p);
+  i += __ffs(*p) + __ffs(*u) + __ffsll((long long)*p) + __ffsll((unsigned long 
long)*u);
+  i += __brev(*u) + (unsigned)__brevll(*u);
+  i += __mul24(*p, *p) + __umul24(*u, *u) + __mulhi(*p, *p) + __umulhi(*u, *u);
+  i += (unsigned)__mul64hi(*p, *p) + (unsigned)__umul64hi(*u, *u);
+  i += __sad(*p, *p, *u) + __usad(*u, *u, *u);
+  i += __hadd(*p, *p) + __rhadd(*p, *p) + __uhadd(*u, *u) + __urhadd(*u, *u);
+  i += __byte_perm(*u, *u, *u);
+  i += __funnelshift_l(*u, *u, *u) + __funnelshift_lc(*u, *u, *u) +
+       __funnelshift_r(*u, *u, *u) + __funnelshift_rc(*u, *u, *u);
+
+  i += __lastbit_u32_u64(*u);
+  i += __bitextract_u32(*u, *u, *u) + (unsigned)__bitextract_u64(*u, *u, *u);
+  i += __bitinsert_u32(*u, *u, *u, *u) + (unsigned)__bitinsert_u64(*u, *u, *u, 
*u);
+
+  i += __float_as_int(*f) + __float_as_uint(*f);
+  *f = __int_as_float(*p) + __uint_as_float(*u);
+  i += (unsigned)__double_as_longlong(*d);
+  *d = __longlong_as_double((long long)*u);
+  i += __double2hiint(*d) + __double2loint(*d);
+  *d = __hiloint2double(*p, *p);
+
+  i += threadIdx.x + threadIdx.y + threadIdx.z;
+  i += blockIdx.x + blockIdx.y + blockIdx.z;
+  i += blockDim.x + blockDim.y + blockDim.z;
+  i += gridDim.x + gridDim.y + gridDim.z;
+  i += __lane_id();
+  unsigned long long m = __ballot(*p) + __ballot64(*p) + __activemask();
+  int v = __all(*p) + __any(*p);
+  v += __fns(m, 0, 1) + __fns32(m, 0, 1) + __fns64(m, __lane_id(), -1);
+
+  int s = __shfl(*p, 1) + __shfl_up(*p, 1) + __shfl_down(*p, 1) + 
__shfl_xor(*p, 1);
+  s += (int)__shfl(*u, 1, 32) + (int)__shfl_down(*f, 1) + (int)__shfl_xor(*d, 
1);
+  s += (int)__shfl((long long)*p, 0) + (int)__shfl((unsigned long long)*u, 0);
+
+  unsigned long long mask = ~0ull;
+  s += __shfl_sync(mask, *p, 1) + __shfl_up_sync(mask, *p, 1) +
+       __shfl_down_sync(mask, *p, 1) + __shfl_xor_sync(mask, *p, 1);
+  s += __all_sync(mask, *p) + __any_sync(mask, *p) + (int)__ballot_sync(mask, 
*p);
+  int pred;
+  s += (int)__match_any(*p) + (int)__match_any_sync(mask, *p) +
+       (int)__match_all_sync(mask, *p, &pred);
+  s += __reduce_add_sync(mask, *p) + __reduce_min_sync(mask, *p) +
+       __reduce_max_sync(mask, *p);
+  s += (int)(__reduce_add_sync(mask, *u) + __reduce_min_sync(mask, *u) +
+             __reduce_max_sync(mask, *u) + __reduce_and_sync(mask, *u) +
+             __reduce_or_sync(mask, *u) + __reduce_xor_sync(mask, *u));
+
+  __syncthreads();
+  s += __syncthreads_count(*p) + __syncthreads_and(*p) + __syncthreads_or(*p);
+  __syncwarp();
+  __syncwarp(mask);
+  __threadfence();
+  __threadfence_block();
+  __threadfence_system();
+  long long c = clock() + clock64() + __clock() + __clock64() + wall_clock64();
+
+  *p = (int)i + v + s + (int)c + warpSize;
+}

>From 8931b683132293db0fc0d910eb163d9454a0e7cf Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Tue, 16 Jun 2026 08:25:34 -0500
Subject: [PATCH 2/6] Move macro things, convergent, and simplify some casts

---
 clang/lib/Basic/Targets/AMDGPU.cpp            |  4 ----
 clang/lib/Headers/__clang_hip_builtin_vars.h  |  2 +-
 .../Headers/__clang_hip_device_functions.h    | 19 +++++++++----------
 clang/lib/Headers/__clang_hip_intrinsics.h    |  5 ++---
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index 906601850e353..bfa956fa9a4e3 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -252,10 +252,6 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   else
     Builder.defineMacro("__R600__");
 
-  // The 'llvm' environment selects the upstream headers.
-  if (getTriple().getEnvironment() == llvm::Triple::LLVM)
-    Builder.defineMacro("__AMDGCN_LLVM__");
-
   // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will 
be
   // removed in the near future.
   if (hasFMAF())
diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h 
b/clang/lib/Headers/__clang_hip_builtin_vars.h
index f2e53ef93b38a..83b42adc587d1 100644
--- a/clang/lib/Headers/__clang_hip_builtin_vars.h
+++ b/clang/lib/Headers/__clang_hip_builtin_vars.h
@@ -9,7 +9,7 @@
 #ifndef __CLANG_HIP_BUILTIN_VARS_H__
 #define __CLANG_HIP_BUILTIN_VARS_H__
 
-#if __HIP__ && (defined(__AMDGPU__))
+#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
 
 #include <gpuintrin.h>
 
diff --git a/clang/lib/Headers/__clang_hip_device_functions.h 
b/clang/lib/Headers/__clang_hip_device_functions.h
index 14edf987c9207..8105adcbcc501 100644
--- a/clang/lib/Headers/__clang_hip_device_functions.h
+++ b/clang/lib/Headers/__clang_hip_device_functions.h
@@ -9,7 +9,7 @@
 #ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__
 #define __CLANG_HIP_DEVICE_FUNCTIONS_H__
 
-#if __HIP__ && (defined(__AMDGPU__))
+#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
 
 #ifndef __device__
 #define __host__ __attribute__((host))
@@ -22,6 +22,8 @@
 
 #include <gpuintrin.h>
 
+#define __HIP_LLVM__ 1
+
 #pragma push_macro("__HIP_DEVICE__")
 #define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline))
 
@@ -273,14 +275,11 @@ __HIP_DEVICE__ int __fns(unsigned int __mask, unsigned 
int __base,
 // Synchronization and fences
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ __attribute__((convergent)) void __syncthreads(void) {
-  __gpu_sync_threads();
-}
+__HIP_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); }
 
 template <typename __Fn>
-__HIP_DEVICE__ __attribute__((convergent)) int
-__hip_block_reduce_impl(int __val, int __init, __Fn __op) {
-  static __shared__ int __scratch[32];
+__HIP_DEVICE__ int __hip_block_reduce_impl(int __val, int __init, __Fn __op) {
+  static __attribute__((shared)) int __scratch[32];
   unsigned int __lanes = __gpu_num_lanes();
   unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) *
                             __gpu_num_threads(__GPU_Y_DIM) *
@@ -303,19 +302,19 @@ __hip_block_reduce_impl(int __val, int __init, __Fn __op) 
{
   return __acc;
 }
 
-__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_count(int __pred) 
{
+__HIP_DEVICE__ int __syncthreads_count(int __pred) {
   unsigned long long __mask = __gpu_lane_mask();
   int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred));
   return __hip_block_reduce_impl(__val, 0,
                                  [](int __a, int __b) { return __a + __b; });
 }
-__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_and(int __pred) {
+__HIP_DEVICE__ int __syncthreads_and(int __pred) {
   unsigned long long __mask = __gpu_lane_mask();
   int __val = __gpu_ballot(__mask, __pred) == __mask;
   return __hip_block_reduce_impl(__val, 1,
                                  [](int __a, int __b) { return __a & __b; });
 }
-__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_or(int __pred) {
+__HIP_DEVICE__ int __syncthreads_or(int __pred) {
   unsigned long long __mask = __gpu_lane_mask();
   int __val = __gpu_ballot(__mask, __pred) != 0ull;
   return __hip_block_reduce_impl(__val, 0,
diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h 
b/clang/lib/Headers/__clang_hip_intrinsics.h
index ef2bede1af070..42fb3b40fb06f 100644
--- a/clang/lib/Headers/__clang_hip_intrinsics.h
+++ b/clang/lib/Headers/__clang_hip_intrinsics.h
@@ -9,7 +9,7 @@
 #ifndef __CLANG_HIP_INTRINSICS_H__
 #define __CLANG_HIP_INTRINSICS_H__
 
-#if __HIP__ && (defined(__AMDGPU__))
+#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
 
 #ifndef __HIP_DEVICE__
 #error                                                                         
\
@@ -70,8 +70,7 @@ __HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int 
__lane_mask,
 // Warp synchronization
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ __attribute__((convergent)) void
-__syncwarp(unsigned long long __mask = -1) {
+__HIP_DEVICE__ void __syncwarp(unsigned long long __mask = -1) {
   __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT);
   __gpu_sync_lane(__mask);
   __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT);

>From 55b4e03cd731cce16ecc74c77abad56b125ee363 Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Tue, 16 Jun 2026 09:25:34 -0500
Subject: [PATCH 3/6] test and tweaks

---
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |  4 +++-
 clang/lib/Headers/__clang_hip_builtin_vars.h  | 15 +++++++++++-
 .../Headers/__clang_hip_device_functions.h    |  2 +-
 clang/lib/Headers/__clang_hip_intrinsics.h    |  2 +-
 .../test/Driver/hip-device-libs-llvm-env.hip  | 23 +++++++++++++++++++
 5 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp 
b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 2dbd8e9f670ce..ed02877cbf944 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -332,7 +332,9 @@ void HIPAMDToolChain::AddIAMCUIncludeArgs(const ArgList 
&Args,
 void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
   if (getTriple().getEnvironment() == llvm::Triple::LLVM) {
-    if (!DriverArgs.hasArg(options::OPT_nohipwrapperinc) &&
+    if (DriverArgs.hasFlag(options::OPT_offload_inc,
+                           options::OPT_no_offload_inc, true) &&
+        !DriverArgs.hasArg(options::OPT_nohipwrapperinc) &&
         !DriverArgs.hasArg(options::OPT_nobuiltininc))
       CC1Args.append({"-include", "__clang_hip_device_functions.h"});
     return;
diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h 
b/clang/lib/Headers/__clang_hip_builtin_vars.h
index 83b42adc587d1..622201c261ae2 100644
--- a/clang/lib/Headers/__clang_hip_builtin_vars.h
+++ b/clang/lib/Headers/__clang_hip_builtin_vars.h
@@ -20,6 +20,15 @@ inline __attribute__((device)) const struct {
   }
 } warpSize{};
 
+// Make sure nobody can create instances of the coordinate types, take their
+// address, copy, or assign them.
+#pragma push_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS")
+#define __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag)                                
\
+  __attribute__((device)) __tag() = delete;                                    
\
+  __attribute__((device)) __tag(const __tag &) = delete;                       
\
+  __attribute__((device)) void operator=(const __tag &) const = delete;        
\
+  __attribute__((device)) __tag *operator&() const = delete
+
 #pragma push_macro("__HIP_COORD_BUILTIN")
 #define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz)                           
\
   struct __tag {                                                               
\
@@ -35,6 +44,9 @@ inline __attribute__((device)) const struct {
     __attribute__((device, always_inline)) unsigned int __get_z() const {      
\
       return __fz;                                                             
\
     }                                                                          
\
+                                                                               
\
+  private:                                                                     
\
+    __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag);                                   
\
   }
 
 __HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(),
@@ -47,11 +59,12 @@ __HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, 
__gpu_num_blocks_x(),
                     __gpu_num_blocks_y(), __gpu_num_blocks_z());
 
 #pragma pop_macro("__HIP_COORD_BUILTIN")
+#pragma pop_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS")
 
 extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx;
 extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx;
 extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim;
 extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim;
 
-#endif // __HIP__ && (defined(__AMDGPU__) || defined(__SPIRV__))
+#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
 #endif // __CLANG_HIP_BUILTIN_VARS_H__
diff --git a/clang/lib/Headers/__clang_hip_device_functions.h 
b/clang/lib/Headers/__clang_hip_device_functions.h
index 8105adcbcc501..4c949b0223d9f 100644
--- a/clang/lib/Headers/__clang_hip_device_functions.h
+++ b/clang/lib/Headers/__clang_hip_device_functions.h
@@ -351,5 +351,5 @@ __HIP_DEVICE__ long long wall_clock64(void) {
 #pragma pop_macro("MAYBE_UNDEF")
 #pragma pop_macro("__HIP_DEVICE__")
 
-#endif // __HIP__ && defined(__AMDGPU__)
+#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
 #endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__
diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h 
b/clang/lib/Headers/__clang_hip_intrinsics.h
index 42fb3b40fb06f..330a80bf267ac 100644
--- a/clang/lib/Headers/__clang_hip_intrinsics.h
+++ b/clang/lib/Headers/__clang_hip_intrinsics.h
@@ -235,5 +235,5 @@ __funnelshift_rc(unsigned int __lo, unsigned int __hi, 
unsigned int __shift) {
   return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s);
 }
 
-#endif // __HIP__ && defined(__AMDGPU__)
+#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
 #endif // __CLANG_HIP_INTRINSICS_H__
diff --git a/clang/test/Driver/hip-device-libs-llvm-env.hip 
b/clang/test/Driver/hip-device-libs-llvm-env.hip
index 7f3c4c9b7af10..23354a2961beb 100644
--- a/clang/test/Driver/hip-device-libs-llvm-env.hip
+++ b/clang/test/Driver/hip-device-libs-llvm-env.hip
@@ -9,3 +9,26 @@
 // LLVMENV-NOT: oclc
 // LLVMENV-NOT: ocml
 // LLVMENV-NOT: ockl
+
+// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN:   --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \
+// RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck --check-prefix=INC %s
+
+// INC: "-cc1" "-triple" "amdgcn-amd-amdhsa-llvm"
+// INC-SAME: "-include" "__clang_hip_device_functions.h"
+
+// RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc \
+// RUN:   --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \
+// RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck --check-prefix=NOINC %s
+
+// RUN: %clang -### --target=x86_64-linux-gnu -nohipwrapperinc \
+// RUN:   --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \
+// RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck --check-prefix=NOINC %s
+
+// NOINC-NOT: __clang_hip_device_functions.h

>From 32c48d112b53e56a59b6de25af1648a3c9293899 Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Wed, 17 Jun 2026 08:50:53 -0500
Subject: [PATCH 4/6] Make generic

---
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   2 +-
 clang/lib/Frontend/InitPreprocessor.cpp       |   2 +
 clang/lib/Headers/CMakeLists.txt              |   6 +-
 ...ltin_vars.h => __clang_gpu_builtin_vars.h} |  43 ++---
 ...tions.h => __clang_gpu_device_functions.h} | 158 +++++++++---------
 ..._intrinsics.h => __clang_gpu_intrinsics.h} |  85 +++++-----
 .../test/Driver/hip-device-libs-llvm-env.hip  |   4 +-
 ...functions.hip => gpu-device-functions.cpp} |  27 ++-
 clang/test/Preprocessor/predefined-macros.c   |   9 +
 9 files changed, 182 insertions(+), 154 deletions(-)
 rename clang/lib/Headers/{__clang_hip_builtin_vars.h => 
__clang_gpu_builtin_vars.h} (67%)
 rename clang/lib/Headers/{__clang_hip_device_functions.h => 
__clang_gpu_device_functions.h} (68%)
 rename clang/lib/Headers/{__clang_hip_intrinsics.h => 
__clang_gpu_intrinsics.h} (77%)
 rename clang/test/Headers/{__clang_hip_device_functions.hip => 
gpu-device-functions.cpp} (74%)

diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp 
b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index ed02877cbf944..e180fc9ad1adf 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -336,7 +336,7 @@ void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList 
&DriverArgs,
                            options::OPT_no_offload_inc, true) &&
         !DriverArgs.hasArg(options::OPT_nohipwrapperinc) &&
         !DriverArgs.hasArg(options::OPT_nobuiltininc))
-      CC1Args.append({"-include", "__clang_hip_device_functions.h"});
+      CC1Args.append({"-include", "__clang_gpu_device_functions.h"});
     return;
   }
 
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp 
b/clang/lib/Frontend/InitPreprocessor.cpp
index ec009211ec6de..4456c0e140c2c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -604,6 +604,8 @@ static void InitializeStandardPredefinedMacros(const 
TargetInfo &TI,
     }
     if (LangOpts.CUDAIsDevice) {
       Builder.defineMacro("__HIP_DEVICE_COMPILE__");
+      if (TI.getTriple().getEnvironment() == llvm::Triple::LLVM)
+        Builder.defineMacro("__HIP_LLVM__");
       if (!TI.hasHIPImageSupport()) {
         Builder.defineMacro("__HIP_NO_IMAGE_SUPPORT__", "1");
         // Deprecated.
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 4a60af68c1d23..07cb92d6c51e3 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -86,9 +86,6 @@ set(hip_files
   __clang_hip_math.h
   __clang_hip_stdlib.h
   __clang_hip_runtime_wrapper.h
-  __clang_hip_builtin_vars.h
-  __clang_hip_device_functions.h
-  __clang_hip_intrinsics.h
   )
 
 set(hlsl_h
@@ -309,6 +306,9 @@ set(gpu_files
   nvptxintrin.h
   amdgpuintrin.h
   spirvintrin.h
+  __clang_gpu_builtin_vars.h
+  __clang_gpu_device_functions.h
+  __clang_gpu_intrinsics.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h 
b/clang/lib/Headers/__clang_gpu_builtin_vars.h
similarity index 67%
rename from clang/lib/Headers/__clang_hip_builtin_vars.h
rename to clang/lib/Headers/__clang_gpu_builtin_vars.h
index 622201c261ae2..43905b0ab4d86 100644
--- a/clang/lib/Headers/__clang_hip_builtin_vars.h
+++ b/clang/lib/Headers/__clang_gpu_builtin_vars.h
@@ -1,4 +1,4 @@
-//===---- __clang_hip_builtin_vars.h - HIP built-in variables --------------===
+//===---- __clang_gpu_builtin_vars.h - GPU built-in variables --------------===
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,11 @@
 //
 //===-----------------------------------------------------------------------===
 
-#ifndef __CLANG_HIP_BUILTIN_VARS_H__
-#define __CLANG_HIP_BUILTIN_VARS_H__
+#ifndef __CLANG_GPU_BUILTIN_VARS_H__
+#define __CLANG_GPU_BUILTIN_VARS_H__
 
-#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
+#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) ||                   
\
+    (defined(__CUDA__) && defined(__CUDA_ARCH__))
 
 #include <gpuintrin.h>
 
@@ -22,15 +23,15 @@ inline __attribute__((device)) const struct {
 
 // Make sure nobody can create instances of the coordinate types, take their
 // address, copy, or assign them.
-#pragma push_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS")
-#define __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag)                                
\
+#pragma push_macro("__GPU_DISALLOW_BUILTINVAR_ACCESS")
+#define __GPU_DISALLOW_BUILTINVAR_ACCESS(__tag)                                
\
   __attribute__((device)) __tag() = delete;                                    
\
   __attribute__((device)) __tag(const __tag &) = delete;                       
\
   __attribute__((device)) void operator=(const __tag &) const = delete;        
\
   __attribute__((device)) __tag *operator&() const = delete
 
-#pragma push_macro("__HIP_COORD_BUILTIN")
-#define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz)                           
\
+#pragma push_macro("__GPU_COORD_BUILTIN")
+#define __GPU_COORD_BUILTIN(__tag, __fx, __fy, __fz)                           
\
   struct __tag {                                                               
\
     __declspec(property(get = __get_x)) unsigned int x;                        
\
     __declspec(property(get = __get_y)) unsigned int y;                        
\
@@ -46,25 +47,25 @@ inline __attribute__((device)) const struct {
     }                                                                          
\
                                                                                
\
   private:                                                                     
\
-    __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag);                                   
\
+    __GPU_DISALLOW_BUILTINVAR_ACCESS(__tag);                                   
\
   }
 
-__HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(),
+__GPU_COORD_BUILTIN(__gpu_builtin_threadIdx_t, __gpu_thread_id_x(),
                     __gpu_thread_id_y(), __gpu_thread_id_z());
-__HIP_COORD_BUILTIN(__hip_builtin_blockIdx_t, __gpu_block_id_x(),
+__GPU_COORD_BUILTIN(__gpu_builtin_blockIdx_t, __gpu_block_id_x(),
                     __gpu_block_id_y(), __gpu_block_id_z());
-__HIP_COORD_BUILTIN(__hip_builtin_blockDim_t, __gpu_num_threads_x(),
+__GPU_COORD_BUILTIN(__gpu_builtin_blockDim_t, __gpu_num_threads_x(),
                     __gpu_num_threads_y(), __gpu_num_threads_z());
-__HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(),
+__GPU_COORD_BUILTIN(__gpu_builtin_gridDim_t, __gpu_num_blocks_x(),
                     __gpu_num_blocks_y(), __gpu_num_blocks_z());
 
-#pragma pop_macro("__HIP_COORD_BUILTIN")
-#pragma pop_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS")
+#pragma pop_macro("__GPU_COORD_BUILTIN")
+#pragma pop_macro("__GPU_DISALLOW_BUILTINVAR_ACCESS")
 
-extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx;
-extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx;
-extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim;
-extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim;
+extern const __attribute__((device, weak)) __gpu_builtin_threadIdx_t threadIdx;
+extern const __attribute__((device, weak)) __gpu_builtin_blockIdx_t blockIdx;
+extern const __attribute__((device, weak)) __gpu_builtin_blockDim_t blockDim;
+extern const __attribute__((device, weak)) __gpu_builtin_gridDim_t gridDim;
 
-#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
-#endif // __CLANG_HIP_BUILTIN_VARS_H__
+#endif // device compile
+#endif // __CLANG_GPU_BUILTIN_VARS_H__
diff --git a/clang/lib/Headers/__clang_hip_device_functions.h 
b/clang/lib/Headers/__clang_gpu_device_functions.h
similarity index 68%
rename from clang/lib/Headers/__clang_hip_device_functions.h
rename to clang/lib/Headers/__clang_gpu_device_functions.h
index 4c949b0223d9f..3b068052d935b 100644
--- a/clang/lib/Headers/__clang_hip_device_functions.h
+++ b/clang/lib/Headers/__clang_gpu_device_functions.h
@@ -1,4 +1,4 @@
-//===---- __clang_hip_device_functions.h - HIP device functions ------------===
+//===---- __clang_gpu_device_functions.h - GPU device functions ------------===
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,11 @@
 //
 //===-----------------------------------------------------------------------===
 
-#ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__
-#define __CLANG_HIP_DEVICE_FUNCTIONS_H__
+#ifndef __CLANG_GPU_DEVICE_FUNCTIONS_H__
+#define __CLANG_GPU_DEVICE_FUNCTIONS_H__
 
-#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
+#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) ||                   
\
+    (defined(__CUDA__) && defined(__CUDA_ARCH__))
 
 #ifndef __device__
 #define __host__ __attribute__((host))
@@ -22,104 +23,102 @@
 
 #include <gpuintrin.h>
 
-#define __HIP_LLVM__ 1
-
-#pragma push_macro("__HIP_DEVICE__")
-#define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline))
+#pragma push_macro("__GPU_DEVICE__")
+#define __GPU_DEVICE__ static __inline__ __attribute__((device, always_inline))
 
 #pragma push_macro("MAYBE_UNDEF")
 #define MAYBE_UNDEF __attribute__((maybe_undef))
 
 // warpSize and the threadIdx/blockIdx/blockDim/gridDim coordinate variables.
-#include <__clang_hip_builtin_vars.h>
+#include <__clang_gpu_builtin_vars.h>
 
 
//===----------------------------------------------------------------------===//
 // Integer intrinsics.
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ unsigned int __popc(unsigned int __x) {
+__GPU_DEVICE__ unsigned int __popc(unsigned int __x) {
   return __builtin_popcountg(__x);
 }
-__HIP_DEVICE__ unsigned int __popcll(unsigned long long __x) {
+__GPU_DEVICE__ unsigned int __popcll(unsigned long long __x) {
   return __builtin_popcountg(__x);
 }
 
-__HIP_DEVICE__ int __clz(int __x) {
+__GPU_DEVICE__ int __clz(int __x) {
   return __builtin_clzg((unsigned int)__x, 32);
 }
-__HIP_DEVICE__ int __clzll(long long __x) {
+__GPU_DEVICE__ int __clzll(long long __x) {
   return __builtin_clzg((unsigned long long)__x, 64);
 }
 
-__HIP_DEVICE__ int __ffs(int __x) {
+__GPU_DEVICE__ int __ffs(int __x) {
   return __builtin_ctzg((unsigned int)__x, -1) + 1;
 }
-__HIP_DEVICE__ int __ffs(unsigned int __x) {
+__GPU_DEVICE__ int __ffs(unsigned int __x) {
   return __builtin_ctzg(__x, -1) + 1;
 }
-__HIP_DEVICE__ int __ffsll(long long __x) {
+__GPU_DEVICE__ int __ffsll(long long __x) {
   return __builtin_ctzg((unsigned long long)__x, -1) + 1;
 }
-__HIP_DEVICE__ int __ffsll(unsigned long long __x) {
+__GPU_DEVICE__ int __ffsll(unsigned long long __x) {
   return __builtin_ctzg(__x, -1) + 1;
 }
 
-__HIP_DEVICE__ unsigned int __brev(unsigned int __x) {
+__GPU_DEVICE__ unsigned int __brev(unsigned int __x) {
   return __builtin_elementwise_bitreverse(__x);
 }
-__HIP_DEVICE__ unsigned long long __brevll(unsigned long long __x) {
+__GPU_DEVICE__ unsigned long long __brevll(unsigned long long __x) {
   return __builtin_elementwise_bitreverse(__x);
 }
 
-__HIP_DEVICE__ int __mul24(int __x, int __y) {
+__GPU_DEVICE__ int __mul24(int __x, int __y) {
   return ((int(unsigned(__x) << 8) >> 8)) * ((int(unsigned(__y) << 8) >> 8));
 }
-__HIP_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) {
+__GPU_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) {
   return int((__x & 0x00ffffffu) * (__y & 0x00ffffffu));
 }
 
-__HIP_DEVICE__ int __mulhi(int __x, int __y) {
+__GPU_DEVICE__ int __mulhi(int __x, int __y) {
   return int(((long long)__x * (long long)__y) >> 32);
 }
-__HIP_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) {
+__GPU_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) {
   return (unsigned int)(((unsigned long long)__x * (unsigned long long)__y) >>
                         32);
 }
-__HIP_DEVICE__ long long __mul64hi(long long __x, long long __y) {
+__GPU_DEVICE__ long long __mul64hi(long long __x, long long __y) {
   return (long long)(((__int128)__x * (__int128)__y) >> 64);
 }
-__HIP_DEVICE__ unsigned long long __umul64hi(unsigned long long __x,
+__GPU_DEVICE__ unsigned long long __umul64hi(unsigned long long __x,
                                              unsigned long long __y) {
   return (
       unsigned long long)(((unsigned __int128)__x * (unsigned __int128)__y) >>
                           64);
 }
 
-__HIP_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) {
+__GPU_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) {
   return __x > __y ? __x - __y + __z : __y - __x + __z;
 }
-__HIP_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y,
+__GPU_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y,
                                    unsigned int __z) {
   return __x > __y ? __x - __y + __z : __y - __x + __z;
 }
 
-__HIP_DEVICE__ int __hadd(int __x, int __y) {
+__GPU_DEVICE__ int __hadd(int __x, int __y) {
   return int(((long long)__x + (long long)__y) >> 1);
 }
-__HIP_DEVICE__ int __rhadd(int __x, int __y) {
+__GPU_DEVICE__ int __rhadd(int __x, int __y) {
   return int(((long long)__x + (long long)__y + 1) >> 1);
 }
-__HIP_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) {
+__GPU_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) {
   return (unsigned int)(((unsigned long long)__x + (unsigned long long)__y) >>
                         1);
 }
-__HIP_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) {
+__GPU_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) {
   return (
       unsigned int)(((unsigned long long)__x + (unsigned long long)__y + 1) >>
                     1);
 }
 
-__HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y,
+__GPU_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y,
                                         unsigned int __s) {
   unsigned long long __tmp = ((unsigned long long)__y << 32) | __x;
   unsigned int __result = 0;
@@ -134,18 +133,18 @@ __HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, 
unsigned int __y,
 // Bitfield operations.
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) {
+__GPU_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) {
   return (unsigned int)__builtin_ctzg(__x, -1);
 }
 
-__HIP_DEVICE__ unsigned int __bitextract_u32(unsigned int __src,
+__GPU_DEVICE__ unsigned int __bitextract_u32(unsigned int __src,
                                              unsigned int __offset,
                                              unsigned int __width) {
   unsigned int __o = __offset & 31u;
   unsigned int __w = __width & 31u;
   return __w == 0 ? 0u : (__src << (32u - __o - __w)) >> (32u - __w);
 }
-__HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src,
+__GPU_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src,
                                                    unsigned int __offset,
                                                    unsigned int __width) {
   unsigned long long __o = __offset & 63u;
@@ -153,7 +152,7 @@ __HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned 
long long __src,
   return __w == 0 ? 0ull : (__src << (64ull - __o - __w)) >> (64ull - __w);
 }
 
-__HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst,
+__GPU_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst,
                                             unsigned int __src,
                                             unsigned int __offset,
                                             unsigned int __width) {
@@ -161,7 +160,7 @@ __HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int 
__dst,
   unsigned int __mask = (1u << (__width & 31u)) - 1u;
   return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o);
 }
-__HIP_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst,
+__GPU_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst,
                                                   unsigned long long __src,
                                                   unsigned int __offset,
                                                   unsigned int __width) {
@@ -174,31 +173,31 @@ __HIP_DEVICE__ unsigned long long 
__bitinsert_u64(unsigned long long __dst,
 // Type punning.
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ int __float_as_int(float __x) {
+__GPU_DEVICE__ int __float_as_int(float __x) {
   return __builtin_bit_cast(int, __x);
 }
-__HIP_DEVICE__ unsigned int __float_as_uint(float __x) {
+__GPU_DEVICE__ unsigned int __float_as_uint(float __x) {
   return __builtin_bit_cast(unsigned int, __x);
 }
-__HIP_DEVICE__ float __int_as_float(int __x) {
+__GPU_DEVICE__ float __int_as_float(int __x) {
   return __builtin_bit_cast(float, __x);
 }
-__HIP_DEVICE__ float __uint_as_float(unsigned int __x) {
+__GPU_DEVICE__ float __uint_as_float(unsigned int __x) {
   return __builtin_bit_cast(float, __x);
 }
-__HIP_DEVICE__ long long __double_as_longlong(double __x) {
+__GPU_DEVICE__ long long __double_as_longlong(double __x) {
   return __builtin_bit_cast(long long, __x);
 }
-__HIP_DEVICE__ double __longlong_as_double(long long __x) {
+__GPU_DEVICE__ double __longlong_as_double(long long __x) {
   return __builtin_bit_cast(double, __x);
 }
-__HIP_DEVICE__ int __double2hiint(double __x) {
+__GPU_DEVICE__ int __double2hiint(double __x) {
   return int(__builtin_bit_cast(unsigned long long, __x) >> 32);
 }
-__HIP_DEVICE__ int __double2loint(double __x) {
+__GPU_DEVICE__ int __double2loint(double __x) {
   return int(__builtin_bit_cast(unsigned long long, __x));
 }
-__HIP_DEVICE__ double __hiloint2double(int __hi, int __lo) {
+__GPU_DEVICE__ double __hiloint2double(int __hi, int __lo) {
   return __builtin_bit_cast(double,
                             ((unsigned long long)(unsigned int)__hi << 32) |
                                 (unsigned long long)(unsigned int)__lo);
@@ -208,27 +207,27 @@ __HIP_DEVICE__ double __hiloint2double(int __hi, int 
__lo) {
 // Wavefront vote and lane identity.
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); }
+__GPU_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); }
 
-__HIP_DEVICE__ unsigned long long __ballot(int __pred) {
+__GPU_DEVICE__ unsigned long long __ballot(int __pred) {
   return __gpu_ballot(__gpu_lane_mask(), __pred);
 }
-__HIP_DEVICE__ unsigned long long __ballot64(int __pred) {
+__GPU_DEVICE__ unsigned long long __ballot64(int __pred) {
   return __gpu_ballot(__gpu_lane_mask(), __pred);
 }
-__HIP_DEVICE__ unsigned long long __activemask(void) {
+__GPU_DEVICE__ unsigned long long __activemask(void) {
   return __gpu_ballot(__gpu_lane_mask(), 1);
 }
 
-__HIP_DEVICE__ int __all(int __pred) {
+__GPU_DEVICE__ int __all(int __pred) {
   return __gpu_ballot(__gpu_lane_mask(), __pred) == __gpu_lane_mask();
 }
-__HIP_DEVICE__ int __any(int __pred) {
+__GPU_DEVICE__ int __any(int __pred) {
   return __gpu_ballot(__gpu_lane_mask(), __pred) != 0ull;
 }
 
 template <typename __T>
-__HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned int __base,
+__GPU_DEVICE__ int __gpu_fns_impl(__T __mask, unsigned int __base,
                                   int __offset) {
   const int __bits = int(sizeof(__T)) * 8;
   __T __m = __mask;
@@ -258,15 +257,15 @@ __HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned 
int __base,
   }
   return __offset < 0 ? (__bits - 1) - __total : __total;
 }
-__HIP_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base,
+__GPU_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base,
                            int __offset) {
-  return __hip_fns_impl(__mask, __base, __offset);
+  return __gpu_fns_impl(__mask, __base, __offset);
 }
-__HIP_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base,
+__GPU_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base,
                            int __offset) {
-  return __hip_fns_impl((unsigned int)__mask, __base, __offset);
+  return __gpu_fns_impl((unsigned int)__mask, __base, __offset);
 }
-__HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base,
+__GPU_DEVICE__ int __fns(unsigned int __mask, unsigned int __base,
                          int __offset) {
   return __fns32(__mask, __base, __offset);
 }
@@ -275,10 +274,13 @@ __HIP_DEVICE__ int __fns(unsigned int __mask, unsigned 
int __base,
 // Synchronization and fences
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); }
+// Some targets (e.g. NVPTX) expose __syncthreads as a compiler builtin 
already.
+#if !__has_builtin(__syncthreads)
+__GPU_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); }
+#endif
 
 template <typename __Fn>
-__HIP_DEVICE__ int __hip_block_reduce_impl(int __val, int __init, __Fn __op) {
+__GPU_DEVICE__ int __gpu_block_reduce_impl(int __val, int __init, __Fn __op) {
   static __attribute__((shared)) int __scratch[32];
   unsigned int __lanes = __gpu_num_lanes();
   unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) *
@@ -302,32 +304,32 @@ __HIP_DEVICE__ int __hip_block_reduce_impl(int __val, int 
__init, __Fn __op) {
   return __acc;
 }
 
-__HIP_DEVICE__ int __syncthreads_count(int __pred) {
+__GPU_DEVICE__ int __syncthreads_count(int __pred) {
   unsigned long long __mask = __gpu_lane_mask();
   int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred));
-  return __hip_block_reduce_impl(__val, 0,
+  return __gpu_block_reduce_impl(__val, 0,
                                  [](int __a, int __b) { return __a + __b; });
 }
-__HIP_DEVICE__ int __syncthreads_and(int __pred) {
+__GPU_DEVICE__ int __syncthreads_and(int __pred) {
   unsigned long long __mask = __gpu_lane_mask();
   int __val = __gpu_ballot(__mask, __pred) == __mask;
-  return __hip_block_reduce_impl(__val, 1,
+  return __gpu_block_reduce_impl(__val, 1,
                                  [](int __a, int __b) { return __a & __b; });
 }
-__HIP_DEVICE__ int __syncthreads_or(int __pred) {
+__GPU_DEVICE__ int __syncthreads_or(int __pred) {
   unsigned long long __mask = __gpu_lane_mask();
   int __val = __gpu_ballot(__mask, __pred) != 0ull;
-  return __hip_block_reduce_impl(__val, 0,
+  return __gpu_block_reduce_impl(__val, 0,
                                  [](int __a, int __b) { return __a | __b; });
 }
 
-__HIP_DEVICE__ void __threadfence(void) {
+__GPU_DEVICE__ void __threadfence(void) {
   __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
 }
-__HIP_DEVICE__ void __threadfence_block(void) {
+__GPU_DEVICE__ void __threadfence_block(void) {
   __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_WRKGRP);
 }
-__HIP_DEVICE__ void __threadfence_system(void) {
+__GPU_DEVICE__ void __threadfence_system(void) {
   __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM);
 }
 
@@ -335,21 +337,21 @@ __HIP_DEVICE__ void __threadfence_system(void) {
 // Timers
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ long long __clock64(void) {
+__GPU_DEVICE__ long long __clock64(void) {
   return (long long)__builtin_readcyclecounter();
 }
-__HIP_DEVICE__ long long __clock(void) { return __clock64(); }
-__HIP_DEVICE__ long long clock64(void) { return __clock64(); }
-__HIP_DEVICE__ long long clock(void) { return __clock(); }
-__HIP_DEVICE__ long long wall_clock64(void) {
+__GPU_DEVICE__ long long __clock(void) { return __clock64(); }
+__GPU_DEVICE__ long long clock64(void) { return __clock64(); }
+__GPU_DEVICE__ long long clock(void) { return __clock(); }
+__GPU_DEVICE__ long long wall_clock64(void) {
   return (long long)__builtin_readsteadycounter();
 }
 
 // Warp shuffle / synchronization / reduction intrinsics.
-#include <__clang_hip_intrinsics.h>
+#include <__clang_gpu_intrinsics.h>
 
 #pragma pop_macro("MAYBE_UNDEF")
-#pragma pop_macro("__HIP_DEVICE__")
+#pragma pop_macro("__GPU_DEVICE__")
 
-#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
-#endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__
+#endif // device compile
+#endif // __CLANG_GPU_DEVICE_FUNCTIONS_H__
diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h 
b/clang/lib/Headers/__clang_gpu_intrinsics.h
similarity index 77%
rename from clang/lib/Headers/__clang_hip_intrinsics.h
rename to clang/lib/Headers/__clang_gpu_intrinsics.h
index 330a80bf267ac..552de46d72f0c 100644
--- a/clang/lib/Headers/__clang_hip_intrinsics.h
+++ b/clang/lib/Headers/__clang_gpu_intrinsics.h
@@ -1,4 +1,4 @@
-//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers 
------===
+//===--- __clang_gpu_intrinsics.h - Device-side GPU intrinsic wrappers 
------===
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,15 @@
 //
 //===-----------------------------------------------------------------------===
 
-#ifndef __CLANG_HIP_INTRINSICS_H__
-#define __CLANG_HIP_INTRINSICS_H__
+#ifndef __CLANG_GPU_INTRINSICS_H__
+#define __CLANG_GPU_INTRINSICS_H__
 
-#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
+#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) ||                   
\
+    (defined(__CUDA__) && defined(__CUDA_ARCH__))
 
-#ifndef __HIP_DEVICE__
+#ifndef __GPU_DEVICE__
 #error                                                                         
\
-    "__clang_hip_intrinsics.h must be included via 
__clang_hip_device_functions.h"
+    "__clang_gpu_intrinsics.h must be included via 
__clang_gpu_device_functions.h"
 #endif
 
 
//===----------------------------------------------------------------------===//
@@ -21,7 +22,7 @@
 
//===----------------------------------------------------------------------===//
 
 template <typename __T>
-__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx,
+__GPU_DEVICE__ __T __gpu_shuffle_idx_impl(__T __v, unsigned int __idx,
                                           int __w) {
   if constexpr (sizeof(__T) == sizeof(unsigned long long))
     return __builtin_bit_cast(
@@ -36,33 +37,33 @@ __HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned 
int __idx,
 }
 
 template <typename __T>
-__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane,
+__GPU_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane,
                           int __width = warpSize) {
-  return __hip_shuffle_idx_impl(
+  return __gpu_shuffle_idx_impl(
       __var, (unsigned int)(__src_lane & (__width - 1)), __width);
 }
 template <typename __T>
-__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta,
+__GPU_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta,
                              int __width = warpSize) {
   int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1));
   int __tgt = __rel - int(__delta);
-  return __hip_shuffle_idx_impl(
+  return __gpu_shuffle_idx_impl(
       __var, (unsigned int)(__tgt < 0 ? __rel : __tgt), __width);
 }
 template <typename __T>
-__HIP_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta,
+__GPU_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta,
                                int __width = warpSize) {
   int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1));
   int __tgt = __rel + int(__delta);
-  return __hip_shuffle_idx_impl(
+  return __gpu_shuffle_idx_impl(
       __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
 }
 template <typename __T>
-__HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask,
+__GPU_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask,
                               int __width = warpSize) {
   int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1));
   int __tgt = __rel ^ __lane_mask;
-  return __hip_shuffle_idx_impl(
+  return __gpu_shuffle_idx_impl(
       __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
 }
 
@@ -70,7 +71,7 @@ __HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int 
__lane_mask,
 // Warp synchronization
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ void __syncwarp(unsigned long long __mask = -1) {
+__GPU_DEVICE__ void __syncwarp(unsigned long long __mask = -1) {
   __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT);
   __gpu_sync_lane(__mask);
   __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT);
@@ -81,40 +82,40 @@ __HIP_DEVICE__ void __syncwarp(unsigned long long __mask = 
-1) {
 
//===----------------------------------------------------------------------===//
 
 template <typename __MaskT>
-__HIP_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) {
+__GPU_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) {
   return __ballot(__pred) & (unsigned long long)__mask;
 }
 template <typename __MaskT>
-__HIP_DEVICE__ int __all_sync(__MaskT __mask, int __pred) {
+__GPU_DEVICE__ int __all_sync(__MaskT __mask, int __pred) {
   return __ballot_sync(__mask, __pred) == (unsigned long long)__mask;
 }
 template <typename __MaskT>
-__HIP_DEVICE__ int __any_sync(__MaskT __mask, int __pred) {
+__GPU_DEVICE__ int __any_sync(__MaskT __mask, int __pred) {
   return __ballot_sync(__mask, __pred) != 0ull;
 }
 
 template <typename __MaskT, typename __T>
-__HIP_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+__GPU_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
                                int __src_lane, int __width = warpSize) {
   (void)__mask;
   return __shfl(__var, __src_lane, __width);
 }
 template <typename __MaskT, typename __T>
-__HIP_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+__GPU_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
                                   unsigned int __delta,
                                   int __width = warpSize) {
   (void)__mask;
   return __shfl_up(__var, __delta, __width);
 }
 template <typename __MaskT, typename __T>
-__HIP_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+__GPU_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
                                     unsigned int __delta,
                                     int __width = warpSize) {
   (void)__mask;
   return __shfl_down(__var, __delta, __width);
 }
 template <typename __MaskT, typename __T>
-__HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+__GPU_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
                                    int __lane_mask, int __width = warpSize) {
   (void)__mask;
   return __shfl_xor(__var, __lane_mask, __width);
@@ -125,7 +126,7 @@ __HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, 
MAYBE_UNDEF __T __var,
 
//===----------------------------------------------------------------------===//
 
 template <typename __T>
-__HIP_DEVICE__ unsigned long long __match_any(__T __value) {
+__GPU_DEVICE__ unsigned long long __match_any(__T __value) {
   if constexpr (sizeof(__T) == sizeof(unsigned long long))
     return __gpu_match_any_u64(__gpu_lane_mask(),
                                __builtin_bit_cast(unsigned long long, 
__value));
@@ -134,13 +135,13 @@ __HIP_DEVICE__ unsigned long long __match_any(__T 
__value) {
                                __builtin_bit_cast(unsigned int, __value));
 }
 template <typename __MaskT, typename __T>
-__HIP_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask,
+__GPU_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask,
                                                    __T __value) {
   return __match_any(__value) & (unsigned long long)__mask;
 }
 
 template <typename __T>
-__HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) {
+__GPU_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) {
   unsigned long long __r;
   if constexpr (sizeof(__T) == sizeof(unsigned long long))
     __r = __gpu_match_all_u64(__gpu_lane_mask(),
@@ -152,7 +153,7 @@ __HIP_DEVICE__ unsigned long long __match_all(__T __value, 
int *__pred) {
   return __r;
 }
 template <typename __MaskT, typename __T>
-__HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value,
+__GPU_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value,
                                                    int *__pred) {
   (void)__mask;
   return __match_all(__value, __pred);
@@ -163,49 +164,49 @@ __HIP_DEVICE__ unsigned long long 
__match_all_sync(__MaskT __mask, __T __value,
 
//===----------------------------------------------------------------------===//
 
 template <typename __MaskT>
-__HIP_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask,
+__GPU_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask,
                                               unsigned int __val) {
   return __gpu_lane_add_u32((unsigned long long)__mask, __val);
 }
 template <typename __MaskT>
-__HIP_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) {
+__GPU_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) {
   return int(
       __gpu_lane_add_u32((unsigned long long)__mask, (unsigned int)__val));
 }
 template <typename __MaskT>
-__HIP_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask,
+__GPU_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask,
                                               unsigned int __val) {
   return __gpu_lane_min_u32((unsigned long long)__mask, __val);
 }
 template <typename __MaskT>
-__HIP_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) {
+__GPU_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) {
   unsigned int __r = __gpu_lane_min_u32((unsigned long long)__mask,
                                         (unsigned int)__val ^ 0x80000000u);
   return int(__r ^ 0x80000000u);
 }
 template <typename __MaskT>
-__HIP_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask,
+__GPU_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask,
                                               unsigned int __val) {
   return __gpu_lane_max_u32((unsigned long long)__mask, __val);
 }
 template <typename __MaskT>
-__HIP_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) {
+__GPU_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) {
   unsigned int __r = __gpu_lane_max_u32((unsigned long long)__mask,
                                         (unsigned int)__val ^ 0x80000000u);
   return int(__r ^ 0x80000000u);
 }
 template <typename __MaskT>
-__HIP_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask,
+__GPU_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask,
                                               unsigned int __val) {
   return __gpu_lane_and_u32((unsigned long long)__mask, __val);
 }
 template <typename __MaskT>
-__HIP_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask,
+__GPU_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask,
                                              unsigned int __val) {
   return __gpu_lane_or_u32((unsigned long long)__mask, __val);
 }
 template <typename __MaskT>
-__HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask,
+__GPU_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask,
                                               unsigned int __val) {
   return __gpu_lane_xor_u32((unsigned long long)__mask, __val);
 }
@@ -214,26 +215,26 @@ __HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT 
__mask,
 // Funnel shifts.
 
//===----------------------------------------------------------------------===//
 
-__HIP_DEVICE__ unsigned int
+__GPU_DEVICE__ unsigned int
 __funnelshift_l(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
   unsigned int __s = __shift & 31u;
   return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 
32);
 }
-__HIP_DEVICE__ unsigned int
+__GPU_DEVICE__ unsigned int
 __funnelshift_lc(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
   unsigned int __s = __shift >= 32u ? 32u : __shift;
   return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 
32);
 }
-__HIP_DEVICE__ unsigned int
+__GPU_DEVICE__ unsigned int
 __funnelshift_r(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
   unsigned int __s = __shift & 31u;
   return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s);
 }
-__HIP_DEVICE__ unsigned int
+__GPU_DEVICE__ unsigned int
 __funnelshift_rc(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
   unsigned int __s = __shift >= 32u ? 32u : __shift;
   return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s);
 }
 
-#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__))
-#endif // __CLANG_HIP_INTRINSICS_H__
+#endif // device compile
+#endif // __CLANG_GPU_INTRINSICS_H__
diff --git a/clang/test/Driver/hip-device-libs-llvm-env.hip 
b/clang/test/Driver/hip-device-libs-llvm-env.hip
index 23354a2961beb..2d6f7f2479fbc 100644
--- a/clang/test/Driver/hip-device-libs-llvm-env.hip
+++ b/clang/test/Driver/hip-device-libs-llvm-env.hip
@@ -17,7 +17,7 @@
 // RUN: 2>&1 | FileCheck --check-prefix=INC %s
 
 // INC: "-cc1" "-triple" "amdgcn-amd-amdhsa-llvm"
-// INC-SAME: "-include" "__clang_hip_device_functions.h"
+// INC-SAME: "-include" "__clang_gpu_device_functions.h"
 
 // RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc \
 // RUN:   --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \
@@ -31,4 +31,4 @@
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck --check-prefix=NOINC %s
 
-// NOINC-NOT: __clang_hip_device_functions.h
+// NOINC-NOT: __clang_gpu_device_functions.h
diff --git a/clang/test/Headers/__clang_hip_device_functions.hip 
b/clang/test/Headers/gpu-device-functions.cpp
similarity index 74%
rename from clang/test/Headers/__clang_hip_device_functions.hip
rename to clang/test/Headers/gpu-device-functions.cpp
index 472f1c97f66b1..e9541936adebd 100644
--- a/clang/test/Headers/__clang_hip_device_functions.hip
+++ b/clang/test/Headers/gpu-device-functions.cpp
@@ -1,15 +1,28 @@
-// REQUIRES: amdgpu-registered-target
+// HIP on AMDGPU.
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
+// RUN:   -x hip -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \
+// RUN:   -include __clang_gpu_device_functions.h
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \
+// RUN:   -x hip -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s 
\
+// RUN:   -include __clang_gpu_device_functions.h
 
+// HIP on SPIR-V.
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -internal-isystem %S/../../lib/Headers \
-// RUN:   -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \
-// RUN:   -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \
-// RUN:   -include __clang_hip_device_functions.h
+// RUN:   -triple spirv64-amd-amdhsa -aux-triple x86_64-unknown-unknown \
+// RUN:   -x hip -fcuda-is-device -fsyntax-only -verify %s \
+// RUN:   -include __clang_gpu_device_functions.h
+
+// CUDA on NVPTX.
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -internal-isystem %S/../../lib/Headers \
-// RUN:   -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \
-// RUN:   -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s \
-// RUN:   -include __clang_hip_device_functions.h
+// RUN:   -triple nvptx64-nvidia-cuda -aux-triple x86_64-unknown-unknown \
+// RUN:   -x cuda -fcuda-is-device -target-cpu sm_70 -fsyntax-only -verify %s \
+// RUN:   -include __clang_gpu_device_functions.h
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Preprocessor/predefined-macros.c 
b/clang/test/Preprocessor/predefined-macros.c
index 2dd4b78238a7b..dc7a884ec8ed8 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -297,6 +297,15 @@
 // CHECK-HIP-DEV: #define __HIPCC__ 1
 // CHECK-HIP-DEV: #define __HIP_DEVICE_COMPILE__ 1
 // CHECK-HIP-DEV: #define __HIP__ 1
+// CHECK-HIP-DEV-NOT: #define __HIP_LLVM__ 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa-llvm \
+// RUN:   -fcuda-is-device \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-LLVM
+// CHECK-HIP-LLVM: #define __HIPCC__ 1
+// CHECK-HIP-LLVM: #define __HIP_DEVICE_COMPILE__ 1
+// CHECK-HIP-LLVM: #define __HIP_LLVM__ 1
+// CHECK-HIP-LLVM: #define __HIP__ 1
 
 // RUN: %clang_cc1 %s -E -dM -o - -x cuda -triple nvptx \
 // RUN:   -fcuda-is-device \

>From 9f02eaed4bc8527a7f04d30ad849265b4d3563f3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Wed, 17 Jun 2026 11:51:46 -0500
Subject: [PATCH 5/6] Fix w/o ROCM

---
 clang/test/Driver/hip-device-libs-llvm-env.hip | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/hip-device-libs-llvm-env.hip 
b/clang/test/Driver/hip-device-libs-llvm-env.hip
index 2d6f7f2479fbc..6d7199c9b35c3 100644
--- a/clang/test/Driver/hip-device-libs-llvm-env.hip
+++ b/clang/test/Driver/hip-device-libs-llvm-env.hip
@@ -10,7 +10,7 @@
 // LLVMENV-NOT: ocml
 // LLVMENV-NOT: ockl
 
-// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN: %clang -### --target=x86_64-linux-gnu --rocm-path=%S/Inputs/rocm \
 // RUN:   --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \
 // RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
@@ -19,13 +19,13 @@
 // INC: "-cc1" "-triple" "amdgcn-amd-amdhsa-llvm"
 // INC-SAME: "-include" "__clang_gpu_device_functions.h"
 
-// RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc \
+// RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc 
--rocm-path=%S/Inputs/rocm \
 // RUN:   --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \
 // RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck --check-prefix=NOINC %s
 
-// RUN: %clang -### --target=x86_64-linux-gnu -nohipwrapperinc \
+// RUN: %clang -### --target=x86_64-linux-gnu -nohipwrapperinc 
--rocm-path=%S/Inputs/rocm \
 // RUN:   --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \
 // RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \

>From 89a869dcaa36afa26672938623c921da2c4451d5 Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Thu, 18 Jun 2026 18:02:25 -0500
Subject: [PATCH 6/6] Fix stuff

---
 clang/lib/Headers/__clang_gpu_builtin_vars.h  | 11 +++++-----
 .../Headers/__clang_gpu_device_functions.h    |  5 ++---
 clang/lib/Headers/__clang_gpu_intrinsics.h    |  3 +--
 clang/test/Headers/gpu-device-functions.cpp   | 21 +++++++++++++++++++
 4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/clang/lib/Headers/__clang_gpu_builtin_vars.h 
b/clang/lib/Headers/__clang_gpu_builtin_vars.h
index 43905b0ab4d86..c14cf82ca7c0f 100644
--- a/clang/lib/Headers/__clang_gpu_builtin_vars.h
+++ b/clang/lib/Headers/__clang_gpu_builtin_vars.h
@@ -9,8 +9,7 @@
 #ifndef __CLANG_GPU_BUILTIN_VARS_H__
 #define __CLANG_GPU_BUILTIN_VARS_H__
 
-#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) ||                   
\
-    (defined(__CUDA__) && defined(__CUDA_ARCH__))
+#if defined(__HIP__) || defined(__CUDA__)
 
 #include <gpuintrin.h>
 
@@ -62,10 +61,10 @@ __GPU_COORD_BUILTIN(__gpu_builtin_gridDim_t, 
__gpu_num_blocks_x(),
 #pragma pop_macro("__GPU_COORD_BUILTIN")
 #pragma pop_macro("__GPU_DISALLOW_BUILTINVAR_ACCESS")
 
-extern const __attribute__((device, weak)) __gpu_builtin_threadIdx_t threadIdx;
-extern const __attribute__((device, weak)) __gpu_builtin_blockIdx_t blockIdx;
-extern const __attribute__((device, weak)) __gpu_builtin_blockDim_t blockDim;
-extern const __attribute__((device, weak)) __gpu_builtin_gridDim_t gridDim;
+static inline const __attribute__((device)) __gpu_builtin_threadIdx_t 
threadIdx{};
+static inline const __attribute__((device)) __gpu_builtin_blockIdx_t 
blockIdx{};
+static inline const __attribute__((device)) __gpu_builtin_blockDim_t 
blockDim{};
+static inline const __attribute__((device)) __gpu_builtin_gridDim_t gridDim{};
 
 #endif // device compile
 #endif // __CLANG_GPU_BUILTIN_VARS_H__
diff --git a/clang/lib/Headers/__clang_gpu_device_functions.h 
b/clang/lib/Headers/__clang_gpu_device_functions.h
index 3b068052d935b..7dbc2548627ef 100644
--- a/clang/lib/Headers/__clang_gpu_device_functions.h
+++ b/clang/lib/Headers/__clang_gpu_device_functions.h
@@ -9,8 +9,7 @@
 #ifndef __CLANG_GPU_DEVICE_FUNCTIONS_H__
 #define __CLANG_GPU_DEVICE_FUNCTIONS_H__
 
-#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) ||                   
\
-    (defined(__CUDA__) && defined(__CUDA_ARCH__))
+#if defined(__HIP__) || defined(__CUDA__)
 
 #ifndef __device__
 #define __host__ __attribute__((host))
@@ -275,7 +274,7 @@ __GPU_DEVICE__ int __fns(unsigned int __mask, unsigned int 
__base,
 
//===----------------------------------------------------------------------===//
 
 // Some targets (e.g. NVPTX) expose __syncthreads as a compiler builtin 
already.
-#if !__has_builtin(__syncthreads)
+#if !defined(__NVPTX__) && !__has_builtin(__syncthreads)
 __GPU_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); }
 #endif
 
diff --git a/clang/lib/Headers/__clang_gpu_intrinsics.h 
b/clang/lib/Headers/__clang_gpu_intrinsics.h
index 552de46d72f0c..fc1b162cd4c65 100644
--- a/clang/lib/Headers/__clang_gpu_intrinsics.h
+++ b/clang/lib/Headers/__clang_gpu_intrinsics.h
@@ -9,8 +9,7 @@
 #ifndef __CLANG_GPU_INTRINSICS_H__
 #define __CLANG_GPU_INTRINSICS_H__
 
-#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) ||                   
\
-    (defined(__CUDA__) && defined(__CUDA_ARCH__))
+#if defined(__HIP__) || defined(__CUDA__)
 
 #ifndef __GPU_DEVICE__
 #error                                                                         
\
diff --git a/clang/test/Headers/gpu-device-functions.cpp 
b/clang/test/Headers/gpu-device-functions.cpp
index e9541936adebd..aa8c8b8605e08 100644
--- a/clang/test/Headers/gpu-device-functions.cpp
+++ b/clang/test/Headers/gpu-device-functions.cpp
@@ -24,6 +24,27 @@
 // RUN:   -x cuda -fcuda-is-device -target-cpu sm_70 -fsyntax-only -verify %s \
 // RUN:   -include __clang_gpu_device_functions.h
 
+// HIP host compilation.
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple x86_64-unknown-unknown -aux-triple amdgcn-amd-amdhsa \
+// RUN:   -aux-target-cpu gfx90a -x hip -fsyntax-only -verify %s \
+// RUN:   -include __clang_gpu_device_functions.h
+//
+// HIP host compilation with a SPIR-V device.
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple x86_64-unknown-unknown -aux-triple spirv64-amd-amdhsa \
+// RUN:   -x hip -fsyntax-only -verify %s \
+// RUN:   -include __clang_gpu_device_functions.h
+//
+// CUDA host compilation.
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
+// RUN:   -internal-isystem %S/../../lib/Headers \
+// RUN:   -triple x86_64-unknown-unknown -aux-triple nvptx64-nvidia-cuda \
+// RUN:   -aux-target-cpu sm_70 -x cuda -fsyntax-only -verify %s \
+// RUN:   -include __clang_gpu_device_functions.h
+
 // expected-no-diagnostics
 
 __global__ void test_kernel(int *p, float *f, double *d, unsigned *u) {

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple (PR #203980)

Reply via email to