[clang] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple (PR #203980)

Matt Arsenault via cfe-commits Mon, 15 Jun 2026 12:48:16 -0700

================
@@ -0,0 +1,240 @@
+//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers 
------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_INTRINSICS_H__
+#define __CLANG_HIP_INTRINSICS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#ifndef __HIP_DEVICE__
+#error                                                                         
\
+    "__clang_hip_intrinsics.h must be included via 
__clang_hip_device_functions.h"
+#endif
+
+//===----------------------------------------------------------------------===//
+// Wavefront shuffles
+//===----------------------------------------------------------------------===//
+
+template <typename __T>
+__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx,
+                                          int __w) {
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u64(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(unsigned long long, __v),
+                                   (unsigned int)__w));
+  else
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u32(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(unsigned int, __v),
+                                   (unsigned int)__w));
+}
+
+template <typename __T>
+__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane,
+                          int __width = warpSize) {
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__src_lane & (__width - 1)), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta,
+                             int __width = warpSize) {
+  int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel - (int)__delta;
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt < 0 ? __rel : __tgt), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta,
+                               int __width = warpSize) {
+  int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel + (int)__delta;
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask,
+                              int __width = warpSize) {
+  int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1));
+  int __tgt = __rel ^ __lane_mask;
+  return __hip_shuffle_idx_impl(
+      __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width);
+}
+
+//===----------------------------------------------------------------------===//
+// Warp synchronization
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ __attribute__((convergent)) void
+__syncwarp(unsigned long long __mask = -1) {
+  __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT);
+  __gpu_sync_lane(__mask);
+  __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT);
+}
+
+//===----------------------------------------------------------------------===//
+// Wave syncrhonization sync aliases.
+//===----------------------------------------------------------------------===//
+
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) {
+  return __ballot(__pred) & (unsigned long long)__mask;
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __all_sync(__MaskT __mask, int __pred) {
+  return __ballot_sync(__mask, __pred) == (unsigned long long)__mask;
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __any_sync(__MaskT __mask, int __pred) {
+  return __ballot_sync(__mask, __pred) != 0ull;
+}
+
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                               int __src_lane, int __width = warpSize) {
+  (void)__mask;
+  return __shfl(__var, __src_lane, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                  unsigned int __delta,
+                                  int __width = warpSize) {
+  (void)__mask;
+  return __shfl_up(__var, __delta, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                    unsigned int __delta,
+                                    int __width = warpSize) {
+  (void)__mask;
+  return __shfl_down(__var, __delta, __width);
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var,
+                                   int __lane_mask, int __width = warpSize) {
+  (void)__mask;
+  return __shfl_xor(__var, __lane_mask, __width);
+}
+
+//===----------------------------------------------------------------------===//
+// Match primitives.
+//===----------------------------------------------------------------------===//
+
+template <typename __T>
+__HIP_DEVICE__ unsigned long long __match_any(__T __value) {
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    return __gpu_match_any_u64(__gpu_lane_mask(),
+                               __builtin_bit_cast(unsigned long long, 
__value));
+  else
+    return __gpu_match_any_u32(__gpu_lane_mask(),
+                               __builtin_bit_cast(unsigned int, __value));
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask,
+                                                   __T __value) {
+  return __match_any(__value) & (unsigned long long)__mask;
+}
+
+template <typename __T>
+__HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) {
+  unsigned long long __r;
+  if constexpr (sizeof(__T) == sizeof(unsigned long long))
+    __r = __gpu_match_all_u64(__gpu_lane_mask(),
+                              __builtin_bit_cast(unsigned long long, __value));
+  else
+    __r = __gpu_match_all_u32(__gpu_lane_mask(),
+                              __builtin_bit_cast(unsigned int, __value));
+  *__pred = __r != 0;
+  return __r;
+}
+template <typename __MaskT, typename __T>
+__HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value,
+                                                   int *__pred) {
+  (void)__mask;
+  return __match_all(__value, __pred);
+}
+
+//===----------------------------------------------------------------------===//
+// Wave reductions.
+//===----------------------------------------------------------------------===//
+
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_add_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) {
+  return (int)__gpu_lane_add_u32((unsigned long long)__mask,
+                                 (unsigned int)__val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_min_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) {
+  unsigned int __r = __gpu_lane_min_u32((unsigned long long)__mask,
+                                        (unsigned int)__val ^ 0x80000000u);
+  return (int)(__r ^ 0x80000000u);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_max_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) {
+  unsigned int __r = __gpu_lane_max_u32((unsigned long long)__mask,
+                                        (unsigned int)__val ^ 0x80000000u);
+  return (int)(__r ^ 0x80000000u);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_and_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask,
+                                             unsigned int __val) {
+  return __gpu_lane_or_u32((unsigned long long)__mask, __val);
+}
+template <typename __MaskT>
+__HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask,
+                                              unsigned int __val) {
+  return __gpu_lane_xor_u32((unsigned long long)__mask, __val);
+}
+
+//===----------------------------------------------------------------------===//
+// Funnel shifts.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ unsigned int
+__funnelshift_l(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
+  unsigned int __s = __shift & 31u;
+  return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 
32);
+}
+__HIP_DEVICE__ unsigned int
+__funnelshift_lc(unsigned int __lo, unsigned int __hi, unsigned int __shift) {
----------------
arsenm wrote:


__builtin_elementwish_fsh?

https://github.com/llvm/llvm-project/pull/203980
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple (PR #203980)

Reply via email to