llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-clang

Author: Joseph Huber (jhuber6)

<details>
<summary>Changes</summary>

Summary:
We already have the `*-llvm` vendor triple to imply a hermetic LLVM
toolchain for HIP. This adds the device portion of the headers that can
be used without a ROCm installation. These are vendored just like the
CUDA ones. Despite being HIP, these are actually generic because it onyl
uses clang builtins and the `gpuintrin` shims.


---

Patch is 31.16 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/203980.diff


7 Files Affected:

- (modified) clang/lib/Basic/Targets/AMDGPU.cpp (+4) 
- (modified) clang/lib/Driver/ToolChains/HIPAMD.cpp (+7) 
- (modified) clang/lib/Headers/CMakeLists.txt (+3) 
- (added) clang/lib/Headers/__clang_hip_builtin_vars.h (+58) 
- (added) clang/lib/Headers/__clang_hip_device_functions.h (+340) 
- (added) clang/lib/Headers/__clang_hip_intrinsics.h (+231) 
- (added) clang/test/Headers/__clang_hip_device_functions.hip (+78) 


``````````diff
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp 
b/clang/lib/Basic/Targets/AMDGPU.cpp
index bfa956fa9a4e3..6d6f3007042dc 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -252,6 +252,10 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   else
     Builder.defineMacro("__R600__");
 
+  // The 'llvm' environment selects the upstream headers.
+  if (getTriple().getEnvironment() == llvm::Triple::LLVM)
+    Builder.defineMacro("__HIP_LLVM__");
+
   // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will 
be
   // removed in the near future.
   if (hasFMAF())
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp 
b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 01cb23d0aa230..117cb013fe60c 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -331,6 +331,13 @@ void HIPAMDToolChain::AddIAMCUIncludeArgs(const ArgList 
&Args,
 
 void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
+  if (getTriple().getEnvironment() == llvm::Triple::LLVM) {
+    if (!DriverArgs.hasArg(options::OPT_nohipwrapperinc) &&
+        !DriverArgs.hasArg(options::OPT_nobuiltininc))
+      CC1Args.append({"-include", "__clang_hip_device_functions.h"});
+    return;
+  }
+
   RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 439f2725168ba..4a60af68c1d23 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -86,6 +86,9 @@ set(hip_files
   __clang_hip_math.h
   __clang_hip_stdlib.h
   __clang_hip_runtime_wrapper.h
+  __clang_hip_builtin_vars.h
+  __clang_hip_device_functions.h
+  __clang_hip_intrinsics.h
   )
 
 set(hlsl_h
diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h 
b/clang/lib/Headers/__clang_hip_builtin_vars.h
new file mode 100644
index 0000000000000..7b2db46448374
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_builtin_vars.h
@@ -0,0 +1,58 @@
+//===---- __clang_hip_builtin_vars.h - HIP built-in variables --------------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_BUILTIN_VARS_H__
+#define __CLANG_HIP_BUILTIN_VARS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#include <gpuintrin.h>
+#include <stdint.h>
+
+// The warpSize is a runtime value rather than a compile-time constant.
+inline __attribute__((device)) const struct {
+  __attribute__((device, always_inline, const)) operator int() const noexcept {
+    return __gpu_num_lanes();
+  }
+} warpSize{};
+
+#pragma push_macro("__HIP_COORD_BUILTIN")
+#define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz)                           
\
+  struct __tag {                                                               
\
+    __declspec(property(get = __get_x)) uint32_t x;                            
\
+    __declspec(property(get = __get_y)) uint32_t y;                            
\
+    __declspec(property(get = __get_z)) uint32_t z;                            
\
+    __attribute__((device, always_inline)) uint32_t __get_x() const {          
\
+      return __fx;                                                             
\
+    }                                                                          
\
+    __attribute__((device, always_inline)) uint32_t __get_y() const {          
\
+      return __fy;                                                             
\
+    }                                                                          
\
+    __attribute__((device, always_inline)) uint32_t __get_z() const {          
\
+      return __fz;                                                             
\
+    }                                                                          
\
+  }
+
+__HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(),
+                    __gpu_thread_id_y(), __gpu_thread_id_z());
+__HIP_COORD_BUILTIN(__hip_builtin_blockIdx_t, __gpu_block_id_x(),
+                    __gpu_block_id_y(), __gpu_block_id_z());
+__HIP_COORD_BUILTIN(__hip_builtin_blockDim_t, __gpu_num_threads_x(),
+                    __gpu_num_threads_y(), __gpu_num_threads_z());
+__HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(),
+                    __gpu_num_blocks_y(), __gpu_num_blocks_z());
+
+#pragma pop_macro("__HIP_COORD_BUILTIN")
+
+extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx;
+extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx;
+extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim;
+extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim;
+
+#endif // __HIP__ && (defined(__AMDGPU__) || defined(__SPIRV__))
+#endif // __CLANG_HIP_BUILTIN_VARS_H__
diff --git a/clang/lib/Headers/__clang_hip_device_functions.h 
b/clang/lib/Headers/__clang_hip_device_functions.h
new file mode 100644
index 0000000000000..3a287ca71a48f
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_device_functions.h
@@ -0,0 +1,340 @@
+//===---- __clang_hip_device_functions.h - HIP device functions ------------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__
+#define __CLANG_HIP_DEVICE_FUNCTIONS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#ifndef __device__
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+#endif
+
+#include <gpuintrin.h>
+#include <stdint.h>
+
+#pragma push_macro("__HIP_DEVICE__")
+#define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline))
+
+#pragma push_macro("MAYBE_UNDEF")
+#define MAYBE_UNDEF __attribute__((maybe_undef))
+
+// warpSize and the threadIdx/blockIdx/blockDim/gridDim coordinate variables.
+#include <__clang_hip_builtin_vars.h>
+
+//===----------------------------------------------------------------------===//
+// Integer intrinsics.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ uint32_t __popc(uint32_t __x) {
+  return __builtin_elementwise_popcount(__x);
+}
+__HIP_DEVICE__ uint32_t __popcll(uint64_t __x) {
+  return __builtin_elementwise_popcount(__x);
+}
+
+__HIP_DEVICE__ int32_t __clz(int32_t __x) {
+  return __builtin_elementwise_clzg((uint32_t)__x, (uint32_t)32);
+}
+__HIP_DEVICE__ int32_t __clzll(int64_t __x) {
+  return __builtin_elementwise_clzg((uint64_t)__x, (uint64_t)64);
+}
+
+__HIP_DEVICE__ int32_t __ffs(int32_t __x) {
+  return __builtin_elementwise_ctzg((uint32_t)__x, (uint32_t)-1) + 1;
+}
+__HIP_DEVICE__ int32_t __ffs(uint32_t __x) {
+  return __builtin_elementwise_ctzg(__x, (uint32_t)-1) + 1;
+}
+__HIP_DEVICE__ int32_t __ffsll(int64_t __x) {
+  return __builtin_elementwise_ctzg((uint64_t)__x, (uint64_t)-1) + 1;
+}
+__HIP_DEVICE__ int32_t __ffsll(uint64_t __x) {
+  return __builtin_elementwise_ctzg(__x, (uint64_t)-1) + 1;
+}
+
+__HIP_DEVICE__ uint32_t __brev(uint32_t __x) {
+  return __builtin_elementwise_bitreverse(__x);
+}
+__HIP_DEVICE__ uint64_t __brevll(uint64_t __x) {
+  return __builtin_elementwise_bitreverse(__x);
+}
+
+__HIP_DEVICE__ int32_t __mul24(int32_t __x, int32_t __y) {
+  return (((int32_t)((uint32_t)__x << 8) >> 8)) *
+         (((int32_t)((uint32_t)__y << 8) >> 8));
+}
+__HIP_DEVICE__ int32_t __umul24(uint32_t __x, uint32_t __y) {
+  return (int32_t)((__x & 0x00ffffffu) * (__y & 0x00ffffffu));
+}
+
+__HIP_DEVICE__ int32_t __mulhi(int32_t __x, int32_t __y) {
+  return (int32_t)(((int64_t)__x * (int64_t)__y) >> 32);
+}
+__HIP_DEVICE__ uint32_t __umulhi(uint32_t __x, uint32_t __y) {
+  return (uint32_t)(((uint64_t)__x * (uint64_t)__y) >> 32);
+}
+__HIP_DEVICE__ int64_t __mul64hi(int64_t __x, int64_t __y) {
+  return (int64_t)(((__int128)__x * (__int128)__y) >> 64);
+}
+__HIP_DEVICE__ uint64_t __umul64hi(uint64_t __x, uint64_t __y) {
+  return (uint64_t)(((unsigned __int128)__x * (unsigned __int128)__y) >> 64);
+}
+
+__HIP_DEVICE__ uint32_t __sad(int32_t __x, int32_t __y, uint32_t __z) {
+  return __x > __y ? __x - __y + __z : __y - __x + __z;
+}
+__HIP_DEVICE__ uint32_t __usad(uint32_t __x, uint32_t __y, uint32_t __z) {
+  return __x > __y ? __x - __y + __z : __y - __x + __z;
+}
+
+__HIP_DEVICE__ int32_t __hadd(int32_t __x, int32_t __y) {
+  return (int32_t)(((int64_t)__x + (int64_t)__y) >> 1);
+}
+__HIP_DEVICE__ int32_t __rhadd(int32_t __x, int32_t __y) {
+  return (int32_t)(((int64_t)__x + (int64_t)__y + 1) >> 1);
+}
+__HIP_DEVICE__ uint32_t __uhadd(uint32_t __x, uint32_t __y) {
+  return (uint32_t)(((uint64_t)__x + (uint64_t)__y) >> 1);
+}
+__HIP_DEVICE__ uint32_t __urhadd(uint32_t __x, uint32_t __y) {
+  return (uint32_t)(((uint64_t)__x + (uint64_t)__y + 1) >> 1);
+}
+
+__HIP_DEVICE__ uint32_t __byte_perm(uint32_t __x, uint32_t __y, uint32_t __s) {
+  uint64_t __tmp = ((uint64_t)__y << 32) | __x;
+  uint32_t __result = 0;
+  for (int32_t __i = 0; __i < 4; ++__i) {
+    uint32_t __sel = (__s >> (__i * 4)) & 0x7u;
+    __result |= (uint32_t)((__tmp >> (__sel * 8)) & 0xffu) << (__i * 8);
+  }
+  return __result;
+}
+
+//===----------------------------------------------------------------------===//
+// Bitfield operations.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ uint32_t __lastbit_u32_u64(uint64_t __x) {
+  return (uint32_t)__builtin_elementwise_ctzg(__x, (uint64_t)-1);
+}
+
+__HIP_DEVICE__ uint32_t __bitextract_u32(uint32_t __src, uint32_t __offset,
+                                         uint32_t __width) {
+  uint32_t __o = __offset & 31u;
+  uint32_t __w = __width & 31u;
+  return __w == 0 ? 0u : (__src << (32u - __o - __w)) >> (32u - __w);
+}
+__HIP_DEVICE__ uint64_t __bitextract_u64(uint64_t __src, uint32_t __offset,
+                                         uint32_t __width) {
+  uint64_t __o = __offset & 63u;
+  uint64_t __w = __width & 63u;
+  return __w == 0 ? 0ull : (__src << (64ull - __o - __w)) >> (64ull - __w);
+}
+
+__HIP_DEVICE__ uint32_t __bitinsert_u32(uint32_t __dst, uint32_t __src,
+                                        uint32_t __offset, uint32_t __width) {
+  uint32_t __o = __offset & 31u;
+  uint32_t __mask = (1u << (__width & 31u)) - 1u;
+  return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o);
+}
+__HIP_DEVICE__ uint64_t __bitinsert_u64(uint64_t __dst, uint64_t __src,
+                                        uint32_t __offset, uint32_t __width) {
+  uint64_t __o = __offset & 63u;
+  uint64_t __mask = (1ull << (__width & 63u)) - 1ull;
+  return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o);
+}
+
+//===----------------------------------------------------------------------===//
+// Type punning.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ int32_t __float_as_int(float __x) {
+  return __builtin_bit_cast(int32_t, __x);
+}
+__HIP_DEVICE__ uint32_t __float_as_uint(float __x) {
+  return __builtin_bit_cast(uint32_t, __x);
+}
+__HIP_DEVICE__ float __int_as_float(int32_t __x) {
+  return __builtin_bit_cast(float, __x);
+}
+__HIP_DEVICE__ float __uint_as_float(uint32_t __x) {
+  return __builtin_bit_cast(float, __x);
+}
+__HIP_DEVICE__ int64_t __double_as_longlong(double __x) {
+  return __builtin_bit_cast(int64_t, __x);
+}
+__HIP_DEVICE__ double __longlong_as_double(int64_t __x) {
+  return __builtin_bit_cast(double, __x);
+}
+__HIP_DEVICE__ int32_t __double2hiint(double __x) {
+  return (int32_t)(__builtin_bit_cast(uint64_t, __x) >> 32);
+}
+__HIP_DEVICE__ int32_t __double2loint(double __x) {
+  return (int32_t)__builtin_bit_cast(uint64_t, __x);
+}
+__HIP_DEVICE__ double __hiloint2double(int32_t __hi, int32_t __lo) {
+  return __builtin_bit_cast(double, ((uint64_t)(uint32_t)__hi << 32) |
+                                        (uint64_t)(uint32_t)__lo);
+}
+
+//===----------------------------------------------------------------------===//
+// Wavefront vote and lane identity.
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ uint32_t __lane_id(void) { return __gpu_lane_id(); }
+
+__HIP_DEVICE__ uint64_t __ballot(int32_t __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred);
+}
+__HIP_DEVICE__ uint64_t __ballot64(int32_t __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred);
+}
+__HIP_DEVICE__ uint64_t __activemask(void) {
+  return __gpu_ballot(__gpu_lane_mask(), 1);
+}
+
+__HIP_DEVICE__ int32_t __all(int32_t __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred) == __gpu_lane_mask();
+}
+__HIP_DEVICE__ int32_t __any(int32_t __pred) {
+  return __gpu_ballot(__gpu_lane_mask(), __pred) != 0ull;
+}
+
+template <typename __T>
+__HIP_DEVICE__ int32_t __hip_fns_impl(__T __mask, uint32_t __base,
+                                      int32_t __offset) {
+  const int32_t __bits = (int32_t)sizeof(__T) * 8;
+  __T __m = __mask;
+  int32_t __off = __offset;
+  if (__offset == 0) {
+    __m &= ((__T)1 << __base);
+    __off = 1;
+  } else if (__offset < 0) {
+    __m = __builtin_elementwise_bitreverse(__mask);
+    __base = (uint32_t)(__bits - 1) - __base;
+    __off = -__offset;
+  }
+  __m &= (~(__T)0) << __base;
+  if ((int32_t)__builtin_elementwise_popcount(__m) < __off)
+    return -1;
+  int32_t __total = 0;
+  for (int32_t __i = __bits / 2; __i > 0; __i >>= 1) {
+    __T __lo = __m & (((__T)1 << __i) - 1);
+    int32_t __pcnt = (int32_t)__builtin_elementwise_popcount(__lo);
+    if (__pcnt < __off) {
+      __m >>= __i;
+      __off -= __pcnt;
+      __total += __i;
+    } else {
+      __m = __lo;
+    }
+  }
+  return __offset < 0 ? (__bits - 1) - __total : __total;
+}
+__HIP_DEVICE__ int32_t __fns64(uint64_t __mask, uint32_t __base,
+                               int32_t __offset) {
+  return __hip_fns_impl(__mask, __base, __offset);
+}
+__HIP_DEVICE__ int32_t __fns32(uint64_t __mask, uint32_t __base,
+                               int32_t __offset) {
+  return __hip_fns_impl((uint32_t)__mask, __base, __offset);
+}
+__HIP_DEVICE__ int32_t __fns(uint32_t __mask, uint32_t __base,
+                             int32_t __offset) {
+  return __fns32(__mask, __base, __offset);
+}
+
+//===----------------------------------------------------------------------===//
+// Synchronization and fences
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); }
+
+template <typename __Fn>
+__HIP_DEVICE__ int32_t __hip_block_reduce_impl(int32_t __val, int32_t __init,
+                                               __Fn __op) {
+  static __shared__ int32_t __scratch[32];
+  uint32_t __lanes = __gpu_num_lanes();
+  uint32_t __nthreads = __gpu_num_threads(__GPU_X_DIM) *
+                        __gpu_num_threads(__GPU_Y_DIM) *
+                        __gpu_num_threads(__GPU_Z_DIM);
+  uint32_t __nwarps = (__nthreads + __lanes - 1) / __lanes;
+  uint32_t __tid =
+      (__gpu_thread_id(__GPU_Z_DIM) * __gpu_num_threads(__GPU_Y_DIM) +
+       __gpu_thread_id(__GPU_Y_DIM)) *
+          __gpu_num_threads(__GPU_X_DIM) +
+      __gpu_thread_id(__GPU_X_DIM);
+
+  if (__gpu_is_first_in_lane(__gpu_lane_mask()))
+    __scratch[__tid / __lanes] = __val;
+  __gpu_sync_threads();
+
+  int32_t __acc = __init;
+  for (uint32_t __i = 0; __i < __nwarps; ++__i)
+    __acc = __op(__acc, __scratch[__i]);
+  __gpu_sync_threads();
+  return __acc;
+}
+
+__HIP_DEVICE__ int32_t __syncthreads_count(int32_t __pred) {
+  uint64_t __mask = __gpu_lane_mask();
+  int32_t __val = __builtin_elementwise_popcount(__gpu_ballot(__mask, __pred));
+  return __hip_block_reduce_impl(
+      __val, 0, [](int32_t __a, int32_t __b) { return __a + __b; });
+}
+__HIP_DEVICE__ int32_t __syncthreads_and(int32_t __pred) {
+  uint64_t __mask = __gpu_lane_mask();
+  int32_t __val = __gpu_ballot(__mask, __pred) == __mask;
+  return __hip_block_reduce_impl(
+      __val, 1, [](int32_t __a, int32_t __b) { return __a & __b; });
+}
+__HIP_DEVICE__ int32_t __syncthreads_or(int32_t __pred) {
+  uint64_t __mask = __gpu_lane_mask();
+  int32_t __val = __gpu_ballot(__mask, __pred) != 0ull;
+  return __hip_block_reduce_impl(
+      __val, 0, [](int32_t __a, int32_t __b) { return __a | __b; });
+}
+
+__HIP_DEVICE__ void __threadfence(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
+}
+__HIP_DEVICE__ void __threadfence_block(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_WRKGRP);
+}
+__HIP_DEVICE__ void __threadfence_system(void) {
+  __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM);
+}
+
+//===----------------------------------------------------------------------===//
+// Timers
+//===----------------------------------------------------------------------===//
+
+__HIP_DEVICE__ int64_t __clock64(void) {
+  return (int64_t)__builtin_readcyclecounter();
+}
+__HIP_DEVICE__ int64_t __clock(void) { return __clock64(); }
+__HIP_DEVICE__ int64_t clock64(void) { return __clock64(); }
+__HIP_DEVICE__ int64_t clock(void) { return __clock(); }
+__HIP_DEVICE__ int64_t wall_clock64(void) {
+  return (int64_t)__builtin_readsteadycounter();
+}
+
+// Warp shuffle / synchronization / reduction intrinsics.
+#include <__clang_hip_intrinsics.h>
+
+#pragma pop_macro("MAYBE_UNDEF")
+#pragma pop_macro("__HIP_DEVICE__")
+
+#endif // __HIP__ && defined(__AMDGPU__)
+#endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__
diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h 
b/clang/lib/Headers/__clang_hip_intrinsics.h
new file mode 100644
index 0000000000000..9d1b784dc92fe
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_intrinsics.h
@@ -0,0 +1,231 @@
+//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers 
------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_HIP_INTRINSICS_H__
+#define __CLANG_HIP_INTRINSICS_H__
+
+#if __HIP__ && (defined(__AMDGPU__))
+
+#ifndef __HIP_DEVICE__
+#error                                                                         
\
+    "__clang_hip_intrinsics.h must be included via 
__clang_hip_device_functions.h"
+#endif
+
+#include <stdint.h>
+
+//===----------------------------------------------------------------------===//
+// Wavefront shuffles
+//===----------------------------------------------------------------------===//
+
+template <typename __T>
+__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, uint32_t __idx,
+                                          int32_t __w) {
+  if constexpr (sizeof(__T) == sizeof(uint64_t))
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u64(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(uint64_t, __v),
+                                   (uint32_t)__w));
+  else
+    return __builtin_bit_cast(
+        __T, __gpu_shuffle_idx_u32(__gpu_lane_mask(), __idx,
+                                   __builtin_bit_cast(uint32_t, __v),
+                                   (uint32_t)__w));
+}
+
+template <typename __T>
+__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int32_t __src_lane,
+                          int32_t __width = warpSize) {
+  return __hip_shuffle_idx_impl(__var, (uint32_t)(__src_lane & (__width - 1)),
+                                __width);
+}
+template <typename __T>
+__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, uint32_t __delta,
+                             int32_t __width = warpSize) {
+  int32_t __rel = (int32_t)(__gpu_lane_id() & (uint32_t)(__width - 1));
+  int32_t __tgt = __rel - (int32_t)__delta;
+  return __hip_shuffle_idx_impl(__var, (uint32_t)...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/203980
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to