https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/203980
>From 8ec209d5f9452dcf7666777a44db56bb55fe478a Mon Sep 17 00:00:00 2001 From: Joseph Huber <[email protected]> Date: Mon, 15 Jun 2026 11:47:19 -0500 Subject: [PATCH] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple Summary: We already have the `*-llvm` vendor triple to imply a hermetic LLVM toolchain for HIP. This adds the device portion of the headers that can be used without a ROCm installation. These are vendored just like the CUDA ones. Despite being HIP, these are actually generic because it onyl uses clang builtins and the `gpuintrin` shims. --- clang/lib/Basic/Targets/AMDGPU.cpp | 4 + clang/lib/Driver/ToolChains/HIPAMD.cpp | 7 + clang/lib/Headers/CMakeLists.txt | 3 + clang/lib/Headers/__clang_hip_builtin_vars.h | 57 +++ .../Headers/__clang_hip_device_functions.h | 357 ++++++++++++++++++ clang/lib/Headers/__clang_hip_intrinsics.h | 240 ++++++++++++ .../Headers/__clang_hip_device_functions.hip | 76 ++++ 7 files changed, 744 insertions(+) create mode 100644 clang/lib/Headers/__clang_hip_builtin_vars.h create mode 100644 clang/lib/Headers/__clang_hip_device_functions.h create mode 100644 clang/lib/Headers/__clang_hip_intrinsics.h create mode 100644 clang/test/Headers/__clang_hip_device_functions.hip diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index bfa956fa9a4e3..906601850e353 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -252,6 +252,10 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, else Builder.defineMacro("__R600__"); + // The 'llvm' environment selects the upstream headers. + if (getTriple().getEnvironment() == llvm::Triple::LLVM) + Builder.defineMacro("__AMDGCN_LLVM__"); + // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will be // removed in the near future. if (hasFMAF()) diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 01cb23d0aa230..117cb013fe60c 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -331,6 +331,13 @@ void HIPAMDToolChain::AddIAMCUIncludeArgs(const ArgList &Args, void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { + if (getTriple().getEnvironment() == llvm::Triple::LLVM) { + if (!DriverArgs.hasArg(options::OPT_nohipwrapperinc) && + !DriverArgs.hasArg(options::OPT_nobuiltininc)) + CC1Args.append({"-include", "__clang_hip_device_functions.h"}); + return; + } + RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args); } diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 439f2725168ba..4a60af68c1d23 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -86,6 +86,9 @@ set(hip_files __clang_hip_math.h __clang_hip_stdlib.h __clang_hip_runtime_wrapper.h + __clang_hip_builtin_vars.h + __clang_hip_device_functions.h + __clang_hip_intrinsics.h ) set(hlsl_h diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h b/clang/lib/Headers/__clang_hip_builtin_vars.h new file mode 100644 index 0000000000000..f2e53ef93b38a --- /dev/null +++ b/clang/lib/Headers/__clang_hip_builtin_vars.h @@ -0,0 +1,57 @@ +//===---- __clang_hip_builtin_vars.h - HIP built-in variables --------------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __CLANG_HIP_BUILTIN_VARS_H__ +#define __CLANG_HIP_BUILTIN_VARS_H__ + +#if __HIP__ && (defined(__AMDGPU__)) + +#include <gpuintrin.h> + +// The warpSize is a runtime value rather than a compile-time constant. +inline __attribute__((device)) const struct { + __attribute__((device, always_inline, const)) operator int() const noexcept { + return __gpu_num_lanes(); + } +} warpSize{}; + +#pragma push_macro("__HIP_COORD_BUILTIN") +#define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz) \ + struct __tag { \ + __declspec(property(get = __get_x)) unsigned int x; \ + __declspec(property(get = __get_y)) unsigned int y; \ + __declspec(property(get = __get_z)) unsigned int z; \ + __attribute__((device, always_inline)) unsigned int __get_x() const { \ + return __fx; \ + } \ + __attribute__((device, always_inline)) unsigned int __get_y() const { \ + return __fy; \ + } \ + __attribute__((device, always_inline)) unsigned int __get_z() const { \ + return __fz; \ + } \ + } + +__HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(), + __gpu_thread_id_y(), __gpu_thread_id_z()); +__HIP_COORD_BUILTIN(__hip_builtin_blockIdx_t, __gpu_block_id_x(), + __gpu_block_id_y(), __gpu_block_id_z()); +__HIP_COORD_BUILTIN(__hip_builtin_blockDim_t, __gpu_num_threads_x(), + __gpu_num_threads_y(), __gpu_num_threads_z()); +__HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(), + __gpu_num_blocks_y(), __gpu_num_blocks_z()); + +#pragma pop_macro("__HIP_COORD_BUILTIN") + +extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx; +extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx; +extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim; +extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim; + +#endif // __HIP__ && (defined(__AMDGPU__) || defined(__SPIRV__)) +#endif // __CLANG_HIP_BUILTIN_VARS_H__ diff --git a/clang/lib/Headers/__clang_hip_device_functions.h b/clang/lib/Headers/__clang_hip_device_functions.h new file mode 100644 index 0000000000000..fb95c3116813d --- /dev/null +++ b/clang/lib/Headers/__clang_hip_device_functions.h @@ -0,0 +1,357 @@ +//===---- __clang_hip_device_functions.h - HIP device functions ------------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__ +#define __CLANG_HIP_DEVICE_FUNCTIONS_H__ + +#if __HIP__ && (defined(__AMDGPU__)) + +#ifndef __device__ +#define __host__ __attribute__((host)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) +#define __managed__ __attribute__((managed)) +#endif + +#include <gpuintrin.h> + +#pragma push_macro("__HIP_DEVICE__") +#define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline)) + +#pragma push_macro("MAYBE_UNDEF") +#define MAYBE_UNDEF __attribute__((maybe_undef)) + +// warpSize and the threadIdx/blockIdx/blockDim/gridDim coordinate variables. +#include <__clang_hip_builtin_vars.h> + +//===----------------------------------------------------------------------===// +// Integer intrinsics. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int __popc(unsigned int __x) { + return __builtin_popcountg(__x); +} +__HIP_DEVICE__ unsigned int __popcll(unsigned long long __x) { + return __builtin_popcountg(__x); +} + +__HIP_DEVICE__ int __clz(int __x) { + return __builtin_clzg((unsigned int)__x, 32); +} +__HIP_DEVICE__ int __clzll(long long __x) { + return __builtin_clzg((unsigned long long)__x, 64); +} + +__HIP_DEVICE__ int __ffs(int __x) { + return __builtin_ctzg((unsigned int)__x, -1) + 1; +} +__HIP_DEVICE__ int __ffs(unsigned int __x) { + return __builtin_ctzg(__x, -1) + 1; +} +__HIP_DEVICE__ int __ffsll(long long __x) { + return __builtin_ctzg((unsigned long long)__x, -1) + 1; +} +__HIP_DEVICE__ int __ffsll(unsigned long long __x) { + return __builtin_ctzg(__x, -1) + 1; +} + +__HIP_DEVICE__ unsigned int __brev(unsigned int __x) { + return __builtin_elementwise_bitreverse(__x); +} +__HIP_DEVICE__ unsigned long long __brevll(unsigned long long __x) { + return __builtin_elementwise_bitreverse(__x); +} + +__HIP_DEVICE__ int __mul24(int __x, int __y) { + return (((int)((unsigned)__x << 8) >> 8)) * + (((int)((unsigned)__y << 8) >> 8)); +} +__HIP_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) { + return (int)((__x & 0x00ffffffu) * (__y & 0x00ffffffu)); +} + +__HIP_DEVICE__ int __mulhi(int __x, int __y) { + return (int)(((long long)__x * (long long)__y) >> 32); +} +__HIP_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) { + return (unsigned int)(((unsigned long long)__x * (unsigned long long)__y) >> + 32); +} +__HIP_DEVICE__ long long __mul64hi(long long __x, long long __y) { + return (long long)(((__int128)__x * (__int128)__y) >> 64); +} +__HIP_DEVICE__ unsigned long long __umul64hi(unsigned long long __x, + unsigned long long __y) { + return ( + unsigned long long)(((unsigned __int128)__x * (unsigned __int128)__y) >> + 64); +} + +__HIP_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) { + return __x > __y ? __x - __y + __z : __y - __x + __z; +} +__HIP_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y, + unsigned int __z) { + return __x > __y ? __x - __y + __z : __y - __x + __z; +} + +__HIP_DEVICE__ int __hadd(int __x, int __y) { + return (int)(((long long)__x + (long long)__y) >> 1); +} +__HIP_DEVICE__ int __rhadd(int __x, int __y) { + return (int)(((long long)__x + (long long)__y + 1) >> 1); +} +__HIP_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) { + return (unsigned int)(((unsigned long long)__x + (unsigned long long)__y) >> + 1); +} +__HIP_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) { + return ( + unsigned int)(((unsigned long long)__x + (unsigned long long)__y + 1) >> + 1); +} + +__HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y, + unsigned int __s) { + unsigned long long __tmp = ((unsigned long long)__y << 32) | __x; + unsigned int __result = 0; + for (int __i = 0; __i < 4; ++__i) { + unsigned int __sel = (__s >> (__i * 4)) & 0x7u; + __result |= (unsigned int)((__tmp >> (__sel * 8)) & 0xffu) << (__i * 8); + } + return __result; +} + +//===----------------------------------------------------------------------===// +// Bitfield operations. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) { + return (unsigned int)__builtin_ctzg(__x, -1); +} + +__HIP_DEVICE__ unsigned int __bitextract_u32(unsigned int __src, + unsigned int __offset, + unsigned int __width) { + unsigned int __o = __offset & 31u; + unsigned int __w = __width & 31u; + return __w == 0 ? 0u : (__src << (32u - __o - __w)) >> (32u - __w); +} +__HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src, + unsigned int __offset, + unsigned int __width) { + unsigned long long __o = __offset & 63u; + unsigned long long __w = __width & 63u; + return __w == 0 ? 0ull : (__src << (64ull - __o - __w)) >> (64ull - __w); +} + +__HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst, + unsigned int __src, + unsigned int __offset, + unsigned int __width) { + unsigned int __o = __offset & 31u; + unsigned int __mask = (1u << (__width & 31u)) - 1u; + return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o); +} +__HIP_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst, + unsigned long long __src, + unsigned int __offset, + unsigned int __width) { + unsigned long long __o = __offset & 63u; + unsigned long long __mask = (1ull << (__width & 63u)) - 1ull; + return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o); +} + +//===----------------------------------------------------------------------===// +// Type punning. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ int __float_as_int(float __x) { + return __builtin_bit_cast(int, __x); +} +__HIP_DEVICE__ unsigned int __float_as_uint(float __x) { + return __builtin_bit_cast(unsigned int, __x); +} +__HIP_DEVICE__ float __int_as_float(int __x) { + return __builtin_bit_cast(float, __x); +} +__HIP_DEVICE__ float __uint_as_float(unsigned int __x) { + return __builtin_bit_cast(float, __x); +} +__HIP_DEVICE__ long long __double_as_longlong(double __x) { + return __builtin_bit_cast(long long, __x); +} +__HIP_DEVICE__ double __longlong_as_double(long long __x) { + return __builtin_bit_cast(double, __x); +} +__HIP_DEVICE__ int __double2hiint(double __x) { + return (int)(__builtin_bit_cast(unsigned long long, __x) >> 32); +} +__HIP_DEVICE__ int __double2loint(double __x) { + return (int)__builtin_bit_cast(unsigned long long, __x); +} +__HIP_DEVICE__ double __hiloint2double(int __hi, int __lo) { + return __builtin_bit_cast(double, + ((unsigned long long)(unsigned int)__hi << 32) | + (unsigned long long)(unsigned int)__lo); +} + +//===----------------------------------------------------------------------===// +// Wavefront vote and lane identity. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); } + +__HIP_DEVICE__ unsigned long long __ballot(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred); +} +__HIP_DEVICE__ unsigned long long __ballot64(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred); +} +__HIP_DEVICE__ unsigned long long __activemask(void) { + return __gpu_ballot(__gpu_lane_mask(), 1); +} + +__HIP_DEVICE__ int __all(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred) == __gpu_lane_mask(); +} +__HIP_DEVICE__ int __any(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred) != 0ull; +} + +template <typename __T> +__HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned int __base, + int __offset) { + const int __bits = (int)sizeof(__T) * 8; + __T __m = __mask; + int __off = __offset; + if (__offset == 0) { + __m &= ((__T)1 << __base); + __off = 1; + } else if (__offset < 0) { + __m = __builtin_elementwise_bitreverse(__mask); + __base = (unsigned int)(__bits - 1) - __base; + __off = -__offset; + } + __m &= (~(__T)0) << __base; + if (__builtin_popcountg(__m) < __off) + return -1; + int __total = 0; + for (int __i = __bits / 2; __i > 0; __i >>= 1) { + __T __lo = __m & (((__T)1 << __i) - 1); + int __pcnt = __builtin_popcountg(__lo); + if (__pcnt < __off) { + __m >>= __i; + __off -= __pcnt; + __total += __i; + } else { + __m = __lo; + } + } + return __offset < 0 ? (__bits - 1) - __total : __total; +} +__HIP_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base, + int __offset) { + return __hip_fns_impl(__mask, __base, __offset); +} +__HIP_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base, + int __offset) { + return __hip_fns_impl((unsigned int)__mask, __base, __offset); +} +__HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base, + int __offset) { + return __fns32(__mask, __base, __offset); +} + +//===----------------------------------------------------------------------===// +// Synchronization and fences +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ __attribute__((convergent)) void __syncthreads(void) { + __gpu_sync_threads(); +} + +template <typename __Fn> +__HIP_DEVICE__ __attribute__((convergent)) int +__hip_block_reduce_impl(int __val, int __init, __Fn __op) { + static __shared__ int __scratch[32]; + unsigned int __lanes = __gpu_num_lanes(); + unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) * + __gpu_num_threads(__GPU_Y_DIM) * + __gpu_num_threads(__GPU_Z_DIM); + unsigned int __nwarps = (__nthreads + __lanes - 1) / __lanes; + unsigned int __tid = + (__gpu_thread_id(__GPU_Z_DIM) * __gpu_num_threads(__GPU_Y_DIM) + + __gpu_thread_id(__GPU_Y_DIM)) * + __gpu_num_threads(__GPU_X_DIM) + + __gpu_thread_id(__GPU_X_DIM); + + if (__gpu_is_first_in_lane(__gpu_lane_mask())) + __scratch[__tid / __lanes] = __val; + __gpu_sync_threads(); + + int __acc = __init; + for (unsigned int __i = 0; __i < __nwarps; ++__i) + __acc = __op(__acc, __scratch[__i]); + __gpu_sync_threads(); + return __acc; +} + +__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_count(int __pred) { + unsigned long long __mask = __gpu_lane_mask(); + int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred)); + return __hip_block_reduce_impl(__val, 0, + [](int __a, int __b) { return __a + __b; }); +} +__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_and(int __pred) { + unsigned long long __mask = __gpu_lane_mask(); + int __val = __gpu_ballot(__mask, __pred) == __mask; + return __hip_block_reduce_impl(__val, 1, + [](int __a, int __b) { return __a & __b; }); +} +__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_or(int __pred) { + unsigned long long __mask = __gpu_lane_mask(); + int __val = __gpu_ballot(__mask, __pred) != 0ull; + return __hip_block_reduce_impl(__val, 0, + [](int __a, int __b) { return __a | __b; }); +} + +__HIP_DEVICE__ void __threadfence(void) { + __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); +} +__HIP_DEVICE__ void __threadfence_block(void) { + __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_WRKGRP); +} +__HIP_DEVICE__ void __threadfence_system(void) { + __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM); +} + +//===----------------------------------------------------------------------===// +// Timers +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ long long __clock64(void) { + return (long long)__builtin_readcyclecounter(); +} +__HIP_DEVICE__ long long __clock(void) { return __clock64(); } +__HIP_DEVICE__ long long clock64(void) { return __clock64(); } +__HIP_DEVICE__ long long clock(void) { return __clock(); } +__HIP_DEVICE__ long long wall_clock64(void) { + return (long long)__builtin_readsteadycounter(); +} + +// Warp shuffle / synchronization / reduction intrinsics. +#include <__clang_hip_intrinsics.h> + +#pragma pop_macro("MAYBE_UNDEF") +#pragma pop_macro("__HIP_DEVICE__") + +#endif // __HIP__ && defined(__AMDGPU__) +#endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__ diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h b/clang/lib/Headers/__clang_hip_intrinsics.h new file mode 100644 index 0000000000000..0ac117a8a3e3d --- /dev/null +++ b/clang/lib/Headers/__clang_hip_intrinsics.h @@ -0,0 +1,240 @@ +//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers ------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __CLANG_HIP_INTRINSICS_H__ +#define __CLANG_HIP_INTRINSICS_H__ + +#if __HIP__ && (defined(__AMDGPU__)) + +#ifndef __HIP_DEVICE__ +#error \ + "__clang_hip_intrinsics.h must be included via __clang_hip_device_functions.h" +#endif + +//===----------------------------------------------------------------------===// +// Wavefront shuffles +//===----------------------------------------------------------------------===// + +template <typename __T> +__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx, + int __w) { + if constexpr (sizeof(__T) == sizeof(unsigned long long)) + return __builtin_bit_cast( + __T, __gpu_shuffle_idx_u64(__gpu_lane_mask(), __idx, + __builtin_bit_cast(unsigned long long, __v), + (unsigned int)__w)); + else + return __builtin_bit_cast( + __T, __gpu_shuffle_idx_u32(__gpu_lane_mask(), __idx, + __builtin_bit_cast(unsigned int, __v), + (unsigned int)__w)); +} + +template <typename __T> +__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane, + int __width = warpSize) { + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__src_lane & (__width - 1)), __width); +} +template <typename __T> +__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta, + int __width = warpSize) { + int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1)); + int __tgt = __rel - (int)__delta; + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__tgt < 0 ? __rel : __tgt), __width); +} +template <typename __T> +__HIP_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta, + int __width = warpSize) { + int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1)); + int __tgt = __rel + (int)__delta; + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width); +} +template <typename __T> +__HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask, + int __width = warpSize) { + int __rel = (int)(__gpu_lane_id() & (unsigned int)(__width - 1)); + int __tgt = __rel ^ __lane_mask; + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width); +} + +//===----------------------------------------------------------------------===// +// Warp synchronization +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ __attribute__((convergent)) void +__syncwarp(unsigned long long __mask = -1) { + __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT); + __gpu_sync_lane(__mask); + __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT); +} + +//===----------------------------------------------------------------------===// +// Wave syncrhonization sync aliases. +//===----------------------------------------------------------------------===// + +template <typename __MaskT> +__HIP_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) { + return __ballot(__pred) & (unsigned long long)__mask; +} +template <typename __MaskT> +__HIP_DEVICE__ int __all_sync(__MaskT __mask, int __pred) { + return __ballot_sync(__mask, __pred) == (unsigned long long)__mask; +} +template <typename __MaskT> +__HIP_DEVICE__ int __any_sync(__MaskT __mask, int __pred) { + return __ballot_sync(__mask, __pred) != 0ull; +} + +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + int __src_lane, int __width = warpSize) { + (void)__mask; + return __shfl(__var, __src_lane, __width); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + unsigned int __delta, + int __width = warpSize) { + (void)__mask; + return __shfl_up(__var, __delta, __width); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + unsigned int __delta, + int __width = warpSize) { + (void)__mask; + return __shfl_down(__var, __delta, __width); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + int __lane_mask, int __width = warpSize) { + (void)__mask; + return __shfl_xor(__var, __lane_mask, __width); +} + +//===----------------------------------------------------------------------===// +// Match primitives. +//===----------------------------------------------------------------------===// + +template <typename __T> +__HIP_DEVICE__ unsigned long long __match_any(__T __value) { + if constexpr (sizeof(__T) == sizeof(unsigned long long)) + return __gpu_match_any_u64(__gpu_lane_mask(), + __builtin_bit_cast(unsigned long long, __value)); + else + return __gpu_match_any_u32(__gpu_lane_mask(), + __builtin_bit_cast(unsigned int, __value)); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask, + __T __value) { + return __match_any(__value) & (unsigned long long)__mask; +} + +template <typename __T> +__HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) { + unsigned long long __r; + if constexpr (sizeof(__T) == sizeof(unsigned long long)) + __r = __gpu_match_all_u64(__gpu_lane_mask(), + __builtin_bit_cast(unsigned long long, __value)); + else + __r = __gpu_match_all_u32(__gpu_lane_mask(), + __builtin_bit_cast(unsigned int, __value)); + *__pred = __r != 0; + return __r; +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value, + int *__pred) { + (void)__mask; + return __match_all(__value, __pred); +} + +//===----------------------------------------------------------------------===// +// Wave reductions. +//===----------------------------------------------------------------------===// + +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_add_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) { + return (int)__gpu_lane_add_u32((unsigned long long)__mask, + (unsigned int)__val); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_min_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) { + unsigned int __r = __gpu_lane_min_u32((unsigned long long)__mask, + (unsigned int)__val ^ 0x80000000u); + return (int)(__r ^ 0x80000000u); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_max_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) { + unsigned int __r = __gpu_lane_max_u32((unsigned long long)__mask, + (unsigned int)__val ^ 0x80000000u); + return (int)(__r ^ 0x80000000u); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_and_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_or_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_xor_u32((unsigned long long)__mask, __val); +} + +//===----------------------------------------------------------------------===// +// Funnel shifts. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int +__funnelshift_l(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift & 31u; + return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 32); +} +__HIP_DEVICE__ unsigned int +__funnelshift_lc(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift >= 32u ? 32u : __shift; + return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 32); +} +__HIP_DEVICE__ unsigned int +__funnelshift_r(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift & 31u; + return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s); +} +__HIP_DEVICE__ unsigned int +__funnelshift_rc(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift >= 32u ? 32u : __shift; + return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s); +} + +#endif // __HIP__ && defined(__AMDGPU__) +#endif // __CLANG_HIP_INTRINSICS_H__ diff --git a/clang/test/Headers/__clang_hip_device_functions.hip b/clang/test/Headers/__clang_hip_device_functions.hip new file mode 100644 index 0000000000000..472f1c97f66b1 --- /dev/null +++ b/clang/test/Headers/__clang_hip_device_functions.hip @@ -0,0 +1,76 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers \ +// RUN: -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \ +// RUN: -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \ +// RUN: -include __clang_hip_device_functions.h +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers \ +// RUN: -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \ +// RUN: -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s \ +// RUN: -include __clang_hip_device_functions.h + +// expected-no-diagnostics + +__global__ void test_kernel(int *p, float *f, double *d, unsigned *u) { + unsigned i = __popc(*u) + __popcll(*u); + i += __clz(*p) + __clzll(*p); + i += __ffs(*p) + __ffs(*u) + __ffsll((long long)*p) + __ffsll((unsigned long long)*u); + i += __brev(*u) + (unsigned)__brevll(*u); + i += __mul24(*p, *p) + __umul24(*u, *u) + __mulhi(*p, *p) + __umulhi(*u, *u); + i += (unsigned)__mul64hi(*p, *p) + (unsigned)__umul64hi(*u, *u); + i += __sad(*p, *p, *u) + __usad(*u, *u, *u); + i += __hadd(*p, *p) + __rhadd(*p, *p) + __uhadd(*u, *u) + __urhadd(*u, *u); + i += __byte_perm(*u, *u, *u); + i += __funnelshift_l(*u, *u, *u) + __funnelshift_lc(*u, *u, *u) + + __funnelshift_r(*u, *u, *u) + __funnelshift_rc(*u, *u, *u); + + i += __lastbit_u32_u64(*u); + i += __bitextract_u32(*u, *u, *u) + (unsigned)__bitextract_u64(*u, *u, *u); + i += __bitinsert_u32(*u, *u, *u, *u) + (unsigned)__bitinsert_u64(*u, *u, *u, *u); + + i += __float_as_int(*f) + __float_as_uint(*f); + *f = __int_as_float(*p) + __uint_as_float(*u); + i += (unsigned)__double_as_longlong(*d); + *d = __longlong_as_double((long long)*u); + i += __double2hiint(*d) + __double2loint(*d); + *d = __hiloint2double(*p, *p); + + i += threadIdx.x + threadIdx.y + threadIdx.z; + i += blockIdx.x + blockIdx.y + blockIdx.z; + i += blockDim.x + blockDim.y + blockDim.z; + i += gridDim.x + gridDim.y + gridDim.z; + i += __lane_id(); + unsigned long long m = __ballot(*p) + __ballot64(*p) + __activemask(); + int v = __all(*p) + __any(*p); + v += __fns(m, 0, 1) + __fns32(m, 0, 1) + __fns64(m, __lane_id(), -1); + + int s = __shfl(*p, 1) + __shfl_up(*p, 1) + __shfl_down(*p, 1) + __shfl_xor(*p, 1); + s += (int)__shfl(*u, 1, 32) + (int)__shfl_down(*f, 1) + (int)__shfl_xor(*d, 1); + s += (int)__shfl((long long)*p, 0) + (int)__shfl((unsigned long long)*u, 0); + + unsigned long long mask = ~0ull; + s += __shfl_sync(mask, *p, 1) + __shfl_up_sync(mask, *p, 1) + + __shfl_down_sync(mask, *p, 1) + __shfl_xor_sync(mask, *p, 1); + s += __all_sync(mask, *p) + __any_sync(mask, *p) + (int)__ballot_sync(mask, *p); + int pred; + s += (int)__match_any(*p) + (int)__match_any_sync(mask, *p) + + (int)__match_all_sync(mask, *p, &pred); + s += __reduce_add_sync(mask, *p) + __reduce_min_sync(mask, *p) + + __reduce_max_sync(mask, *p); + s += (int)(__reduce_add_sync(mask, *u) + __reduce_min_sync(mask, *u) + + __reduce_max_sync(mask, *u) + __reduce_and_sync(mask, *u) + + __reduce_or_sync(mask, *u) + __reduce_xor_sync(mask, *u)); + + __syncthreads(); + s += __syncthreads_count(*p) + __syncthreads_and(*p) + __syncthreads_or(*p); + __syncwarp(); + __syncwarp(mask); + __threadfence(); + __threadfence_block(); + __threadfence_system(); + long long c = clock() + clock64() + __clock() + __clock64() + wall_clock64(); + + *p = (int)i + v + s + (int)c + warpSize; +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
