https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/203980
>From 8a2190f6c9085ffe930b51e9b64c9a7cde83895c Mon Sep 17 00:00:00 2001 From: Joseph Huber <[email protected]> Date: Mon, 15 Jun 2026 11:47:19 -0500 Subject: [PATCH 1/4] [Clang][HIP] Add LLVM vendored device headers for `*-llvm` triple Summary: We already have the `*-llvm` vendor triple to imply a hermetic LLVM toolchain for HIP. This adds the device portion of the headers that can be used without a ROCm installation. These are vendored just like the CUDA ones. Despite being HIP, these are actually generic because it onyl uses clang builtins and the `gpuintrin` shims. --- clang/lib/Basic/Targets/AMDGPU.cpp | 4 + clang/lib/Driver/ToolChains/HIPAMD.cpp | 7 + clang/lib/Headers/CMakeLists.txt | 3 + clang/lib/Headers/__clang_hip_builtin_vars.h | 57 +++ .../Headers/__clang_hip_device_functions.h | 356 ++++++++++++++++++ clang/lib/Headers/__clang_hip_intrinsics.h | 240 ++++++++++++ .../Headers/__clang_hip_device_functions.hip | 76 ++++ 7 files changed, 743 insertions(+) create mode 100644 clang/lib/Headers/__clang_hip_builtin_vars.h create mode 100644 clang/lib/Headers/__clang_hip_device_functions.h create mode 100644 clang/lib/Headers/__clang_hip_intrinsics.h create mode 100644 clang/test/Headers/__clang_hip_device_functions.hip diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index bfa956fa9a4e3..906601850e353 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -252,6 +252,10 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, else Builder.defineMacro("__R600__"); + // The 'llvm' environment selects the upstream headers. + if (getTriple().getEnvironment() == llvm::Triple::LLVM) + Builder.defineMacro("__AMDGCN_LLVM__"); + // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will be // removed in the near future. if (hasFMAF()) diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 84664bcddbb94..2dbd8e9f670ce 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -331,6 +331,13 @@ void HIPAMDToolChain::AddIAMCUIncludeArgs(const ArgList &Args, void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { + if (getTriple().getEnvironment() == llvm::Triple::LLVM) { + if (!DriverArgs.hasArg(options::OPT_nohipwrapperinc) && + !DriverArgs.hasArg(options::OPT_nobuiltininc)) + CC1Args.append({"-include", "__clang_hip_device_functions.h"}); + return; + } + RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args); } diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 439f2725168ba..4a60af68c1d23 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -86,6 +86,9 @@ set(hip_files __clang_hip_math.h __clang_hip_stdlib.h __clang_hip_runtime_wrapper.h + __clang_hip_builtin_vars.h + __clang_hip_device_functions.h + __clang_hip_intrinsics.h ) set(hlsl_h diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h b/clang/lib/Headers/__clang_hip_builtin_vars.h new file mode 100644 index 0000000000000..f2e53ef93b38a --- /dev/null +++ b/clang/lib/Headers/__clang_hip_builtin_vars.h @@ -0,0 +1,57 @@ +//===---- __clang_hip_builtin_vars.h - HIP built-in variables --------------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __CLANG_HIP_BUILTIN_VARS_H__ +#define __CLANG_HIP_BUILTIN_VARS_H__ + +#if __HIP__ && (defined(__AMDGPU__)) + +#include <gpuintrin.h> + +// The warpSize is a runtime value rather than a compile-time constant. +inline __attribute__((device)) const struct { + __attribute__((device, always_inline, const)) operator int() const noexcept { + return __gpu_num_lanes(); + } +} warpSize{}; + +#pragma push_macro("__HIP_COORD_BUILTIN") +#define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz) \ + struct __tag { \ + __declspec(property(get = __get_x)) unsigned int x; \ + __declspec(property(get = __get_y)) unsigned int y; \ + __declspec(property(get = __get_z)) unsigned int z; \ + __attribute__((device, always_inline)) unsigned int __get_x() const { \ + return __fx; \ + } \ + __attribute__((device, always_inline)) unsigned int __get_y() const { \ + return __fy; \ + } \ + __attribute__((device, always_inline)) unsigned int __get_z() const { \ + return __fz; \ + } \ + } + +__HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(), + __gpu_thread_id_y(), __gpu_thread_id_z()); +__HIP_COORD_BUILTIN(__hip_builtin_blockIdx_t, __gpu_block_id_x(), + __gpu_block_id_y(), __gpu_block_id_z()); +__HIP_COORD_BUILTIN(__hip_builtin_blockDim_t, __gpu_num_threads_x(), + __gpu_num_threads_y(), __gpu_num_threads_z()); +__HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(), + __gpu_num_blocks_y(), __gpu_num_blocks_z()); + +#pragma pop_macro("__HIP_COORD_BUILTIN") + +extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx; +extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx; +extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim; +extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim; + +#endif // __HIP__ && (defined(__AMDGPU__) || defined(__SPIRV__)) +#endif // __CLANG_HIP_BUILTIN_VARS_H__ diff --git a/clang/lib/Headers/__clang_hip_device_functions.h b/clang/lib/Headers/__clang_hip_device_functions.h new file mode 100644 index 0000000000000..14edf987c9207 --- /dev/null +++ b/clang/lib/Headers/__clang_hip_device_functions.h @@ -0,0 +1,356 @@ +//===---- __clang_hip_device_functions.h - HIP device functions ------------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__ +#define __CLANG_HIP_DEVICE_FUNCTIONS_H__ + +#if __HIP__ && (defined(__AMDGPU__)) + +#ifndef __device__ +#define __host__ __attribute__((host)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) +#define __managed__ __attribute__((managed)) +#endif + +#include <gpuintrin.h> + +#pragma push_macro("__HIP_DEVICE__") +#define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline)) + +#pragma push_macro("MAYBE_UNDEF") +#define MAYBE_UNDEF __attribute__((maybe_undef)) + +// warpSize and the threadIdx/blockIdx/blockDim/gridDim coordinate variables. +#include <__clang_hip_builtin_vars.h> + +//===----------------------------------------------------------------------===// +// Integer intrinsics. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int __popc(unsigned int __x) { + return __builtin_popcountg(__x); +} +__HIP_DEVICE__ unsigned int __popcll(unsigned long long __x) { + return __builtin_popcountg(__x); +} + +__HIP_DEVICE__ int __clz(int __x) { + return __builtin_clzg((unsigned int)__x, 32); +} +__HIP_DEVICE__ int __clzll(long long __x) { + return __builtin_clzg((unsigned long long)__x, 64); +} + +__HIP_DEVICE__ int __ffs(int __x) { + return __builtin_ctzg((unsigned int)__x, -1) + 1; +} +__HIP_DEVICE__ int __ffs(unsigned int __x) { + return __builtin_ctzg(__x, -1) + 1; +} +__HIP_DEVICE__ int __ffsll(long long __x) { + return __builtin_ctzg((unsigned long long)__x, -1) + 1; +} +__HIP_DEVICE__ int __ffsll(unsigned long long __x) { + return __builtin_ctzg(__x, -1) + 1; +} + +__HIP_DEVICE__ unsigned int __brev(unsigned int __x) { + return __builtin_elementwise_bitreverse(__x); +} +__HIP_DEVICE__ unsigned long long __brevll(unsigned long long __x) { + return __builtin_elementwise_bitreverse(__x); +} + +__HIP_DEVICE__ int __mul24(int __x, int __y) { + return ((int(unsigned(__x) << 8) >> 8)) * ((int(unsigned(__y) << 8) >> 8)); +} +__HIP_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) { + return int((__x & 0x00ffffffu) * (__y & 0x00ffffffu)); +} + +__HIP_DEVICE__ int __mulhi(int __x, int __y) { + return int(((long long)__x * (long long)__y) >> 32); +} +__HIP_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) { + return (unsigned int)(((unsigned long long)__x * (unsigned long long)__y) >> + 32); +} +__HIP_DEVICE__ long long __mul64hi(long long __x, long long __y) { + return (long long)(((__int128)__x * (__int128)__y) >> 64); +} +__HIP_DEVICE__ unsigned long long __umul64hi(unsigned long long __x, + unsigned long long __y) { + return ( + unsigned long long)(((unsigned __int128)__x * (unsigned __int128)__y) >> + 64); +} + +__HIP_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) { + return __x > __y ? __x - __y + __z : __y - __x + __z; +} +__HIP_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y, + unsigned int __z) { + return __x > __y ? __x - __y + __z : __y - __x + __z; +} + +__HIP_DEVICE__ int __hadd(int __x, int __y) { + return int(((long long)__x + (long long)__y) >> 1); +} +__HIP_DEVICE__ int __rhadd(int __x, int __y) { + return int(((long long)__x + (long long)__y + 1) >> 1); +} +__HIP_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) { + return (unsigned int)(((unsigned long long)__x + (unsigned long long)__y) >> + 1); +} +__HIP_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) { + return ( + unsigned int)(((unsigned long long)__x + (unsigned long long)__y + 1) >> + 1); +} + +__HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y, + unsigned int __s) { + unsigned long long __tmp = ((unsigned long long)__y << 32) | __x; + unsigned int __result = 0; + for (int __i = 0; __i < 4; ++__i) { + unsigned int __sel = (__s >> (__i * 4)) & 0x7u; + __result |= (unsigned int)((__tmp >> (__sel * 8)) & 0xffu) << (__i * 8); + } + return __result; +} + +//===----------------------------------------------------------------------===// +// Bitfield operations. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) { + return (unsigned int)__builtin_ctzg(__x, -1); +} + +__HIP_DEVICE__ unsigned int __bitextract_u32(unsigned int __src, + unsigned int __offset, + unsigned int __width) { + unsigned int __o = __offset & 31u; + unsigned int __w = __width & 31u; + return __w == 0 ? 0u : (__src << (32u - __o - __w)) >> (32u - __w); +} +__HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src, + unsigned int __offset, + unsigned int __width) { + unsigned long long __o = __offset & 63u; + unsigned long long __w = __width & 63u; + return __w == 0 ? 0ull : (__src << (64ull - __o - __w)) >> (64ull - __w); +} + +__HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst, + unsigned int __src, + unsigned int __offset, + unsigned int __width) { + unsigned int __o = __offset & 31u; + unsigned int __mask = (1u << (__width & 31u)) - 1u; + return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o); +} +__HIP_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst, + unsigned long long __src, + unsigned int __offset, + unsigned int __width) { + unsigned long long __o = __offset & 63u; + unsigned long long __mask = (1ull << (__width & 63u)) - 1ull; + return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o); +} + +//===----------------------------------------------------------------------===// +// Type punning. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ int __float_as_int(float __x) { + return __builtin_bit_cast(int, __x); +} +__HIP_DEVICE__ unsigned int __float_as_uint(float __x) { + return __builtin_bit_cast(unsigned int, __x); +} +__HIP_DEVICE__ float __int_as_float(int __x) { + return __builtin_bit_cast(float, __x); +} +__HIP_DEVICE__ float __uint_as_float(unsigned int __x) { + return __builtin_bit_cast(float, __x); +} +__HIP_DEVICE__ long long __double_as_longlong(double __x) { + return __builtin_bit_cast(long long, __x); +} +__HIP_DEVICE__ double __longlong_as_double(long long __x) { + return __builtin_bit_cast(double, __x); +} +__HIP_DEVICE__ int __double2hiint(double __x) { + return int(__builtin_bit_cast(unsigned long long, __x) >> 32); +} +__HIP_DEVICE__ int __double2loint(double __x) { + return int(__builtin_bit_cast(unsigned long long, __x)); +} +__HIP_DEVICE__ double __hiloint2double(int __hi, int __lo) { + return __builtin_bit_cast(double, + ((unsigned long long)(unsigned int)__hi << 32) | + (unsigned long long)(unsigned int)__lo); +} + +//===----------------------------------------------------------------------===// +// Wavefront vote and lane identity. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); } + +__HIP_DEVICE__ unsigned long long __ballot(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred); +} +__HIP_DEVICE__ unsigned long long __ballot64(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred); +} +__HIP_DEVICE__ unsigned long long __activemask(void) { + return __gpu_ballot(__gpu_lane_mask(), 1); +} + +__HIP_DEVICE__ int __all(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred) == __gpu_lane_mask(); +} +__HIP_DEVICE__ int __any(int __pred) { + return __gpu_ballot(__gpu_lane_mask(), __pred) != 0ull; +} + +template <typename __T> +__HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned int __base, + int __offset) { + const int __bits = int(sizeof(__T)) * 8; + __T __m = __mask; + int __off = __offset; + if (__offset == 0) { + __m &= (__T(1) << __base); + __off = 1; + } else if (__offset < 0) { + __m = __builtin_elementwise_bitreverse(__mask); + __base = (unsigned int)(__bits - 1) - __base; + __off = -__offset; + } + __m &= (~__T(0)) << __base; + if (__builtin_popcountg(__m) < __off) + return -1; + int __total = 0; + for (int __i = __bits / 2; __i > 0; __i >>= 1) { + __T __lo = __m & ((__T(1) << __i) - 1); + int __pcnt = __builtin_popcountg(__lo); + if (__pcnt < __off) { + __m >>= __i; + __off -= __pcnt; + __total += __i; + } else { + __m = __lo; + } + } + return __offset < 0 ? (__bits - 1) - __total : __total; +} +__HIP_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base, + int __offset) { + return __hip_fns_impl(__mask, __base, __offset); +} +__HIP_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base, + int __offset) { + return __hip_fns_impl((unsigned int)__mask, __base, __offset); +} +__HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base, + int __offset) { + return __fns32(__mask, __base, __offset); +} + +//===----------------------------------------------------------------------===// +// Synchronization and fences +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ __attribute__((convergent)) void __syncthreads(void) { + __gpu_sync_threads(); +} + +template <typename __Fn> +__HIP_DEVICE__ __attribute__((convergent)) int +__hip_block_reduce_impl(int __val, int __init, __Fn __op) { + static __shared__ int __scratch[32]; + unsigned int __lanes = __gpu_num_lanes(); + unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) * + __gpu_num_threads(__GPU_Y_DIM) * + __gpu_num_threads(__GPU_Z_DIM); + unsigned int __nwarps = (__nthreads + __lanes - 1) / __lanes; + unsigned int __tid = + (__gpu_thread_id(__GPU_Z_DIM) * __gpu_num_threads(__GPU_Y_DIM) + + __gpu_thread_id(__GPU_Y_DIM)) * + __gpu_num_threads(__GPU_X_DIM) + + __gpu_thread_id(__GPU_X_DIM); + + if (__gpu_is_first_in_lane(__gpu_lane_mask())) + __scratch[__tid / __lanes] = __val; + __gpu_sync_threads(); + + int __acc = __init; + for (unsigned int __i = 0; __i < __nwarps; ++__i) + __acc = __op(__acc, __scratch[__i]); + __gpu_sync_threads(); + return __acc; +} + +__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_count(int __pred) { + unsigned long long __mask = __gpu_lane_mask(); + int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred)); + return __hip_block_reduce_impl(__val, 0, + [](int __a, int __b) { return __a + __b; }); +} +__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_and(int __pred) { + unsigned long long __mask = __gpu_lane_mask(); + int __val = __gpu_ballot(__mask, __pred) == __mask; + return __hip_block_reduce_impl(__val, 1, + [](int __a, int __b) { return __a & __b; }); +} +__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_or(int __pred) { + unsigned long long __mask = __gpu_lane_mask(); + int __val = __gpu_ballot(__mask, __pred) != 0ull; + return __hip_block_reduce_impl(__val, 0, + [](int __a, int __b) { return __a | __b; }); +} + +__HIP_DEVICE__ void __threadfence(void) { + __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); +} +__HIP_DEVICE__ void __threadfence_block(void) { + __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_WRKGRP); +} +__HIP_DEVICE__ void __threadfence_system(void) { + __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM); +} + +//===----------------------------------------------------------------------===// +// Timers +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ long long __clock64(void) { + return (long long)__builtin_readcyclecounter(); +} +__HIP_DEVICE__ long long __clock(void) { return __clock64(); } +__HIP_DEVICE__ long long clock64(void) { return __clock64(); } +__HIP_DEVICE__ long long clock(void) { return __clock(); } +__HIP_DEVICE__ long long wall_clock64(void) { + return (long long)__builtin_readsteadycounter(); +} + +// Warp shuffle / synchronization / reduction intrinsics. +#include <__clang_hip_intrinsics.h> + +#pragma pop_macro("MAYBE_UNDEF") +#pragma pop_macro("__HIP_DEVICE__") + +#endif // __HIP__ && defined(__AMDGPU__) +#endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__ diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h b/clang/lib/Headers/__clang_hip_intrinsics.h new file mode 100644 index 0000000000000..ef2bede1af070 --- /dev/null +++ b/clang/lib/Headers/__clang_hip_intrinsics.h @@ -0,0 +1,240 @@ +//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers ------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __CLANG_HIP_INTRINSICS_H__ +#define __CLANG_HIP_INTRINSICS_H__ + +#if __HIP__ && (defined(__AMDGPU__)) + +#ifndef __HIP_DEVICE__ +#error \ + "__clang_hip_intrinsics.h must be included via __clang_hip_device_functions.h" +#endif + +//===----------------------------------------------------------------------===// +// Wavefront shuffles +//===----------------------------------------------------------------------===// + +template <typename __T> +__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx, + int __w) { + if constexpr (sizeof(__T) == sizeof(unsigned long long)) + return __builtin_bit_cast( + __T, __gpu_shuffle_idx_u64(__gpu_lane_mask(), __idx, + __builtin_bit_cast(unsigned long long, __v), + (unsigned int)__w)); + else + return __builtin_bit_cast( + __T, __gpu_shuffle_idx_u32(__gpu_lane_mask(), __idx, + __builtin_bit_cast(unsigned int, __v), + (unsigned int)__w)); +} + +template <typename __T> +__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane, + int __width = warpSize) { + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__src_lane & (__width - 1)), __width); +} +template <typename __T> +__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta, + int __width = warpSize) { + int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1)); + int __tgt = __rel - int(__delta); + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__tgt < 0 ? __rel : __tgt), __width); +} +template <typename __T> +__HIP_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta, + int __width = warpSize) { + int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1)); + int __tgt = __rel + int(__delta); + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width); +} +template <typename __T> +__HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask, + int __width = warpSize) { + int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1)); + int __tgt = __rel ^ __lane_mask; + return __hip_shuffle_idx_impl( + __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width); +} + +//===----------------------------------------------------------------------===// +// Warp synchronization +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ __attribute__((convergent)) void +__syncwarp(unsigned long long __mask = -1) { + __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT); + __gpu_sync_lane(__mask); + __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT); +} + +//===----------------------------------------------------------------------===// +// Wave syncrhonization sync aliases. +//===----------------------------------------------------------------------===// + +template <typename __MaskT> +__HIP_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) { + return __ballot(__pred) & (unsigned long long)__mask; +} +template <typename __MaskT> +__HIP_DEVICE__ int __all_sync(__MaskT __mask, int __pred) { + return __ballot_sync(__mask, __pred) == (unsigned long long)__mask; +} +template <typename __MaskT> +__HIP_DEVICE__ int __any_sync(__MaskT __mask, int __pred) { + return __ballot_sync(__mask, __pred) != 0ull; +} + +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + int __src_lane, int __width = warpSize) { + (void)__mask; + return __shfl(__var, __src_lane, __width); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + unsigned int __delta, + int __width = warpSize) { + (void)__mask; + return __shfl_up(__var, __delta, __width); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + unsigned int __delta, + int __width = warpSize) { + (void)__mask; + return __shfl_down(__var, __delta, __width); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var, + int __lane_mask, int __width = warpSize) { + (void)__mask; + return __shfl_xor(__var, __lane_mask, __width); +} + +//===----------------------------------------------------------------------===// +// Match primitives. +//===----------------------------------------------------------------------===// + +template <typename __T> +__HIP_DEVICE__ unsigned long long __match_any(__T __value) { + if constexpr (sizeof(__T) == sizeof(unsigned long long)) + return __gpu_match_any_u64(__gpu_lane_mask(), + __builtin_bit_cast(unsigned long long, __value)); + else + return __gpu_match_any_u32(__gpu_lane_mask(), + __builtin_bit_cast(unsigned int, __value)); +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask, + __T __value) { + return __match_any(__value) & (unsigned long long)__mask; +} + +template <typename __T> +__HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) { + unsigned long long __r; + if constexpr (sizeof(__T) == sizeof(unsigned long long)) + __r = __gpu_match_all_u64(__gpu_lane_mask(), + __builtin_bit_cast(unsigned long long, __value)); + else + __r = __gpu_match_all_u32(__gpu_lane_mask(), + __builtin_bit_cast(unsigned int, __value)); + *__pred = __r != 0; + return __r; +} +template <typename __MaskT, typename __T> +__HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value, + int *__pred) { + (void)__mask; + return __match_all(__value, __pred); +} + +//===----------------------------------------------------------------------===// +// Wave reductions. +//===----------------------------------------------------------------------===// + +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_add_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) { + return int( + __gpu_lane_add_u32((unsigned long long)__mask, (unsigned int)__val)); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_min_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) { + unsigned int __r = __gpu_lane_min_u32((unsigned long long)__mask, + (unsigned int)__val ^ 0x80000000u); + return int(__r ^ 0x80000000u); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_max_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) { + unsigned int __r = __gpu_lane_max_u32((unsigned long long)__mask, + (unsigned int)__val ^ 0x80000000u); + return int(__r ^ 0x80000000u); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_and_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_or_u32((unsigned long long)__mask, __val); +} +template <typename __MaskT> +__HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask, + unsigned int __val) { + return __gpu_lane_xor_u32((unsigned long long)__mask, __val); +} + +//===----------------------------------------------------------------------===// +// Funnel shifts. +//===----------------------------------------------------------------------===// + +__HIP_DEVICE__ unsigned int +__funnelshift_l(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift & 31u; + return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 32); +} +__HIP_DEVICE__ unsigned int +__funnelshift_lc(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift >= 32u ? 32u : __shift; + return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 32); +} +__HIP_DEVICE__ unsigned int +__funnelshift_r(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift & 31u; + return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s); +} +__HIP_DEVICE__ unsigned int +__funnelshift_rc(unsigned int __lo, unsigned int __hi, unsigned int __shift) { + unsigned int __s = __shift >= 32u ? 32u : __shift; + return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s); +} + +#endif // __HIP__ && defined(__AMDGPU__) +#endif // __CLANG_HIP_INTRINSICS_H__ diff --git a/clang/test/Headers/__clang_hip_device_functions.hip b/clang/test/Headers/__clang_hip_device_functions.hip new file mode 100644 index 0000000000000..472f1c97f66b1 --- /dev/null +++ b/clang/test/Headers/__clang_hip_device_functions.hip @@ -0,0 +1,76 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers \ +// RUN: -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \ +// RUN: -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \ +// RUN: -include __clang_hip_device_functions.h +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers \ +// RUN: -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \ +// RUN: -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s \ +// RUN: -include __clang_hip_device_functions.h + +// expected-no-diagnostics + +__global__ void test_kernel(int *p, float *f, double *d, unsigned *u) { + unsigned i = __popc(*u) + __popcll(*u); + i += __clz(*p) + __clzll(*p); + i += __ffs(*p) + __ffs(*u) + __ffsll((long long)*p) + __ffsll((unsigned long long)*u); + i += __brev(*u) + (unsigned)__brevll(*u); + i += __mul24(*p, *p) + __umul24(*u, *u) + __mulhi(*p, *p) + __umulhi(*u, *u); + i += (unsigned)__mul64hi(*p, *p) + (unsigned)__umul64hi(*u, *u); + i += __sad(*p, *p, *u) + __usad(*u, *u, *u); + i += __hadd(*p, *p) + __rhadd(*p, *p) + __uhadd(*u, *u) + __urhadd(*u, *u); + i += __byte_perm(*u, *u, *u); + i += __funnelshift_l(*u, *u, *u) + __funnelshift_lc(*u, *u, *u) + + __funnelshift_r(*u, *u, *u) + __funnelshift_rc(*u, *u, *u); + + i += __lastbit_u32_u64(*u); + i += __bitextract_u32(*u, *u, *u) + (unsigned)__bitextract_u64(*u, *u, *u); + i += __bitinsert_u32(*u, *u, *u, *u) + (unsigned)__bitinsert_u64(*u, *u, *u, *u); + + i += __float_as_int(*f) + __float_as_uint(*f); + *f = __int_as_float(*p) + __uint_as_float(*u); + i += (unsigned)__double_as_longlong(*d); + *d = __longlong_as_double((long long)*u); + i += __double2hiint(*d) + __double2loint(*d); + *d = __hiloint2double(*p, *p); + + i += threadIdx.x + threadIdx.y + threadIdx.z; + i += blockIdx.x + blockIdx.y + blockIdx.z; + i += blockDim.x + blockDim.y + blockDim.z; + i += gridDim.x + gridDim.y + gridDim.z; + i += __lane_id(); + unsigned long long m = __ballot(*p) + __ballot64(*p) + __activemask(); + int v = __all(*p) + __any(*p); + v += __fns(m, 0, 1) + __fns32(m, 0, 1) + __fns64(m, __lane_id(), -1); + + int s = __shfl(*p, 1) + __shfl_up(*p, 1) + __shfl_down(*p, 1) + __shfl_xor(*p, 1); + s += (int)__shfl(*u, 1, 32) + (int)__shfl_down(*f, 1) + (int)__shfl_xor(*d, 1); + s += (int)__shfl((long long)*p, 0) + (int)__shfl((unsigned long long)*u, 0); + + unsigned long long mask = ~0ull; + s += __shfl_sync(mask, *p, 1) + __shfl_up_sync(mask, *p, 1) + + __shfl_down_sync(mask, *p, 1) + __shfl_xor_sync(mask, *p, 1); + s += __all_sync(mask, *p) + __any_sync(mask, *p) + (int)__ballot_sync(mask, *p); + int pred; + s += (int)__match_any(*p) + (int)__match_any_sync(mask, *p) + + (int)__match_all_sync(mask, *p, &pred); + s += __reduce_add_sync(mask, *p) + __reduce_min_sync(mask, *p) + + __reduce_max_sync(mask, *p); + s += (int)(__reduce_add_sync(mask, *u) + __reduce_min_sync(mask, *u) + + __reduce_max_sync(mask, *u) + __reduce_and_sync(mask, *u) + + __reduce_or_sync(mask, *u) + __reduce_xor_sync(mask, *u)); + + __syncthreads(); + s += __syncthreads_count(*p) + __syncthreads_and(*p) + __syncthreads_or(*p); + __syncwarp(); + __syncwarp(mask); + __threadfence(); + __threadfence_block(); + __threadfence_system(); + long long c = clock() + clock64() + __clock() + __clock64() + wall_clock64(); + + *p = (int)i + v + s + (int)c + warpSize; +} >From 8931b683132293db0fc0d910eb163d9454a0e7cf Mon Sep 17 00:00:00 2001 From: Joseph Huber <[email protected]> Date: Tue, 16 Jun 2026 08:25:34 -0500 Subject: [PATCH 2/4] Move macro things, convergent, and simplify some casts --- clang/lib/Basic/Targets/AMDGPU.cpp | 4 ---- clang/lib/Headers/__clang_hip_builtin_vars.h | 2 +- .../Headers/__clang_hip_device_functions.h | 19 +++++++++---------- clang/lib/Headers/__clang_hip_intrinsics.h | 5 ++--- 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 906601850e353..bfa956fa9a4e3 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -252,10 +252,6 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, else Builder.defineMacro("__R600__"); - // The 'llvm' environment selects the upstream headers. - if (getTriple().getEnvironment() == llvm::Triple::LLVM) - Builder.defineMacro("__AMDGCN_LLVM__"); - // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will be // removed in the near future. if (hasFMAF()) diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h b/clang/lib/Headers/__clang_hip_builtin_vars.h index f2e53ef93b38a..83b42adc587d1 100644 --- a/clang/lib/Headers/__clang_hip_builtin_vars.h +++ b/clang/lib/Headers/__clang_hip_builtin_vars.h @@ -9,7 +9,7 @@ #ifndef __CLANG_HIP_BUILTIN_VARS_H__ #define __CLANG_HIP_BUILTIN_VARS_H__ -#if __HIP__ && (defined(__AMDGPU__)) +#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) #include <gpuintrin.h> diff --git a/clang/lib/Headers/__clang_hip_device_functions.h b/clang/lib/Headers/__clang_hip_device_functions.h index 14edf987c9207..8105adcbcc501 100644 --- a/clang/lib/Headers/__clang_hip_device_functions.h +++ b/clang/lib/Headers/__clang_hip_device_functions.h @@ -9,7 +9,7 @@ #ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__ #define __CLANG_HIP_DEVICE_FUNCTIONS_H__ -#if __HIP__ && (defined(__AMDGPU__)) +#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) #ifndef __device__ #define __host__ __attribute__((host)) @@ -22,6 +22,8 @@ #include <gpuintrin.h> +#define __HIP_LLVM__ 1 + #pragma push_macro("__HIP_DEVICE__") #define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline)) @@ -273,14 +275,11 @@ __HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base, // Synchronization and fences //===----------------------------------------------------------------------===// -__HIP_DEVICE__ __attribute__((convergent)) void __syncthreads(void) { - __gpu_sync_threads(); -} +__HIP_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); } template <typename __Fn> -__HIP_DEVICE__ __attribute__((convergent)) int -__hip_block_reduce_impl(int __val, int __init, __Fn __op) { - static __shared__ int __scratch[32]; +__HIP_DEVICE__ int __hip_block_reduce_impl(int __val, int __init, __Fn __op) { + static __attribute__((shared)) int __scratch[32]; unsigned int __lanes = __gpu_num_lanes(); unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) * __gpu_num_threads(__GPU_Y_DIM) * @@ -303,19 +302,19 @@ __hip_block_reduce_impl(int __val, int __init, __Fn __op) { return __acc; } -__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_count(int __pred) { +__HIP_DEVICE__ int __syncthreads_count(int __pred) { unsigned long long __mask = __gpu_lane_mask(); int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred)); return __hip_block_reduce_impl(__val, 0, [](int __a, int __b) { return __a + __b; }); } -__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_and(int __pred) { +__HIP_DEVICE__ int __syncthreads_and(int __pred) { unsigned long long __mask = __gpu_lane_mask(); int __val = __gpu_ballot(__mask, __pred) == __mask; return __hip_block_reduce_impl(__val, 1, [](int __a, int __b) { return __a & __b; }); } -__HIP_DEVICE__ __attribute__((convergent)) int __syncthreads_or(int __pred) { +__HIP_DEVICE__ int __syncthreads_or(int __pred) { unsigned long long __mask = __gpu_lane_mask(); int __val = __gpu_ballot(__mask, __pred) != 0ull; return __hip_block_reduce_impl(__val, 0, diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h b/clang/lib/Headers/__clang_hip_intrinsics.h index ef2bede1af070..42fb3b40fb06f 100644 --- a/clang/lib/Headers/__clang_hip_intrinsics.h +++ b/clang/lib/Headers/__clang_hip_intrinsics.h @@ -9,7 +9,7 @@ #ifndef __CLANG_HIP_INTRINSICS_H__ #define __CLANG_HIP_INTRINSICS_H__ -#if __HIP__ && (defined(__AMDGPU__)) +#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) #ifndef __HIP_DEVICE__ #error \ @@ -70,8 +70,7 @@ __HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask, // Warp synchronization //===----------------------------------------------------------------------===// -__HIP_DEVICE__ __attribute__((convergent)) void -__syncwarp(unsigned long long __mask = -1) { +__HIP_DEVICE__ void __syncwarp(unsigned long long __mask = -1) { __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT); __gpu_sync_lane(__mask); __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT); >From 55b4e03cd731cce16ecc74c77abad56b125ee363 Mon Sep 17 00:00:00 2001 From: Joseph Huber <[email protected]> Date: Tue, 16 Jun 2026 09:25:34 -0500 Subject: [PATCH 3/4] test and tweaks --- clang/lib/Driver/ToolChains/HIPAMD.cpp | 4 +++- clang/lib/Headers/__clang_hip_builtin_vars.h | 15 +++++++++++- .../Headers/__clang_hip_device_functions.h | 2 +- clang/lib/Headers/__clang_hip_intrinsics.h | 2 +- .../test/Driver/hip-device-libs-llvm-env.hip | 23 +++++++++++++++++++ 5 files changed, 42 insertions(+), 4 deletions(-) diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 2dbd8e9f670ce..ed02877cbf944 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -332,7 +332,9 @@ void HIPAMDToolChain::AddIAMCUIncludeArgs(const ArgList &Args, void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (getTriple().getEnvironment() == llvm::Triple::LLVM) { - if (!DriverArgs.hasArg(options::OPT_nohipwrapperinc) && + if (DriverArgs.hasFlag(options::OPT_offload_inc, + options::OPT_no_offload_inc, true) && + !DriverArgs.hasArg(options::OPT_nohipwrapperinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) CC1Args.append({"-include", "__clang_hip_device_functions.h"}); return; diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h b/clang/lib/Headers/__clang_hip_builtin_vars.h index 83b42adc587d1..622201c261ae2 100644 --- a/clang/lib/Headers/__clang_hip_builtin_vars.h +++ b/clang/lib/Headers/__clang_hip_builtin_vars.h @@ -20,6 +20,15 @@ inline __attribute__((device)) const struct { } } warpSize{}; +// Make sure nobody can create instances of the coordinate types, take their +// address, copy, or assign them. +#pragma push_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS") +#define __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag) \ + __attribute__((device)) __tag() = delete; \ + __attribute__((device)) __tag(const __tag &) = delete; \ + __attribute__((device)) void operator=(const __tag &) const = delete; \ + __attribute__((device)) __tag *operator&() const = delete + #pragma push_macro("__HIP_COORD_BUILTIN") #define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz) \ struct __tag { \ @@ -35,6 +44,9 @@ inline __attribute__((device)) const struct { __attribute__((device, always_inline)) unsigned int __get_z() const { \ return __fz; \ } \ + \ + private: \ + __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag); \ } __HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(), @@ -47,11 +59,12 @@ __HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(), __gpu_num_blocks_y(), __gpu_num_blocks_z()); #pragma pop_macro("__HIP_COORD_BUILTIN") +#pragma pop_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS") extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx; extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx; extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim; extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim; -#endif // __HIP__ && (defined(__AMDGPU__) || defined(__SPIRV__)) +#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) #endif // __CLANG_HIP_BUILTIN_VARS_H__ diff --git a/clang/lib/Headers/__clang_hip_device_functions.h b/clang/lib/Headers/__clang_hip_device_functions.h index 8105adcbcc501..4c949b0223d9f 100644 --- a/clang/lib/Headers/__clang_hip_device_functions.h +++ b/clang/lib/Headers/__clang_hip_device_functions.h @@ -351,5 +351,5 @@ __HIP_DEVICE__ long long wall_clock64(void) { #pragma pop_macro("MAYBE_UNDEF") #pragma pop_macro("__HIP_DEVICE__") -#endif // __HIP__ && defined(__AMDGPU__) +#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) #endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__ diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h b/clang/lib/Headers/__clang_hip_intrinsics.h index 42fb3b40fb06f..330a80bf267ac 100644 --- a/clang/lib/Headers/__clang_hip_intrinsics.h +++ b/clang/lib/Headers/__clang_hip_intrinsics.h @@ -235,5 +235,5 @@ __funnelshift_rc(unsigned int __lo, unsigned int __hi, unsigned int __shift) { return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s); } -#endif // __HIP__ && defined(__AMDGPU__) +#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) #endif // __CLANG_HIP_INTRINSICS_H__ diff --git a/clang/test/Driver/hip-device-libs-llvm-env.hip b/clang/test/Driver/hip-device-libs-llvm-env.hip index 7f3c4c9b7af10..23354a2961beb 100644 --- a/clang/test/Driver/hip-device-libs-llvm-env.hip +++ b/clang/test/Driver/hip-device-libs-llvm-env.hip @@ -9,3 +9,26 @@ // LLVMENV-NOT: oclc // LLVMENV-NOT: ocml // LLVMENV-NOT: ockl + +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \ +// RUN: -resource-dir=%S/Inputs/rocm_resource_dir \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck --check-prefix=INC %s + +// INC: "-cc1" "-triple" "amdgcn-amd-amdhsa-llvm" +// INC-SAME: "-include" "__clang_hip_device_functions.h" + +// RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc \ +// RUN: --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \ +// RUN: -resource-dir=%S/Inputs/rocm_resource_dir \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck --check-prefix=NOINC %s + +// RUN: %clang -### --target=x86_64-linux-gnu -nohipwrapperinc \ +// RUN: --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \ +// RUN: -resource-dir=%S/Inputs/rocm_resource_dir \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck --check-prefix=NOINC %s + +// NOINC-NOT: __clang_hip_device_functions.h >From 32c48d112b53e56a59b6de25af1648a3c9293899 Mon Sep 17 00:00:00 2001 From: Joseph Huber <[email protected]> Date: Wed, 17 Jun 2026 08:50:53 -0500 Subject: [PATCH 4/4] Make generic --- clang/lib/Driver/ToolChains/HIPAMD.cpp | 2 +- clang/lib/Frontend/InitPreprocessor.cpp | 2 + clang/lib/Headers/CMakeLists.txt | 6 +- ...ltin_vars.h => __clang_gpu_builtin_vars.h} | 43 ++--- ...tions.h => __clang_gpu_device_functions.h} | 158 +++++++++--------- ..._intrinsics.h => __clang_gpu_intrinsics.h} | 85 +++++----- .../test/Driver/hip-device-libs-llvm-env.hip | 4 +- ...functions.hip => gpu-device-functions.cpp} | 27 ++- clang/test/Preprocessor/predefined-macros.c | 9 + 9 files changed, 182 insertions(+), 154 deletions(-) rename clang/lib/Headers/{__clang_hip_builtin_vars.h => __clang_gpu_builtin_vars.h} (67%) rename clang/lib/Headers/{__clang_hip_device_functions.h => __clang_gpu_device_functions.h} (68%) rename clang/lib/Headers/{__clang_hip_intrinsics.h => __clang_gpu_intrinsics.h} (77%) rename clang/test/Headers/{__clang_hip_device_functions.hip => gpu-device-functions.cpp} (74%) diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index ed02877cbf944..e180fc9ad1adf 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -336,7 +336,7 @@ void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, options::OPT_no_offload_inc, true) && !DriverArgs.hasArg(options::OPT_nohipwrapperinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) - CC1Args.append({"-include", "__clang_hip_device_functions.h"}); + CC1Args.append({"-include", "__clang_gpu_device_functions.h"}); return; } diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index ec009211ec6de..4456c0e140c2c 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -604,6 +604,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, } if (LangOpts.CUDAIsDevice) { Builder.defineMacro("__HIP_DEVICE_COMPILE__"); + if (TI.getTriple().getEnvironment() == llvm::Triple::LLVM) + Builder.defineMacro("__HIP_LLVM__"); if (!TI.hasHIPImageSupport()) { Builder.defineMacro("__HIP_NO_IMAGE_SUPPORT__", "1"); // Deprecated. diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 4a60af68c1d23..07cb92d6c51e3 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -86,9 +86,6 @@ set(hip_files __clang_hip_math.h __clang_hip_stdlib.h __clang_hip_runtime_wrapper.h - __clang_hip_builtin_vars.h - __clang_hip_device_functions.h - __clang_hip_intrinsics.h ) set(hlsl_h @@ -309,6 +306,9 @@ set(gpu_files nvptxintrin.h amdgpuintrin.h spirvintrin.h + __clang_gpu_builtin_vars.h + __clang_gpu_device_functions.h + __clang_gpu_intrinsics.h ) set(windows_only_files diff --git a/clang/lib/Headers/__clang_hip_builtin_vars.h b/clang/lib/Headers/__clang_gpu_builtin_vars.h similarity index 67% rename from clang/lib/Headers/__clang_hip_builtin_vars.h rename to clang/lib/Headers/__clang_gpu_builtin_vars.h index 622201c261ae2..43905b0ab4d86 100644 --- a/clang/lib/Headers/__clang_hip_builtin_vars.h +++ b/clang/lib/Headers/__clang_gpu_builtin_vars.h @@ -1,4 +1,4 @@ -//===---- __clang_hip_builtin_vars.h - HIP built-in variables --------------=== +//===---- __clang_gpu_builtin_vars.h - GPU built-in variables --------------=== // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,10 +6,11 @@ // //===-----------------------------------------------------------------------=== -#ifndef __CLANG_HIP_BUILTIN_VARS_H__ -#define __CLANG_HIP_BUILTIN_VARS_H__ +#ifndef __CLANG_GPU_BUILTIN_VARS_H__ +#define __CLANG_GPU_BUILTIN_VARS_H__ -#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) +#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(__CUDA__) && defined(__CUDA_ARCH__)) #include <gpuintrin.h> @@ -22,15 +23,15 @@ inline __attribute__((device)) const struct { // Make sure nobody can create instances of the coordinate types, take their // address, copy, or assign them. -#pragma push_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS") -#define __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag) \ +#pragma push_macro("__GPU_DISALLOW_BUILTINVAR_ACCESS") +#define __GPU_DISALLOW_BUILTINVAR_ACCESS(__tag) \ __attribute__((device)) __tag() = delete; \ __attribute__((device)) __tag(const __tag &) = delete; \ __attribute__((device)) void operator=(const __tag &) const = delete; \ __attribute__((device)) __tag *operator&() const = delete -#pragma push_macro("__HIP_COORD_BUILTIN") -#define __HIP_COORD_BUILTIN(__tag, __fx, __fy, __fz) \ +#pragma push_macro("__GPU_COORD_BUILTIN") +#define __GPU_COORD_BUILTIN(__tag, __fx, __fy, __fz) \ struct __tag { \ __declspec(property(get = __get_x)) unsigned int x; \ __declspec(property(get = __get_y)) unsigned int y; \ @@ -46,25 +47,25 @@ inline __attribute__((device)) const struct { } \ \ private: \ - __HIP_DISALLOW_BUILTINVAR_ACCESS(__tag); \ + __GPU_DISALLOW_BUILTINVAR_ACCESS(__tag); \ } -__HIP_COORD_BUILTIN(__hip_builtin_threadIdx_t, __gpu_thread_id_x(), +__GPU_COORD_BUILTIN(__gpu_builtin_threadIdx_t, __gpu_thread_id_x(), __gpu_thread_id_y(), __gpu_thread_id_z()); -__HIP_COORD_BUILTIN(__hip_builtin_blockIdx_t, __gpu_block_id_x(), +__GPU_COORD_BUILTIN(__gpu_builtin_blockIdx_t, __gpu_block_id_x(), __gpu_block_id_y(), __gpu_block_id_z()); -__HIP_COORD_BUILTIN(__hip_builtin_blockDim_t, __gpu_num_threads_x(), +__GPU_COORD_BUILTIN(__gpu_builtin_blockDim_t, __gpu_num_threads_x(), __gpu_num_threads_y(), __gpu_num_threads_z()); -__HIP_COORD_BUILTIN(__hip_builtin_gridDim_t, __gpu_num_blocks_x(), +__GPU_COORD_BUILTIN(__gpu_builtin_gridDim_t, __gpu_num_blocks_x(), __gpu_num_blocks_y(), __gpu_num_blocks_z()); -#pragma pop_macro("__HIP_COORD_BUILTIN") -#pragma pop_macro("__HIP_DISALLOW_BUILTINVAR_ACCESS") +#pragma pop_macro("__GPU_COORD_BUILTIN") +#pragma pop_macro("__GPU_DISALLOW_BUILTINVAR_ACCESS") -extern const __attribute__((device, weak)) __hip_builtin_threadIdx_t threadIdx; -extern const __attribute__((device, weak)) __hip_builtin_blockIdx_t blockIdx; -extern const __attribute__((device, weak)) __hip_builtin_blockDim_t blockDim; -extern const __attribute__((device, weak)) __hip_builtin_gridDim_t gridDim; +extern const __attribute__((device, weak)) __gpu_builtin_threadIdx_t threadIdx; +extern const __attribute__((device, weak)) __gpu_builtin_blockIdx_t blockIdx; +extern const __attribute__((device, weak)) __gpu_builtin_blockDim_t blockDim; +extern const __attribute__((device, weak)) __gpu_builtin_gridDim_t gridDim; -#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) -#endif // __CLANG_HIP_BUILTIN_VARS_H__ +#endif // device compile +#endif // __CLANG_GPU_BUILTIN_VARS_H__ diff --git a/clang/lib/Headers/__clang_hip_device_functions.h b/clang/lib/Headers/__clang_gpu_device_functions.h similarity index 68% rename from clang/lib/Headers/__clang_hip_device_functions.h rename to clang/lib/Headers/__clang_gpu_device_functions.h index 4c949b0223d9f..3b068052d935b 100644 --- a/clang/lib/Headers/__clang_hip_device_functions.h +++ b/clang/lib/Headers/__clang_gpu_device_functions.h @@ -1,4 +1,4 @@ -//===---- __clang_hip_device_functions.h - HIP device functions ------------=== +//===---- __clang_gpu_device_functions.h - GPU device functions ------------=== // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,10 +6,11 @@ // //===-----------------------------------------------------------------------=== -#ifndef __CLANG_HIP_DEVICE_FUNCTIONS_H__ -#define __CLANG_HIP_DEVICE_FUNCTIONS_H__ +#ifndef __CLANG_GPU_DEVICE_FUNCTIONS_H__ +#define __CLANG_GPU_DEVICE_FUNCTIONS_H__ -#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) +#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(__CUDA__) && defined(__CUDA_ARCH__)) #ifndef __device__ #define __host__ __attribute__((host)) @@ -22,104 +23,102 @@ #include <gpuintrin.h> -#define __HIP_LLVM__ 1 - -#pragma push_macro("__HIP_DEVICE__") -#define __HIP_DEVICE__ static __inline__ __attribute__((device, always_inline)) +#pragma push_macro("__GPU_DEVICE__") +#define __GPU_DEVICE__ static __inline__ __attribute__((device, always_inline)) #pragma push_macro("MAYBE_UNDEF") #define MAYBE_UNDEF __attribute__((maybe_undef)) // warpSize and the threadIdx/blockIdx/blockDim/gridDim coordinate variables. -#include <__clang_hip_builtin_vars.h> +#include <__clang_gpu_builtin_vars.h> //===----------------------------------------------------------------------===// // Integer intrinsics. //===----------------------------------------------------------------------===// -__HIP_DEVICE__ unsigned int __popc(unsigned int __x) { +__GPU_DEVICE__ unsigned int __popc(unsigned int __x) { return __builtin_popcountg(__x); } -__HIP_DEVICE__ unsigned int __popcll(unsigned long long __x) { +__GPU_DEVICE__ unsigned int __popcll(unsigned long long __x) { return __builtin_popcountg(__x); } -__HIP_DEVICE__ int __clz(int __x) { +__GPU_DEVICE__ int __clz(int __x) { return __builtin_clzg((unsigned int)__x, 32); } -__HIP_DEVICE__ int __clzll(long long __x) { +__GPU_DEVICE__ int __clzll(long long __x) { return __builtin_clzg((unsigned long long)__x, 64); } -__HIP_DEVICE__ int __ffs(int __x) { +__GPU_DEVICE__ int __ffs(int __x) { return __builtin_ctzg((unsigned int)__x, -1) + 1; } -__HIP_DEVICE__ int __ffs(unsigned int __x) { +__GPU_DEVICE__ int __ffs(unsigned int __x) { return __builtin_ctzg(__x, -1) + 1; } -__HIP_DEVICE__ int __ffsll(long long __x) { +__GPU_DEVICE__ int __ffsll(long long __x) { return __builtin_ctzg((unsigned long long)__x, -1) + 1; } -__HIP_DEVICE__ int __ffsll(unsigned long long __x) { +__GPU_DEVICE__ int __ffsll(unsigned long long __x) { return __builtin_ctzg(__x, -1) + 1; } -__HIP_DEVICE__ unsigned int __brev(unsigned int __x) { +__GPU_DEVICE__ unsigned int __brev(unsigned int __x) { return __builtin_elementwise_bitreverse(__x); } -__HIP_DEVICE__ unsigned long long __brevll(unsigned long long __x) { +__GPU_DEVICE__ unsigned long long __brevll(unsigned long long __x) { return __builtin_elementwise_bitreverse(__x); } -__HIP_DEVICE__ int __mul24(int __x, int __y) { +__GPU_DEVICE__ int __mul24(int __x, int __y) { return ((int(unsigned(__x) << 8) >> 8)) * ((int(unsigned(__y) << 8) >> 8)); } -__HIP_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) { +__GPU_DEVICE__ int __umul24(unsigned int __x, unsigned int __y) { return int((__x & 0x00ffffffu) * (__y & 0x00ffffffu)); } -__HIP_DEVICE__ int __mulhi(int __x, int __y) { +__GPU_DEVICE__ int __mulhi(int __x, int __y) { return int(((long long)__x * (long long)__y) >> 32); } -__HIP_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) { +__GPU_DEVICE__ unsigned int __umulhi(unsigned int __x, unsigned int __y) { return (unsigned int)(((unsigned long long)__x * (unsigned long long)__y) >> 32); } -__HIP_DEVICE__ long long __mul64hi(long long __x, long long __y) { +__GPU_DEVICE__ long long __mul64hi(long long __x, long long __y) { return (long long)(((__int128)__x * (__int128)__y) >> 64); } -__HIP_DEVICE__ unsigned long long __umul64hi(unsigned long long __x, +__GPU_DEVICE__ unsigned long long __umul64hi(unsigned long long __x, unsigned long long __y) { return ( unsigned long long)(((unsigned __int128)__x * (unsigned __int128)__y) >> 64); } -__HIP_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) { +__GPU_DEVICE__ unsigned int __sad(int __x, int __y, unsigned int __z) { return __x > __y ? __x - __y + __z : __y - __x + __z; } -__HIP_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y, +__GPU_DEVICE__ unsigned int __usad(unsigned int __x, unsigned int __y, unsigned int __z) { return __x > __y ? __x - __y + __z : __y - __x + __z; } -__HIP_DEVICE__ int __hadd(int __x, int __y) { +__GPU_DEVICE__ int __hadd(int __x, int __y) { return int(((long long)__x + (long long)__y) >> 1); } -__HIP_DEVICE__ int __rhadd(int __x, int __y) { +__GPU_DEVICE__ int __rhadd(int __x, int __y) { return int(((long long)__x + (long long)__y + 1) >> 1); } -__HIP_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) { +__GPU_DEVICE__ unsigned int __uhadd(unsigned int __x, unsigned int __y) { return (unsigned int)(((unsigned long long)__x + (unsigned long long)__y) >> 1); } -__HIP_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) { +__GPU_DEVICE__ unsigned int __urhadd(unsigned int __x, unsigned int __y) { return ( unsigned int)(((unsigned long long)__x + (unsigned long long)__y + 1) >> 1); } -__HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y, +__GPU_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y, unsigned int __s) { unsigned long long __tmp = ((unsigned long long)__y << 32) | __x; unsigned int __result = 0; @@ -134,18 +133,18 @@ __HIP_DEVICE__ unsigned int __byte_perm(unsigned int __x, unsigned int __y, // Bitfield operations. //===----------------------------------------------------------------------===// -__HIP_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) { +__GPU_DEVICE__ unsigned int __lastbit_u32_u64(unsigned long long __x) { return (unsigned int)__builtin_ctzg(__x, -1); } -__HIP_DEVICE__ unsigned int __bitextract_u32(unsigned int __src, +__GPU_DEVICE__ unsigned int __bitextract_u32(unsigned int __src, unsigned int __offset, unsigned int __width) { unsigned int __o = __offset & 31u; unsigned int __w = __width & 31u; return __w == 0 ? 0u : (__src << (32u - __o - __w)) >> (32u - __w); } -__HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src, +__GPU_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src, unsigned int __offset, unsigned int __width) { unsigned long long __o = __offset & 63u; @@ -153,7 +152,7 @@ __HIP_DEVICE__ unsigned long long __bitextract_u64(unsigned long long __src, return __w == 0 ? 0ull : (__src << (64ull - __o - __w)) >> (64ull - __w); } -__HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst, +__GPU_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst, unsigned int __src, unsigned int __offset, unsigned int __width) { @@ -161,7 +160,7 @@ __HIP_DEVICE__ unsigned int __bitinsert_u32(unsigned int __dst, unsigned int __mask = (1u << (__width & 31u)) - 1u; return (__dst & ~(__mask << __o)) | ((__src & __mask) << __o); } -__HIP_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst, +__GPU_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst, unsigned long long __src, unsigned int __offset, unsigned int __width) { @@ -174,31 +173,31 @@ __HIP_DEVICE__ unsigned long long __bitinsert_u64(unsigned long long __dst, // Type punning. //===----------------------------------------------------------------------===// -__HIP_DEVICE__ int __float_as_int(float __x) { +__GPU_DEVICE__ int __float_as_int(float __x) { return __builtin_bit_cast(int, __x); } -__HIP_DEVICE__ unsigned int __float_as_uint(float __x) { +__GPU_DEVICE__ unsigned int __float_as_uint(float __x) { return __builtin_bit_cast(unsigned int, __x); } -__HIP_DEVICE__ float __int_as_float(int __x) { +__GPU_DEVICE__ float __int_as_float(int __x) { return __builtin_bit_cast(float, __x); } -__HIP_DEVICE__ float __uint_as_float(unsigned int __x) { +__GPU_DEVICE__ float __uint_as_float(unsigned int __x) { return __builtin_bit_cast(float, __x); } -__HIP_DEVICE__ long long __double_as_longlong(double __x) { +__GPU_DEVICE__ long long __double_as_longlong(double __x) { return __builtin_bit_cast(long long, __x); } -__HIP_DEVICE__ double __longlong_as_double(long long __x) { +__GPU_DEVICE__ double __longlong_as_double(long long __x) { return __builtin_bit_cast(double, __x); } -__HIP_DEVICE__ int __double2hiint(double __x) { +__GPU_DEVICE__ int __double2hiint(double __x) { return int(__builtin_bit_cast(unsigned long long, __x) >> 32); } -__HIP_DEVICE__ int __double2loint(double __x) { +__GPU_DEVICE__ int __double2loint(double __x) { return int(__builtin_bit_cast(unsigned long long, __x)); } -__HIP_DEVICE__ double __hiloint2double(int __hi, int __lo) { +__GPU_DEVICE__ double __hiloint2double(int __hi, int __lo) { return __builtin_bit_cast(double, ((unsigned long long)(unsigned int)__hi << 32) | (unsigned long long)(unsigned int)__lo); @@ -208,27 +207,27 @@ __HIP_DEVICE__ double __hiloint2double(int __hi, int __lo) { // Wavefront vote and lane identity. //===----------------------------------------------------------------------===// -__HIP_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); } +__GPU_DEVICE__ unsigned int __lane_id(void) { return __gpu_lane_id(); } -__HIP_DEVICE__ unsigned long long __ballot(int __pred) { +__GPU_DEVICE__ unsigned long long __ballot(int __pred) { return __gpu_ballot(__gpu_lane_mask(), __pred); } -__HIP_DEVICE__ unsigned long long __ballot64(int __pred) { +__GPU_DEVICE__ unsigned long long __ballot64(int __pred) { return __gpu_ballot(__gpu_lane_mask(), __pred); } -__HIP_DEVICE__ unsigned long long __activemask(void) { +__GPU_DEVICE__ unsigned long long __activemask(void) { return __gpu_ballot(__gpu_lane_mask(), 1); } -__HIP_DEVICE__ int __all(int __pred) { +__GPU_DEVICE__ int __all(int __pred) { return __gpu_ballot(__gpu_lane_mask(), __pred) == __gpu_lane_mask(); } -__HIP_DEVICE__ int __any(int __pred) { +__GPU_DEVICE__ int __any(int __pred) { return __gpu_ballot(__gpu_lane_mask(), __pred) != 0ull; } template <typename __T> -__HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned int __base, +__GPU_DEVICE__ int __gpu_fns_impl(__T __mask, unsigned int __base, int __offset) { const int __bits = int(sizeof(__T)) * 8; __T __m = __mask; @@ -258,15 +257,15 @@ __HIP_DEVICE__ int __hip_fns_impl(__T __mask, unsigned int __base, } return __offset < 0 ? (__bits - 1) - __total : __total; } -__HIP_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base, +__GPU_DEVICE__ int __fns64(unsigned long long __mask, unsigned int __base, int __offset) { - return __hip_fns_impl(__mask, __base, __offset); + return __gpu_fns_impl(__mask, __base, __offset); } -__HIP_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base, +__GPU_DEVICE__ int __fns32(unsigned long long __mask, unsigned int __base, int __offset) { - return __hip_fns_impl((unsigned int)__mask, __base, __offset); + return __gpu_fns_impl((unsigned int)__mask, __base, __offset); } -__HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base, +__GPU_DEVICE__ int __fns(unsigned int __mask, unsigned int __base, int __offset) { return __fns32(__mask, __base, __offset); } @@ -275,10 +274,13 @@ __HIP_DEVICE__ int __fns(unsigned int __mask, unsigned int __base, // Synchronization and fences //===----------------------------------------------------------------------===// -__HIP_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); } +// Some targets (e.g. NVPTX) expose __syncthreads as a compiler builtin already. +#if !__has_builtin(__syncthreads) +__GPU_DEVICE__ void __syncthreads(void) { __gpu_sync_threads(); } +#endif template <typename __Fn> -__HIP_DEVICE__ int __hip_block_reduce_impl(int __val, int __init, __Fn __op) { +__GPU_DEVICE__ int __gpu_block_reduce_impl(int __val, int __init, __Fn __op) { static __attribute__((shared)) int __scratch[32]; unsigned int __lanes = __gpu_num_lanes(); unsigned int __nthreads = __gpu_num_threads(__GPU_X_DIM) * @@ -302,32 +304,32 @@ __HIP_DEVICE__ int __hip_block_reduce_impl(int __val, int __init, __Fn __op) { return __acc; } -__HIP_DEVICE__ int __syncthreads_count(int __pred) { +__GPU_DEVICE__ int __syncthreads_count(int __pred) { unsigned long long __mask = __gpu_lane_mask(); int __val = __builtin_popcountg(__gpu_ballot(__mask, __pred)); - return __hip_block_reduce_impl(__val, 0, + return __gpu_block_reduce_impl(__val, 0, [](int __a, int __b) { return __a + __b; }); } -__HIP_DEVICE__ int __syncthreads_and(int __pred) { +__GPU_DEVICE__ int __syncthreads_and(int __pred) { unsigned long long __mask = __gpu_lane_mask(); int __val = __gpu_ballot(__mask, __pred) == __mask; - return __hip_block_reduce_impl(__val, 1, + return __gpu_block_reduce_impl(__val, 1, [](int __a, int __b) { return __a & __b; }); } -__HIP_DEVICE__ int __syncthreads_or(int __pred) { +__GPU_DEVICE__ int __syncthreads_or(int __pred) { unsigned long long __mask = __gpu_lane_mask(); int __val = __gpu_ballot(__mask, __pred) != 0ull; - return __hip_block_reduce_impl(__val, 0, + return __gpu_block_reduce_impl(__val, 0, [](int __a, int __b) { return __a | __b; }); } -__HIP_DEVICE__ void __threadfence(void) { +__GPU_DEVICE__ void __threadfence(void) { __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); } -__HIP_DEVICE__ void __threadfence_block(void) { +__GPU_DEVICE__ void __threadfence_block(void) { __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_WRKGRP); } -__HIP_DEVICE__ void __threadfence_system(void) { +__GPU_DEVICE__ void __threadfence_system(void) { __scoped_atomic_thread_fence(__ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM); } @@ -335,21 +337,21 @@ __HIP_DEVICE__ void __threadfence_system(void) { // Timers //===----------------------------------------------------------------------===// -__HIP_DEVICE__ long long __clock64(void) { +__GPU_DEVICE__ long long __clock64(void) { return (long long)__builtin_readcyclecounter(); } -__HIP_DEVICE__ long long __clock(void) { return __clock64(); } -__HIP_DEVICE__ long long clock64(void) { return __clock64(); } -__HIP_DEVICE__ long long clock(void) { return __clock(); } -__HIP_DEVICE__ long long wall_clock64(void) { +__GPU_DEVICE__ long long __clock(void) { return __clock64(); } +__GPU_DEVICE__ long long clock64(void) { return __clock64(); } +__GPU_DEVICE__ long long clock(void) { return __clock(); } +__GPU_DEVICE__ long long wall_clock64(void) { return (long long)__builtin_readsteadycounter(); } // Warp shuffle / synchronization / reduction intrinsics. -#include <__clang_hip_intrinsics.h> +#include <__clang_gpu_intrinsics.h> #pragma pop_macro("MAYBE_UNDEF") -#pragma pop_macro("__HIP_DEVICE__") +#pragma pop_macro("__GPU_DEVICE__") -#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) -#endif // __CLANG_HIP_DEVICE_FUNCTIONS_H__ +#endif // device compile +#endif // __CLANG_GPU_DEVICE_FUNCTIONS_H__ diff --git a/clang/lib/Headers/__clang_hip_intrinsics.h b/clang/lib/Headers/__clang_gpu_intrinsics.h similarity index 77% rename from clang/lib/Headers/__clang_hip_intrinsics.h rename to clang/lib/Headers/__clang_gpu_intrinsics.h index 330a80bf267ac..552de46d72f0c 100644 --- a/clang/lib/Headers/__clang_hip_intrinsics.h +++ b/clang/lib/Headers/__clang_gpu_intrinsics.h @@ -1,4 +1,4 @@ -//===--- __clang_hip_intrinsics.h - Device-side HIP intrinsic wrappers ------=== +//===--- __clang_gpu_intrinsics.h - Device-side GPU intrinsic wrappers ------=== // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,14 +6,15 @@ // //===-----------------------------------------------------------------------=== -#ifndef __CLANG_HIP_INTRINSICS_H__ -#define __CLANG_HIP_INTRINSICS_H__ +#ifndef __CLANG_GPU_INTRINSICS_H__ +#define __CLANG_GPU_INTRINSICS_H__ -#if __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) +#if (defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(__CUDA__) && defined(__CUDA_ARCH__)) -#ifndef __HIP_DEVICE__ +#ifndef __GPU_DEVICE__ #error \ - "__clang_hip_intrinsics.h must be included via __clang_hip_device_functions.h" + "__clang_gpu_intrinsics.h must be included via __clang_gpu_device_functions.h" #endif //===----------------------------------------------------------------------===// @@ -21,7 +22,7 @@ //===----------------------------------------------------------------------===// template <typename __T> -__HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx, +__GPU_DEVICE__ __T __gpu_shuffle_idx_impl(__T __v, unsigned int __idx, int __w) { if constexpr (sizeof(__T) == sizeof(unsigned long long)) return __builtin_bit_cast( @@ -36,33 +37,33 @@ __HIP_DEVICE__ __T __hip_shuffle_idx_impl(__T __v, unsigned int __idx, } template <typename __T> -__HIP_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane, +__GPU_DEVICE__ __T __shfl(MAYBE_UNDEF __T __var, int __src_lane, int __width = warpSize) { - return __hip_shuffle_idx_impl( + return __gpu_shuffle_idx_impl( __var, (unsigned int)(__src_lane & (__width - 1)), __width); } template <typename __T> -__HIP_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta, +__GPU_DEVICE__ __T __shfl_up(MAYBE_UNDEF __T __var, unsigned int __delta, int __width = warpSize) { int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1)); int __tgt = __rel - int(__delta); - return __hip_shuffle_idx_impl( + return __gpu_shuffle_idx_impl( __var, (unsigned int)(__tgt < 0 ? __rel : __tgt), __width); } template <typename __T> -__HIP_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta, +__GPU_DEVICE__ __T __shfl_down(MAYBE_UNDEF __T __var, unsigned int __delta, int __width = warpSize) { int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1)); int __tgt = __rel + int(__delta); - return __hip_shuffle_idx_impl( + return __gpu_shuffle_idx_impl( __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width); } template <typename __T> -__HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask, +__GPU_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask, int __width = warpSize) { int __rel = int(__gpu_lane_id() & (unsigned int)(__width - 1)); int __tgt = __rel ^ __lane_mask; - return __hip_shuffle_idx_impl( + return __gpu_shuffle_idx_impl( __var, (unsigned int)(__tgt >= __width ? __rel : __tgt), __width); } @@ -70,7 +71,7 @@ __HIP_DEVICE__ __T __shfl_xor(MAYBE_UNDEF __T __var, int __lane_mask, // Warp synchronization //===----------------------------------------------------------------------===// -__HIP_DEVICE__ void __syncwarp(unsigned long long __mask = -1) { +__GPU_DEVICE__ void __syncwarp(unsigned long long __mask = -1) { __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT); __gpu_sync_lane(__mask); __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT); @@ -81,40 +82,40 @@ __HIP_DEVICE__ void __syncwarp(unsigned long long __mask = -1) { //===----------------------------------------------------------------------===// template <typename __MaskT> -__HIP_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) { +__GPU_DEVICE__ unsigned long long __ballot_sync(__MaskT __mask, int __pred) { return __ballot(__pred) & (unsigned long long)__mask; } template <typename __MaskT> -__HIP_DEVICE__ int __all_sync(__MaskT __mask, int __pred) { +__GPU_DEVICE__ int __all_sync(__MaskT __mask, int __pred) { return __ballot_sync(__mask, __pred) == (unsigned long long)__mask; } template <typename __MaskT> -__HIP_DEVICE__ int __any_sync(__MaskT __mask, int __pred) { +__GPU_DEVICE__ int __any_sync(__MaskT __mask, int __pred) { return __ballot_sync(__mask, __pred) != 0ull; } template <typename __MaskT, typename __T> -__HIP_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var, +__GPU_DEVICE__ __T __shfl_sync(__MaskT __mask, MAYBE_UNDEF __T __var, int __src_lane, int __width = warpSize) { (void)__mask; return __shfl(__var, __src_lane, __width); } template <typename __MaskT, typename __T> -__HIP_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var, +__GPU_DEVICE__ __T __shfl_up_sync(__MaskT __mask, MAYBE_UNDEF __T __var, unsigned int __delta, int __width = warpSize) { (void)__mask; return __shfl_up(__var, __delta, __width); } template <typename __MaskT, typename __T> -__HIP_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var, +__GPU_DEVICE__ __T __shfl_down_sync(__MaskT __mask, MAYBE_UNDEF __T __var, unsigned int __delta, int __width = warpSize) { (void)__mask; return __shfl_down(__var, __delta, __width); } template <typename __MaskT, typename __T> -__HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var, +__GPU_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var, int __lane_mask, int __width = warpSize) { (void)__mask; return __shfl_xor(__var, __lane_mask, __width); @@ -125,7 +126,7 @@ __HIP_DEVICE__ __T __shfl_xor_sync(__MaskT __mask, MAYBE_UNDEF __T __var, //===----------------------------------------------------------------------===// template <typename __T> -__HIP_DEVICE__ unsigned long long __match_any(__T __value) { +__GPU_DEVICE__ unsigned long long __match_any(__T __value) { if constexpr (sizeof(__T) == sizeof(unsigned long long)) return __gpu_match_any_u64(__gpu_lane_mask(), __builtin_bit_cast(unsigned long long, __value)); @@ -134,13 +135,13 @@ __HIP_DEVICE__ unsigned long long __match_any(__T __value) { __builtin_bit_cast(unsigned int, __value)); } template <typename __MaskT, typename __T> -__HIP_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask, +__GPU_DEVICE__ unsigned long long __match_any_sync(__MaskT __mask, __T __value) { return __match_any(__value) & (unsigned long long)__mask; } template <typename __T> -__HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) { +__GPU_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) { unsigned long long __r; if constexpr (sizeof(__T) == sizeof(unsigned long long)) __r = __gpu_match_all_u64(__gpu_lane_mask(), @@ -152,7 +153,7 @@ __HIP_DEVICE__ unsigned long long __match_all(__T __value, int *__pred) { return __r; } template <typename __MaskT, typename __T> -__HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value, +__GPU_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value, int *__pred) { (void)__mask; return __match_all(__value, __pred); @@ -163,49 +164,49 @@ __HIP_DEVICE__ unsigned long long __match_all_sync(__MaskT __mask, __T __value, //===----------------------------------------------------------------------===// template <typename __MaskT> -__HIP_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask, +__GPU_DEVICE__ unsigned int __reduce_add_sync(__MaskT __mask, unsigned int __val) { return __gpu_lane_add_u32((unsigned long long)__mask, __val); } template <typename __MaskT> -__HIP_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) { +__GPU_DEVICE__ int __reduce_add_sync(__MaskT __mask, int __val) { return int( __gpu_lane_add_u32((unsigned long long)__mask, (unsigned int)__val)); } template <typename __MaskT> -__HIP_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask, +__GPU_DEVICE__ unsigned int __reduce_min_sync(__MaskT __mask, unsigned int __val) { return __gpu_lane_min_u32((unsigned long long)__mask, __val); } template <typename __MaskT> -__HIP_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) { +__GPU_DEVICE__ int __reduce_min_sync(__MaskT __mask, int __val) { unsigned int __r = __gpu_lane_min_u32((unsigned long long)__mask, (unsigned int)__val ^ 0x80000000u); return int(__r ^ 0x80000000u); } template <typename __MaskT> -__HIP_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask, +__GPU_DEVICE__ unsigned int __reduce_max_sync(__MaskT __mask, unsigned int __val) { return __gpu_lane_max_u32((unsigned long long)__mask, __val); } template <typename __MaskT> -__HIP_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) { +__GPU_DEVICE__ int __reduce_max_sync(__MaskT __mask, int __val) { unsigned int __r = __gpu_lane_max_u32((unsigned long long)__mask, (unsigned int)__val ^ 0x80000000u); return int(__r ^ 0x80000000u); } template <typename __MaskT> -__HIP_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask, +__GPU_DEVICE__ unsigned int __reduce_and_sync(__MaskT __mask, unsigned int __val) { return __gpu_lane_and_u32((unsigned long long)__mask, __val); } template <typename __MaskT> -__HIP_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask, +__GPU_DEVICE__ unsigned int __reduce_or_sync(__MaskT __mask, unsigned int __val) { return __gpu_lane_or_u32((unsigned long long)__mask, __val); } template <typename __MaskT> -__HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask, +__GPU_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask, unsigned int __val) { return __gpu_lane_xor_u32((unsigned long long)__mask, __val); } @@ -214,26 +215,26 @@ __HIP_DEVICE__ unsigned int __reduce_xor_sync(__MaskT __mask, // Funnel shifts. //===----------------------------------------------------------------------===// -__HIP_DEVICE__ unsigned int +__GPU_DEVICE__ unsigned int __funnelshift_l(unsigned int __lo, unsigned int __hi, unsigned int __shift) { unsigned int __s = __shift & 31u; return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 32); } -__HIP_DEVICE__ unsigned int +__GPU_DEVICE__ unsigned int __funnelshift_lc(unsigned int __lo, unsigned int __hi, unsigned int __shift) { unsigned int __s = __shift >= 32u ? 32u : __shift; return (unsigned int)((((unsigned long long)__hi << 32 | __lo) << __s) >> 32); } -__HIP_DEVICE__ unsigned int +__GPU_DEVICE__ unsigned int __funnelshift_r(unsigned int __lo, unsigned int __hi, unsigned int __shift) { unsigned int __s = __shift & 31u; return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s); } -__HIP_DEVICE__ unsigned int +__GPU_DEVICE__ unsigned int __funnelshift_rc(unsigned int __lo, unsigned int __hi, unsigned int __shift) { unsigned int __s = __shift >= 32u ? 32u : __shift; return (unsigned int)(((unsigned long long)__hi << 32 | __lo) >> __s); } -#endif // __HIP__ && (defined(__HIP_DEVICE_COMPILE__)) -#endif // __CLANG_HIP_INTRINSICS_H__ +#endif // device compile +#endif // __CLANG_GPU_INTRINSICS_H__ diff --git a/clang/test/Driver/hip-device-libs-llvm-env.hip b/clang/test/Driver/hip-device-libs-llvm-env.hip index 23354a2961beb..2d6f7f2479fbc 100644 --- a/clang/test/Driver/hip-device-libs-llvm-env.hip +++ b/clang/test/Driver/hip-device-libs-llvm-env.hip @@ -17,7 +17,7 @@ // RUN: 2>&1 | FileCheck --check-prefix=INC %s // INC: "-cc1" "-triple" "amdgcn-amd-amdhsa-llvm" -// INC-SAME: "-include" "__clang_hip_device_functions.h" +// INC-SAME: "-include" "__clang_gpu_device_functions.h" // RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc \ // RUN: --offload-targets=amdgcn-amd-amdhsa-llvm --offload-arch=gfx90a \ @@ -31,4 +31,4 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck --check-prefix=NOINC %s -// NOINC-NOT: __clang_hip_device_functions.h +// NOINC-NOT: __clang_gpu_device_functions.h diff --git a/clang/test/Headers/__clang_hip_device_functions.hip b/clang/test/Headers/gpu-device-functions.cpp similarity index 74% rename from clang/test/Headers/__clang_hip_device_functions.hip rename to clang/test/Headers/gpu-device-functions.cpp index 472f1c97f66b1..e9541936adebd 100644 --- a/clang/test/Headers/__clang_hip_device_functions.hip +++ b/clang/test/Headers/gpu-device-functions.cpp @@ -1,15 +1,28 @@ -// REQUIRES: amdgpu-registered-target +// HIP on AMDGPU. +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers \ +// RUN: -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \ +// RUN: -x hip -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \ +// RUN: -include __clang_gpu_device_functions.h +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers \ +// RUN: -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \ +// RUN: -x hip -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s \ +// RUN: -include __clang_gpu_device_functions.h +// HIP on SPIR-V. // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ // RUN: -internal-isystem %S/../../lib/Headers \ -// RUN: -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \ -// RUN: -fcuda-is-device -target-cpu gfx90a -fsyntax-only -verify %s \ -// RUN: -include __clang_hip_device_functions.h +// RUN: -triple spirv64-amd-amdhsa -aux-triple x86_64-unknown-unknown \ +// RUN: -x hip -fcuda-is-device -fsyntax-only -verify %s \ +// RUN: -include __clang_gpu_device_functions.h + +// CUDA on NVPTX. // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ // RUN: -internal-isystem %S/../../lib/Headers \ -// RUN: -triple amdgcn-amd-amdhsa-llvm -aux-triple x86_64-unknown-unknown \ -// RUN: -fcuda-is-device -target-cpu gfx1100 -fsyntax-only -verify %s \ -// RUN: -include __clang_hip_device_functions.h +// RUN: -triple nvptx64-nvidia-cuda -aux-triple x86_64-unknown-unknown \ +// RUN: -x cuda -fcuda-is-device -target-cpu sm_70 -fsyntax-only -verify %s \ +// RUN: -include __clang_gpu_device_functions.h // expected-no-diagnostics diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c index 2dd4b78238a7b..dc7a884ec8ed8 100644 --- a/clang/test/Preprocessor/predefined-macros.c +++ b/clang/test/Preprocessor/predefined-macros.c @@ -297,6 +297,15 @@ // CHECK-HIP-DEV: #define __HIPCC__ 1 // CHECK-HIP-DEV: #define __HIP_DEVICE_COMPILE__ 1 // CHECK-HIP-DEV: #define __HIP__ 1 +// CHECK-HIP-DEV-NOT: #define __HIP_LLVM__ 1 + +// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa-llvm \ +// RUN: -fcuda-is-device \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-LLVM +// CHECK-HIP-LLVM: #define __HIPCC__ 1 +// CHECK-HIP-LLVM: #define __HIP_DEVICE_COMPILE__ 1 +// CHECK-HIP-LLVM: #define __HIP_LLVM__ 1 +// CHECK-HIP-LLVM: #define __HIP__ 1 // RUN: %clang_cc1 %s -E -dM -o - -x cuda -triple nvptx \ // RUN: -fcuda-is-device \ _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
