https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/190021
>From e215f3792d5fe1504a46d91b26f9502a2a72a5ac Mon Sep 17 00:00:00 2001 From: Artem Belevich <[email protected]> Date: Tue, 31 Mar 2026 16:01:08 -0700 Subject: [PATCH 1/6] [CUDA] Refactor and consolidate load/store intrinsics - Defined family macros to reduce code duplication. - Added support for unsigned long types. - Implemented missing load/store families (ca, lu, wb, cg, cs). - Forwarded long/ulong types to standard types. Verified with test file. --- clang/lib/Headers/__clang_cuda_intrinsics.h | 390 ++++++++++---------- 1 file changed, 196 insertions(+), 194 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h index cca97cb21ef50..9507b6ce59e82 100644 --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -523,173 +523,119 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, return __ret; \ } -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long, - "=l", ); - -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", ); -__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", ); -__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", ); -__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", ); - -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int, - "=r", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short, - "=h", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int, - "=r", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long, - unsigned long long, "=l", ); - -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", ); -__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", ); -__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", ); -__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2, - "=l", ); - -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", ); -__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", ); -__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", ); -__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", ); - -inline __device__ long __ldcg(const long *__ptr) { - unsigned long __ret; - if (sizeof(long) == 8) { - asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr)); - } else { - asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr)); +#pragma push_macro("__INTRINSIC_LOAD_LONG") +#define __INTRINSIC_LOAD_LONG(__Mode) \ + inline __device__ long __ld##__Mode(const long *__ptr) { \ + if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ + return (long)__ld##__Mode((const long long *)__ptr); \ + } else { \ + return (long)__ld##__Mode((const int *)__ptr); \ + } \ } - return (long)__ret; -} - -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int, - "=r", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short, - "=h", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int, - "=r", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long, - unsigned long long, "=l", : "memory"); - -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int, - "=r", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int, - "=r", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short, - "=h", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int, - "=r", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long, - "=l", : "memory"); - -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2, - "=r", : "memory"); -__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4, - "=r", : "memory"); -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2, - "=h", : "memory"); -__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4, - "=h", : "memory"); -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2, - "=r", : "memory"); -__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4, - "=r", : "memory"); -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2, - "=l", : "memory"); - -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory"); -__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory"); -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2, - "=h", : "memory"); -__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4, - "=h", : "memory"); -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory"); -__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory"); -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2, - "=l", : "memory"); - -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory"); -__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory"); - -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2, - "=f", : "memory"); -__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4, - "=f", : "memory"); -__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2, - "=d", : "memory"); - -inline __device__ long __ldcv(const long *__ptr) { - unsigned long __ret; - if (sizeof(long) == 8) { - asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr)); - } else { - asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr)); + +#pragma push_macro("__INTRINSIC_LOAD_ULONG") +#define __INTRINSIC_LOAD_ULONG(__Mode) \ + inline __device__ unsigned long __ld##__Mode(const unsigned long *__ptr) { \ + if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ + return (unsigned long)__ld##__Mode((const unsigned long long *)__ptr); \ + } else { \ + return (unsigned long)__ld##__Mode((const unsigned int *)__ptr); \ + } \ } - return (long)__ret; -} - -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long, - "=l", ); - -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", ); -__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", ); -__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", ); -__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", ); - -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int, - "=r", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short, - "=h", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int, - "=r", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long, - unsigned long long, "=l", ); - -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", ); -__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", ); -__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", ); -__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2, - "=l", ); - -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", ); -__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", ); -__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", ); -__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", ); + +#pragma push_macro("__INTRINSIC_LOAD_FAMILY") +#define __INTRINSIC_LOAD_FAMILY(__Mode, __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", char, \ + unsigned int, "=r", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", signed char, \ + unsigned int, "=r", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s16", short, \ + unsigned short, "=h", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s32", int, \ + unsigned int, "=r", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s64", long long, \ + unsigned long long, "=l", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s8", char2, int2, \ + "=r", __Clobber) \ + __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s8", char4, int4, \ + "=r", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s16", short2, \ + short2, "=h", __Clobber) \ + __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s16", short4, \ + short4, "=h", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s32", int2, int2, \ + "=r", __Clobber) \ + __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s32", int4, int4, \ + "=r", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s64", longlong2, \ + longlong2, "=l", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char, \ + unsigned int, "=r", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short, \ + unsigned short, "=h", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int, \ + unsigned int, "=r", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u64", \ + unsigned long long, unsigned long long, "=l", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u8", uchar2, \ + uint2, "=r", __Clobber) \ + __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u8", uchar4, \ + uint4, "=r", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u16", ushort2, \ + ushort2, "=h", __Clobber) \ + __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u16", ushort4, \ + ushort4, "=h", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u32", uint2, \ + uint2, "=r", __Clobber) \ + __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u32", uint4, \ + uint4, "=r", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2, \ + ulonglong2, "=l", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f32", float, float, \ + "=f", __Clobber) \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f64", double, double, \ + "=d", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f32", float2, \ + float2, "=f", __Clobber) \ + __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.f32", float4, \ + float4, "=f", __Clobber) \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f64", double2, \ + double2, "=d", __Clobber) \ + __INTRINSIC_LOAD_LONG(__Mode) \ + __INTRINSIC_LOAD_ULONG(__Mode) + + + + +__INTRINSIC_LOAD_FAMILY(cg, ) +__INTRINSIC_LOAD_FAMILY(ca, ) + + + + + +__INTRINSIC_LOAD_FAMILY(cv, : "memory") +__INTRINSIC_LOAD_FAMILY(lu, : "memory") + + + + + + +__INTRINSIC_LOAD_FAMILY(cs, ) + + + #pragma pop_macro("__INTRINSIC_LOAD") #pragma pop_macro("__INTRINSIC_LOAD2") #pragma pop_macro("__INTRINSIC_LOAD4") +#pragma pop_macro("__INTRINSIC_LOAD_FAMILY") +#pragma pop_macro("__INTRINSIC_LOAD_LONG") +#pragma pop_macro("__INTRINSIC_LOAD_ULONG") + + -inline __device__ long __ldcs(const long *__ptr) { - unsigned long __ret; - if (sizeof(long) == 8) { - asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr)); - } else { - asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr)); - } - return (long)__ret; -} #pragma push_macro("__INTRINSIC_STORE") #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \ @@ -726,44 +672,100 @@ inline __device__ long __ldcs(const long *__ptr) { : "memory"); \ } -__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r"); -__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r"); -__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h"); -__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r"); -__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l"); - -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r"); -__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h"); -__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r"); -__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l"); - -__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r"); -__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short, - "h"); -__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r"); -__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long, - unsigned long long, "l"); - -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r"); -__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h"); -__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r"); -__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l"); - -__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f"); -__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f"); -__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f"); -__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d"); +#pragma push_macro("__INTRINSIC_STORE_LONG") +#define __INTRINSIC_STORE_LONG(__Mode) \ + inline __device__ void __st##__Mode(long *__ptr, long __value) { \ + if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ + __st##__Mode((long long *)__ptr, (long long)__value); \ + } else { \ + __st##__Mode((int *)__ptr, (int)__value); \ + } \ + } + +#pragma push_macro("__INTRINSIC_STORE_ULONG") +#define __INTRINSIC_STORE_ULONG(__Mode) \ + inline __device__ void __st##__Mode(unsigned long *__ptr, unsigned long __value) { \ + if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ + __st##__Mode((unsigned long long *)__ptr, (unsigned long long)__value); \ + } else { \ + __st##__Mode((unsigned int *)__ptr, (unsigned int)__value); \ + } \ + } + +#pragma push_macro("__INTRINSIC_STORE_FAMILY") +#define __INTRINSIC_STORE_FAMILY(__Mode) \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s8", char, int, "r") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s8", signed char, \ + int, "r") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s16", short, short, \ + "h") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s32", int, int, "r") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s64", long long, \ + long long, "l") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s8", char2, \ + int2, "r") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s8", char4, \ + int4, "r") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s16", short2, \ + short2, "h") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s16", short4, \ + short4, "h") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s32", int2, \ + int2, "r") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s32", int4, \ + int4, "r") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s64", longlong2, \ + longlong2, "l") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u8", unsigned char, \ + int, "r") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u16", unsigned short, \ + unsigned short, "h") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u32", unsigned int, \ + unsigned int, "r") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u64", \ + unsigned long long, unsigned long long, "l") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u8", uchar2, \ + uchar2, "r") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u8", uchar4, \ + uint4, "r") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u16", ushort2, \ + ushort2, "h") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u16", ushort4, \ + ushort4, "h") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u32", uint2, \ + uint2, "r") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u32", uint4, \ + uint4, "r") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u64", ulonglong2, \ + ulonglong2, "l") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f32", float, float, \ + "f") \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f64", double, double, \ + "d") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f32", float2, \ + float2, "f") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.f32", float4, \ + float4, "f") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f64", double2, \ + double2, "d") \ + __INTRINSIC_STORE_LONG(__Mode) \ + __INTRINSIC_STORE_ULONG(__Mode) + + +__INTRINSIC_STORE_FAMILY(wt) +__INTRINSIC_STORE_FAMILY(wb) +__INTRINSIC_STORE_FAMILY(cg) +__INTRINSIC_STORE_FAMILY(cs) + + + #pragma pop_macro("__INTRINSIC_STORE") #pragma pop_macro("__INTRINSIC_STORE2") #pragma pop_macro("__INTRINSIC_STORE4") +#pragma pop_macro("__INTRINSIC_STORE_FAMILY") +#pragma pop_macro("__INTRINSIC_STORE_LONG") +#pragma pop_macro("__INTRINSIC_STORE_ULONG") #endif // defined(__cplusplus) && (__cplusplus >= 201103L) #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 >From 957d335fa20a4bf2626259759fb84e3f00167837 Mon Sep 17 00:00:00 2001 From: Artem Belevich <[email protected]> Date: Tue, 31 Mar 2026 17:08:02 -0700 Subject: [PATCH 2/6] [clang][Headers] Add PTX verification test for CUDA load/store intrinsics. --- clang/test/Headers/cuda_intrinsics.cu | 486 ++++++++++++++++++++++++++ 1 file changed, 486 insertions(+) create mode 100644 clang/test/Headers/cuda_intrinsics.cu diff --git a/clang/test/Headers/cuda_intrinsics.cu b/clang/test/Headers/cuda_intrinsics.cu new file mode 100644 index 0000000000000..68e2d931acd79 --- /dev/null +++ b/clang/test/Headers/cuda_intrinsics.cu @@ -0,0 +1,486 @@ +// RUN: %clang++ -O1 -S --cuda-device-only --offload-arch=sm_32 -nocudalib -nocudainc %s -o - | FileCheck %s + +#define __device__ __attribute__((device)) +#define warpSize 32 +#define memcpy __builtin_memcpy + +// Define missing types for standalone testing +struct char2 { char x, y; }; +struct char4 { char x, y, z, w; }; +struct short2 { short x, y; }; +struct short4 { short x, y, z, w; }; +struct int2 { int x, y; }; +struct int4 { int x, y, z, w; }; +struct longlong2 { long long x, y; }; +struct uchar2 { unsigned char x, y; }; +struct uchar4 { unsigned char x, y, z, w; }; +struct ushort2 { unsigned short x, y; }; +struct ushort4 { unsigned short x, y, z, w; }; +struct uint2 { unsigned int x, y; }; +struct uint4 { unsigned int x, y, z, w; }; +struct ulonglong2 { unsigned long long x, y; }; +struct float2 { float x, y; }; +struct float4 { float x, y, z, w; }; +struct double2 { double x, y; }; + +#include "__clang_cuda_intrinsics.h" + +__device__ void test_loads_cg(void *ptr) { + // CHECK-LABEL: .func _Z13test_loads_cgPv + // CHECK: ld.global.cg.s8 + volatile char v_0 = __ldcg(((const char *)ptr) + 0); + // CHECK: ld.global.cg.s32 + volatile int v_1 = __ldcg(((const int *)ptr) + 1); + // CHECK: ld.global.cg.s64 + volatile long v_2 = __ldcg(((const long *)ptr) + 2); + // CHECK: ld.global.cg.u64 + volatile unsigned long v_3 = __ldcg(((const unsigned long *)ptr) + 3); + // CHECK: ld.global.cg.s64 + volatile long long v_4 = __ldcg(((const long long *)ptr) + 4); + // CHECK: ld.global.cg.f32 + volatile float v_5 = __ldcg(((const float *)ptr) + 5); + // CHECK: ld.global.cg.f64 + volatile double v_6 = __ldcg(((const double *)ptr) + 6); + // CHECK: ld.global.cg.v2.s8 + volatile char2 v_7 = __ldcg(((const char2 *)ptr) + 7); + // CHECK: ld.global.cg.v4.s8 + volatile char4 v_8 = __ldcg(((const char4 *)ptr) + 8); + // CHECK: ld.global.cg.v2.s16 + volatile short2 v_9 = __ldcg(((const short2 *)ptr) + 9); + // CHECK: ld.global.cg.v4.s16 + volatile short4 v_10 = __ldcg(((const short4 *)ptr) + 10); + // CHECK: ld.global.cg.v2.s32 + volatile int2 v_11 = __ldcg(((const int2 *)ptr) + 11); + // CHECK: ld.global.cg.v4.s32 + volatile int4 v_12 = __ldcg(((const int4 *)ptr) + 12); + // CHECK: ld.global.cg.v2.s64 + volatile longlong2 v_13 = __ldcg(((const longlong2 *)ptr) + 13); + // CHECK: ld.global.cg.v2.u8 + volatile uchar2 v_14 = __ldcg(((const uchar2 *)ptr) + 14); + // CHECK: ld.global.cg.v4.u8 + volatile uchar4 v_15 = __ldcg(((const uchar4 *)ptr) + 15); + // CHECK: ld.global.cg.v2.u16 + volatile ushort2 v_16 = __ldcg(((const ushort2 *)ptr) + 16); + // CHECK: ld.global.cg.v4.u16 + volatile ushort4 v_17 = __ldcg(((const ushort4 *)ptr) + 17); + // CHECK: ld.global.cg.v2.u32 + volatile uint2 v_18 = __ldcg(((const uint2 *)ptr) + 18); + // CHECK: ld.global.cg.v4.u32 + volatile uint4 v_19 = __ldcg(((const uint4 *)ptr) + 19); + // CHECK: ld.global.cg.v2.u64 + volatile ulonglong2 v_20 = __ldcg(((const ulonglong2 *)ptr) + 20); + // CHECK: ld.global.cg.v2.f32 + volatile float2 v_21 = __ldcg(((const float2 *)ptr) + 21); + // CHECK: ld.global.cg.v4.f32 + volatile float4 v_22 = __ldcg(((const float4 *)ptr) + 22); + // CHECK: ld.global.cg.v2.f64 + volatile double2 v_23 = __ldcg(((const double2 *)ptr) + 23); +} +__device__ void test_loads_cv(void *ptr) { + // CHECK-LABEL: .func _Z13test_loads_cvPv + // CHECK: ld.global.cv.s8 + volatile char v_0 = __ldcv(((const char *)ptr) + 0); + // CHECK: ld.global.cv.s32 + volatile int v_1 = __ldcv(((const int *)ptr) + 1); + // CHECK: ld.global.cv.s64 + volatile long v_2 = __ldcv(((const long *)ptr) + 2); + // CHECK: ld.global.cv.u64 + volatile unsigned long v_3 = __ldcv(((const unsigned long *)ptr) + 3); + // CHECK: ld.global.cv.s64 + volatile long long v_4 = __ldcv(((const long long *)ptr) + 4); + // CHECK: ld.global.cv.f32 + volatile float v_5 = __ldcv(((const float *)ptr) + 5); + // CHECK: ld.global.cv.f64 + volatile double v_6 = __ldcv(((const double *)ptr) + 6); + // CHECK: ld.global.cv.v2.s8 + volatile char2 v_7 = __ldcv(((const char2 *)ptr) + 7); + // CHECK: ld.global.cv.v4.s8 + volatile char4 v_8 = __ldcv(((const char4 *)ptr) + 8); + // CHECK: ld.global.cv.v2.s16 + volatile short2 v_9 = __ldcv(((const short2 *)ptr) + 9); + // CHECK: ld.global.cv.v4.s16 + volatile short4 v_10 = __ldcv(((const short4 *)ptr) + 10); + // CHECK: ld.global.cv.v2.s32 + volatile int2 v_11 = __ldcv(((const int2 *)ptr) + 11); + // CHECK: ld.global.cv.v4.s32 + volatile int4 v_12 = __ldcv(((const int4 *)ptr) + 12); + // CHECK: ld.global.cv.v2.s64 + volatile longlong2 v_13 = __ldcv(((const longlong2 *)ptr) + 13); + // CHECK: ld.global.cv.v2.u8 + volatile uchar2 v_14 = __ldcv(((const uchar2 *)ptr) + 14); + // CHECK: ld.global.cv.v4.u8 + volatile uchar4 v_15 = __ldcv(((const uchar4 *)ptr) + 15); + // CHECK: ld.global.cv.v2.u16 + volatile ushort2 v_16 = __ldcv(((const ushort2 *)ptr) + 16); + // CHECK: ld.global.cv.v4.u16 + volatile ushort4 v_17 = __ldcv(((const ushort4 *)ptr) + 17); + // CHECK: ld.global.cv.v2.u32 + volatile uint2 v_18 = __ldcv(((const uint2 *)ptr) + 18); + // CHECK: ld.global.cv.v4.u32 + volatile uint4 v_19 = __ldcv(((const uint4 *)ptr) + 19); + // CHECK: ld.global.cv.v2.u64 + volatile ulonglong2 v_20 = __ldcv(((const ulonglong2 *)ptr) + 20); + // CHECK: ld.global.cv.v2.f32 + volatile float2 v_21 = __ldcv(((const float2 *)ptr) + 21); + // CHECK: ld.global.cv.v4.f32 + volatile float4 v_22 = __ldcv(((const float4 *)ptr) + 22); + // CHECK: ld.global.cv.v2.f64 + volatile double2 v_23 = __ldcv(((const double2 *)ptr) + 23); +} +__device__ void test_loads_cs(void *ptr) { + // CHECK-LABEL: .func _Z13test_loads_csPv + // CHECK: ld.global.cs.s8 + volatile char v_0 = __ldcs(((const char *)ptr) + 0); + // CHECK: ld.global.cs.s32 + volatile int v_1 = __ldcs(((const int *)ptr) + 1); + // CHECK: ld.global.cs.s64 + volatile long v_2 = __ldcs(((const long *)ptr) + 2); + // CHECK: ld.global.cs.u64 + volatile unsigned long v_3 = __ldcs(((const unsigned long *)ptr) + 3); + // CHECK: ld.global.cs.s64 + volatile long long v_4 = __ldcs(((const long long *)ptr) + 4); + // CHECK: ld.global.cs.f32 + volatile float v_5 = __ldcs(((const float *)ptr) + 5); + // CHECK: ld.global.cs.f64 + volatile double v_6 = __ldcs(((const double *)ptr) + 6); + // CHECK: ld.global.cs.v2.s8 + volatile char2 v_7 = __ldcs(((const char2 *)ptr) + 7); + // CHECK: ld.global.cs.v4.s8 + volatile char4 v_8 = __ldcs(((const char4 *)ptr) + 8); + // CHECK: ld.global.cs.v2.s16 + volatile short2 v_9 = __ldcs(((const short2 *)ptr) + 9); + // CHECK: ld.global.cs.v4.s16 + volatile short4 v_10 = __ldcs(((const short4 *)ptr) + 10); + // CHECK: ld.global.cs.v2.s32 + volatile int2 v_11 = __ldcs(((const int2 *)ptr) + 11); + // CHECK: ld.global.cs.v4.s32 + volatile int4 v_12 = __ldcs(((const int4 *)ptr) + 12); + // CHECK: ld.global.cs.v2.s64 + volatile longlong2 v_13 = __ldcs(((const longlong2 *)ptr) + 13); + // CHECK: ld.global.cs.v2.u8 + volatile uchar2 v_14 = __ldcs(((const uchar2 *)ptr) + 14); + // CHECK: ld.global.cs.v4.u8 + volatile uchar4 v_15 = __ldcs(((const uchar4 *)ptr) + 15); + // CHECK: ld.global.cs.v2.u16 + volatile ushort2 v_16 = __ldcs(((const ushort2 *)ptr) + 16); + // CHECK: ld.global.cs.v4.u16 + volatile ushort4 v_17 = __ldcs(((const ushort4 *)ptr) + 17); + // CHECK: ld.global.cs.v2.u32 + volatile uint2 v_18 = __ldcs(((const uint2 *)ptr) + 18); + // CHECK: ld.global.cs.v4.u32 + volatile uint4 v_19 = __ldcs(((const uint4 *)ptr) + 19); + // CHECK: ld.global.cs.v2.u64 + volatile ulonglong2 v_20 = __ldcs(((const ulonglong2 *)ptr) + 20); + // CHECK: ld.global.cs.v2.f32 + volatile float2 v_21 = __ldcs(((const float2 *)ptr) + 21); + // CHECK: ld.global.cs.v4.f32 + volatile float4 v_22 = __ldcs(((const float4 *)ptr) + 22); + // CHECK: ld.global.cs.v2.f64 + volatile double2 v_23 = __ldcs(((const double2 *)ptr) + 23); +} +__device__ void test_loads_ca(void *ptr) { + // CHECK-LABEL: .func _Z13test_loads_caPv + // CHECK: ld.global.ca.s8 + volatile char v_0 = __ldca(((const char *)ptr) + 0); + // CHECK: ld.global.ca.s32 + volatile int v_1 = __ldca(((const int *)ptr) + 1); + // CHECK: ld.global.ca.s64 + volatile long v_2 = __ldca(((const long *)ptr) + 2); + // CHECK: ld.global.ca.u64 + volatile unsigned long v_3 = __ldca(((const unsigned long *)ptr) + 3); + // CHECK: ld.global.ca.s64 + volatile long long v_4 = __ldca(((const long long *)ptr) + 4); + // CHECK: ld.global.ca.f32 + volatile float v_5 = __ldca(((const float *)ptr) + 5); + // CHECK: ld.global.ca.f64 + volatile double v_6 = __ldca(((const double *)ptr) + 6); + // CHECK: ld.global.ca.v2.s8 + volatile char2 v_7 = __ldca(((const char2 *)ptr) + 7); + // CHECK: ld.global.ca.v4.s8 + volatile char4 v_8 = __ldca(((const char4 *)ptr) + 8); + // CHECK: ld.global.ca.v2.s16 + volatile short2 v_9 = __ldca(((const short2 *)ptr) + 9); + // CHECK: ld.global.ca.v4.s16 + volatile short4 v_10 = __ldca(((const short4 *)ptr) + 10); + // CHECK: ld.global.ca.v2.s32 + volatile int2 v_11 = __ldca(((const int2 *)ptr) + 11); + // CHECK: ld.global.ca.v4.s32 + volatile int4 v_12 = __ldca(((const int4 *)ptr) + 12); + // CHECK: ld.global.ca.v2.s64 + volatile longlong2 v_13 = __ldca(((const longlong2 *)ptr) + 13); + // CHECK: ld.global.ca.v2.u8 + volatile uchar2 v_14 = __ldca(((const uchar2 *)ptr) + 14); + // CHECK: ld.global.ca.v4.u8 + volatile uchar4 v_15 = __ldca(((const uchar4 *)ptr) + 15); + // CHECK: ld.global.ca.v2.u16 + volatile ushort2 v_16 = __ldca(((const ushort2 *)ptr) + 16); + // CHECK: ld.global.ca.v4.u16 + volatile ushort4 v_17 = __ldca(((const ushort4 *)ptr) + 17); + // CHECK: ld.global.ca.v2.u32 + volatile uint2 v_18 = __ldca(((const uint2 *)ptr) + 18); + // CHECK: ld.global.ca.v4.u32 + volatile uint4 v_19 = __ldca(((const uint4 *)ptr) + 19); + // CHECK: ld.global.ca.v2.u64 + volatile ulonglong2 v_20 = __ldca(((const ulonglong2 *)ptr) + 20); + // CHECK: ld.global.ca.v2.f32 + volatile float2 v_21 = __ldca(((const float2 *)ptr) + 21); + // CHECK: ld.global.ca.v4.f32 + volatile float4 v_22 = __ldca(((const float4 *)ptr) + 22); + // CHECK: ld.global.ca.v2.f64 + volatile double2 v_23 = __ldca(((const double2 *)ptr) + 23); +} +__device__ void test_loads_lu(void *ptr) { + // CHECK-LABEL: .func _Z13test_loads_luPv + // CHECK: ld.global.lu.s8 + volatile char v_0 = __ldlu(((const char *)ptr) + 0); + // CHECK: ld.global.lu.s32 + volatile int v_1 = __ldlu(((const int *)ptr) + 1); + // CHECK: ld.global.lu.s64 + volatile long v_2 = __ldlu(((const long *)ptr) + 2); + // CHECK: ld.global.lu.u64 + volatile unsigned long v_3 = __ldlu(((const unsigned long *)ptr) + 3); + // CHECK: ld.global.lu.s64 + volatile long long v_4 = __ldlu(((const long long *)ptr) + 4); + // CHECK: ld.global.lu.f32 + volatile float v_5 = __ldlu(((const float *)ptr) + 5); + // CHECK: ld.global.lu.f64 + volatile double v_6 = __ldlu(((const double *)ptr) + 6); + // CHECK: ld.global.lu.v2.s8 + volatile char2 v_7 = __ldlu(((const char2 *)ptr) + 7); + // CHECK: ld.global.lu.v4.s8 + volatile char4 v_8 = __ldlu(((const char4 *)ptr) + 8); + // CHECK: ld.global.lu.v2.s16 + volatile short2 v_9 = __ldlu(((const short2 *)ptr) + 9); + // CHECK: ld.global.lu.v4.s16 + volatile short4 v_10 = __ldlu(((const short4 *)ptr) + 10); + // CHECK: ld.global.lu.v2.s32 + volatile int2 v_11 = __ldlu(((const int2 *)ptr) + 11); + // CHECK: ld.global.lu.v4.s32 + volatile int4 v_12 = __ldlu(((const int4 *)ptr) + 12); + // CHECK: ld.global.lu.v2.s64 + volatile longlong2 v_13 = __ldlu(((const longlong2 *)ptr) + 13); + // CHECK: ld.global.lu.v2.u8 + volatile uchar2 v_14 = __ldlu(((const uchar2 *)ptr) + 14); + // CHECK: ld.global.lu.v4.u8 + volatile uchar4 v_15 = __ldlu(((const uchar4 *)ptr) + 15); + // CHECK: ld.global.lu.v2.u16 + volatile ushort2 v_16 = __ldlu(((const ushort2 *)ptr) + 16); + // CHECK: ld.global.lu.v4.u16 + volatile ushort4 v_17 = __ldlu(((const ushort4 *)ptr) + 17); + // CHECK: ld.global.lu.v2.u32 + volatile uint2 v_18 = __ldlu(((const uint2 *)ptr) + 18); + // CHECK: ld.global.lu.v4.u32 + volatile uint4 v_19 = __ldlu(((const uint4 *)ptr) + 19); + // CHECK: ld.global.lu.v2.u64 + volatile ulonglong2 v_20 = __ldlu(((const ulonglong2 *)ptr) + 20); + // CHECK: ld.global.lu.v2.f32 + volatile float2 v_21 = __ldlu(((const float2 *)ptr) + 21); + // CHECK: ld.global.lu.v4.f32 + volatile float4 v_22 = __ldlu(((const float4 *)ptr) + 22); + // CHECK: ld.global.lu.v2.f64 + volatile double2 v_23 = __ldlu(((const double2 *)ptr) + 23); +} +__device__ void test_stores_wt(void *ptr, int val) { + // CHECK-LABEL: .func _Z14test_stores_wtPvi + // CHECK: st.global.wt.s8 + __stwt(((char *)ptr) + 0, (char)val); + // CHECK: st.global.wt.s32 + __stwt(((int *)ptr) + 1, (int)val); + // CHECK: st.global.wt.s64 + __stwt(((long *)ptr) + 2, (long)val); + // CHECK: st.global.wt.u64 + __stwt(((unsigned long *)ptr) + 3, (unsigned long)val); + // CHECK: st.global.wt.s64 + __stwt(((long long *)ptr) + 4, (long long)val); + // CHECK: st.global.wt.f32 + __stwt(((float *)ptr) + 5, (float)val); + // CHECK: st.global.wt.f64 + __stwt(((double *)ptr) + 6, (double)val); + // CHECK: st.global.wt.v2.s8 + { char2 v = {(char)val, (char)val}; __stwt(((char2 *)ptr) + 7, v); } + // CHECK: st.global.wt.v4.s8 + { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stwt(((char4 *)ptr) + 8, v); } + // CHECK: st.global.wt.v2.s16 + { short2 v = {(short)val, (short)val}; __stwt(((short2 *)ptr) + 9, v); } + // CHECK: st.global.wt.v4.s16 + { short4 v = {(short)val, (short)val, (short)val, (short)val}; __stwt(((short4 *)ptr) + 10, v); } + // CHECK: st.global.wt.v2.s32 + { int2 v = {(int)val, (int)val}; __stwt(((int2 *)ptr) + 11, v); } + // CHECK: st.global.wt.v4.s32 + { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stwt(((int4 *)ptr) + 12, v); } + // CHECK: st.global.wt.v2.s64 + { longlong2 v = {(long long)val, (long long)val}; __stwt(((longlong2 *)ptr) + 13, v); } + // CHECK: st.global.wt.v2.u8 + { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stwt(((uchar2 *)ptr) + 14, v); } + // CHECK: st.global.wt.v4.u8 + { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, (unsigned char)val}; __stwt(((uchar4 *)ptr) + 15, v); } + // CHECK: st.global.wt.v2.u16 + { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stwt(((ushort2 *)ptr) + 16, v); } + // CHECK: st.global.wt.v4.u16 + { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned short)val, (unsigned short)val}; __stwt(((ushort4 *)ptr) + 17, v); } + // CHECK: st.global.wt.v2.u32 + { uint2 v = {(unsigned int)val, (unsigned int)val}; __stwt(((uint2 *)ptr) + 18, v); } + // CHECK: st.global.wt.v4.u32 + { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, (unsigned int)val}; __stwt(((uint4 *)ptr) + 19, v); } + // CHECK: st.global.wt.v2.u64 + { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; __stwt(((ulonglong2 *)ptr) + 20, v); } + // CHECK: st.global.wt.v2.f32 + { float2 v = {(float)val, (float)val}; __stwt(((float2 *)ptr) + 21, v); } + // CHECK: st.global.wt.v4.f32 + { float4 v = {(float)val, (float)val, (float)val, (float)val}; __stwt(((float4 *)ptr) + 22, v); } + // CHECK: st.global.wt.v2.f64 + { double2 v = {(double)val, (double)val}; __stwt(((double2 *)ptr) + 23, v); } +} +__device__ void test_stores_wb(void *ptr, int val) { + // CHECK-LABEL: .func _Z14test_stores_wbPvi + // CHECK: st.global.wb.s8 + __stwb(((char *)ptr) + 0, (char)val); + // CHECK: st.global.wb.s32 + __stwb(((int *)ptr) + 1, (int)val); + // CHECK: st.global.wb.s64 + __stwb(((long *)ptr) + 2, (long)val); + // CHECK: st.global.wb.u64 + __stwb(((unsigned long *)ptr) + 3, (unsigned long)val); + // CHECK: st.global.wb.s64 + __stwb(((long long *)ptr) + 4, (long long)val); + // CHECK: st.global.wb.f32 + __stwb(((float *)ptr) + 5, (float)val); + // CHECK: st.global.wb.f64 + __stwb(((double *)ptr) + 6, (double)val); + // CHECK: st.global.wb.v2.s8 + { char2 v = {(char)val, (char)val}; __stwb(((char2 *)ptr) + 7, v); } + // CHECK: st.global.wb.v4.s8 + { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stwb(((char4 *)ptr) + 8, v); } + // CHECK: st.global.wb.v2.s16 + { short2 v = {(short)val, (short)val}; __stwb(((short2 *)ptr) + 9, v); } + // CHECK: st.global.wb.v4.s16 + { short4 v = {(short)val, (short)val, (short)val, (short)val}; __stwb(((short4 *)ptr) + 10, v); } + // CHECK: st.global.wb.v2.s32 + { int2 v = {(int)val, (int)val}; __stwb(((int2 *)ptr) + 11, v); } + // CHECK: st.global.wb.v4.s32 + { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stwb(((int4 *)ptr) + 12, v); } + // CHECK: st.global.wb.v2.s64 + { longlong2 v = {(long long)val, (long long)val}; __stwb(((longlong2 *)ptr) + 13, v); } + // CHECK: st.global.wb.v2.u8 + { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stwb(((uchar2 *)ptr) + 14, v); } + // CHECK: st.global.wb.v4.u8 + { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, (unsigned char)val}; __stwb(((uchar4 *)ptr) + 15, v); } + // CHECK: st.global.wb.v2.u16 + { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stwb(((ushort2 *)ptr) + 16, v); } + // CHECK: st.global.wb.v4.u16 + { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned short)val, (unsigned short)val}; __stwb(((ushort4 *)ptr) + 17, v); } + // CHECK: st.global.wb.v2.u32 + { uint2 v = {(unsigned int)val, (unsigned int)val}; __stwb(((uint2 *)ptr) + 18, v); } + // CHECK: st.global.wb.v4.u32 + { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, (unsigned int)val}; __stwb(((uint4 *)ptr) + 19, v); } + // CHECK: st.global.wb.v2.u64 + { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; __stwb(((ulonglong2 *)ptr) + 20, v); } + // CHECK: st.global.wb.v2.f32 + { float2 v = {(float)val, (float)val}; __stwb(((float2 *)ptr) + 21, v); } + // CHECK: st.global.wb.v4.f32 + { float4 v = {(float)val, (float)val, (float)val, (float)val}; __stwb(((float4 *)ptr) + 22, v); } + // CHECK: st.global.wb.v2.f64 + { double2 v = {(double)val, (double)val}; __stwb(((double2 *)ptr) + 23, v); } +} +__device__ void test_stores_cg(void *ptr, int val) { + // CHECK-LABEL: .func _Z14test_stores_cgPvi + // CHECK: st.global.cg.s8 + __stcg(((char *)ptr) + 0, (char)val); + // CHECK: st.global.cg.s32 + __stcg(((int *)ptr) + 1, (int)val); + // CHECK: st.global.cg.s64 + __stcg(((long *)ptr) + 2, (long)val); + // CHECK: st.global.cg.u64 + __stcg(((unsigned long *)ptr) + 3, (unsigned long)val); + // CHECK: st.global.cg.s64 + __stcg(((long long *)ptr) + 4, (long long)val); + // CHECK: st.global.cg.f32 + __stcg(((float *)ptr) + 5, (float)val); + // CHECK: st.global.cg.f64 + __stcg(((double *)ptr) + 6, (double)val); + // CHECK: st.global.cg.v2.s8 + { char2 v = {(char)val, (char)val}; __stcg(((char2 *)ptr) + 7, v); } + // CHECK: st.global.cg.v4.s8 + { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stcg(((char4 *)ptr) + 8, v); } + // CHECK: st.global.cg.v2.s16 + { short2 v = {(short)val, (short)val}; __stcg(((short2 *)ptr) + 9, v); } + // CHECK: st.global.cg.v4.s16 + { short4 v = {(short)val, (short)val, (short)val, (short)val}; __stcg(((short4 *)ptr) + 10, v); } + // CHECK: st.global.cg.v2.s32 + { int2 v = {(int)val, (int)val}; __stcg(((int2 *)ptr) + 11, v); } + // CHECK: st.global.cg.v4.s32 + { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stcg(((int4 *)ptr) + 12, v); } + // CHECK: st.global.cg.v2.s64 + { longlong2 v = {(long long)val, (long long)val}; __stcg(((longlong2 *)ptr) + 13, v); } + // CHECK: st.global.cg.v2.u8 + { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stcg(((uchar2 *)ptr) + 14, v); } + // CHECK: st.global.cg.v4.u8 + { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, (unsigned char)val}; __stcg(((uchar4 *)ptr) + 15, v); } + // CHECK: st.global.cg.v2.u16 + { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stcg(((ushort2 *)ptr) + 16, v); } + // CHECK: st.global.cg.v4.u16 + { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned short)val, (unsigned short)val}; __stcg(((ushort4 *)ptr) + 17, v); } + // CHECK: st.global.cg.v2.u32 + { uint2 v = {(unsigned int)val, (unsigned int)val}; __stcg(((uint2 *)ptr) + 18, v); } + // CHECK: st.global.cg.v4.u32 + { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, (unsigned int)val}; __stcg(((uint4 *)ptr) + 19, v); } + // CHECK: st.global.cg.v2.u64 + { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; __stcg(((ulonglong2 *)ptr) + 20, v); } + // CHECK: st.global.cg.v2.f32 + { float2 v = {(float)val, (float)val}; __stcg(((float2 *)ptr) + 21, v); } + // CHECK: st.global.cg.v4.f32 + { float4 v = {(float)val, (float)val, (float)val, (float)val}; __stcg(((float4 *)ptr) + 22, v); } + // CHECK: st.global.cg.v2.f64 + { double2 v = {(double)val, (double)val}; __stcg(((double2 *)ptr) + 23, v); } +} +__device__ void test_stores_cs(void *ptr, int val) { + // CHECK-LABEL: .func _Z14test_stores_csPvi + // CHECK: st.global.cs.s8 + __stcs(((char *)ptr) + 0, (char)val); + // CHECK: st.global.cs.s32 + __stcs(((int *)ptr) + 1, (int)val); + // CHECK: st.global.cs.s64 + __stcs(((long *)ptr) + 2, (long)val); + // CHECK: st.global.cs.u64 + __stcs(((unsigned long *)ptr) + 3, (unsigned long)val); + // CHECK: st.global.cs.s64 + __stcs(((long long *)ptr) + 4, (long long)val); + // CHECK: st.global.cs.f32 + __stcs(((float *)ptr) + 5, (float)val); + // CHECK: st.global.cs.f64 + __stcs(((double *)ptr) + 6, (double)val); + // CHECK: st.global.cs.v2.s8 + { char2 v = {(char)val, (char)val}; __stcs(((char2 *)ptr) + 7, v); } + // CHECK: st.global.cs.v4.s8 + { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stcs(((char4 *)ptr) + 8, v); } + // CHECK: st.global.cs.v2.s16 + { short2 v = {(short)val, (short)val}; __stcs(((short2 *)ptr) + 9, v); } + // CHECK: st.global.cs.v4.s16 + { short4 v = {(short)val, (short)val, (short)val, (short)val}; __stcs(((short4 *)ptr) + 10, v); } + // CHECK: st.global.cs.v2.s32 + { int2 v = {(int)val, (int)val}; __stcs(((int2 *)ptr) + 11, v); } + // CHECK: st.global.cs.v4.s32 + { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stcs(((int4 *)ptr) + 12, v); } + // CHECK: st.global.cs.v2.s64 + { longlong2 v = {(long long)val, (long long)val}; __stcs(((longlong2 *)ptr) + 13, v); } + // CHECK: st.global.cs.v2.u8 + { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stcs(((uchar2 *)ptr) + 14, v); } + // CHECK: st.global.cs.v4.u8 + { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, (unsigned char)val}; __stcs(((uchar4 *)ptr) + 15, v); } + // CHECK: st.global.cs.v2.u16 + { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stcs(((ushort2 *)ptr) + 16, v); } + // CHECK: st.global.cs.v4.u16 + { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned short)val, (unsigned short)val}; __stcs(((ushort4 *)ptr) + 17, v); } + // CHECK: st.global.cs.v2.u32 + { uint2 v = {(unsigned int)val, (unsigned int)val}; __stcs(((uint2 *)ptr) + 18, v); } + // CHECK: st.global.cs.v4.u32 + { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, (unsigned int)val}; __stcs(((uint4 *)ptr) + 19, v); } + // CHECK: st.global.cs.v2.u64 + { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; __stcs(((ulonglong2 *)ptr) + 20, v); } + // CHECK: st.global.cs.v2.f32 + { float2 v = {(float)val, (float)val}; __stcs(((float2 *)ptr) + 21, v); } + // CHECK: st.global.cs.v4.f32 + { float4 v = {(float)val, (float)val, (float)val, (float)val}; __stcs(((float4 *)ptr) + 22, v); } + // CHECK: st.global.cs.v2.f64 + { double2 v = {(double)val, (double)val}; __stcs(((double2 *)ptr) + 23, v); } +} >From 22c88c70ccdaa2e1bf20e7cd94143a005b0ec97e Mon Sep 17 00:00:00 2001 From: Artem Belevich <[email protected]> Date: Tue, 31 Mar 2026 17:15:30 -0700 Subject: [PATCH 3/6] clang-format --- clang/lib/Headers/__clang_cuda_intrinsics.h | 115 ++++++++------------ 1 file changed, 47 insertions(+), 68 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h index 9507b6ce59e82..4c34f742f9a81 100644 --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -528,19 +528,19 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, inline __device__ long __ld##__Mode(const long *__ptr) { \ if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ return (long)__ld##__Mode((const long long *)__ptr); \ - } else { \ - return (long)__ld##__Mode((const int *)__ptr); \ - } \ + } else { \ + return (long)__ld##__Mode((const int *)__ptr); \ + } \ } #pragma push_macro("__INTRINSIC_LOAD_ULONG") -#define __INTRINSIC_LOAD_ULONG(__Mode) \ +#define __INTRINSIC_LOAD_ULONG(__Mode) \ inline __device__ unsigned long __ld##__Mode(const unsigned long *__ptr) { \ if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ return (unsigned long)__ld##__Mode((const unsigned long long *)__ptr); \ - } else { \ + } else { \ return (unsigned long)__ld##__Mode((const unsigned int *)__ptr); \ - } \ + } \ } #pragma push_macro("__INTRINSIC_LOAD_FAMILY") @@ -569,11 +569,11 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, "=r", __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s64", longlong2, \ longlong2, "=l", __Clobber) \ - __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char, \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char, \ unsigned int, "=r", __Clobber) \ - __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short, \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short, \ unsigned short, "=h", __Clobber) \ - __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int, \ + __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int, \ unsigned int, "=r", __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u64", \ unsigned long long, unsigned long long, "=l", __Clobber) \ @@ -589,7 +589,7 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, uint2, "=r", __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u32", uint4, \ uint4, "=r", __Clobber) \ - __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2, \ + __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2, \ ulonglong2, "=l", __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f32", float, float, \ "=f", __Clobber) \ @@ -604,29 +604,14 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, __INTRINSIC_LOAD_LONG(__Mode) \ __INTRINSIC_LOAD_ULONG(__Mode) - - - __INTRINSIC_LOAD_FAMILY(cg, ) __INTRINSIC_LOAD_FAMILY(ca, ) - - - - __INTRINSIC_LOAD_FAMILY(cv, : "memory") __INTRINSIC_LOAD_FAMILY(lu, : "memory") - - - - - __INTRINSIC_LOAD_FAMILY(cs, ) - - - #pragma pop_macro("__INTRINSIC_LOAD") #pragma pop_macro("__INTRINSIC_LOAD2") #pragma pop_macro("__INTRINSIC_LOAD4") @@ -634,9 +619,6 @@ __INTRINSIC_LOAD_FAMILY(cs, ) #pragma pop_macro("__INTRINSIC_LOAD_LONG") #pragma pop_macro("__INTRINSIC_LOAD_ULONG") - - - #pragma push_macro("__INTRINSIC_STORE") #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \ @@ -673,23 +655,24 @@ __INTRINSIC_LOAD_FAMILY(cs, ) } #pragma push_macro("__INTRINSIC_STORE_LONG") -#define __INTRINSIC_STORE_LONG(__Mode) \ - inline __device__ void __st##__Mode(long *__ptr, long __value) { \ +#define __INTRINSIC_STORE_LONG(__Mode) \ + inline __device__ void __st##__Mode(long *__ptr, long __value) { \ if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ - __st##__Mode((long long *)__ptr, (long long)__value); \ - } else { \ - __st##__Mode((int *)__ptr, (int)__value); \ - } \ + __st##__Mode((long long *)__ptr, (long long)__value); \ + } else { \ + __st##__Mode((int *)__ptr, (int)__value); \ + } \ } #pragma push_macro("__INTRINSIC_STORE_ULONG") -#define __INTRINSIC_STORE_ULONG(__Mode) \ - inline __device__ void __st##__Mode(unsigned long *__ptr, unsigned long __value) { \ +#define __INTRINSIC_STORE_ULONG(__Mode) \ + inline __device__ void __st##__Mode(unsigned long *__ptr, \ + unsigned long __value) { \ if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) { \ - __st##__Mode((unsigned long long *)__ptr, (unsigned long long)__value); \ - } else { \ - __st##__Mode((unsigned int *)__ptr, (unsigned int)__value); \ - } \ + __st##__Mode((unsigned long long *)__ptr, (unsigned long long)__value); \ + } else { \ + __st##__Mode((unsigned int *)__ptr, (unsigned int)__value); \ + } \ } #pragma push_macro("__INTRINSIC_STORE_FAMILY") @@ -700,20 +683,20 @@ __INTRINSIC_LOAD_FAMILY(cs, ) __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s16", short, short, \ "h") \ __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s32", int, int, "r") \ - __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s64", long long, \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s64", long long, \ long long, "l") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s8", char2, \ - int2, "r") \ - __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s8", char4, \ - int4, "r") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s16", short2, \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s8", char2, int2, \ + "r") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s8", char4, int4, \ + "r") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s16", short2, \ short2, "h") \ - __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s16", short4, \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s16", short4, \ short4, "h") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s32", int2, \ - int2, "r") \ - __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s32", int4, \ - int4, "r") \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s32", int2, int2, \ + "r") \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s32", int4, int4, \ + "r") \ __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s64", longlong2, \ longlong2, "l") \ __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u8", unsigned char, \ @@ -724,42 +707,38 @@ __INTRINSIC_LOAD_FAMILY(cs, ) unsigned int, "r") \ __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u64", \ unsigned long long, unsigned long long, "l") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u8", uchar2, \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u8", uchar2, \ uchar2, "r") \ - __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u8", uchar4, \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u8", uchar4, \ uint4, "r") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u16", ushort2, \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u16", ushort2, \ ushort2, "h") \ - __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u16", ushort4, \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u16", ushort4, \ ushort4, "h") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u32", uint2, \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u32", uint2, \ uint2, "r") \ - __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u32", uint4, \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u32", uint4, \ uint4, "r") \ __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u64", ulonglong2, \ ulonglong2, "l") \ - __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f32", float, float, \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f32", float, float, \ "f") \ - __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f64", double, double, \ + __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f64", double, double, \ "d") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f32", float2, \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f32", float2, \ float2, "f") \ - __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.f32", float4, \ + __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.f32", float4, \ float4, "f") \ - __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f64", double2, \ - double2, "d") \ - __INTRINSIC_STORE_LONG(__Mode) \ + __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f64", double2, \ + double2, "d") \ + __INTRINSIC_STORE_LONG(__Mode) \ __INTRINSIC_STORE_ULONG(__Mode) - __INTRINSIC_STORE_FAMILY(wt) __INTRINSIC_STORE_FAMILY(wb) __INTRINSIC_STORE_FAMILY(cg) __INTRINSIC_STORE_FAMILY(cs) - - - #pragma pop_macro("__INTRINSIC_STORE") #pragma pop_macro("__INTRINSIC_STORE2") #pragma pop_macro("__INTRINSIC_STORE4") >From 86bd3f4f2f2666475fabf1f07785aaec7447bce4 Mon Sep 17 00:00:00 2001 From: Artem Belevich <[email protected]> Date: Tue, 31 Mar 2026 17:17:05 -0700 Subject: [PATCH 4/6] minor cleanup --- clang/lib/Headers/__clang_cuda_intrinsics.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h index 4c34f742f9a81..1ceb540fbd746 100644 --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -604,20 +604,18 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, __INTRINSIC_LOAD_LONG(__Mode) \ __INTRINSIC_LOAD_ULONG(__Mode) -__INTRINSIC_LOAD_FAMILY(cg, ) -__INTRINSIC_LOAD_FAMILY(ca, ) - +__INTRINSIC_LOAD_FAMILY(ca, /* no clobber */) +__INTRINSIC_LOAD_FAMILY(cg, /* no clobber */) +__INTRINSIC_LOAD_FAMILY(cs, /* no clobber */) __INTRINSIC_LOAD_FAMILY(cv, : "memory") __INTRINSIC_LOAD_FAMILY(lu, : "memory") -__INTRINSIC_LOAD_FAMILY(cs, ) - #pragma pop_macro("__INTRINSIC_LOAD") #pragma pop_macro("__INTRINSIC_LOAD2") #pragma pop_macro("__INTRINSIC_LOAD4") -#pragma pop_macro("__INTRINSIC_LOAD_FAMILY") #pragma pop_macro("__INTRINSIC_LOAD_LONG") #pragma pop_macro("__INTRINSIC_LOAD_ULONG") +#pragma pop_macro("__INTRINSIC_LOAD_FAMILY") #pragma push_macro("__INTRINSIC_STORE") #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \ @@ -734,17 +732,17 @@ __INTRINSIC_LOAD_FAMILY(cs, ) __INTRINSIC_STORE_LONG(__Mode) \ __INTRINSIC_STORE_ULONG(__Mode) -__INTRINSIC_STORE_FAMILY(wt) -__INTRINSIC_STORE_FAMILY(wb) __INTRINSIC_STORE_FAMILY(cg) __INTRINSIC_STORE_FAMILY(cs) +__INTRINSIC_STORE_FAMILY(wb) +__INTRINSIC_STORE_FAMILY(wt) #pragma pop_macro("__INTRINSIC_STORE") #pragma pop_macro("__INTRINSIC_STORE2") #pragma pop_macro("__INTRINSIC_STORE4") -#pragma pop_macro("__INTRINSIC_STORE_FAMILY") #pragma pop_macro("__INTRINSIC_STORE_LONG") #pragma pop_macro("__INTRINSIC_STORE_ULONG") +#pragma pop_macro("__INTRINSIC_STORE_FAMILY") #endif // defined(__cplusplus) && (__cplusplus >= 201103L) #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 >From 421c087a144fca1d30710be61d0ba957a3499e95 Mon Sep 17 00:00:00 2001 From: Artem Belevich <[email protected]> Date: Wed, 1 Apr 2026 11:19:19 -0700 Subject: [PATCH 5/6] [Clang][CUDA] Refactor and consolidate CUDA load/store intrinsics - Parameterize volatile for load macros - Use __asm__ __volatile__ for cached loads (ca, cg, cs) - Use plain __asm__ with memory clobbers for uncached loads - Replace plain asm with __asm__ for consistency - Add support for unsigned long - Update test suite to verify loads are preserved --- clang/lib/Headers/__clang_cuda_intrinsics.h | 134 +++++++++--------- clang/test/Headers/cuda_intrinsics.cu | 144 ++++++++++---------- 2 files changed, 140 insertions(+), 138 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h index 1ceb540fbd746..2412f9e51ffbf 100644 --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -449,33 +449,33 @@ inline __device__ double2 __ldg(const double2 *ptr) { inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, unsigned shiftWidth) { unsigned result; - asm("shf.l.wrap.b32 %0, %1, %2, %3;" - : "=r"(result) - : "r"(low32), "r"(high32), "r"(shiftWidth)); + __asm__("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); return result; } inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, unsigned shiftWidth) { unsigned result; - asm("shf.l.clamp.b32 %0, %1, %2, %3;" - : "=r"(result) - : "r"(low32), "r"(high32), "r"(shiftWidth)); + __asm__("shf.l.clamp.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); return result; } inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, unsigned shiftWidth) { unsigned result; - asm("shf.r.wrap.b32 %0, %1, %2, %3;" - : "=r"(result) - : "r"(low32), "r"(high32), "r"(shiftWidth)); + __asm__("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); return result; } inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, unsigned shiftWidth) { unsigned ret; - asm("shf.r.clamp.b32 %0, %1, %2, %3;" - : "=r"(ret) - : "r"(low32), "r"(high32), "r"(shiftWidth)); + __asm__("shf.r.clamp.b32 %0, %1, %2, %3;" + : "=r"(ret) + : "r"(low32), "r"(high32), "r"(shiftWidth)); return ret; } @@ -483,38 +483,40 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, #pragma push_macro("__INTRINSIC_LOAD") #define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \ - __Clobber) \ + __Volatile, __Clobber) \ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \ __TmpType __ret; \ - asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \ + __asm__ __Volatile(__AsmOp " %0, [%1];" \ + : __AsmType(__ret) \ + : "l"(__ptr)__Clobber); \ return (__DeclType)__ret; \ } #pragma push_macro("__INTRINSIC_LOAD2") #define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \ - __Clobber) \ + __Volatile, __Clobber) \ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \ __DeclType __ret; \ __TmpType __tmp; \ - asm(__AsmOp " {%0,%1}, [%2];" \ - : __AsmType(__tmp.x), __AsmType(__tmp.y) \ - : "l"(__ptr)__Clobber); \ + __asm__ __Volatile(__AsmOp " {%0,%1}, [%2];" \ + : __AsmType(__tmp.x), __AsmType(__tmp.y) \ + : "l"(__ptr)__Clobber); \ using __ElementType = decltype(__ret.x); \ - __ret.x = (__ElementType)(__tmp.x); \ + __ret.x = (__ElementType)__tmp.x; \ __ret.y = (__ElementType)__tmp.y; \ return __ret; \ } #pragma push_macro("__INTRINSIC_LOAD4") #define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \ - __Clobber) \ + __Volatile, __Clobber) \ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \ __DeclType __ret; \ __TmpType __tmp; \ - asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \ - : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \ - __AsmType(__tmp.w) \ - : "l"(__ptr)__Clobber); \ + __asm__ __Volatile(__AsmOp " {%0,%1,%2,%3}, [%4];" \ + : __AsmType(__tmp.x), __AsmType(__tmp.y), \ + __AsmType(__tmp.z), __AsmType(__tmp.w) \ + : "l"(__ptr)__Clobber); \ using __ElementType = decltype(__ret.x); \ __ret.x = (__ElementType)__tmp.x; \ __ret.y = (__ElementType)__tmp.y; \ @@ -543,72 +545,72 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, } \ } -#pragma push_macro("__INTRINSIC_LOAD_FAMILY") -#define __INTRINSIC_LOAD_FAMILY(__Mode, __Clobber) \ +#define __INTRINSIC_LOAD_FAMILY(__Mode, __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", char, \ - unsigned int, "=r", __Clobber) \ + unsigned int, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", signed char, \ - unsigned int, "=r", __Clobber) \ + unsigned int, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s16", short, \ - unsigned short, "=h", __Clobber) \ + unsigned short, "=h", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s32", int, \ - unsigned int, "=r", __Clobber) \ + unsigned int, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s64", long long, \ - unsigned long long, "=l", __Clobber) \ + unsigned long long, "=l", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s8", char2, int2, \ - "=r", __Clobber) \ + "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s8", char4, int4, \ - "=r", __Clobber) \ + "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s16", short2, \ - short2, "=h", __Clobber) \ + short2, "=h", __Volatile, __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s16", short4, \ - short4, "=h", __Clobber) \ + short4, "=h", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s32", int2, int2, \ - "=r", __Clobber) \ + "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s32", int4, int4, \ - "=r", __Clobber) \ + "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s64", longlong2, \ - longlong2, "=l", __Clobber) \ + longlong2, "=l", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char, \ - unsigned int, "=r", __Clobber) \ + unsigned int, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short, \ - unsigned short, "=h", __Clobber) \ + unsigned short, "=h", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int, \ - unsigned int, "=r", __Clobber) \ + unsigned int, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u64", \ - unsigned long long, unsigned long long, "=l", __Clobber) \ + unsigned long long, unsigned long long, "=l", __Volatile, \ + __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u8", uchar2, \ - uint2, "=r", __Clobber) \ + uint2, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u8", uchar4, \ - uint4, "=r", __Clobber) \ + uint4, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u16", ushort2, \ - ushort2, "=h", __Clobber) \ + ushort2, "=h", __Volatile, __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u16", ushort4, \ - ushort4, "=h", __Clobber) \ + ushort4, "=h", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u32", uint2, \ - uint2, "=r", __Clobber) \ + uint2, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u32", uint4, \ - uint4, "=r", __Clobber) \ + uint4, "=r", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2, \ - ulonglong2, "=l", __Clobber) \ + ulonglong2, "=l", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f32", float, float, \ - "=f", __Clobber) \ + "=f", __Volatile, __Clobber) \ __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f64", double, double, \ - "=d", __Clobber) \ + "=d", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f32", float2, \ - float2, "=f", __Clobber) \ + float2, "=f", __Volatile, __Clobber) \ __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.f32", float4, \ - float4, "=f", __Clobber) \ + float4, "=f", __Volatile, __Clobber) \ __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f64", double2, \ - double2, "=d", __Clobber) \ + double2, "=d", __Volatile, __Clobber) \ __INTRINSIC_LOAD_LONG(__Mode) \ __INTRINSIC_LOAD_ULONG(__Mode) -__INTRINSIC_LOAD_FAMILY(ca, /* no clobber */) -__INTRINSIC_LOAD_FAMILY(cg, /* no clobber */) -__INTRINSIC_LOAD_FAMILY(cs, /* no clobber */) -__INTRINSIC_LOAD_FAMILY(cv, : "memory") -__INTRINSIC_LOAD_FAMILY(lu, : "memory") +__INTRINSIC_LOAD_FAMILY(ca, __volatile__, /* no clobber */) +__INTRINSIC_LOAD_FAMILY(cg, __volatile__, /* no clobber */) +__INTRINSIC_LOAD_FAMILY(cs, __volatile__, /* no clobber */) +__INTRINSIC_LOAD_FAMILY(cv, /* not volatile */, : "memory") +__INTRINSIC_LOAD_FAMILY(lu, /* not volatile */, : "memory") #pragma pop_macro("__INTRINSIC_LOAD") #pragma pop_macro("__INTRINSIC_LOAD2") @@ -621,7 +623,7 @@ __INTRINSIC_LOAD_FAMILY(lu, : "memory") #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \ __TmpType __tmp = (__TmpType)__value; \ - asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \ + __asm__(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \ } #pragma push_macro("__INTRINSIC_STORE2") @@ -632,9 +634,9 @@ __INTRINSIC_LOAD_FAMILY(lu, : "memory") using __ElementType = decltype(__tmp.x); \ __tmp.x = (__ElementType)(__value.x); \ __tmp.y = (__ElementType)(__value.y); \ - asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \ - __AsmType(__tmp.y) \ - : "memory"); \ + __asm__(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \ + __AsmType(__tmp.y) \ + : "memory"); \ } #pragma push_macro("__INTRINSIC_STORE4") @@ -647,9 +649,9 @@ __INTRINSIC_LOAD_FAMILY(lu, : "memory") __tmp.y = (__ElementType)(__value.y); \ __tmp.z = (__ElementType)(__value.z); \ __tmp.w = (__ElementType)(__value.w); \ - asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \ - __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \ - : "memory"); \ + __asm__(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \ + __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \ + : "memory"); \ } #pragma push_macro("__INTRINSIC_STORE_LONG") diff --git a/clang/test/Headers/cuda_intrinsics.cu b/clang/test/Headers/cuda_intrinsics.cu index 68e2d931acd79..ed082f54a28dc 100644 --- a/clang/test/Headers/cuda_intrinsics.cu +++ b/clang/test/Headers/cuda_intrinsics.cu @@ -28,53 +28,53 @@ struct double2 { double x, y; }; __device__ void test_loads_cg(void *ptr) { // CHECK-LABEL: .func _Z13test_loads_cgPv // CHECK: ld.global.cg.s8 - volatile char v_0 = __ldcg(((const char *)ptr) + 0); + __ldcg(((const char *)ptr) + 0); // CHECK: ld.global.cg.s32 - volatile int v_1 = __ldcg(((const int *)ptr) + 1); + __ldcg(((const int *)ptr) + 1); // CHECK: ld.global.cg.s64 - volatile long v_2 = __ldcg(((const long *)ptr) + 2); + __ldcg(((const long *)ptr) + 2); // CHECK: ld.global.cg.u64 - volatile unsigned long v_3 = __ldcg(((const unsigned long *)ptr) + 3); + __ldcg(((const unsigned long *)ptr) + 3); // CHECK: ld.global.cg.s64 - volatile long long v_4 = __ldcg(((const long long *)ptr) + 4); + __ldcg(((const long long *)ptr) + 4); // CHECK: ld.global.cg.f32 - volatile float v_5 = __ldcg(((const float *)ptr) + 5); + __ldcg(((const float *)ptr) + 5); // CHECK: ld.global.cg.f64 - volatile double v_6 = __ldcg(((const double *)ptr) + 6); + __ldcg(((const double *)ptr) + 6); // CHECK: ld.global.cg.v2.s8 - volatile char2 v_7 = __ldcg(((const char2 *)ptr) + 7); + __ldcg(((const char2 *)ptr) + 7); // CHECK: ld.global.cg.v4.s8 - volatile char4 v_8 = __ldcg(((const char4 *)ptr) + 8); + __ldcg(((const char4 *)ptr) + 8); // CHECK: ld.global.cg.v2.s16 - volatile short2 v_9 = __ldcg(((const short2 *)ptr) + 9); + __ldcg(((const short2 *)ptr) + 9); // CHECK: ld.global.cg.v4.s16 - volatile short4 v_10 = __ldcg(((const short4 *)ptr) + 10); + __ldcg(((const short4 *)ptr) + 10); // CHECK: ld.global.cg.v2.s32 - volatile int2 v_11 = __ldcg(((const int2 *)ptr) + 11); + __ldcg(((const int2 *)ptr) + 11); // CHECK: ld.global.cg.v4.s32 - volatile int4 v_12 = __ldcg(((const int4 *)ptr) + 12); + __ldcg(((const int4 *)ptr) + 12); // CHECK: ld.global.cg.v2.s64 - volatile longlong2 v_13 = __ldcg(((const longlong2 *)ptr) + 13); + __ldcg(((const longlong2 *)ptr) + 13); // CHECK: ld.global.cg.v2.u8 - volatile uchar2 v_14 = __ldcg(((const uchar2 *)ptr) + 14); + __ldcg(((const uchar2 *)ptr) + 14); // CHECK: ld.global.cg.v4.u8 - volatile uchar4 v_15 = __ldcg(((const uchar4 *)ptr) + 15); + __ldcg(((const uchar4 *)ptr) + 15); // CHECK: ld.global.cg.v2.u16 - volatile ushort2 v_16 = __ldcg(((const ushort2 *)ptr) + 16); + __ldcg(((const ushort2 *)ptr) + 16); // CHECK: ld.global.cg.v4.u16 - volatile ushort4 v_17 = __ldcg(((const ushort4 *)ptr) + 17); + __ldcg(((const ushort4 *)ptr) + 17); // CHECK: ld.global.cg.v2.u32 - volatile uint2 v_18 = __ldcg(((const uint2 *)ptr) + 18); + __ldcg(((const uint2 *)ptr) + 18); // CHECK: ld.global.cg.v4.u32 - volatile uint4 v_19 = __ldcg(((const uint4 *)ptr) + 19); + __ldcg(((const uint4 *)ptr) + 19); // CHECK: ld.global.cg.v2.u64 - volatile ulonglong2 v_20 = __ldcg(((const ulonglong2 *)ptr) + 20); + __ldcg(((const ulonglong2 *)ptr) + 20); // CHECK: ld.global.cg.v2.f32 - volatile float2 v_21 = __ldcg(((const float2 *)ptr) + 21); + __ldcg(((const float2 *)ptr) + 21); // CHECK: ld.global.cg.v4.f32 - volatile float4 v_22 = __ldcg(((const float4 *)ptr) + 22); + __ldcg(((const float4 *)ptr) + 22); // CHECK: ld.global.cg.v2.f64 - volatile double2 v_23 = __ldcg(((const double2 *)ptr) + 23); + __ldcg(((const double2 *)ptr) + 23); } __device__ void test_loads_cv(void *ptr) { // CHECK-LABEL: .func _Z13test_loads_cvPv @@ -130,104 +130,104 @@ __device__ void test_loads_cv(void *ptr) { __device__ void test_loads_cs(void *ptr) { // CHECK-LABEL: .func _Z13test_loads_csPv // CHECK: ld.global.cs.s8 - volatile char v_0 = __ldcs(((const char *)ptr) + 0); + __ldcs(((const char *)ptr) + 0); // CHECK: ld.global.cs.s32 - volatile int v_1 = __ldcs(((const int *)ptr) + 1); + __ldcs(((const int *)ptr) + 1); // CHECK: ld.global.cs.s64 - volatile long v_2 = __ldcs(((const long *)ptr) + 2); + __ldcs(((const long *)ptr) + 2); // CHECK: ld.global.cs.u64 - volatile unsigned long v_3 = __ldcs(((const unsigned long *)ptr) + 3); + __ldcs(((const unsigned long *)ptr) + 3); // CHECK: ld.global.cs.s64 - volatile long long v_4 = __ldcs(((const long long *)ptr) + 4); + __ldcs(((const long long *)ptr) + 4); // CHECK: ld.global.cs.f32 - volatile float v_5 = __ldcs(((const float *)ptr) + 5); + __ldcs(((const float *)ptr) + 5); // CHECK: ld.global.cs.f64 - volatile double v_6 = __ldcs(((const double *)ptr) + 6); + __ldcs(((const double *)ptr) + 6); // CHECK: ld.global.cs.v2.s8 - volatile char2 v_7 = __ldcs(((const char2 *)ptr) + 7); + __ldcs(((const char2 *)ptr) + 7); // CHECK: ld.global.cs.v4.s8 - volatile char4 v_8 = __ldcs(((const char4 *)ptr) + 8); + __ldcs(((const char4 *)ptr) + 8); // CHECK: ld.global.cs.v2.s16 - volatile short2 v_9 = __ldcs(((const short2 *)ptr) + 9); + __ldcs(((const short2 *)ptr) + 9); // CHECK: ld.global.cs.v4.s16 - volatile short4 v_10 = __ldcs(((const short4 *)ptr) + 10); + __ldcs(((const short4 *)ptr) + 10); // CHECK: ld.global.cs.v2.s32 - volatile int2 v_11 = __ldcs(((const int2 *)ptr) + 11); + __ldcs(((const int2 *)ptr) + 11); // CHECK: ld.global.cs.v4.s32 - volatile int4 v_12 = __ldcs(((const int4 *)ptr) + 12); + __ldcs(((const int4 *)ptr) + 12); // CHECK: ld.global.cs.v2.s64 - volatile longlong2 v_13 = __ldcs(((const longlong2 *)ptr) + 13); + __ldcs(((const longlong2 *)ptr) + 13); // CHECK: ld.global.cs.v2.u8 - volatile uchar2 v_14 = __ldcs(((const uchar2 *)ptr) + 14); + __ldcs(((const uchar2 *)ptr) + 14); // CHECK: ld.global.cs.v4.u8 - volatile uchar4 v_15 = __ldcs(((const uchar4 *)ptr) + 15); + __ldcs(((const uchar4 *)ptr) + 15); // CHECK: ld.global.cs.v2.u16 - volatile ushort2 v_16 = __ldcs(((const ushort2 *)ptr) + 16); + __ldcs(((const ushort2 *)ptr) + 16); // CHECK: ld.global.cs.v4.u16 - volatile ushort4 v_17 = __ldcs(((const ushort4 *)ptr) + 17); + __ldcs(((const ushort4 *)ptr) + 17); // CHECK: ld.global.cs.v2.u32 - volatile uint2 v_18 = __ldcs(((const uint2 *)ptr) + 18); + __ldcs(((const uint2 *)ptr) + 18); // CHECK: ld.global.cs.v4.u32 - volatile uint4 v_19 = __ldcs(((const uint4 *)ptr) + 19); + __ldcs(((const uint4 *)ptr) + 19); // CHECK: ld.global.cs.v2.u64 - volatile ulonglong2 v_20 = __ldcs(((const ulonglong2 *)ptr) + 20); + __ldcs(((const ulonglong2 *)ptr) + 20); // CHECK: ld.global.cs.v2.f32 - volatile float2 v_21 = __ldcs(((const float2 *)ptr) + 21); + __ldcs(((const float2 *)ptr) + 21); // CHECK: ld.global.cs.v4.f32 - volatile float4 v_22 = __ldcs(((const float4 *)ptr) + 22); + __ldcs(((const float4 *)ptr) + 22); // CHECK: ld.global.cs.v2.f64 - volatile double2 v_23 = __ldcs(((const double2 *)ptr) + 23); + __ldcs(((const double2 *)ptr) + 23); } __device__ void test_loads_ca(void *ptr) { // CHECK-LABEL: .func _Z13test_loads_caPv // CHECK: ld.global.ca.s8 - volatile char v_0 = __ldca(((const char *)ptr) + 0); + __ldca(((const char *)ptr) + 0); // CHECK: ld.global.ca.s32 - volatile int v_1 = __ldca(((const int *)ptr) + 1); + __ldca(((const int *)ptr) + 1); // CHECK: ld.global.ca.s64 - volatile long v_2 = __ldca(((const long *)ptr) + 2); + __ldca(((const long *)ptr) + 2); // CHECK: ld.global.ca.u64 - volatile unsigned long v_3 = __ldca(((const unsigned long *)ptr) + 3); + __ldca(((const unsigned long *)ptr) + 3); // CHECK: ld.global.ca.s64 - volatile long long v_4 = __ldca(((const long long *)ptr) + 4); + __ldca(((const long long *)ptr) + 4); // CHECK: ld.global.ca.f32 - volatile float v_5 = __ldca(((const float *)ptr) + 5); + __ldca(((const float *)ptr) + 5); // CHECK: ld.global.ca.f64 - volatile double v_6 = __ldca(((const double *)ptr) + 6); + __ldca(((const double *)ptr) + 6); // CHECK: ld.global.ca.v2.s8 - volatile char2 v_7 = __ldca(((const char2 *)ptr) + 7); + __ldca(((const char2 *)ptr) + 7); // CHECK: ld.global.ca.v4.s8 - volatile char4 v_8 = __ldca(((const char4 *)ptr) + 8); + __ldca(((const char4 *)ptr) + 8); // CHECK: ld.global.ca.v2.s16 - volatile short2 v_9 = __ldca(((const short2 *)ptr) + 9); + __ldca(((const short2 *)ptr) + 9); // CHECK: ld.global.ca.v4.s16 - volatile short4 v_10 = __ldca(((const short4 *)ptr) + 10); + __ldca(((const short4 *)ptr) + 10); // CHECK: ld.global.ca.v2.s32 - volatile int2 v_11 = __ldca(((const int2 *)ptr) + 11); + __ldca(((const int2 *)ptr) + 11); // CHECK: ld.global.ca.v4.s32 - volatile int4 v_12 = __ldca(((const int4 *)ptr) + 12); + __ldca(((const int4 *)ptr) + 12); // CHECK: ld.global.ca.v2.s64 - volatile longlong2 v_13 = __ldca(((const longlong2 *)ptr) + 13); + __ldca(((const longlong2 *)ptr) + 13); // CHECK: ld.global.ca.v2.u8 - volatile uchar2 v_14 = __ldca(((const uchar2 *)ptr) + 14); + __ldca(((const uchar2 *)ptr) + 14); // CHECK: ld.global.ca.v4.u8 - volatile uchar4 v_15 = __ldca(((const uchar4 *)ptr) + 15); + __ldca(((const uchar4 *)ptr) + 15); // CHECK: ld.global.ca.v2.u16 - volatile ushort2 v_16 = __ldca(((const ushort2 *)ptr) + 16); + __ldca(((const ushort2 *)ptr) + 16); // CHECK: ld.global.ca.v4.u16 - volatile ushort4 v_17 = __ldca(((const ushort4 *)ptr) + 17); + __ldca(((const ushort4 *)ptr) + 17); // CHECK: ld.global.ca.v2.u32 - volatile uint2 v_18 = __ldca(((const uint2 *)ptr) + 18); + __ldca(((const uint2 *)ptr) + 18); // CHECK: ld.global.ca.v4.u32 - volatile uint4 v_19 = __ldca(((const uint4 *)ptr) + 19); + __ldca(((const uint4 *)ptr) + 19); // CHECK: ld.global.ca.v2.u64 - volatile ulonglong2 v_20 = __ldca(((const ulonglong2 *)ptr) + 20); + __ldca(((const ulonglong2 *)ptr) + 20); // CHECK: ld.global.ca.v2.f32 - volatile float2 v_21 = __ldca(((const float2 *)ptr) + 21); + __ldca(((const float2 *)ptr) + 21); // CHECK: ld.global.ca.v4.f32 - volatile float4 v_22 = __ldca(((const float4 *)ptr) + 22); + __ldca(((const float4 *)ptr) + 22); // CHECK: ld.global.ca.v2.f64 - volatile double2 v_23 = __ldca(((const double2 *)ptr) + 23); + __ldca(((const double2 *)ptr) + 23); } __device__ void test_loads_lu(void *ptr) { // CHECK-LABEL: .func _Z13test_loads_luPv >From 88302173f8502c51bf7496fcba4247999e5f5e1d Mon Sep 17 00:00:00 2001 From: Artem Belevich <[email protected]> Date: Wed, 1 Apr 2026 11:46:16 -0700 Subject: [PATCH 6/6] Add REQUIRES: nvptx-registered-target to test and move to top --- clang/test/Headers/cuda_intrinsics.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/test/Headers/cuda_intrinsics.cu b/clang/test/Headers/cuda_intrinsics.cu index ed082f54a28dc..fb36a53a14410 100644 --- a/clang/test/Headers/cuda_intrinsics.cu +++ b/clang/test/Headers/cuda_intrinsics.cu @@ -1,3 +1,4 @@ +// REQUIRES: nvptx-registered-target // RUN: %clang++ -O1 -S --cuda-device-only --offload-arch=sm_32 -nocudalib -nocudainc %s -o - | FileCheck %s #define __device__ __attribute__((device)) _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
