https://github.com/Artem-B updated 
https://github.com/llvm/llvm-project/pull/190021

>From e215f3792d5fe1504a46d91b26f9502a2a72a5ac Mon Sep 17 00:00:00 2001
From: Artem Belevich <[email protected]>
Date: Tue, 31 Mar 2026 16:01:08 -0700
Subject: [PATCH 1/6] [CUDA] Refactor and consolidate load/store intrinsics

- Defined family macros to reduce code duplication.
- Added support for unsigned long types.
- Implemented missing load/store families (ca, lu, wb, cg, cs).
- Forwarded long/ulong types to standard types.
Verified with test file.
---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 390 ++++++++++----------
 1 file changed, 196 insertions(+), 194 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h 
b/clang/lib/Headers/__clang_cuda_intrinsics.h
index cca97cb21ef50..9507b6ce59e82 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -523,173 +523,119 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
     return __ret;                                                              
\
   }
 
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
-                 "=l", );
-
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", 
);
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
-                 "=h", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
-                 unsigned long long, "=l", );
-
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
-                  "=l", );
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
-
-inline __device__ long __ldcg(const long *__ptr) {
-  unsigned long __ret;
-  if (sizeof(long) == 8) {
-    asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
-  } else {
-    asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+#pragma push_macro("__INTRINSIC_LOAD_LONG")
+#define __INTRINSIC_LOAD_LONG(__Mode)                                          
\
+  inline __device__ long __ld##__Mode(const long *__ptr) {                     
\
+    if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
+      return (long)__ld##__Mode((const long long *)__ptr);                     
\
+    } else {                                                                  \
+      return (long)__ld##__Mode((const int *)__ptr);                           
  \
+    }                                                                         \
   }
-  return (long)__ret;
-}
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
-                 "=h", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
-                 unsigned long long, "=l", : "memory");
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
-                 "=h", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
-                 "=l", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
-                  "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
-                  "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
-                  "=h", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
-                  "=h", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
-                  "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
-                  "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
-                  "=l", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
-                  "=h", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
-                  "=h", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
-                  "=l", : "memory");
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
-                  "=f", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
-                  "=f", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
-                  "=d", : "memory");
-
-inline __device__ long __ldcv(const long *__ptr) {
-  unsigned long __ret;
-  if (sizeof(long) == 8) {
-    asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
-  } else {
-    asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+
+#pragma push_macro("__INTRINSIC_LOAD_ULONG")
+#define __INTRINSIC_LOAD_ULONG(__Mode)                                         
 \
+  inline __device__ unsigned long __ld##__Mode(const unsigned long *__ptr) {   
\
+    if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
+      return (unsigned long)__ld##__Mode((const unsigned long long *)__ptr);   
\
+    } else {                                                                  \
+      return (unsigned long)__ld##__Mode((const unsigned int *)__ptr);         
\
+    }                                                                         \
   }
-  return (long)__ret;
-}
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
-                 "=l", );
-
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
-                 "=h", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
-                 unsigned long long, "=l", );
-
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
-                  "=l", );
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
+
+#pragma push_macro("__INTRINSIC_LOAD_FAMILY")
+#define __INTRINSIC_LOAD_FAMILY(__Mode, __Clobber)                             
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", char,             
\
+                   unsigned int, "=r", __Clobber)                              
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", signed char,      
\
+                   unsigned int, "=r", __Clobber)                              
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s16", short,           
\
+                   unsigned short, "=h", __Clobber)                            
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s32", int,             
\
+                   unsigned int, "=r", __Clobber)                              
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s64", long long,       
\
+                   unsigned long long, "=l", __Clobber)                        
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s8", char2, int2,  
\
+                    "=r", __Clobber)                                           
\
+  __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s8", char4, int4,  
\
+                    "=r", __Clobber)                                           
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s16", short2,      
\
+                    short2, "=h", __Clobber)                                   
\
+  __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s16", short4,      
\
+                    short4, "=h", __Clobber)                                   
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s32", int2, int2,  
\
+                    "=r", __Clobber)                                           
\
+  __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s32", int4, int4,  
\
+                    "=r", __Clobber)                                           
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s64", longlong2,   
\
+                    longlong2, "=l", __Clobber)                                
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char,   \
+                   unsigned int, "=r", __Clobber)                              
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short, \
+                   unsigned short, "=h", __Clobber)                            
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int,   \
+                   unsigned int, "=r", __Clobber)                              
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u64",                  
\
+                   unsigned long long, unsigned long long, "=l", __Clobber)    
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u8", uchar2,       
\
+                    uint2, "=r", __Clobber)                                    
\
+  __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u8", uchar4,       
\
+                    uint4, "=r", __Clobber)                                    
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u16", ushort2,     
\
+                    ushort2, "=h", __Clobber)                                  
\
+  __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u16", ushort4,     
\
+                    ushort4, "=h", __Clobber)                                  
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u32", uint2,       
\
+                    uint2, "=r", __Clobber)                                    
\
+  __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u32", uint4,       
\
+                    uint4, "=r", __Clobber)                                    
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2, \
+                    ulonglong2, "=l", __Clobber)                               
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f32", float, float,    
\
+                   "=f", __Clobber)                                            
\
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f64", double, double,  
\
+                   "=d", __Clobber)                                            
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f32", float2,      
\
+                    float2, "=f", __Clobber)                                   
\
+  __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.f32", float4,      
\
+                    float4, "=f", __Clobber)                                   
\
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f64", double2,     
\
+                    double2, "=d", __Clobber)                                  
\
+  __INTRINSIC_LOAD_LONG(__Mode)                                                
\
+  __INTRINSIC_LOAD_ULONG(__Mode)
+
+
+
+
+__INTRINSIC_LOAD_FAMILY(cg, )
+__INTRINSIC_LOAD_FAMILY(ca, )
+
+
+
+
+
+__INTRINSIC_LOAD_FAMILY(cv, : "memory")
+__INTRINSIC_LOAD_FAMILY(lu, : "memory")
+
+
+
+
+
+
+__INTRINSIC_LOAD_FAMILY(cs, )
+
+
+
 
 #pragma pop_macro("__INTRINSIC_LOAD")
 #pragma pop_macro("__INTRINSIC_LOAD2")
 #pragma pop_macro("__INTRINSIC_LOAD4")
+#pragma pop_macro("__INTRINSIC_LOAD_FAMILY")
+#pragma pop_macro("__INTRINSIC_LOAD_LONG")
+#pragma pop_macro("__INTRINSIC_LOAD_ULONG")
+
+
 
-inline __device__ long __ldcs(const long *__ptr) {
-  unsigned long __ret;
-  if (sizeof(long) == 8) {
-    asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
-  } else {
-    asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
-  }
-  return (long)__ret;
-}
 
 #pragma push_macro("__INTRINSIC_STORE")
 #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) 
\
@@ -726,44 +672,100 @@ inline __device__ long __ldcs(const long *__ptr) {
         : "memory");                                                           
\
   }
 
-__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
-
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
-                  "h");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
-                  unsigned long long, "l");
-
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
-__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
+#pragma push_macro("__INTRINSIC_STORE_LONG")
+#define __INTRINSIC_STORE_LONG(__Mode)                                         
 \
+  inline __device__ void __st##__Mode(long *__ptr, long __value) {             
 \
+    if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
+      __st##__Mode((long long *)__ptr, (long long)__value);                    
 \
+    } else {                                                                  \
+      __st##__Mode((int *)__ptr, (int)__value);                                
 \
+    }                                                                         \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE_ULONG")
+#define __INTRINSIC_STORE_ULONG(__Mode)                                        
  \
+  inline __device__ void __st##__Mode(unsigned long *__ptr, unsigned long 
__value) { \
+    if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
+      __st##__Mode((unsigned long long *)__ptr, (unsigned long long)__value);  
 \
+    } else {                                                                  \
+      __st##__Mode((unsigned int *)__ptr, (unsigned int)__value);              
   \
+    }                                                                         \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE_FAMILY")
+#define __INTRINSIC_STORE_FAMILY(__Mode)                                       
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s8", char, int, "r")  
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s8", signed char,     
\
+                    int, "r")                                                  
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s16", short, short,   
\
+                    "h")                                                       
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s32", int, int, "r")  
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s64", long long,      
 \
+                    long long, "l")                                            
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s8", char2,       
\
+                     int2, "r")                                                
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s8", char4,       
\
+                     int4, "r")                                                
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s16", short2,     
 \
+                     short2, "h")                                              
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s16", short4,     
 \
+                     short4, "h")                                              
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s32", int2,       
\
+                     int2, "r")                                                
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s32", int4,       
\
+                     int4, "r")                                                
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s64", longlong2,  
\
+                     longlong2, "l")                                           
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u8", unsigned char,   
\
+                    int, "r")                                                  
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u16", unsigned short, 
\
+                    unsigned short, "h")                                       
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u32", unsigned int,   
\
+                    unsigned int, "r")                                         
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u64",                 
\
+                    unsigned long long, unsigned long long, "l")               
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u8", uchar2,      
 \
+                     uchar2, "r")                                              
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u8", uchar4,      
 \
+                     uint4, "r")                                               
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u16", ushort2,    
 \
+                     ushort2, "h")                                             
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u16", ushort4,    
 \
+                     ushort4, "h")                                             
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u32", uint2,      
 \
+                     uint2, "r")                                               
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u32", uint4,      
 \
+                     uint4, "r")                                               
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u64", ulonglong2, 
\
+                     ulonglong2, "l")                                          
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f32", float, float,   
 \
+                    "f")                                                       
\
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f64", double, double, 
 \
+                    "d")                                                       
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f32", float2,     
 \
+                     float2, "f")                                              
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.f32", float4,     
 \
+                     float4, "f")                                              
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f64", double2,    
 \
+                     double2, "d")                                             
 \
+  __INTRINSIC_STORE_LONG(__Mode)                                               
 \
+  __INTRINSIC_STORE_ULONG(__Mode)
+
+
+__INTRINSIC_STORE_FAMILY(wt)
+__INTRINSIC_STORE_FAMILY(wb)
+__INTRINSIC_STORE_FAMILY(cg)
+__INTRINSIC_STORE_FAMILY(cs)
+
+
+
 
 #pragma pop_macro("__INTRINSIC_STORE")
 #pragma pop_macro("__INTRINSIC_STORE2")
 #pragma pop_macro("__INTRINSIC_STORE4")
+#pragma pop_macro("__INTRINSIC_STORE_FAMILY")
+#pragma pop_macro("__INTRINSIC_STORE_LONG")
+#pragma pop_macro("__INTRINSIC_STORE_ULONG")
 
 #endif // defined(__cplusplus) && (__cplusplus >= 201103L)
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320

>From 957d335fa20a4bf2626259759fb84e3f00167837 Mon Sep 17 00:00:00 2001
From: Artem Belevich <[email protected]>
Date: Tue, 31 Mar 2026 17:08:02 -0700
Subject: [PATCH 2/6] [clang][Headers] Add PTX verification test for CUDA
 load/store intrinsics.

---
 clang/test/Headers/cuda_intrinsics.cu | 486 ++++++++++++++++++++++++++
 1 file changed, 486 insertions(+)
 create mode 100644 clang/test/Headers/cuda_intrinsics.cu

diff --git a/clang/test/Headers/cuda_intrinsics.cu 
b/clang/test/Headers/cuda_intrinsics.cu
new file mode 100644
index 0000000000000..68e2d931acd79
--- /dev/null
+++ b/clang/test/Headers/cuda_intrinsics.cu
@@ -0,0 +1,486 @@
+// RUN: %clang++ -O1 -S --cuda-device-only --offload-arch=sm_32 -nocudalib 
-nocudainc %s -o - | FileCheck %s
+
+#define __device__ __attribute__((device))
+#define warpSize 32
+#define memcpy __builtin_memcpy
+
+// Define missing types for standalone testing
+struct char2 { char x, y; };
+struct char4 { char x, y, z, w; };
+struct short2 { short x, y; };
+struct short4 { short x, y, z, w; };
+struct int2 { int x, y; };
+struct int4 { int x, y, z, w; };
+struct longlong2 { long long x, y; };
+struct uchar2 { unsigned char x, y; };
+struct uchar4 { unsigned char x, y, z, w; };
+struct ushort2 { unsigned short x, y; };
+struct ushort4 { unsigned short x, y, z, w; };
+struct uint2 { unsigned int x, y; };
+struct uint4 { unsigned int x, y, z, w; };
+struct ulonglong2 { unsigned long long x, y; };
+struct float2 { float x, y; };
+struct float4 { float x, y, z, w; };
+struct double2 { double x, y; };
+
+#include "__clang_cuda_intrinsics.h"
+
+__device__ void test_loads_cg(void *ptr) {
+  // CHECK-LABEL: .func _Z13test_loads_cgPv
+  // CHECK: ld.global.cg.s8
+  volatile char v_0 = __ldcg(((const char *)ptr) + 0);
+  // CHECK: ld.global.cg.s32
+  volatile int v_1 = __ldcg(((const int *)ptr) + 1);
+  // CHECK: ld.global.cg.s64
+  volatile long v_2 = __ldcg(((const long *)ptr) + 2);
+  // CHECK: ld.global.cg.u64
+  volatile unsigned long v_3 = __ldcg(((const unsigned long *)ptr) + 3);
+  // CHECK: ld.global.cg.s64
+  volatile long long v_4 = __ldcg(((const long long *)ptr) + 4);
+  // CHECK: ld.global.cg.f32
+  volatile float v_5 = __ldcg(((const float *)ptr) + 5);
+  // CHECK: ld.global.cg.f64
+  volatile double v_6 = __ldcg(((const double *)ptr) + 6);
+  // CHECK: ld.global.cg.v2.s8
+  volatile char2 v_7 = __ldcg(((const char2 *)ptr) + 7);
+  // CHECK: ld.global.cg.v4.s8
+  volatile char4 v_8 = __ldcg(((const char4 *)ptr) + 8);
+  // CHECK: ld.global.cg.v2.s16
+  volatile short2 v_9 = __ldcg(((const short2 *)ptr) + 9);
+  // CHECK: ld.global.cg.v4.s16
+  volatile short4 v_10 = __ldcg(((const short4 *)ptr) + 10);
+  // CHECK: ld.global.cg.v2.s32
+  volatile int2 v_11 = __ldcg(((const int2 *)ptr) + 11);
+  // CHECK: ld.global.cg.v4.s32
+  volatile int4 v_12 = __ldcg(((const int4 *)ptr) + 12);
+  // CHECK: ld.global.cg.v2.s64
+  volatile longlong2 v_13 = __ldcg(((const longlong2 *)ptr) + 13);
+  // CHECK: ld.global.cg.v2.u8
+  volatile uchar2 v_14 = __ldcg(((const uchar2 *)ptr) + 14);
+  // CHECK: ld.global.cg.v4.u8
+  volatile uchar4 v_15 = __ldcg(((const uchar4 *)ptr) + 15);
+  // CHECK: ld.global.cg.v2.u16
+  volatile ushort2 v_16 = __ldcg(((const ushort2 *)ptr) + 16);
+  // CHECK: ld.global.cg.v4.u16
+  volatile ushort4 v_17 = __ldcg(((const ushort4 *)ptr) + 17);
+  // CHECK: ld.global.cg.v2.u32
+  volatile uint2 v_18 = __ldcg(((const uint2 *)ptr) + 18);
+  // CHECK: ld.global.cg.v4.u32
+  volatile uint4 v_19 = __ldcg(((const uint4 *)ptr) + 19);
+  // CHECK: ld.global.cg.v2.u64
+  volatile ulonglong2 v_20 = __ldcg(((const ulonglong2 *)ptr) + 20);
+  // CHECK: ld.global.cg.v2.f32
+  volatile float2 v_21 = __ldcg(((const float2 *)ptr) + 21);
+  // CHECK: ld.global.cg.v4.f32
+  volatile float4 v_22 = __ldcg(((const float4 *)ptr) + 22);
+  // CHECK: ld.global.cg.v2.f64
+  volatile double2 v_23 = __ldcg(((const double2 *)ptr) + 23);
+}
+__device__ void test_loads_cv(void *ptr) {
+  // CHECK-LABEL: .func _Z13test_loads_cvPv
+  // CHECK: ld.global.cv.s8
+  volatile char v_0 = __ldcv(((const char *)ptr) + 0);
+  // CHECK: ld.global.cv.s32
+  volatile int v_1 = __ldcv(((const int *)ptr) + 1);
+  // CHECK: ld.global.cv.s64
+  volatile long v_2 = __ldcv(((const long *)ptr) + 2);
+  // CHECK: ld.global.cv.u64
+  volatile unsigned long v_3 = __ldcv(((const unsigned long *)ptr) + 3);
+  // CHECK: ld.global.cv.s64
+  volatile long long v_4 = __ldcv(((const long long *)ptr) + 4);
+  // CHECK: ld.global.cv.f32
+  volatile float v_5 = __ldcv(((const float *)ptr) + 5);
+  // CHECK: ld.global.cv.f64
+  volatile double v_6 = __ldcv(((const double *)ptr) + 6);
+  // CHECK: ld.global.cv.v2.s8
+  volatile char2 v_7 = __ldcv(((const char2 *)ptr) + 7);
+  // CHECK: ld.global.cv.v4.s8
+  volatile char4 v_8 = __ldcv(((const char4 *)ptr) + 8);
+  // CHECK: ld.global.cv.v2.s16
+  volatile short2 v_9 = __ldcv(((const short2 *)ptr) + 9);
+  // CHECK: ld.global.cv.v4.s16
+  volatile short4 v_10 = __ldcv(((const short4 *)ptr) + 10);
+  // CHECK: ld.global.cv.v2.s32
+  volatile int2 v_11 = __ldcv(((const int2 *)ptr) + 11);
+  // CHECK: ld.global.cv.v4.s32
+  volatile int4 v_12 = __ldcv(((const int4 *)ptr) + 12);
+  // CHECK: ld.global.cv.v2.s64
+  volatile longlong2 v_13 = __ldcv(((const longlong2 *)ptr) + 13);
+  // CHECK: ld.global.cv.v2.u8
+  volatile uchar2 v_14 = __ldcv(((const uchar2 *)ptr) + 14);
+  // CHECK: ld.global.cv.v4.u8
+  volatile uchar4 v_15 = __ldcv(((const uchar4 *)ptr) + 15);
+  // CHECK: ld.global.cv.v2.u16
+  volatile ushort2 v_16 = __ldcv(((const ushort2 *)ptr) + 16);
+  // CHECK: ld.global.cv.v4.u16
+  volatile ushort4 v_17 = __ldcv(((const ushort4 *)ptr) + 17);
+  // CHECK: ld.global.cv.v2.u32
+  volatile uint2 v_18 = __ldcv(((const uint2 *)ptr) + 18);
+  // CHECK: ld.global.cv.v4.u32
+  volatile uint4 v_19 = __ldcv(((const uint4 *)ptr) + 19);
+  // CHECK: ld.global.cv.v2.u64
+  volatile ulonglong2 v_20 = __ldcv(((const ulonglong2 *)ptr) + 20);
+  // CHECK: ld.global.cv.v2.f32
+  volatile float2 v_21 = __ldcv(((const float2 *)ptr) + 21);
+  // CHECK: ld.global.cv.v4.f32
+  volatile float4 v_22 = __ldcv(((const float4 *)ptr) + 22);
+  // CHECK: ld.global.cv.v2.f64
+  volatile double2 v_23 = __ldcv(((const double2 *)ptr) + 23);
+}
+__device__ void test_loads_cs(void *ptr) {
+  // CHECK-LABEL: .func _Z13test_loads_csPv
+  // CHECK: ld.global.cs.s8
+  volatile char v_0 = __ldcs(((const char *)ptr) + 0);
+  // CHECK: ld.global.cs.s32
+  volatile int v_1 = __ldcs(((const int *)ptr) + 1);
+  // CHECK: ld.global.cs.s64
+  volatile long v_2 = __ldcs(((const long *)ptr) + 2);
+  // CHECK: ld.global.cs.u64
+  volatile unsigned long v_3 = __ldcs(((const unsigned long *)ptr) + 3);
+  // CHECK: ld.global.cs.s64
+  volatile long long v_4 = __ldcs(((const long long *)ptr) + 4);
+  // CHECK: ld.global.cs.f32
+  volatile float v_5 = __ldcs(((const float *)ptr) + 5);
+  // CHECK: ld.global.cs.f64
+  volatile double v_6 = __ldcs(((const double *)ptr) + 6);
+  // CHECK: ld.global.cs.v2.s8
+  volatile char2 v_7 = __ldcs(((const char2 *)ptr) + 7);
+  // CHECK: ld.global.cs.v4.s8
+  volatile char4 v_8 = __ldcs(((const char4 *)ptr) + 8);
+  // CHECK: ld.global.cs.v2.s16
+  volatile short2 v_9 = __ldcs(((const short2 *)ptr) + 9);
+  // CHECK: ld.global.cs.v4.s16
+  volatile short4 v_10 = __ldcs(((const short4 *)ptr) + 10);
+  // CHECK: ld.global.cs.v2.s32
+  volatile int2 v_11 = __ldcs(((const int2 *)ptr) + 11);
+  // CHECK: ld.global.cs.v4.s32
+  volatile int4 v_12 = __ldcs(((const int4 *)ptr) + 12);
+  // CHECK: ld.global.cs.v2.s64
+  volatile longlong2 v_13 = __ldcs(((const longlong2 *)ptr) + 13);
+  // CHECK: ld.global.cs.v2.u8
+  volatile uchar2 v_14 = __ldcs(((const uchar2 *)ptr) + 14);
+  // CHECK: ld.global.cs.v4.u8
+  volatile uchar4 v_15 = __ldcs(((const uchar4 *)ptr) + 15);
+  // CHECK: ld.global.cs.v2.u16
+  volatile ushort2 v_16 = __ldcs(((const ushort2 *)ptr) + 16);
+  // CHECK: ld.global.cs.v4.u16
+  volatile ushort4 v_17 = __ldcs(((const ushort4 *)ptr) + 17);
+  // CHECK: ld.global.cs.v2.u32
+  volatile uint2 v_18 = __ldcs(((const uint2 *)ptr) + 18);
+  // CHECK: ld.global.cs.v4.u32
+  volatile uint4 v_19 = __ldcs(((const uint4 *)ptr) + 19);
+  // CHECK: ld.global.cs.v2.u64
+  volatile ulonglong2 v_20 = __ldcs(((const ulonglong2 *)ptr) + 20);
+  // CHECK: ld.global.cs.v2.f32
+  volatile float2 v_21 = __ldcs(((const float2 *)ptr) + 21);
+  // CHECK: ld.global.cs.v4.f32
+  volatile float4 v_22 = __ldcs(((const float4 *)ptr) + 22);
+  // CHECK: ld.global.cs.v2.f64
+  volatile double2 v_23 = __ldcs(((const double2 *)ptr) + 23);
+}
+__device__ void test_loads_ca(void *ptr) {
+  // CHECK-LABEL: .func _Z13test_loads_caPv
+  // CHECK: ld.global.ca.s8
+  volatile char v_0 = __ldca(((const char *)ptr) + 0);
+  // CHECK: ld.global.ca.s32
+  volatile int v_1 = __ldca(((const int *)ptr) + 1);
+  // CHECK: ld.global.ca.s64
+  volatile long v_2 = __ldca(((const long *)ptr) + 2);
+  // CHECK: ld.global.ca.u64
+  volatile unsigned long v_3 = __ldca(((const unsigned long *)ptr) + 3);
+  // CHECK: ld.global.ca.s64
+  volatile long long v_4 = __ldca(((const long long *)ptr) + 4);
+  // CHECK: ld.global.ca.f32
+  volatile float v_5 = __ldca(((const float *)ptr) + 5);
+  // CHECK: ld.global.ca.f64
+  volatile double v_6 = __ldca(((const double *)ptr) + 6);
+  // CHECK: ld.global.ca.v2.s8
+  volatile char2 v_7 = __ldca(((const char2 *)ptr) + 7);
+  // CHECK: ld.global.ca.v4.s8
+  volatile char4 v_8 = __ldca(((const char4 *)ptr) + 8);
+  // CHECK: ld.global.ca.v2.s16
+  volatile short2 v_9 = __ldca(((const short2 *)ptr) + 9);
+  // CHECK: ld.global.ca.v4.s16
+  volatile short4 v_10 = __ldca(((const short4 *)ptr) + 10);
+  // CHECK: ld.global.ca.v2.s32
+  volatile int2 v_11 = __ldca(((const int2 *)ptr) + 11);
+  // CHECK: ld.global.ca.v4.s32
+  volatile int4 v_12 = __ldca(((const int4 *)ptr) + 12);
+  // CHECK: ld.global.ca.v2.s64
+  volatile longlong2 v_13 = __ldca(((const longlong2 *)ptr) + 13);
+  // CHECK: ld.global.ca.v2.u8
+  volatile uchar2 v_14 = __ldca(((const uchar2 *)ptr) + 14);
+  // CHECK: ld.global.ca.v4.u8
+  volatile uchar4 v_15 = __ldca(((const uchar4 *)ptr) + 15);
+  // CHECK: ld.global.ca.v2.u16
+  volatile ushort2 v_16 = __ldca(((const ushort2 *)ptr) + 16);
+  // CHECK: ld.global.ca.v4.u16
+  volatile ushort4 v_17 = __ldca(((const ushort4 *)ptr) + 17);
+  // CHECK: ld.global.ca.v2.u32
+  volatile uint2 v_18 = __ldca(((const uint2 *)ptr) + 18);
+  // CHECK: ld.global.ca.v4.u32
+  volatile uint4 v_19 = __ldca(((const uint4 *)ptr) + 19);
+  // CHECK: ld.global.ca.v2.u64
+  volatile ulonglong2 v_20 = __ldca(((const ulonglong2 *)ptr) + 20);
+  // CHECK: ld.global.ca.v2.f32
+  volatile float2 v_21 = __ldca(((const float2 *)ptr) + 21);
+  // CHECK: ld.global.ca.v4.f32
+  volatile float4 v_22 = __ldca(((const float4 *)ptr) + 22);
+  // CHECK: ld.global.ca.v2.f64
+  volatile double2 v_23 = __ldca(((const double2 *)ptr) + 23);
+}
+__device__ void test_loads_lu(void *ptr) {
+  // CHECK-LABEL: .func _Z13test_loads_luPv
+  // CHECK: ld.global.lu.s8
+  volatile char v_0 = __ldlu(((const char *)ptr) + 0);
+  // CHECK: ld.global.lu.s32
+  volatile int v_1 = __ldlu(((const int *)ptr) + 1);
+  // CHECK: ld.global.lu.s64
+  volatile long v_2 = __ldlu(((const long *)ptr) + 2);
+  // CHECK: ld.global.lu.u64
+  volatile unsigned long v_3 = __ldlu(((const unsigned long *)ptr) + 3);
+  // CHECK: ld.global.lu.s64
+  volatile long long v_4 = __ldlu(((const long long *)ptr) + 4);
+  // CHECK: ld.global.lu.f32
+  volatile float v_5 = __ldlu(((const float *)ptr) + 5);
+  // CHECK: ld.global.lu.f64
+  volatile double v_6 = __ldlu(((const double *)ptr) + 6);
+  // CHECK: ld.global.lu.v2.s8
+  volatile char2 v_7 = __ldlu(((const char2 *)ptr) + 7);
+  // CHECK: ld.global.lu.v4.s8
+  volatile char4 v_8 = __ldlu(((const char4 *)ptr) + 8);
+  // CHECK: ld.global.lu.v2.s16
+  volatile short2 v_9 = __ldlu(((const short2 *)ptr) + 9);
+  // CHECK: ld.global.lu.v4.s16
+  volatile short4 v_10 = __ldlu(((const short4 *)ptr) + 10);
+  // CHECK: ld.global.lu.v2.s32
+  volatile int2 v_11 = __ldlu(((const int2 *)ptr) + 11);
+  // CHECK: ld.global.lu.v4.s32
+  volatile int4 v_12 = __ldlu(((const int4 *)ptr) + 12);
+  // CHECK: ld.global.lu.v2.s64
+  volatile longlong2 v_13 = __ldlu(((const longlong2 *)ptr) + 13);
+  // CHECK: ld.global.lu.v2.u8
+  volatile uchar2 v_14 = __ldlu(((const uchar2 *)ptr) + 14);
+  // CHECK: ld.global.lu.v4.u8
+  volatile uchar4 v_15 = __ldlu(((const uchar4 *)ptr) + 15);
+  // CHECK: ld.global.lu.v2.u16
+  volatile ushort2 v_16 = __ldlu(((const ushort2 *)ptr) + 16);
+  // CHECK: ld.global.lu.v4.u16
+  volatile ushort4 v_17 = __ldlu(((const ushort4 *)ptr) + 17);
+  // CHECK: ld.global.lu.v2.u32
+  volatile uint2 v_18 = __ldlu(((const uint2 *)ptr) + 18);
+  // CHECK: ld.global.lu.v4.u32
+  volatile uint4 v_19 = __ldlu(((const uint4 *)ptr) + 19);
+  // CHECK: ld.global.lu.v2.u64
+  volatile ulonglong2 v_20 = __ldlu(((const ulonglong2 *)ptr) + 20);
+  // CHECK: ld.global.lu.v2.f32
+  volatile float2 v_21 = __ldlu(((const float2 *)ptr) + 21);
+  // CHECK: ld.global.lu.v4.f32
+  volatile float4 v_22 = __ldlu(((const float4 *)ptr) + 22);
+  // CHECK: ld.global.lu.v2.f64
+  volatile double2 v_23 = __ldlu(((const double2 *)ptr) + 23);
+}
+__device__ void test_stores_wt(void *ptr, int val) {
+  // CHECK-LABEL: .func _Z14test_stores_wtPvi
+  // CHECK: st.global.wt.s8
+  __stwt(((char *)ptr) + 0, (char)val);
+  // CHECK: st.global.wt.s32
+  __stwt(((int *)ptr) + 1, (int)val);
+  // CHECK: st.global.wt.s64
+  __stwt(((long *)ptr) + 2, (long)val);
+  // CHECK: st.global.wt.u64
+  __stwt(((unsigned long *)ptr) + 3, (unsigned long)val);
+  // CHECK: st.global.wt.s64
+  __stwt(((long long *)ptr) + 4, (long long)val);
+  // CHECK: st.global.wt.f32
+  __stwt(((float *)ptr) + 5, (float)val);
+  // CHECK: st.global.wt.f64
+  __stwt(((double *)ptr) + 6, (double)val);
+  // CHECK: st.global.wt.v2.s8
+  { char2 v = {(char)val, (char)val}; __stwt(((char2 *)ptr) + 7, v); }
+  // CHECK: st.global.wt.v4.s8
+  { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stwt(((char4 
*)ptr) + 8, v); }
+  // CHECK: st.global.wt.v2.s16
+  { short2 v = {(short)val, (short)val}; __stwt(((short2 *)ptr) + 9, v); }
+  // CHECK: st.global.wt.v4.s16
+  { short4 v = {(short)val, (short)val, (short)val, (short)val}; 
__stwt(((short4 *)ptr) + 10, v); }
+  // CHECK: st.global.wt.v2.s32
+  { int2 v = {(int)val, (int)val}; __stwt(((int2 *)ptr) + 11, v); }
+  // CHECK: st.global.wt.v4.s32
+  { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stwt(((int4 *)ptr) + 
12, v); }
+  // CHECK: st.global.wt.v2.s64
+  { longlong2 v = {(long long)val, (long long)val}; __stwt(((longlong2 *)ptr) 
+ 13, v); }
+  // CHECK: st.global.wt.v2.u8
+  { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stwt(((uchar2 
*)ptr) + 14, v); }
+  // CHECK: st.global.wt.v4.u8
+  { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, 
(unsigned char)val}; __stwt(((uchar4 *)ptr) + 15, v); }
+  // CHECK: st.global.wt.v2.u16
+  { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stwt(((ushort2 
*)ptr) + 16, v); }
+  // CHECK: st.global.wt.v4.u16
+  { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned 
short)val, (unsigned short)val}; __stwt(((ushort4 *)ptr) + 17, v); }
+  // CHECK: st.global.wt.v2.u32
+  { uint2 v = {(unsigned int)val, (unsigned int)val}; __stwt(((uint2 *)ptr) + 
18, v); }
+  // CHECK: st.global.wt.v4.u32
+  { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, 
(unsigned int)val}; __stwt(((uint4 *)ptr) + 19, v); }
+  // CHECK: st.global.wt.v2.u64
+  { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; 
__stwt(((ulonglong2 *)ptr) + 20, v); }
+  // CHECK: st.global.wt.v2.f32
+  { float2 v = {(float)val, (float)val}; __stwt(((float2 *)ptr) + 21, v); }
+  // CHECK: st.global.wt.v4.f32
+  { float4 v = {(float)val, (float)val, (float)val, (float)val}; 
__stwt(((float4 *)ptr) + 22, v); }
+  // CHECK: st.global.wt.v2.f64
+  { double2 v = {(double)val, (double)val}; __stwt(((double2 *)ptr) + 23, v); }
+}
+__device__ void test_stores_wb(void *ptr, int val) {
+  // CHECK-LABEL: .func _Z14test_stores_wbPvi
+  // CHECK: st.global.wb.s8
+  __stwb(((char *)ptr) + 0, (char)val);
+  // CHECK: st.global.wb.s32
+  __stwb(((int *)ptr) + 1, (int)val);
+  // CHECK: st.global.wb.s64
+  __stwb(((long *)ptr) + 2, (long)val);
+  // CHECK: st.global.wb.u64
+  __stwb(((unsigned long *)ptr) + 3, (unsigned long)val);
+  // CHECK: st.global.wb.s64
+  __stwb(((long long *)ptr) + 4, (long long)val);
+  // CHECK: st.global.wb.f32
+  __stwb(((float *)ptr) + 5, (float)val);
+  // CHECK: st.global.wb.f64
+  __stwb(((double *)ptr) + 6, (double)val);
+  // CHECK: st.global.wb.v2.s8
+  { char2 v = {(char)val, (char)val}; __stwb(((char2 *)ptr) + 7, v); }
+  // CHECK: st.global.wb.v4.s8
+  { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stwb(((char4 
*)ptr) + 8, v); }
+  // CHECK: st.global.wb.v2.s16
+  { short2 v = {(short)val, (short)val}; __stwb(((short2 *)ptr) + 9, v); }
+  // CHECK: st.global.wb.v4.s16
+  { short4 v = {(short)val, (short)val, (short)val, (short)val}; 
__stwb(((short4 *)ptr) + 10, v); }
+  // CHECK: st.global.wb.v2.s32
+  { int2 v = {(int)val, (int)val}; __stwb(((int2 *)ptr) + 11, v); }
+  // CHECK: st.global.wb.v4.s32
+  { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stwb(((int4 *)ptr) + 
12, v); }
+  // CHECK: st.global.wb.v2.s64
+  { longlong2 v = {(long long)val, (long long)val}; __stwb(((longlong2 *)ptr) 
+ 13, v); }
+  // CHECK: st.global.wb.v2.u8
+  { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stwb(((uchar2 
*)ptr) + 14, v); }
+  // CHECK: st.global.wb.v4.u8
+  { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, 
(unsigned char)val}; __stwb(((uchar4 *)ptr) + 15, v); }
+  // CHECK: st.global.wb.v2.u16
+  { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stwb(((ushort2 
*)ptr) + 16, v); }
+  // CHECK: st.global.wb.v4.u16
+  { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned 
short)val, (unsigned short)val}; __stwb(((ushort4 *)ptr) + 17, v); }
+  // CHECK: st.global.wb.v2.u32
+  { uint2 v = {(unsigned int)val, (unsigned int)val}; __stwb(((uint2 *)ptr) + 
18, v); }
+  // CHECK: st.global.wb.v4.u32
+  { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, 
(unsigned int)val}; __stwb(((uint4 *)ptr) + 19, v); }
+  // CHECK: st.global.wb.v2.u64
+  { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; 
__stwb(((ulonglong2 *)ptr) + 20, v); }
+  // CHECK: st.global.wb.v2.f32
+  { float2 v = {(float)val, (float)val}; __stwb(((float2 *)ptr) + 21, v); }
+  // CHECK: st.global.wb.v4.f32
+  { float4 v = {(float)val, (float)val, (float)val, (float)val}; 
__stwb(((float4 *)ptr) + 22, v); }
+  // CHECK: st.global.wb.v2.f64
+  { double2 v = {(double)val, (double)val}; __stwb(((double2 *)ptr) + 23, v); }
+}
+__device__ void test_stores_cg(void *ptr, int val) {
+  // CHECK-LABEL: .func _Z14test_stores_cgPvi
+  // CHECK: st.global.cg.s8
+  __stcg(((char *)ptr) + 0, (char)val);
+  // CHECK: st.global.cg.s32
+  __stcg(((int *)ptr) + 1, (int)val);
+  // CHECK: st.global.cg.s64
+  __stcg(((long *)ptr) + 2, (long)val);
+  // CHECK: st.global.cg.u64
+  __stcg(((unsigned long *)ptr) + 3, (unsigned long)val);
+  // CHECK: st.global.cg.s64
+  __stcg(((long long *)ptr) + 4, (long long)val);
+  // CHECK: st.global.cg.f32
+  __stcg(((float *)ptr) + 5, (float)val);
+  // CHECK: st.global.cg.f64
+  __stcg(((double *)ptr) + 6, (double)val);
+  // CHECK: st.global.cg.v2.s8
+  { char2 v = {(char)val, (char)val}; __stcg(((char2 *)ptr) + 7, v); }
+  // CHECK: st.global.cg.v4.s8
+  { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stcg(((char4 
*)ptr) + 8, v); }
+  // CHECK: st.global.cg.v2.s16
+  { short2 v = {(short)val, (short)val}; __stcg(((short2 *)ptr) + 9, v); }
+  // CHECK: st.global.cg.v4.s16
+  { short4 v = {(short)val, (short)val, (short)val, (short)val}; 
__stcg(((short4 *)ptr) + 10, v); }
+  // CHECK: st.global.cg.v2.s32
+  { int2 v = {(int)val, (int)val}; __stcg(((int2 *)ptr) + 11, v); }
+  // CHECK: st.global.cg.v4.s32
+  { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stcg(((int4 *)ptr) + 
12, v); }
+  // CHECK: st.global.cg.v2.s64
+  { longlong2 v = {(long long)val, (long long)val}; __stcg(((longlong2 *)ptr) 
+ 13, v); }
+  // CHECK: st.global.cg.v2.u8
+  { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stcg(((uchar2 
*)ptr) + 14, v); }
+  // CHECK: st.global.cg.v4.u8
+  { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, 
(unsigned char)val}; __stcg(((uchar4 *)ptr) + 15, v); }
+  // CHECK: st.global.cg.v2.u16
+  { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stcg(((ushort2 
*)ptr) + 16, v); }
+  // CHECK: st.global.cg.v4.u16
+  { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned 
short)val, (unsigned short)val}; __stcg(((ushort4 *)ptr) + 17, v); }
+  // CHECK: st.global.cg.v2.u32
+  { uint2 v = {(unsigned int)val, (unsigned int)val}; __stcg(((uint2 *)ptr) + 
18, v); }
+  // CHECK: st.global.cg.v4.u32
+  { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, 
(unsigned int)val}; __stcg(((uint4 *)ptr) + 19, v); }
+  // CHECK: st.global.cg.v2.u64
+  { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; 
__stcg(((ulonglong2 *)ptr) + 20, v); }
+  // CHECK: st.global.cg.v2.f32
+  { float2 v = {(float)val, (float)val}; __stcg(((float2 *)ptr) + 21, v); }
+  // CHECK: st.global.cg.v4.f32
+  { float4 v = {(float)val, (float)val, (float)val, (float)val}; 
__stcg(((float4 *)ptr) + 22, v); }
+  // CHECK: st.global.cg.v2.f64
+  { double2 v = {(double)val, (double)val}; __stcg(((double2 *)ptr) + 23, v); }
+}
+__device__ void test_stores_cs(void *ptr, int val) {
+  // CHECK-LABEL: .func _Z14test_stores_csPvi
+  // CHECK: st.global.cs.s8
+  __stcs(((char *)ptr) + 0, (char)val);
+  // CHECK: st.global.cs.s32
+  __stcs(((int *)ptr) + 1, (int)val);
+  // CHECK: st.global.cs.s64
+  __stcs(((long *)ptr) + 2, (long)val);
+  // CHECK: st.global.cs.u64
+  __stcs(((unsigned long *)ptr) + 3, (unsigned long)val);
+  // CHECK: st.global.cs.s64
+  __stcs(((long long *)ptr) + 4, (long long)val);
+  // CHECK: st.global.cs.f32
+  __stcs(((float *)ptr) + 5, (float)val);
+  // CHECK: st.global.cs.f64
+  __stcs(((double *)ptr) + 6, (double)val);
+  // CHECK: st.global.cs.v2.s8
+  { char2 v = {(char)val, (char)val}; __stcs(((char2 *)ptr) + 7, v); }
+  // CHECK: st.global.cs.v4.s8
+  { char4 v = {(char)val, (char)val, (char)val, (char)val}; __stcs(((char4 
*)ptr) + 8, v); }
+  // CHECK: st.global.cs.v2.s16
+  { short2 v = {(short)val, (short)val}; __stcs(((short2 *)ptr) + 9, v); }
+  // CHECK: st.global.cs.v4.s16
+  { short4 v = {(short)val, (short)val, (short)val, (short)val}; 
__stcs(((short4 *)ptr) + 10, v); }
+  // CHECK: st.global.cs.v2.s32
+  { int2 v = {(int)val, (int)val}; __stcs(((int2 *)ptr) + 11, v); }
+  // CHECK: st.global.cs.v4.s32
+  { int4 v = {(int)val, (int)val, (int)val, (int)val}; __stcs(((int4 *)ptr) + 
12, v); }
+  // CHECK: st.global.cs.v2.s64
+  { longlong2 v = {(long long)val, (long long)val}; __stcs(((longlong2 *)ptr) 
+ 13, v); }
+  // CHECK: st.global.cs.v2.u8
+  { uchar2 v = {(unsigned char)val, (unsigned char)val}; __stcs(((uchar2 
*)ptr) + 14, v); }
+  // CHECK: st.global.cs.v4.u8
+  { uchar4 v = {(unsigned char)val, (unsigned char)val, (unsigned char)val, 
(unsigned char)val}; __stcs(((uchar4 *)ptr) + 15, v); }
+  // CHECK: st.global.cs.v2.u16
+  { ushort2 v = {(unsigned short)val, (unsigned short)val}; __stcs(((ushort2 
*)ptr) + 16, v); }
+  // CHECK: st.global.cs.v4.u16
+  { ushort4 v = {(unsigned short)val, (unsigned short)val, (unsigned 
short)val, (unsigned short)val}; __stcs(((ushort4 *)ptr) + 17, v); }
+  // CHECK: st.global.cs.v2.u32
+  { uint2 v = {(unsigned int)val, (unsigned int)val}; __stcs(((uint2 *)ptr) + 
18, v); }
+  // CHECK: st.global.cs.v4.u32
+  { uint4 v = {(unsigned int)val, (unsigned int)val, (unsigned int)val, 
(unsigned int)val}; __stcs(((uint4 *)ptr) + 19, v); }
+  // CHECK: st.global.cs.v2.u64
+  { ulonglong2 v = {(unsigned long long)val, (unsigned long long)val}; 
__stcs(((ulonglong2 *)ptr) + 20, v); }
+  // CHECK: st.global.cs.v2.f32
+  { float2 v = {(float)val, (float)val}; __stcs(((float2 *)ptr) + 21, v); }
+  // CHECK: st.global.cs.v4.f32
+  { float4 v = {(float)val, (float)val, (float)val, (float)val}; 
__stcs(((float4 *)ptr) + 22, v); }
+  // CHECK: st.global.cs.v2.f64
+  { double2 v = {(double)val, (double)val}; __stcs(((double2 *)ptr) + 23, v); }
+}

>From 22c88c70ccdaa2e1bf20e7cd94143a005b0ec97e Mon Sep 17 00:00:00 2001
From: Artem Belevich <[email protected]>
Date: Tue, 31 Mar 2026 17:15:30 -0700
Subject: [PATCH 3/6] clang-format

---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 115 ++++++++------------
 1 file changed, 47 insertions(+), 68 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h 
b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 9507b6ce59e82..4c34f742f9a81 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -528,19 +528,19 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
   inline __device__ long __ld##__Mode(const long *__ptr) {                     
\
     if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
       return (long)__ld##__Mode((const long long *)__ptr);                     
\
-    } else {                                                                  \
-      return (long)__ld##__Mode((const int *)__ptr);                           
  \
-    }                                                                         \
+    } else {                                                                   
\
+      return (long)__ld##__Mode((const int *)__ptr);                           
\
+    }                                                                          
\
   }
 
 #pragma push_macro("__INTRINSIC_LOAD_ULONG")
-#define __INTRINSIC_LOAD_ULONG(__Mode)                                         
 \
+#define __INTRINSIC_LOAD_ULONG(__Mode)                                         
\
   inline __device__ unsigned long __ld##__Mode(const unsigned long *__ptr) {   
\
     if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
       return (unsigned long)__ld##__Mode((const unsigned long long *)__ptr);   
\
-    } else {                                                                  \
+    } else {                                                                   
\
       return (unsigned long)__ld##__Mode((const unsigned int *)__ptr);         
\
-    }                                                                         \
+    }                                                                          
\
   }
 
 #pragma push_macro("__INTRINSIC_LOAD_FAMILY")
@@ -569,11 +569,11 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
                     "=r", __Clobber)                                           
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s64", longlong2,   
\
                     longlong2, "=l", __Clobber)                                
\
-  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char,   \
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char,    
\
                    unsigned int, "=r", __Clobber)                              
\
-  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short, \
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short,  
\
                    unsigned short, "=h", __Clobber)                            
\
-  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int,   \
+  __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int,    
\
                    unsigned int, "=r", __Clobber)                              
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u64",                  
\
                    unsigned long long, unsigned long long, "=l", __Clobber)    
\
@@ -589,7 +589,7 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, 
unsigned high32,
                     uint2, "=r", __Clobber)                                    
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u32", uint4,       
\
                     uint4, "=r", __Clobber)                                    
\
-  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2, \
+  __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2,  
\
                     ulonglong2, "=l", __Clobber)                               
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f32", float, float,    
\
                    "=f", __Clobber)                                            
\
@@ -604,29 +604,14 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
   __INTRINSIC_LOAD_LONG(__Mode)                                                
\
   __INTRINSIC_LOAD_ULONG(__Mode)
 
-
-
-
 __INTRINSIC_LOAD_FAMILY(cg, )
 __INTRINSIC_LOAD_FAMILY(ca, )
 
-
-
-
-
 __INTRINSIC_LOAD_FAMILY(cv, : "memory")
 __INTRINSIC_LOAD_FAMILY(lu, : "memory")
 
-
-
-
-
-
 __INTRINSIC_LOAD_FAMILY(cs, )
 
-
-
-
 #pragma pop_macro("__INTRINSIC_LOAD")
 #pragma pop_macro("__INTRINSIC_LOAD2")
 #pragma pop_macro("__INTRINSIC_LOAD4")
@@ -634,9 +619,6 @@ __INTRINSIC_LOAD_FAMILY(cs, )
 #pragma pop_macro("__INTRINSIC_LOAD_LONG")
 #pragma pop_macro("__INTRINSIC_LOAD_ULONG")
 
-
-
-
 #pragma push_macro("__INTRINSIC_STORE")
 #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) 
\
   inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     
\
@@ -673,23 +655,24 @@ __INTRINSIC_LOAD_FAMILY(cs, )
   }
 
 #pragma push_macro("__INTRINSIC_STORE_LONG")
-#define __INTRINSIC_STORE_LONG(__Mode)                                         
 \
-  inline __device__ void __st##__Mode(long *__ptr, long __value) {             
 \
+#define __INTRINSIC_STORE_LONG(__Mode)                                         
\
+  inline __device__ void __st##__Mode(long *__ptr, long __value) {             
\
     if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
-      __st##__Mode((long long *)__ptr, (long long)__value);                    
 \
-    } else {                                                                  \
-      __st##__Mode((int *)__ptr, (int)__value);                                
 \
-    }                                                                         \
+      __st##__Mode((long long *)__ptr, (long long)__value);                    
\
+    } else {                                                                   
\
+      __st##__Mode((int *)__ptr, (int)__value);                                
\
+    }                                                                          
\
   }
 
 #pragma push_macro("__INTRINSIC_STORE_ULONG")
-#define __INTRINSIC_STORE_ULONG(__Mode)                                        
  \
-  inline __device__ void __st##__Mode(unsigned long *__ptr, unsigned long 
__value) { \
+#define __INTRINSIC_STORE_ULONG(__Mode)                                        
\
+  inline __device__ void __st##__Mode(unsigned long *__ptr,                    
\
+                                      unsigned long __value) {                 
\
     if (__SIZEOF_LONG__ == __SIZEOF_LONG_LONG__) {                             
\
-      __st##__Mode((unsigned long long *)__ptr, (unsigned long long)__value);  
 \
-    } else {                                                                  \
-      __st##__Mode((unsigned int *)__ptr, (unsigned int)__value);              
   \
-    }                                                                         \
+      __st##__Mode((unsigned long long *)__ptr, (unsigned long long)__value);  
\
+    } else {                                                                   
\
+      __st##__Mode((unsigned int *)__ptr, (unsigned int)__value);              
\
+    }                                                                          
\
   }
 
 #pragma push_macro("__INTRINSIC_STORE_FAMILY")
@@ -700,20 +683,20 @@ __INTRINSIC_LOAD_FAMILY(cs, )
   __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s16", short, short,   
\
                     "h")                                                       
\
   __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s32", int, int, "r")  
\
-  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s64", long long,      
 \
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".s64", long long,      
\
                     long long, "l")                                            
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s8", char2,       
\
-                     int2, "r")                                                
\
-  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s8", char4,       
\
-                     int4, "r")                                                
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s16", short2,     
 \
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s8", char2, int2, 
\
+                     "r")                                                      
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s8", char4, int4, 
\
+                     "r")                                                      
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s16", short2,     
\
                      short2, "h")                                              
\
-  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s16", short4,     
 \
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s16", short4,     
\
                      short4, "h")                                              
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s32", int2,       
\
-                     int2, "r")                                                
\
-  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s32", int4,       
\
-                     int4, "r")                                                
\
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s32", int2, int2, 
\
+                     "r")                                                      
\
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.s32", int4, int4, 
\
+                     "r")                                                      
\
   __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.s64", longlong2,  
\
                      longlong2, "l")                                           
\
   __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u8", unsigned char,   
\
@@ -724,42 +707,38 @@ __INTRINSIC_LOAD_FAMILY(cs, )
                     unsigned int, "r")                                         
\
   __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".u64",                 
\
                     unsigned long long, unsigned long long, "l")               
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u8", uchar2,      
 \
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u8", uchar2,      
\
                      uchar2, "r")                                              
\
-  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u8", uchar4,      
 \
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u8", uchar4,      
\
                      uint4, "r")                                               
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u16", ushort2,    
 \
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u16", ushort2,    
\
                      ushort2, "h")                                             
\
-  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u16", ushort4,    
 \
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u16", ushort4,    
\
                      ushort4, "h")                                             
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u32", uint2,      
 \
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u32", uint2,      
\
                      uint2, "r")                                               
\
-  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u32", uint4,      
 \
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.u32", uint4,      
\
                      uint4, "r")                                               
\
   __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.u64", ulonglong2, 
\
                      ulonglong2, "l")                                          
\
-  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f32", float, float,   
 \
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f32", float, float,   
\
                     "f")                                                       
\
-  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f64", double, double, 
 \
+  __INTRINSIC_STORE(__st##__Mode, "st.global." #__Mode ".f64", double, double, 
\
                     "d")                                                       
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f32", float2,     
 \
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f32", float2,     
\
                      float2, "f")                                              
\
-  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.f32", float4,     
 \
+  __INTRINSIC_STORE4(__st##__Mode, "st.global." #__Mode ".v4.f32", float4,     
\
                      float4, "f")                                              
\
-  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f64", double2,    
 \
-                     double2, "d")                                             
 \
-  __INTRINSIC_STORE_LONG(__Mode)                                               
 \
+  __INTRINSIC_STORE2(__st##__Mode, "st.global." #__Mode ".v2.f64", double2,    
\
+                     double2, "d")                                             
\
+  __INTRINSIC_STORE_LONG(__Mode)                                               
\
   __INTRINSIC_STORE_ULONG(__Mode)
 
-
 __INTRINSIC_STORE_FAMILY(wt)
 __INTRINSIC_STORE_FAMILY(wb)
 __INTRINSIC_STORE_FAMILY(cg)
 __INTRINSIC_STORE_FAMILY(cs)
 
-
-
-
 #pragma pop_macro("__INTRINSIC_STORE")
 #pragma pop_macro("__INTRINSIC_STORE2")
 #pragma pop_macro("__INTRINSIC_STORE4")

>From 86bd3f4f2f2666475fabf1f07785aaec7447bce4 Mon Sep 17 00:00:00 2001
From: Artem Belevich <[email protected]>
Date: Tue, 31 Mar 2026 17:17:05 -0700
Subject: [PATCH 4/6] minor cleanup

---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h 
b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 4c34f742f9a81..1ceb540fbd746 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -604,20 +604,18 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
   __INTRINSIC_LOAD_LONG(__Mode)                                                
\
   __INTRINSIC_LOAD_ULONG(__Mode)
 
-__INTRINSIC_LOAD_FAMILY(cg, )
-__INTRINSIC_LOAD_FAMILY(ca, )
-
+__INTRINSIC_LOAD_FAMILY(ca, /* no clobber */)
+__INTRINSIC_LOAD_FAMILY(cg, /* no clobber */)
+__INTRINSIC_LOAD_FAMILY(cs, /* no clobber */)
 __INTRINSIC_LOAD_FAMILY(cv, : "memory")
 __INTRINSIC_LOAD_FAMILY(lu, : "memory")
 
-__INTRINSIC_LOAD_FAMILY(cs, )
-
 #pragma pop_macro("__INTRINSIC_LOAD")
 #pragma pop_macro("__INTRINSIC_LOAD2")
 #pragma pop_macro("__INTRINSIC_LOAD4")
-#pragma pop_macro("__INTRINSIC_LOAD_FAMILY")
 #pragma pop_macro("__INTRINSIC_LOAD_LONG")
 #pragma pop_macro("__INTRINSIC_LOAD_ULONG")
+#pragma pop_macro("__INTRINSIC_LOAD_FAMILY")
 
 #pragma push_macro("__INTRINSIC_STORE")
 #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) 
\
@@ -734,17 +732,17 @@ __INTRINSIC_LOAD_FAMILY(cs, )
   __INTRINSIC_STORE_LONG(__Mode)                                               
\
   __INTRINSIC_STORE_ULONG(__Mode)
 
-__INTRINSIC_STORE_FAMILY(wt)
-__INTRINSIC_STORE_FAMILY(wb)
 __INTRINSIC_STORE_FAMILY(cg)
 __INTRINSIC_STORE_FAMILY(cs)
+__INTRINSIC_STORE_FAMILY(wb)
+__INTRINSIC_STORE_FAMILY(wt)
 
 #pragma pop_macro("__INTRINSIC_STORE")
 #pragma pop_macro("__INTRINSIC_STORE2")
 #pragma pop_macro("__INTRINSIC_STORE4")
-#pragma pop_macro("__INTRINSIC_STORE_FAMILY")
 #pragma pop_macro("__INTRINSIC_STORE_LONG")
 #pragma pop_macro("__INTRINSIC_STORE_ULONG")
+#pragma pop_macro("__INTRINSIC_STORE_FAMILY")
 
 #endif // defined(__cplusplus) && (__cplusplus >= 201103L)
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320

>From 421c087a144fca1d30710be61d0ba957a3499e95 Mon Sep 17 00:00:00 2001
From: Artem Belevich <[email protected]>
Date: Wed, 1 Apr 2026 11:19:19 -0700
Subject: [PATCH 5/6] [Clang][CUDA] Refactor and consolidate CUDA load/store
 intrinsics

- Parameterize volatile for load macros
- Use __asm__ __volatile__ for cached loads (ca, cg, cs)
- Use plain __asm__ with memory clobbers for uncached loads
- Replace plain asm with __asm__ for consistency
- Add support for unsigned long
- Update test suite to verify loads are preserved
---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 134 +++++++++---------
 clang/test/Headers/cuda_intrinsics.cu       | 144 ++++++++++----------
 2 files changed, 140 insertions(+), 138 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h 
b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 1ceb540fbd746..2412f9e51ffbf 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -449,33 +449,33 @@ inline __device__ double2 __ldg(const double2 *ptr) {
 inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
                                            unsigned shiftWidth) {
   unsigned result;
-  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
-      : "=r"(result)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  __asm__("shf.l.wrap.b32 %0, %1, %2, %3;"
+          : "=r"(result)
+          : "r"(low32), "r"(high32), "r"(shiftWidth));
   return result;
 }
 inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
                                             unsigned shiftWidth) {
   unsigned result;
-  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
-      : "=r"(result)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  __asm__("shf.l.clamp.b32 %0, %1, %2, %3;"
+          : "=r"(result)
+          : "r"(low32), "r"(high32), "r"(shiftWidth));
   return result;
 }
 inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
                                            unsigned shiftWidth) {
   unsigned result;
-  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
-      : "=r"(result)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  __asm__("shf.r.wrap.b32 %0, %1, %2, %3;"
+          : "=r"(result)
+          : "r"(low32), "r"(high32), "r"(shiftWidth));
   return result;
 }
 inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
                                             unsigned shiftWidth) {
   unsigned ret;
-  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
-      : "=r"(ret)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  __asm__("shf.r.clamp.b32 %0, %1, %2, %3;"
+          : "=r"(ret)
+          : "r"(low32), "r"(high32), "r"(shiftWidth));
   return ret;
 }
 
@@ -483,38 +483,40 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
 
 #pragma push_macro("__INTRINSIC_LOAD")
 #define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType,  
\
-                         __Clobber)                                            
\
+                         __Volatile, __Clobber)                                
\
   inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             
\
     __TmpType __ret;                                                           
\
-    asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber);        
\
+    __asm__ __Volatile(__AsmOp " %0, [%1];"                                    
\
+                       : __AsmType(__ret)                                      
\
+                       : "l"(__ptr)__Clobber);                                 
\
     return (__DeclType)__ret;                                                  
\
   }
 
 #pragma push_macro("__INTRINSIC_LOAD2")
 #define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, 
\
-                          __Clobber)                                           
\
+                          __Volatile, __Clobber)                               
\
   inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             
\
     __DeclType __ret;                                                          
\
     __TmpType __tmp;                                                           
\
-    asm(__AsmOp " {%0,%1}, [%2];"                                              
\
-        : __AsmType(__tmp.x), __AsmType(__tmp.y)                               
\
-        : "l"(__ptr)__Clobber);                                                
\
+    __asm__ __Volatile(__AsmOp " {%0,%1}, [%2];"                               
\
+                       : __AsmType(__tmp.x), __AsmType(__tmp.y)                
\
+                       : "l"(__ptr)__Clobber);                                 
\
     using __ElementType = decltype(__ret.x);                                   
\
-    __ret.x = (__ElementType)(__tmp.x);                                        
\
+    __ret.x = (__ElementType)__tmp.x;                                          
\
     __ret.y = (__ElementType)__tmp.y;                                          
\
     return __ret;                                                              
\
   }
 
 #pragma push_macro("__INTRINSIC_LOAD4")
 #define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, 
\
-                          __Clobber)                                           
\
+                          __Volatile, __Clobber)                               
\
   inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             
\
     __DeclType __ret;                                                          
\
     __TmpType __tmp;                                                           
\
-    asm(__AsmOp " {%0,%1,%2,%3}, [%4];"                                        
\
-        : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z),          
\
-          __AsmType(__tmp.w)                                                   
\
-        : "l"(__ptr)__Clobber);                                                
\
+    __asm__ __Volatile(__AsmOp " {%0,%1,%2,%3}, [%4];"                         
\
+                       : __AsmType(__tmp.x), __AsmType(__tmp.y),               
\
+                         __AsmType(__tmp.z), __AsmType(__tmp.w)                
\
+                       : "l"(__ptr)__Clobber);                                 
\
     using __ElementType = decltype(__ret.x);                                   
\
     __ret.x = (__ElementType)__tmp.x;                                          
\
     __ret.y = (__ElementType)__tmp.y;                                          
\
@@ -543,72 +545,72 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
     }                                                                          
\
   }
 
-#pragma push_macro("__INTRINSIC_LOAD_FAMILY")
-#define __INTRINSIC_LOAD_FAMILY(__Mode, __Clobber)                             
\
+#define __INTRINSIC_LOAD_FAMILY(__Mode, __Volatile, __Clobber)                 
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", char,             
\
-                   unsigned int, "=r", __Clobber)                              
\
+                   unsigned int, "=r", __Volatile, __Clobber)                  
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s8", signed char,      
\
-                   unsigned int, "=r", __Clobber)                              
\
+                   unsigned int, "=r", __Volatile, __Clobber)                  
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s16", short,           
\
-                   unsigned short, "=h", __Clobber)                            
\
+                   unsigned short, "=h", __Volatile, __Clobber)                
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s32", int,             
\
-                   unsigned int, "=r", __Clobber)                              
\
+                   unsigned int, "=r", __Volatile, __Clobber)                  
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".s64", long long,       
\
-                   unsigned long long, "=l", __Clobber)                        
\
+                   unsigned long long, "=l", __Volatile, __Clobber)            
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s8", char2, int2,  
\
-                    "=r", __Clobber)                                           
\
+                    "=r", __Volatile, __Clobber)                               
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s8", char4, int4,  
\
-                    "=r", __Clobber)                                           
\
+                    "=r", __Volatile, __Clobber)                               
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s16", short2,      
\
-                    short2, "=h", __Clobber)                                   
\
+                    short2, "=h", __Volatile, __Clobber)                       
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s16", short4,      
\
-                    short4, "=h", __Clobber)                                   
\
+                    short4, "=h", __Volatile, __Clobber)                       
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s32", int2, int2,  
\
-                    "=r", __Clobber)                                           
\
+                    "=r", __Volatile, __Clobber)                               
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.s32", int4, int4,  
\
-                    "=r", __Clobber)                                           
\
+                    "=r", __Volatile, __Clobber)                               
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.s64", longlong2,   
\
-                    longlong2, "=l", __Clobber)                                
\
+                    longlong2, "=l", __Volatile, __Clobber)                    
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u8", unsigned char,    
\
-                   unsigned int, "=r", __Clobber)                              
\
+                   unsigned int, "=r", __Volatile, __Clobber)                  
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u16", unsigned short,  
\
-                   unsigned short, "=h", __Clobber)                            
\
+                   unsigned short, "=h", __Volatile, __Clobber)                
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u32", unsigned int,    
\
-                   unsigned int, "=r", __Clobber)                              
\
+                   unsigned int, "=r", __Volatile, __Clobber)                  
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".u64",                  
\
-                   unsigned long long, unsigned long long, "=l", __Clobber)    
\
+                   unsigned long long, unsigned long long, "=l", __Volatile,   
\
+                   __Clobber)                                                  
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u8", uchar2,       
\
-                    uint2, "=r", __Clobber)                                    
\
+                    uint2, "=r", __Volatile, __Clobber)                        
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u8", uchar4,       
\
-                    uint4, "=r", __Clobber)                                    
\
+                    uint4, "=r", __Volatile, __Clobber)                        
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u16", ushort2,     
\
-                    ushort2, "=h", __Clobber)                                  
\
+                    ushort2, "=h", __Volatile, __Clobber)                      
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u16", ushort4,     
\
-                    ushort4, "=h", __Clobber)                                  
\
+                    ushort4, "=h", __Volatile, __Clobber)                      
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u32", uint2,       
\
-                    uint2, "=r", __Clobber)                                    
\
+                    uint2, "=r", __Volatile, __Clobber)                        
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.u32", uint4,       
\
-                    uint4, "=r", __Clobber)                                    
\
+                    uint4, "=r", __Volatile, __Clobber)                        
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.u64", ulonglong2,  
\
-                    ulonglong2, "=l", __Clobber)                               
\
+                    ulonglong2, "=l", __Volatile, __Clobber)                   
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f32", float, float,    
\
-                   "=f", __Clobber)                                            
\
+                   "=f", __Volatile, __Clobber)                                
\
   __INTRINSIC_LOAD(__ld##__Mode, "ld.global." #__Mode ".f64", double, double,  
\
-                   "=d", __Clobber)                                            
\
+                   "=d", __Volatile, __Clobber)                                
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f32", float2,      
\
-                    float2, "=f", __Clobber)                                   
\
+                    float2, "=f", __Volatile, __Clobber)                       
\
   __INTRINSIC_LOAD4(__ld##__Mode, "ld.global." #__Mode ".v4.f32", float4,      
\
-                    float4, "=f", __Clobber)                                   
\
+                    float4, "=f", __Volatile, __Clobber)                       
\
   __INTRINSIC_LOAD2(__ld##__Mode, "ld.global." #__Mode ".v2.f64", double2,     
\
-                    double2, "=d", __Clobber)                                  
\
+                    double2, "=d", __Volatile, __Clobber)                      
\
   __INTRINSIC_LOAD_LONG(__Mode)                                                
\
   __INTRINSIC_LOAD_ULONG(__Mode)
 
-__INTRINSIC_LOAD_FAMILY(ca, /* no clobber */)
-__INTRINSIC_LOAD_FAMILY(cg, /* no clobber */)
-__INTRINSIC_LOAD_FAMILY(cs, /* no clobber */)
-__INTRINSIC_LOAD_FAMILY(cv, : "memory")
-__INTRINSIC_LOAD_FAMILY(lu, : "memory")
+__INTRINSIC_LOAD_FAMILY(ca, __volatile__, /* no clobber */)
+__INTRINSIC_LOAD_FAMILY(cg, __volatile__, /* no clobber */)
+__INTRINSIC_LOAD_FAMILY(cs, __volatile__, /* no clobber */)
+__INTRINSIC_LOAD_FAMILY(cv, /* not volatile */, : "memory")
+__INTRINSIC_LOAD_FAMILY(lu, /* not volatile */, : "memory")
 
 #pragma pop_macro("__INTRINSIC_LOAD")
 #pragma pop_macro("__INTRINSIC_LOAD2")
@@ -621,7 +623,7 @@ __INTRINSIC_LOAD_FAMILY(lu, : "memory")
 #define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) 
\
   inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     
\
     __TmpType __tmp = (__TmpType)__value;                                      
\
-    asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory");       
\
+    __asm__(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory");   
\
   }
 
 #pragma push_macro("__INTRINSIC_STORE2")
@@ -632,9 +634,9 @@ __INTRINSIC_LOAD_FAMILY(lu, : "memory")
     using __ElementType = decltype(__tmp.x);                                   
\
     __tmp.x = (__ElementType)(__value.x);                                      
\
     __tmp.y = (__ElementType)(__value.y);                                      
\
-    asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x),            
\
-        __AsmType(__tmp.y)                                                     
\
-        : "memory");                                                           
\
+    __asm__(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x),        
\
+            __AsmType(__tmp.y)                                                 
\
+            : "memory");                                                       
\
   }
 
 #pragma push_macro("__INTRINSIC_STORE4")
@@ -647,9 +649,9 @@ __INTRINSIC_LOAD_FAMILY(lu, : "memory")
     __tmp.y = (__ElementType)(__value.y);                                      
\
     __tmp.z = (__ElementType)(__value.z);                                      
\
     __tmp.w = (__ElementType)(__value.w);                                      
\
-    asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x),      
\
-        __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w)             
\
-        : "memory");                                                           
\
+    __asm__(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x),  
\
+            __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w)         
\
+            : "memory");                                                       
\
   }
 
 #pragma push_macro("__INTRINSIC_STORE_LONG")
diff --git a/clang/test/Headers/cuda_intrinsics.cu 
b/clang/test/Headers/cuda_intrinsics.cu
index 68e2d931acd79..ed082f54a28dc 100644
--- a/clang/test/Headers/cuda_intrinsics.cu
+++ b/clang/test/Headers/cuda_intrinsics.cu
@@ -28,53 +28,53 @@ struct double2 { double x, y; };
 __device__ void test_loads_cg(void *ptr) {
   // CHECK-LABEL: .func _Z13test_loads_cgPv
   // CHECK: ld.global.cg.s8
-  volatile char v_0 = __ldcg(((const char *)ptr) + 0);
+  __ldcg(((const char *)ptr) + 0);
   // CHECK: ld.global.cg.s32
-  volatile int v_1 = __ldcg(((const int *)ptr) + 1);
+  __ldcg(((const int *)ptr) + 1);
   // CHECK: ld.global.cg.s64
-  volatile long v_2 = __ldcg(((const long *)ptr) + 2);
+  __ldcg(((const long *)ptr) + 2);
   // CHECK: ld.global.cg.u64
-  volatile unsigned long v_3 = __ldcg(((const unsigned long *)ptr) + 3);
+  __ldcg(((const unsigned long *)ptr) + 3);
   // CHECK: ld.global.cg.s64
-  volatile long long v_4 = __ldcg(((const long long *)ptr) + 4);
+  __ldcg(((const long long *)ptr) + 4);
   // CHECK: ld.global.cg.f32
-  volatile float v_5 = __ldcg(((const float *)ptr) + 5);
+  __ldcg(((const float *)ptr) + 5);
   // CHECK: ld.global.cg.f64
-  volatile double v_6 = __ldcg(((const double *)ptr) + 6);
+  __ldcg(((const double *)ptr) + 6);
   // CHECK: ld.global.cg.v2.s8
-  volatile char2 v_7 = __ldcg(((const char2 *)ptr) + 7);
+  __ldcg(((const char2 *)ptr) + 7);
   // CHECK: ld.global.cg.v4.s8
-  volatile char4 v_8 = __ldcg(((const char4 *)ptr) + 8);
+  __ldcg(((const char4 *)ptr) + 8);
   // CHECK: ld.global.cg.v2.s16
-  volatile short2 v_9 = __ldcg(((const short2 *)ptr) + 9);
+  __ldcg(((const short2 *)ptr) + 9);
   // CHECK: ld.global.cg.v4.s16
-  volatile short4 v_10 = __ldcg(((const short4 *)ptr) + 10);
+  __ldcg(((const short4 *)ptr) + 10);
   // CHECK: ld.global.cg.v2.s32
-  volatile int2 v_11 = __ldcg(((const int2 *)ptr) + 11);
+  __ldcg(((const int2 *)ptr) + 11);
   // CHECK: ld.global.cg.v4.s32
-  volatile int4 v_12 = __ldcg(((const int4 *)ptr) + 12);
+  __ldcg(((const int4 *)ptr) + 12);
   // CHECK: ld.global.cg.v2.s64
-  volatile longlong2 v_13 = __ldcg(((const longlong2 *)ptr) + 13);
+  __ldcg(((const longlong2 *)ptr) + 13);
   // CHECK: ld.global.cg.v2.u8
-  volatile uchar2 v_14 = __ldcg(((const uchar2 *)ptr) + 14);
+  __ldcg(((const uchar2 *)ptr) + 14);
   // CHECK: ld.global.cg.v4.u8
-  volatile uchar4 v_15 = __ldcg(((const uchar4 *)ptr) + 15);
+  __ldcg(((const uchar4 *)ptr) + 15);
   // CHECK: ld.global.cg.v2.u16
-  volatile ushort2 v_16 = __ldcg(((const ushort2 *)ptr) + 16);
+  __ldcg(((const ushort2 *)ptr) + 16);
   // CHECK: ld.global.cg.v4.u16
-  volatile ushort4 v_17 = __ldcg(((const ushort4 *)ptr) + 17);
+  __ldcg(((const ushort4 *)ptr) + 17);
   // CHECK: ld.global.cg.v2.u32
-  volatile uint2 v_18 = __ldcg(((const uint2 *)ptr) + 18);
+  __ldcg(((const uint2 *)ptr) + 18);
   // CHECK: ld.global.cg.v4.u32
-  volatile uint4 v_19 = __ldcg(((const uint4 *)ptr) + 19);
+  __ldcg(((const uint4 *)ptr) + 19);
   // CHECK: ld.global.cg.v2.u64
-  volatile ulonglong2 v_20 = __ldcg(((const ulonglong2 *)ptr) + 20);
+  __ldcg(((const ulonglong2 *)ptr) + 20);
   // CHECK: ld.global.cg.v2.f32
-  volatile float2 v_21 = __ldcg(((const float2 *)ptr) + 21);
+  __ldcg(((const float2 *)ptr) + 21);
   // CHECK: ld.global.cg.v4.f32
-  volatile float4 v_22 = __ldcg(((const float4 *)ptr) + 22);
+  __ldcg(((const float4 *)ptr) + 22);
   // CHECK: ld.global.cg.v2.f64
-  volatile double2 v_23 = __ldcg(((const double2 *)ptr) + 23);
+  __ldcg(((const double2 *)ptr) + 23);
 }
 __device__ void test_loads_cv(void *ptr) {
   // CHECK-LABEL: .func _Z13test_loads_cvPv
@@ -130,104 +130,104 @@ __device__ void test_loads_cv(void *ptr) {
 __device__ void test_loads_cs(void *ptr) {
   // CHECK-LABEL: .func _Z13test_loads_csPv
   // CHECK: ld.global.cs.s8
-  volatile char v_0 = __ldcs(((const char *)ptr) + 0);
+  __ldcs(((const char *)ptr) + 0);
   // CHECK: ld.global.cs.s32
-  volatile int v_1 = __ldcs(((const int *)ptr) + 1);
+  __ldcs(((const int *)ptr) + 1);
   // CHECK: ld.global.cs.s64
-  volatile long v_2 = __ldcs(((const long *)ptr) + 2);
+  __ldcs(((const long *)ptr) + 2);
   // CHECK: ld.global.cs.u64
-  volatile unsigned long v_3 = __ldcs(((const unsigned long *)ptr) + 3);
+  __ldcs(((const unsigned long *)ptr) + 3);
   // CHECK: ld.global.cs.s64
-  volatile long long v_4 = __ldcs(((const long long *)ptr) + 4);
+  __ldcs(((const long long *)ptr) + 4);
   // CHECK: ld.global.cs.f32
-  volatile float v_5 = __ldcs(((const float *)ptr) + 5);
+  __ldcs(((const float *)ptr) + 5);
   // CHECK: ld.global.cs.f64
-  volatile double v_6 = __ldcs(((const double *)ptr) + 6);
+  __ldcs(((const double *)ptr) + 6);
   // CHECK: ld.global.cs.v2.s8
-  volatile char2 v_7 = __ldcs(((const char2 *)ptr) + 7);
+  __ldcs(((const char2 *)ptr) + 7);
   // CHECK: ld.global.cs.v4.s8
-  volatile char4 v_8 = __ldcs(((const char4 *)ptr) + 8);
+  __ldcs(((const char4 *)ptr) + 8);
   // CHECK: ld.global.cs.v2.s16
-  volatile short2 v_9 = __ldcs(((const short2 *)ptr) + 9);
+  __ldcs(((const short2 *)ptr) + 9);
   // CHECK: ld.global.cs.v4.s16
-  volatile short4 v_10 = __ldcs(((const short4 *)ptr) + 10);
+  __ldcs(((const short4 *)ptr) + 10);
   // CHECK: ld.global.cs.v2.s32
-  volatile int2 v_11 = __ldcs(((const int2 *)ptr) + 11);
+  __ldcs(((const int2 *)ptr) + 11);
   // CHECK: ld.global.cs.v4.s32
-  volatile int4 v_12 = __ldcs(((const int4 *)ptr) + 12);
+  __ldcs(((const int4 *)ptr) + 12);
   // CHECK: ld.global.cs.v2.s64
-  volatile longlong2 v_13 = __ldcs(((const longlong2 *)ptr) + 13);
+  __ldcs(((const longlong2 *)ptr) + 13);
   // CHECK: ld.global.cs.v2.u8
-  volatile uchar2 v_14 = __ldcs(((const uchar2 *)ptr) + 14);
+  __ldcs(((const uchar2 *)ptr) + 14);
   // CHECK: ld.global.cs.v4.u8
-  volatile uchar4 v_15 = __ldcs(((const uchar4 *)ptr) + 15);
+  __ldcs(((const uchar4 *)ptr) + 15);
   // CHECK: ld.global.cs.v2.u16
-  volatile ushort2 v_16 = __ldcs(((const ushort2 *)ptr) + 16);
+  __ldcs(((const ushort2 *)ptr) + 16);
   // CHECK: ld.global.cs.v4.u16
-  volatile ushort4 v_17 = __ldcs(((const ushort4 *)ptr) + 17);
+  __ldcs(((const ushort4 *)ptr) + 17);
   // CHECK: ld.global.cs.v2.u32
-  volatile uint2 v_18 = __ldcs(((const uint2 *)ptr) + 18);
+  __ldcs(((const uint2 *)ptr) + 18);
   // CHECK: ld.global.cs.v4.u32
-  volatile uint4 v_19 = __ldcs(((const uint4 *)ptr) + 19);
+  __ldcs(((const uint4 *)ptr) + 19);
   // CHECK: ld.global.cs.v2.u64
-  volatile ulonglong2 v_20 = __ldcs(((const ulonglong2 *)ptr) + 20);
+  __ldcs(((const ulonglong2 *)ptr) + 20);
   // CHECK: ld.global.cs.v2.f32
-  volatile float2 v_21 = __ldcs(((const float2 *)ptr) + 21);
+  __ldcs(((const float2 *)ptr) + 21);
   // CHECK: ld.global.cs.v4.f32
-  volatile float4 v_22 = __ldcs(((const float4 *)ptr) + 22);
+  __ldcs(((const float4 *)ptr) + 22);
   // CHECK: ld.global.cs.v2.f64
-  volatile double2 v_23 = __ldcs(((const double2 *)ptr) + 23);
+  __ldcs(((const double2 *)ptr) + 23);
 }
 __device__ void test_loads_ca(void *ptr) {
   // CHECK-LABEL: .func _Z13test_loads_caPv
   // CHECK: ld.global.ca.s8
-  volatile char v_0 = __ldca(((const char *)ptr) + 0);
+  __ldca(((const char *)ptr) + 0);
   // CHECK: ld.global.ca.s32
-  volatile int v_1 = __ldca(((const int *)ptr) + 1);
+  __ldca(((const int *)ptr) + 1);
   // CHECK: ld.global.ca.s64
-  volatile long v_2 = __ldca(((const long *)ptr) + 2);
+  __ldca(((const long *)ptr) + 2);
   // CHECK: ld.global.ca.u64
-  volatile unsigned long v_3 = __ldca(((const unsigned long *)ptr) + 3);
+  __ldca(((const unsigned long *)ptr) + 3);
   // CHECK: ld.global.ca.s64
-  volatile long long v_4 = __ldca(((const long long *)ptr) + 4);
+  __ldca(((const long long *)ptr) + 4);
   // CHECK: ld.global.ca.f32
-  volatile float v_5 = __ldca(((const float *)ptr) + 5);
+  __ldca(((const float *)ptr) + 5);
   // CHECK: ld.global.ca.f64
-  volatile double v_6 = __ldca(((const double *)ptr) + 6);
+  __ldca(((const double *)ptr) + 6);
   // CHECK: ld.global.ca.v2.s8
-  volatile char2 v_7 = __ldca(((const char2 *)ptr) + 7);
+  __ldca(((const char2 *)ptr) + 7);
   // CHECK: ld.global.ca.v4.s8
-  volatile char4 v_8 = __ldca(((const char4 *)ptr) + 8);
+  __ldca(((const char4 *)ptr) + 8);
   // CHECK: ld.global.ca.v2.s16
-  volatile short2 v_9 = __ldca(((const short2 *)ptr) + 9);
+  __ldca(((const short2 *)ptr) + 9);
   // CHECK: ld.global.ca.v4.s16
-  volatile short4 v_10 = __ldca(((const short4 *)ptr) + 10);
+  __ldca(((const short4 *)ptr) + 10);
   // CHECK: ld.global.ca.v2.s32
-  volatile int2 v_11 = __ldca(((const int2 *)ptr) + 11);
+  __ldca(((const int2 *)ptr) + 11);
   // CHECK: ld.global.ca.v4.s32
-  volatile int4 v_12 = __ldca(((const int4 *)ptr) + 12);
+  __ldca(((const int4 *)ptr) + 12);
   // CHECK: ld.global.ca.v2.s64
-  volatile longlong2 v_13 = __ldca(((const longlong2 *)ptr) + 13);
+  __ldca(((const longlong2 *)ptr) + 13);
   // CHECK: ld.global.ca.v2.u8
-  volatile uchar2 v_14 = __ldca(((const uchar2 *)ptr) + 14);
+  __ldca(((const uchar2 *)ptr) + 14);
   // CHECK: ld.global.ca.v4.u8
-  volatile uchar4 v_15 = __ldca(((const uchar4 *)ptr) + 15);
+  __ldca(((const uchar4 *)ptr) + 15);
   // CHECK: ld.global.ca.v2.u16
-  volatile ushort2 v_16 = __ldca(((const ushort2 *)ptr) + 16);
+  __ldca(((const ushort2 *)ptr) + 16);
   // CHECK: ld.global.ca.v4.u16
-  volatile ushort4 v_17 = __ldca(((const ushort4 *)ptr) + 17);
+  __ldca(((const ushort4 *)ptr) + 17);
   // CHECK: ld.global.ca.v2.u32
-  volatile uint2 v_18 = __ldca(((const uint2 *)ptr) + 18);
+  __ldca(((const uint2 *)ptr) + 18);
   // CHECK: ld.global.ca.v4.u32
-  volatile uint4 v_19 = __ldca(((const uint4 *)ptr) + 19);
+  __ldca(((const uint4 *)ptr) + 19);
   // CHECK: ld.global.ca.v2.u64
-  volatile ulonglong2 v_20 = __ldca(((const ulonglong2 *)ptr) + 20);
+  __ldca(((const ulonglong2 *)ptr) + 20);
   // CHECK: ld.global.ca.v2.f32
-  volatile float2 v_21 = __ldca(((const float2 *)ptr) + 21);
+  __ldca(((const float2 *)ptr) + 21);
   // CHECK: ld.global.ca.v4.f32
-  volatile float4 v_22 = __ldca(((const float4 *)ptr) + 22);
+  __ldca(((const float4 *)ptr) + 22);
   // CHECK: ld.global.ca.v2.f64
-  volatile double2 v_23 = __ldca(((const double2 *)ptr) + 23);
+  __ldca(((const double2 *)ptr) + 23);
 }
 __device__ void test_loads_lu(void *ptr) {
   // CHECK-LABEL: .func _Z13test_loads_luPv

>From 88302173f8502c51bf7496fcba4247999e5f5e1d Mon Sep 17 00:00:00 2001
From: Artem Belevich <[email protected]>
Date: Wed, 1 Apr 2026 11:46:16 -0700
Subject: [PATCH 6/6] Add REQUIRES: nvptx-registered-target to test and move to
 top

---
 clang/test/Headers/cuda_intrinsics.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/Headers/cuda_intrinsics.cu 
b/clang/test/Headers/cuda_intrinsics.cu
index ed082f54a28dc..fb36a53a14410 100644
--- a/clang/test/Headers/cuda_intrinsics.cu
+++ b/clang/test/Headers/cuda_intrinsics.cu
@@ -1,3 +1,4 @@
+// REQUIRES: nvptx-registered-target
 // RUN: %clang++ -O1 -S --cuda-device-only --offload-arch=sm_32 -nocudalib 
-nocudainc %s -o - | FileCheck %s
 
 #define __device__ __attribute__((device))

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to