https://github.com/kevinsala updated 
https://github.com/llvm/llvm-project/pull/199483

>From 56b8d64b07d5710019223928518b432739162ad2 Mon Sep 17 00:00:00 2001
From: Kevin Sala <[email protected]>
Date: Tue, 12 May 2026 21:09:21 -0700
Subject: [PATCH 1/3] [offload][OpenMP] Add strict flags for blocks and threads
 in kernel arguments

Until now, strict behavior in the number of threads and blocks has been
applied only when the kernel is in bare mode. When this mode is enabled,
the values passed in UserNumBlocks and UserThreadLimit are not adjusted
and are the values used to launch the kernel. This commit detaches the
strictness from the kernel mode.

This is going to be used by the kernel replay tool. Additionally, it
paves the path for the upcoming OpenMP dims modifier, used to configure
multidimensional teams and leagues, which will include strictness choices
for teams and threads.

The bare kernels must indicate strict behavior. Asserts are added to
check this condition.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  3 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  8 ++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  7 ++-
 offload/include/Shared/APITypes.h             |  6 ++-
 offload/liboffload/src/OffloadImpl.cpp        |  1 +
 offload/libomptarget/KernelLanguage/API.cpp   |  1 +
 offload/libomptarget/omptarget.cpp            |  1 +
 .../common/include/PluginInterface.h          |  5 +-
 .../common/src/PluginInterface.cpp            | 49 ++++++++++---------
 9 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 60bfe3e9d43f7..2fc6128b78476 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10984,7 +10984,8 @@ static void emitTargetCallKernelLaunch(
 
     llvm::OpenMPIRBuilder::TargetKernelArgs Args(
         NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads,
-        DynCGroupMem, HasNoWait, DynCGroupMemFallback);
+        DynCGroupMem, HasNoWait, /*StrictBlocksAndThreads=*/IsBare,
+        DynCGroupMemFallback);
 
     llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
         cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch(
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h 
b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2b790458f3c32..961b9958319a4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2771,6 +2771,9 @@ class OpenMPIRBuilder {
     Value *DynCGroupMem = nullptr;
     /// True if the kernel has 'no wait' clause.
     bool HasNoWait = false;
+    /// True if the kernel strictly requires the number of blocks and threads
+    /// above to run.
+    bool StrictBlocksAndThreads = false;
     /// The fallback mechanism for the shared memory.
     omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
         omp::OMPDynGroupprivateFallbackType::Abort;
@@ -2780,12 +2783,13 @@ class OpenMPIRBuilder {
     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
                      Value *NumIterations, ArrayRef<Value *> NumTeams,
                      ArrayRef<Value *> NumThreads, Value *DynCGroupMem,
-                     bool HasNoWait,
+                     bool HasNoWait, bool StrictBlocksAndThreads,
                      omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
         : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
           NumIterations(NumIterations), NumTeams(NumTeams),
           NumThreads(NumThreads), DynCGroupMem(DynCGroupMem),
-          HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
+          HasNoWait(HasNoWait), StrictBlocksAndThreads(StrictBlocksAndThreads),
+          DynCGroupMemFallback(DynCGroupMemFallback) {}
   };
 
   /// Create the kernel args vector used by emitTargetKernel. This function
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp 
b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ecff0c9b0aac4..1f2f3546f596e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -647,7 +647,12 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs 
&KernelArgs,
   Value *DynCGroupMemFallbackFlag =
       Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
   DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
+
+  Value *StrictFlag = Builder.getInt64(KernelArgs.StrictBlocksAndThreads);
+  StrictFlag = Builder.CreateShl(StrictFlag, 5);
+
   Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
+  Flags = Builder.CreateOr(Flags, StrictFlag);
 
   assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
 
@@ -9783,7 +9788,7 @@ static void emitTargetCall(
 
     KArgs = OpenMPIRBuilder::TargetKernelArgs(
         NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, 
DynCGroupMem,
-        HasNoWait, DynCGroupMemFallback);
+        HasNoWait, /*StrictBlocksAndThreads=*/false, DynCGroupMemFallback);
 
     // Assume no error was returned because TaskBodyCB and
     // EmitTargetCallFallbackCB don't produce any.
diff --git a/offload/include/Shared/APITypes.h 
b/offload/include/Shared/APITypes.h
index 212fb285030fb..99c5dcd3b5154 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -105,8 +105,10 @@ struct KernelArgsTy {
     uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
     uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
     uint64_t Cooperative : 1; // Was this kernel spawned as cooperative.
-    uint64_t Unused : 59;
-  } Flags = {0, 0, 0, 0, 0};
+    uint64_t StrictBlocksAndThreads
+        : 1; // The user-requested number of blocks and threads are strict.
+    uint64_t Unused : 58;
+  } Flags = {0, 0, 0, 0, 0, 0};
   // User-requested number of blocks (for x,y,z dimension).
   uint32_t UserNumBlocks[3] = {0, 0, 0};
   // User-requested number of threads (for x,y,z dimension).
diff --git a/offload/liboffload/src/OffloadImpl.cpp 
b/offload/liboffload/src/OffloadImpl.cpp
index 66fcbbc264ab4..de13fd7c67ee2 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -1122,6 +1122,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, 
ol_device_handle_t Device,
   LaunchArgs.UserThreadLimit[1] = LaunchSizeArgs->GroupSize.y;
   LaunchArgs.UserThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
   LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
+  LaunchArgs.Flags.StrictBlocksAndThreads = true;
 
   while (Properties && Properties->type != OL_KERNEL_LAUNCH_PROP_TYPE_NONE) {
     switch (Properties->type) {
diff --git a/offload/libomptarget/KernelLanguage/API.cpp 
b/offload/libomptarget/KernelLanguage/API.cpp
index 112b27b707e5a..50f9b695bed6a 100644
--- a/offload/libomptarget/KernelLanguage/API.cpp
+++ b/offload/libomptarget/KernelLanguage/API.cpp
@@ -68,6 +68,7 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, 
dim3 blockDim,
   Args.UserThreadLimit[2] = blockDim.z;
   Args.ArgPtrs = reinterpret_cast<void **>(args);
   Args.Flags.IsCUDA = true;
+  Args.Flags.StrictBlocksAndThreads = true;
   return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args);
 }
 }
diff --git a/offload/libomptarget/omptarget.cpp 
b/offload/libomptarget/omptarget.cpp
index c2456920ebc1b..17b215732d51b 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2481,6 +2481,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void 
*HostPtr,
   KernelArgs.UserThreadLimit[1] = 1;
   KernelArgs.UserThreadLimit[2] = 1;
   KernelArgs.DynCGroupMem = SharedMemorySize;
+  KernelArgs.Flags.StrictBlocksAndThreads = true;
 
   KernelExtraArgsTy KernelExtraArgs{};
   KernelExtraArgs.ReplayOutcome = ReplayOutcome;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h 
b/offload/plugins-nextgen/common/include/PluginInterface.h
index 54aac2f34b590..f99a0e817fd58 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -529,7 +529,7 @@ struct GenericKernelTy {
   /// Get the effective number of threads for the kernel based on the
   /// user-defined number of threads.
   uint32_t getEffectiveNumThreads(GenericDeviceTy &GenericDevice,
-                                  uint32_t UserThreadLimit[3]) const;
+                                  uint32_t UserThreadLimit) const;
 
   /// Get the effective number of blocks for the kernel based on the
   /// user-defined number of blocks and the loop trip count.
@@ -537,8 +537,7 @@ struct GenericKernelTy {
   /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
   /// thread_limit clause.
   uint32_t getEffectiveNumBlocks(GenericDeviceTy &GenericDevice,
-                                 uint32_t UserNumBlocks[3],
-                                 uint64_t LoopTripCount,
+                                 uint32_t UserNumBlocks, uint64_t 
LoopTripCount,
                                  uint32_t &EffectiveNumThreads,
                                  bool IsNumThreadsFromUser) const;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp 
b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 4379ebd250794..d3d80cad0d86a 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -249,15 +249,27 @@ Error GenericKernelTy::launch(GenericDeviceTy 
&GenericDevice, void **ArgPtrs,
   uint32_t EffectiveNumBlocks[3] = {KernelArgs.UserNumBlocks[0],
                                     KernelArgs.UserNumBlocks[1],
                                     KernelArgs.UserNumBlocks[2]};
-  if (!isBareMode()) {
-    assert(
-        EffectiveNumThreads[1] == 1 && EffectiveNumThreads[2] == 1 &&
-        EffectiveNumBlocks[1] == 1 && EffectiveNumBlocks[2] == 1 &&
-        "Non-bare mode should only use the first thread and block dimensions");
+
+  // Multidimensional is only supported with bare mode for now.
+  assert(isBareMode() ||
+         EffectiveNumThreads[1] == 1 && EffectiveNumThreads[2] == 1 &&
+             EffectiveNumBlocks[1] == 1 && EffectiveNumBlocks[2] == 1 &&
+             "Non-bare mode should only use the first thread and block "
+             "dimensions");
+
+  assert(!KernelArgs.Flags.StrictBlocksAndThreads ||
+         EffectiveNumThreads[0] > 0 && EffectiveNumThreads[1] > 0 &&
+             EffectiveNumThreads[2] > 0 && EffectiveNumBlocks[0] > 0 &&
+             EffectiveNumBlocks[1] > 0 && EffectiveNumBlocks[2] > 0 &&
+             "Strict requires number of blocks and threads greater than zero");
+
+  // Calculate or adjust the effective number of threads and blocks if needed.
+  if (!KernelArgs.Flags.StrictBlocksAndThreads) {
     EffectiveNumThreads[0] =
-        getEffectiveNumThreads(GenericDevice, EffectiveNumThreads);
+        getEffectiveNumThreads(GenericDevice, EffectiveNumThreads[0]);
+
     EffectiveNumBlocks[0] = getEffectiveNumBlocks(
-        GenericDevice, EffectiveNumBlocks, KernelArgs.Tripcount,
+        GenericDevice, EffectiveNumBlocks[0], KernelArgs.Tripcount,
         EffectiveNumThreads[0], KernelArgs.UserThreadLimit[0] > 0);
   }
 
@@ -362,34 +374,27 @@ GenericKernelTy::prepareArgs(GenericDeviceTy 
&GenericDevice, void **ArgPtrs,
 
 uint32_t
 GenericKernelTy::getEffectiveNumThreads(GenericDeviceTy &GenericDevice,
-                                        uint32_t UserThreadLimit[3]) const {
+                                        uint32_t UserThreadLimit) const {
   assert(!isBareMode() && "bare kernel should not call this function");
 
-  assert(UserThreadLimit[1] == 1 && UserThreadLimit[2] == 1 &&
-         "Multi dimensional launch not supported yet.");
+  if (UserThreadLimit > 0 && isGenericMode())
+    UserThreadLimit += GenericDevice.getWarpSize();
 
-  if (UserThreadLimit[0] > 0 && isGenericMode())
-    UserThreadLimit[0] += GenericDevice.getWarpSize();
-
-  return std::min(MaxNumThreads, (UserThreadLimit[0] > 0)
-                                     ? UserThreadLimit[0]
-                                     : PreferredNumThreads);
+  return std::min(MaxNumThreads, (UserThreadLimit > 0) ? UserThreadLimit
+                                                       : PreferredNumThreads);
 }
 
 uint32_t GenericKernelTy::getEffectiveNumBlocks(
-    GenericDeviceTy &GenericDevice, uint32_t UserNumBlocks[3],
+    GenericDeviceTy &GenericDevice, uint32_t UserNumBlocks,
     uint64_t LoopTripCount, uint32_t &EffectiveNumThreads,
     bool IsNumThreadsFromUser) const {
   assert(!isBareMode() && "bare kernel should not call this function");
 
-  assert(UserNumBlocks[1] == 1 && UserNumBlocks[2] == 1 &&
-         "Multi dimensional launch not supported yet.");
-
-  if (UserNumBlocks[0] > 0) {
+  if (UserNumBlocks > 0) {
     // TODO: We need to honor any value and consequently allow more than the
     // block limit. For this we might need to start multiple kernels or let the
     // blocks start again until the requested number has been started.
-    return std::min(UserNumBlocks[0], GenericDevice.getBlockLimit());
+    return std::min(UserNumBlocks, GenericDevice.getBlockLimit());
   }
 
   // Return the number of blocks required to cover the loop iterations.

>From 7837240c42bb3878eac8e8d73ce517d0ca227c56 Mon Sep 17 00:00:00 2001
From: Kevin Sala <[email protected]>
Date: Mon, 25 May 2026 12:53:51 -0700
Subject: [PATCH 2/3] Update clang test

---
 clang/test/OpenMP/target_teams_codegen.cpp | 84 +++++++++++-----------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/clang/test/OpenMP/target_teams_codegen.cpp 
b/clang/test/OpenMP/target_teams_codegen.cpp
index f2c8318d5895f..8e39b8072b991 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -634,7 +634,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP127:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, ptr [[TMP127]], align 8
 // CHECK1-NEXT:    [[TMP128:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP128]], align 8
+// CHECK1-NEXT:    store i64 32, ptr [[TMP128]], align 8
 // CHECK1-NEXT:    [[TMP129:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
 // CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP129]], 
align 4
 // CHECK1-NEXT:    [[TMP130:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
@@ -693,7 +693,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP157:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, ptr [[TMP157]], align 8
 // CHECK1-NEXT:    [[TMP158:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP158]], align 8
+// CHECK1-NEXT:    store i64 32, ptr [[TMP158]], align 8
 // CHECK1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 10
 // CHECK1-NEXT:    store [3 x i32] [i32 1, i32 2, i32 0], ptr [[TMP159]], 
align 4
 // CHECK1-NEXT:    [[TMP160:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 11
@@ -752,7 +752,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, ptr [[TMP187]], align 8
 // CHECK1-NEXT:    [[TMP188:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP188]], align 8
+// CHECK1-NEXT:    store i64 32, ptr [[TMP188]], align 8
 // CHECK1-NEXT:    [[TMP189:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 10
 // CHECK1-NEXT:    store [3 x i32] [i32 1, i32 2, i32 3], ptr [[TMP189]], 
align 4
 // CHECK1-NEXT:    [[TMP190:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 11
@@ -1094,7 +1094,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, 
!noalias [[META39]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 
8, !noalias [[META39]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], 
align 8, !noalias [[META39]]
-// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]], 
!inline_history [[META40:![0-9]+]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], 
align 8, !noalias [[META39]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], 
align 8, !noalias [[META39]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], 
align 8, !noalias [[META39]]
@@ -1413,14 +1413,14 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40:![0-9]+]], !align [[META41:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41:![0-9]+]], !align [[META42:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42:![0-9]+]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43:![0-9]+]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR4]], align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -1453,14 +1453,14 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[VLA3]], ptr [[VLA_ADDR4]], align 8
 // CHECK1-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR4]], align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -1581,7 +1581,7 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 8
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 8
 // CHECK1-NEXT:    store ptr [[NN]], ptr [[NN_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -2054,7 +2054,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META43:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META44:![0-9]+]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B_CASTED]], align 8
@@ -2082,7 +2082,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META43]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META44]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
@@ -2116,7 +2116,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -2145,7 +2145,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -2179,7 +2179,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -2203,7 +2203,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -2531,7 +2531,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP125:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, ptr [[TMP125]], align 8
 // CHECK3-NEXT:    [[TMP126:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CHECK3-NEXT:    store i64 32, ptr [[TMP126]], align 8
 // CHECK3-NEXT:    [[TMP127:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
 // CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP127]], 
align 4
 // CHECK3-NEXT:    [[TMP128:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
@@ -2590,7 +2590,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP155:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, ptr [[TMP155]], align 8
 // CHECK3-NEXT:    [[TMP156:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP156]], align 8
+// CHECK3-NEXT:    store i64 32, ptr [[TMP156]], align 8
 // CHECK3-NEXT:    [[TMP157:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 10
 // CHECK3-NEXT:    store [3 x i32] [i32 1, i32 2, i32 0], ptr [[TMP157]], 
align 4
 // CHECK3-NEXT:    [[TMP158:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 11
@@ -2649,7 +2649,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP185:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, ptr [[TMP185]], align 8
 // CHECK3-NEXT:    [[TMP186:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP186]], align 8
+// CHECK3-NEXT:    store i64 32, ptr [[TMP186]], align 8
 // CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 10
 // CHECK3-NEXT:    store [3 x i32] [i32 1, i32 2, i32 3], ptr [[TMP187]], 
align 4
 // CHECK3-NEXT:    [[TMP188:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 11
@@ -2993,7 +2993,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, 
!noalias [[META40]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 
4, !noalias [[META40]]
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], 
align 4, !noalias [[META40]]
-// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]], 
!inline_history [[META41:![0-9]+]]
 // CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], 
align 4, !noalias [[META40]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], 
align 4, !noalias [[META40]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], 
align 4, !noalias [[META40]]
@@ -3312,14 +3312,14 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41:![0-9]+]], !align [[META42:![0-9]+]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42:![0-9]+]], !align [[META43:![0-9]+]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VLA_ADDR4]], align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -3352,14 +3352,14 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[VLA3]], ptr [[VLA_ADDR4]], align 4
 // CHECK3-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VLA_ADDR4]], align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -3480,7 +3480,7 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 4
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 4
 // CHECK3-NEXT:    store ptr [[NN]], ptr [[NN_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3952,7 +3952,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META43:![0-9]+]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META44:![0-9]+]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B_CASTED]], align 4
@@ -3980,7 +3980,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META44]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
@@ -4014,7 +4014,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -4043,7 +4043,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -4077,7 +4077,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -4101,7 +4101,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4

>From 3dde875540a0c4b94396983f817ace548e91b7e4 Mon Sep 17 00:00:00 2001
From: Kevin Sala <[email protected]>
Date: Thu, 28 May 2026 17:53:40 -0700
Subject: [PATCH 3/3] Add ompx_bare and liboffload tests

---
 offload/test/offloading/ompx_bare_gridsize.c  | 213 ++++++++++++++++++
 .../OffloadAPI/device_code/CMakeLists.txt     |   2 +
 .../OffloadAPI/kernel/olLaunchKernel.cpp      |  43 ++++
 3 files changed, 258 insertions(+)
 create mode 100644 offload/test/offloading/ompx_bare_gridsize.c

diff --git a/offload/test/offloading/ompx_bare_gridsize.c 
b/offload/test/offloading/ompx_bare_gridsize.c
new file mode 100644
index 0000000000000..a3551f49117ed
--- /dev/null
+++ b/offload/test/offloading/ompx_bare_gridsize.c
@@ -0,0 +1,213 @@
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-generic
+//
+// REQUIRES: gpu
+
+#include <ompx.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int nb_x, nb_y, nb_z;
+  int nt_x, nt_y, nt_z;
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,1,1] blocks and [32,1,1]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64) thread_limit(32)              
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 1 1, nthreads: 32 1 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,1,1] blocks and [32,4,1]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64) thread_limit(32, 4)           
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 1 1, nthreads: 32 4 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,1,1] blocks and [32,4,2]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64) thread_limit(32, 4, 2)        
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 1 1, nthreads: 32 4 2
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,1] blocks and [32,1,1]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 1) thread_limit(32)       
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 16 1, nthreads: 32 1 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,1] blocks and [32,4,1]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 1) thread_limit(32, 4)    
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 16 1, nthreads: 32 4 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,1] blocks and [32,4,2]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 1) thread_limit(32, 4, 2) 
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 16 1, nthreads: 32 4 2
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,8] blocks and [32,1,1]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 8) thread_limit(32)       
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 16 8, nthreads: 32 1 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,8] blocks and [32,4,1]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 8) thread_limit(32, 4)    
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 16 8, nthreads: 32 4 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,8] blocks and [32,4,2]
+  // threads in BARE mode
+  nb_x = nb_y = nb_z = nt_x = nt_y = nt_z = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 8) thread_limit(32, 4, 2) 
\
+    map(from : nb_x, nb_y, nb_z, nt_x, nt_y, nt_z)
+  {
+    if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+        ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+        ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+      nb_x = ompx_grid_dim_x();
+      nb_y = ompx_grid_dim_y();
+      nb_z = ompx_grid_dim_z();
+      nt_x = ompx_block_dim_x();
+      nt_y = ompx_block_dim_y();
+      nt_z = ompx_block_dim_z();
+    }
+  }
+  // CHECK: nblocks: 64 16 8, nthreads: 32 4 2
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nb_x, nb_y, nb_z,
+          nt_x, nt_y, nt_z);
+
+  return 0;
+}
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt 
b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 57f529e1a4ae6..3870436d53afc 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -12,6 +12,7 @@ add_offload_test_device_code(global.cpp global)
 add_offload_test_device_code(global_ctor.cpp global_ctor)
 add_offload_test_device_code(global_dtor.cpp global_dtor)
 add_offload_test_device_code(sequence.cpp sequence)
+add_offload_test_device_code(gridsize.cpp gridsize)
 
 add_custom_target(offload_device_binaries DEPENDS
     foo.bin
@@ -27,5 +28,6 @@ add_custom_target(offload_device_binaries DEPENDS
     global_ctor.bin
     global_dtor.bin
     sequence.bin
+    gridsize.bin
 )
 set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp 
b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index e2dfa1a9c6b64..d41dae6bb6608 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -26,6 +26,7 @@ KERNEL_TEST(LocalMemStatic, localmem_static)
 KERNEL_TEST(SingleCounterSyncEvent, single_counter)
 KERNEL_TEST(GlobalCtor, global_ctor)
 KERNEL_TEST(GlobalDtor, global_dtor)
+KERNEL_TEST(GridSize, gridsize)
 
 struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
   void SetUpKernels(const char *program, std::vector<const char *> kernels) {
@@ -387,3 +388,45 @@ TEST_P(olLaunchKernelGlobalDtorTest, Success) {
       olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, nullptr));
   ASSERT_SUCCESS(olSyncQueue(Queue));
 }
+
+TEST_P(olLaunchKernelGridSizeTest, Success) {
+  void *Mem;
+  ASSERT_SUCCESS(
+      olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 6 * sizeof(uint32_t), &Mem));
+
+  uint32_t *NumBlocks = static_cast<uint32_t *>(Mem);
+  uint32_t *NumThreads = static_cast<uint32_t *>(Mem) + 3;
+
+  struct {
+    uint32_t *NumBlocks;
+    uint32_t *NumThreads;
+  } Args{NumBlocks, NumThreads};
+
+  const uint32_t BaseBlocks[3] = {64, 16, 8};
+  const uint32_t BaseThreads[3] = {32, 4, 2};
+
+  for (uint32_t Dim = 1; Dim <= 3; ++Dim) {
+    NumBlocks[0] = NumBlocks[1] = NumBlocks[2] = 0;
+    NumThreads[0] = NumThreads[1] = NumThreads[2] = 0;
+
+    LaunchArgs.Dimensions = 3;
+    LaunchArgs.NumGroups = {BaseBlocks[0], Dim >= 2 ? BaseBlocks[1] : 1,
+                            Dim >= 3 ? BaseBlocks[2] : 1};
+    LaunchArgs.GroupSize = {BaseThreads[0], Dim >= 2 ? BaseThreads[1] : 1,
+                            Dim >= 3 ? BaseThreads[2] : 1};
+
+    ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                  &LaunchArgs, nullptr));
+
+    ASSERT_SUCCESS(olSyncQueue(Queue));
+
+    ASSERT_EQ(NumBlocks[0], LaunchArgs.NumGroups.x);
+    ASSERT_EQ(NumBlocks[1], LaunchArgs.NumGroups.y);
+    ASSERT_EQ(NumBlocks[2], LaunchArgs.NumGroups.z);
+    ASSERT_EQ(NumThreads[0], LaunchArgs.GroupSize.x);
+    ASSERT_EQ(NumThreads[1], LaunchArgs.GroupSize.y);
+    ASSERT_EQ(NumThreads[2], LaunchArgs.GroupSize.z);
+  }
+
+  ASSERT_SUCCESS(olMemFree(Mem));
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to