https://github.com/kevinsala updated 
https://github.com/llvm/llvm-project/pull/199483

>From bfad6f7c306e132c79eaaa616030e4b10724bf61 Mon Sep 17 00:00:00 2001
From: Kevin Sala <[email protected]>
Date: Tue, 12 May 2026 21:09:21 -0700
Subject: [PATCH 1/3] [offload][OpenMP] Add strict flags for blocks and threads
 in kernel arguments

Until now, strict behavior in the number of threads and blocks has been
applied only when the kernel is in bare mode. When this mode is enabled,
the values passed in UserNumBlocks and UserThreadLimit are not adjusted
and are the values used to launch the kernel. This commit detaches the
strictness from the kernel mode.

This is going to be used by the kernel replay tool. Additionally, it
paves the path for the upcoming OpenMP dims modifier, used to configure
multidimensional teams and leagues, which will include strictness choices
for teams and threads.

The bare kernels must indicate strict behavior. Asserts are added to
check this condition.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  3 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  8 ++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  7 ++-
 offload/include/Shared/APITypes.h             |  6 ++-
 offload/liboffload/src/OffloadImpl.cpp        |  1 +
 offload/libomptarget/KernelLanguage/API.cpp   |  1 +
 offload/libomptarget/omptarget.cpp            |  1 +
 .../common/include/PluginInterface.h          |  5 +-
 .../common/src/PluginInterface.cpp            | 49 ++++++++++---------
 9 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index ec059f9dfef82..87e9824e677ba 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10982,7 +10982,8 @@ static void emitTargetCallKernelLaunch(
 
     llvm::OpenMPIRBuilder::TargetKernelArgs Args(
         NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads,
-        DynCGroupMem, HasNoWait, DynCGroupMemFallback);
+        DynCGroupMem, HasNoWait, /*StrictBlocksAndThreads=*/IsBare,
+        DynCGroupMemFallback);
 
     llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
         cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch(
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h 
b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2b790458f3c32..961b9958319a4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2771,6 +2771,9 @@ class OpenMPIRBuilder {
     Value *DynCGroupMem = nullptr;
     /// True if the kernel has 'no wait' clause.
     bool HasNoWait = false;
+    /// True if the kernel strictly requires the number of blocks and threads
+    /// above to run.
+    bool StrictBlocksAndThreads = false;
     /// The fallback mechanism for the shared memory.
     omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
         omp::OMPDynGroupprivateFallbackType::Abort;
@@ -2780,12 +2783,13 @@ class OpenMPIRBuilder {
     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
                      Value *NumIterations, ArrayRef<Value *> NumTeams,
                      ArrayRef<Value *> NumThreads, Value *DynCGroupMem,
-                     bool HasNoWait,
+                     bool HasNoWait, bool StrictBlocksAndThreads,
                      omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
         : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
           NumIterations(NumIterations), NumTeams(NumTeams),
           NumThreads(NumThreads), DynCGroupMem(DynCGroupMem),
-          HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
+          HasNoWait(HasNoWait), StrictBlocksAndThreads(StrictBlocksAndThreads),
+          DynCGroupMemFallback(DynCGroupMemFallback) {}
   };
 
   /// Create the kernel args vector used by emitTargetKernel. This function
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp 
b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6fb25f898f393..f35c7b7c6e467 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -647,7 +647,12 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs 
&KernelArgs,
   Value *DynCGroupMemFallbackFlag =
       Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
   DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
+
+  Value *StrictFlag = Builder.getInt64(KernelArgs.StrictBlocksAndThreads);
+  StrictFlag = Builder.CreateShl(StrictFlag, 6);
+
   Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
+  Flags = Builder.CreateOr(Flags, StrictFlag);
 
   assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
 
@@ -9783,7 +9788,7 @@ static void emitTargetCall(
 
     KArgs = OpenMPIRBuilder::TargetKernelArgs(
         NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, 
DynCGroupMem,
-        HasNoWait, DynCGroupMemFallback);
+        HasNoWait, /*StrictBlocksAndThreads=*/false, DynCGroupMemFallback);
 
     // Assume no error was returned because TaskBodyCB and
     // EmitTargetCallFallbackCB don't produce any.
diff --git a/offload/include/Shared/APITypes.h 
b/offload/include/Shared/APITypes.h
index d1326b4337551..948c12a27107e 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -106,8 +106,10 @@ struct KernelArgsTy {
     uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
     uint64_t Cooperative : 1; // Was this kernel spawned as cooperative.
     uint64_t IsPtrArgs : 1;   // Arguments are laid out as an array of 
pointers.
-    uint64_t Unused : 58;
-  } Flags = {0, 0, 0, 0, 0, 0};
+    uint64_t StrictBlocksAndThreads
+        : 1; // The user-requested number of blocks and threads are strict.
+    uint64_t Unused : 57;
+  } Flags = {0, 0, 0, 0, 0, 0, 0};
   // User-requested number of blocks (for x,y,z dimension).
   uint32_t UserNumBlocks[3] = {0, 0, 0};
   // User-requested number of threads (for x,y,z dimension).
diff --git a/offload/liboffload/src/OffloadImpl.cpp 
b/offload/liboffload/src/OffloadImpl.cpp
index fd2bb703be9a4..63294886e520e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -1124,6 +1124,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, 
ol_device_handle_t Device,
   LaunchArgs.UserThreadLimit[1] = LaunchSizeArgs->GroupSize.y;
   LaunchArgs.UserThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
   LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
+  LaunchArgs.Flags.StrictBlocksAndThreads = true;
 
   while (Properties && Properties->type != OL_KERNEL_LAUNCH_PROP_TYPE_NONE) {
     switch (Properties->type) {
diff --git a/offload/libomptarget/KernelLanguage/API.cpp 
b/offload/libomptarget/KernelLanguage/API.cpp
index 112b27b707e5a..50f9b695bed6a 100644
--- a/offload/libomptarget/KernelLanguage/API.cpp
+++ b/offload/libomptarget/KernelLanguage/API.cpp
@@ -68,6 +68,7 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, 
dim3 blockDim,
   Args.UserThreadLimit[2] = blockDim.z;
   Args.ArgPtrs = reinterpret_cast<void **>(args);
   Args.Flags.IsCUDA = true;
+  Args.Flags.StrictBlocksAndThreads = true;
   return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args);
 }
 }
diff --git a/offload/libomptarget/omptarget.cpp 
b/offload/libomptarget/omptarget.cpp
index c2456920ebc1b..17b215732d51b 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2481,6 +2481,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void 
*HostPtr,
   KernelArgs.UserThreadLimit[1] = 1;
   KernelArgs.UserThreadLimit[2] = 1;
   KernelArgs.DynCGroupMem = SharedMemorySize;
+  KernelArgs.Flags.StrictBlocksAndThreads = true;
 
   KernelExtraArgsTy KernelExtraArgs{};
   KernelExtraArgs.ReplayOutcome = ReplayOutcome;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h 
b/offload/plugins-nextgen/common/include/PluginInterface.h
index 54aac2f34b590..f99a0e817fd58 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -529,7 +529,7 @@ struct GenericKernelTy {
   /// Get the effective number of threads for the kernel based on the
   /// user-defined number of threads.
   uint32_t getEffectiveNumThreads(GenericDeviceTy &GenericDevice,
-                                  uint32_t UserThreadLimit[3]) const;
+                                  uint32_t UserThreadLimit) const;
 
   /// Get the effective number of blocks for the kernel based on the
   /// user-defined number of blocks and the loop trip count.
@@ -537,8 +537,7 @@ struct GenericKernelTy {
   /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
   /// thread_limit clause.
   uint32_t getEffectiveNumBlocks(GenericDeviceTy &GenericDevice,
-                                 uint32_t UserNumBlocks[3],
-                                 uint64_t LoopTripCount,
+                                 uint32_t UserNumBlocks, uint64_t 
LoopTripCount,
                                  uint32_t &EffectiveNumThreads,
                                  bool IsNumThreadsFromUser) const;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp 
b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f623a240b137e..1e05c6ae66fdf 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -249,15 +249,27 @@ Error GenericKernelTy::launch(GenericDeviceTy 
&GenericDevice, void **ArgPtrs,
   uint32_t EffectiveNumBlocks[3] = {KernelArgs.UserNumBlocks[0],
                                     KernelArgs.UserNumBlocks[1],
                                     KernelArgs.UserNumBlocks[2]};
-  if (!isBareMode()) {
-    assert(
-        EffectiveNumThreads[1] == 1 && EffectiveNumThreads[2] == 1 &&
-        EffectiveNumBlocks[1] == 1 && EffectiveNumBlocks[2] == 1 &&
-        "Non-bare mode should only use the first thread and block dimensions");
+
+  // Multidimensional is only supported with bare mode for now.
+  assert(isBareMode() ||
+         EffectiveNumThreads[1] == 1 && EffectiveNumThreads[2] == 1 &&
+             EffectiveNumBlocks[1] == 1 && EffectiveNumBlocks[2] == 1 &&
+             "Non-bare mode should only use the first thread and block "
+             "dimensions");
+
+  assert(!KernelArgs.Flags.StrictBlocksAndThreads ||
+         EffectiveNumThreads[0] > 0 && EffectiveNumThreads[1] > 0 &&
+             EffectiveNumThreads[2] > 0 && EffectiveNumBlocks[0] > 0 &&
+             EffectiveNumBlocks[1] > 0 && EffectiveNumBlocks[2] > 0 &&
+             "Strict requires number of blocks and threads greater than zero");
+
+  // Calculate or adjust the effective number of threads and blocks if needed.
+  if (!KernelArgs.Flags.StrictBlocksAndThreads) {
     EffectiveNumThreads[0] =
-        getEffectiveNumThreads(GenericDevice, EffectiveNumThreads);
+        getEffectiveNumThreads(GenericDevice, EffectiveNumThreads[0]);
+
     EffectiveNumBlocks[0] = getEffectiveNumBlocks(
-        GenericDevice, EffectiveNumBlocks, KernelArgs.Tripcount,
+        GenericDevice, EffectiveNumBlocks[0], KernelArgs.Tripcount,
         EffectiveNumThreads[0], KernelArgs.UserThreadLimit[0] > 0);
   }
 
@@ -364,34 +376,27 @@ GenericKernelTy::prepareArgs(GenericDeviceTy 
&GenericDevice, void **ArgPtrs,
 
 uint32_t
 GenericKernelTy::getEffectiveNumThreads(GenericDeviceTy &GenericDevice,
-                                        uint32_t UserThreadLimit[3]) const {
+                                        uint32_t UserThreadLimit) const {
   assert(!isBareMode() && "bare kernel should not call this function");
 
-  assert(UserThreadLimit[1] == 1 && UserThreadLimit[2] == 1 &&
-         "Multi dimensional launch not supported yet.");
+  if (UserThreadLimit > 0 && isGenericMode())
+    UserThreadLimit += GenericDevice.getWarpSize();
 
-  if (UserThreadLimit[0] > 0 && isGenericMode())
-    UserThreadLimit[0] += GenericDevice.getWarpSize();
-
-  return std::min(MaxNumThreads, (UserThreadLimit[0] > 0)
-                                     ? UserThreadLimit[0]
-                                     : PreferredNumThreads);
+  return std::min(MaxNumThreads, (UserThreadLimit > 0) ? UserThreadLimit
+                                                       : PreferredNumThreads);
 }
 
 uint32_t GenericKernelTy::getEffectiveNumBlocks(
-    GenericDeviceTy &GenericDevice, uint32_t UserNumBlocks[3],
+    GenericDeviceTy &GenericDevice, uint32_t UserNumBlocks,
     uint64_t LoopTripCount, uint32_t &EffectiveNumThreads,
     bool IsNumThreadsFromUser) const {
   assert(!isBareMode() && "bare kernel should not call this function");
 
-  assert(UserNumBlocks[1] == 1 && UserNumBlocks[2] == 1 &&
-         "Multi dimensional launch not supported yet.");
-
-  if (UserNumBlocks[0] > 0) {
+  if (UserNumBlocks > 0) {
     // TODO: We need to honor any value and consequently allow more than the
     // block limit. For this we might need to start multiple kernels or let the
     // blocks start again until the requested number has been started.
-    return std::min(UserNumBlocks[0], GenericDevice.getBlockLimit());
+    return std::min(UserNumBlocks, GenericDevice.getBlockLimit());
   }
 
   // Return the number of blocks required to cover the loop iterations.

>From 22feb0aab64e1e80f771b9ceede038d872ca7e2a Mon Sep 17 00:00:00 2001
From: Kevin Sala <[email protected]>
Date: Mon, 25 May 2026 12:53:51 -0700
Subject: [PATCH 2/3] Update clang test

---
 clang/test/OpenMP/target_teams_codegen.cpp | 84 +++++++++++-----------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/clang/test/OpenMP/target_teams_codegen.cpp 
b/clang/test/OpenMP/target_teams_codegen.cpp
index f2c8318d5895f..eb59d83d71391 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -634,7 +634,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP127:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, ptr [[TMP127]], align 8
 // CHECK1-NEXT:    [[TMP128:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP128]], align 8
+// CHECK1-NEXT:    store i64 64, ptr [[TMP128]], align 8
 // CHECK1-NEXT:    [[TMP129:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
 // CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP129]], 
align 4
 // CHECK1-NEXT:    [[TMP130:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
@@ -693,7 +693,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP157:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, ptr [[TMP157]], align 8
 // CHECK1-NEXT:    [[TMP158:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP158]], align 8
+// CHECK1-NEXT:    store i64 64, ptr [[TMP158]], align 8
 // CHECK1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 10
 // CHECK1-NEXT:    store [3 x i32] [i32 1, i32 2, i32 0], ptr [[TMP159]], 
align 4
 // CHECK1-NEXT:    [[TMP160:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 11
@@ -752,7 +752,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, ptr [[TMP187]], align 8
 // CHECK1-NEXT:    [[TMP188:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP188]], align 8
+// CHECK1-NEXT:    store i64 64, ptr [[TMP188]], align 8
 // CHECK1-NEXT:    [[TMP189:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 10
 // CHECK1-NEXT:    store [3 x i32] [i32 1, i32 2, i32 3], ptr [[TMP189]], 
align 4
 // CHECK1-NEXT:    [[TMP190:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 11
@@ -1094,7 +1094,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, 
!noalias [[META39]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 
8, !noalias [[META39]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], 
align 8, !noalias [[META39]]
-// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]], 
!inline_history [[META40:![0-9]+]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], 
align 8, !noalias [[META39]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], 
align 8, !noalias [[META39]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], 
align 8, !noalias [[META39]]
@@ -1413,14 +1413,14 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40:![0-9]+]], !align [[META41:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41:![0-9]+]], !align [[META42:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42:![0-9]+]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43:![0-9]+]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR4]], align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -1453,14 +1453,14 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[VLA3]], ptr [[VLA_ADDR4]], align 8
 // CHECK1-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR4]], align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META40]], !align [[META42]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull 
[[META41]], !align [[META43]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -1581,7 +1581,7 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 8
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 8
 // CHECK1-NEXT:    store ptr [[NN]], ptr [[NN_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -2054,7 +2054,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META43:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META44:![0-9]+]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B_CASTED]], align 8
@@ -2082,7 +2082,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META40]], !align [[META43]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !nonnull 
[[META41]], !align [[META44]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
@@ -2116,7 +2116,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -2145,7 +2145,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -2179,7 +2179,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -2203,7 +2203,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META40]], !align [[META41]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nonnull 
[[META41]], !align [[META42]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -2531,7 +2531,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP125:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, ptr [[TMP125]], align 8
 // CHECK3-NEXT:    [[TMP126:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CHECK3-NEXT:    store i64 64, ptr [[TMP126]], align 8
 // CHECK3-NEXT:    [[TMP127:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
 // CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP127]], 
align 4
 // CHECK3-NEXT:    [[TMP128:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
@@ -2590,7 +2590,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP155:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, ptr [[TMP155]], align 8
 // CHECK3-NEXT:    [[TMP156:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP156]], align 8
+// CHECK3-NEXT:    store i64 64, ptr [[TMP156]], align 8
 // CHECK3-NEXT:    [[TMP157:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 10
 // CHECK3-NEXT:    store [3 x i32] [i32 1, i32 2, i32 0], ptr [[TMP157]], 
align 4
 // CHECK3-NEXT:    [[TMP158:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 11
@@ -2649,7 +2649,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP185:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, ptr [[TMP185]], align 8
 // CHECK3-NEXT:    [[TMP186:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP186]], align 8
+// CHECK3-NEXT:    store i64 64, ptr [[TMP186]], align 8
 // CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 10
 // CHECK3-NEXT:    store [3 x i32] [i32 1, i32 2, i32 3], ptr [[TMP187]], 
align 4
 // CHECK3-NEXT:    [[TMP188:%.*]] = getelementptr inbounds nuw 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 11
@@ -2993,7 +2993,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, 
!noalias [[META40]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 
4, !noalias [[META40]]
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], 
align 4, !noalias [[META40]]
-// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr 
[[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]], 
!inline_history [[META41:![0-9]+]]
 // CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], 
align 4, !noalias [[META40]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], 
align 4, !noalias [[META40]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], 
align 4, !noalias [[META40]]
@@ -3312,14 +3312,14 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41:![0-9]+]], !align [[META42:![0-9]+]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42:![0-9]+]], !align [[META43:![0-9]+]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VLA_ADDR4]], align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -3352,14 +3352,14 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[VLA3]], ptr [[VLA_ADDR4]], align 4
 // CHECK3-NEXT:    store ptr [[CN]], ptr [[CN_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VLA_ADDR4]], align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -3480,7 +3480,7 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], 
align 4
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], 
align 4
 // CHECK3-NEXT:    store ptr [[NN]], ptr [[NN_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[NN_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3952,7 +3952,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META43:![0-9]+]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META44:![0-9]+]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B_CASTED]], align 4
@@ -3980,7 +3980,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META41]], !align [[META43]]
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !nonnull 
[[META42]], !align [[META44]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
@@ -4014,7 +4014,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -4043,7 +4043,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
@@ -4077,7 +4077,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -4101,7 +4101,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META41]], !align [[META42]]
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nonnull 
[[META42]], !align [[META43]]
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4

>From 0dc780c92ae588e96de90ed338a47904b53a354a Mon Sep 17 00:00:00 2001
From: Kevin Sala <[email protected]>
Date: Thu, 28 May 2026 17:53:40 -0700
Subject: [PATCH 3/3] Add ompx_bare and liboffload tests

---
 offload/test/offloading/ompx_bare_gridsize.c  | 153 ++++++++++++++++++
 .../OffloadAPI/device_code/CMakeLists.txt     |   2 +
 .../OffloadAPI/device_code/gridsize.cpp       |  13 ++
 .../OffloadAPI/kernel/olLaunchKernel.cpp      |  41 +++++
 4 files changed, 209 insertions(+)
 create mode 100644 offload/test/offloading/ompx_bare_gridsize.c
 create mode 100644 offload/unittests/OffloadAPI/device_code/gridsize.cpp

diff --git a/offload/test/offloading/ompx_bare_gridsize.c 
b/offload/test/offloading/ompx_bare_gridsize.c
new file mode 100644
index 0000000000000..5f295d785bf52
--- /dev/null
+++ b/offload/test/offloading/ompx_bare_gridsize.c
@@ -0,0 +1,153 @@
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-generic
+//
+// REQUIRES: gpu
+
+#include <ompx.h>
+#include <stdio.h>
+
+void get_gridsizes(int *nblocks, int *nthreads) {
+  if (ompx_block_id_x() == 0 && ompx_thread_id_x() == 0 &&
+      ompx_block_id_y() == 0 && ompx_thread_id_y() == 0 &&
+      ompx_block_id_z() == 0 && ompx_thread_id_z() == 0) {
+    nblocks[0] = ompx_grid_dim_x();
+    nblocks[1] = ompx_grid_dim_y();
+    nblocks[2] = ompx_grid_dim_z();
+    nthreads[0] = ompx_block_dim_x();
+    nthreads[1] = ompx_block_dim_y();
+    nthreads[2] = ompx_block_dim_z();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  int nblocks[3], nthreads[3];
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,1,1] blocks and [32,1,1]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64) thread_limit(32)              
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 1 1, nthreads: 32 1 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,1,1] blocks and [32,4,1]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64) thread_limit(32, 4)           
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 1 1, nthreads: 32 4 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,1,1] blocks and [32,4,2]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64) thread_limit(32, 4, 2)        
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 1 1, nthreads: 32 4 2
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,1] blocks and [32,1,1]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 1) thread_limit(32)       
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 16 1, nthreads: 32 1 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,1] blocks and [32,4,1]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 1) thread_limit(32, 4)    
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 16 1, nthreads: 32 4 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,1] blocks and [32,4,2]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 1) thread_limit(32, 4, 2) 
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 16 1, nthreads: 32 4 2
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,8] blocks and [32,1,1]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 8) thread_limit(32)       
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 16 8, nthreads: 32 1 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,8] blocks and [32,4,1]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 8) thread_limit(32, 4)    
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 16 8, nthreads: 32 4 1
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  // CHECK: PluginInterface device 0 info: Launching kernel
+  // CHECK-SAME: __omp_offloading_{{.*}} with [64,16,8] blocks and [32,4,2]
+  // CHECK-SAME: threads in BARE mode
+  nblocks[0] = nblocks[1] = nblocks[2] = nthreads[0] = nthreads[1] =
+      nthreads[2] = 0;
+#pragma omp target teams ompx_bare num_teams(64, 16, 8) thread_limit(32, 4, 2) 
\
+    map(tofrom : nblocks, nthreads)
+  {
+    get_gridsizes(nblocks, nthreads);
+  }
+  // CHECK: nblocks: 64 16 8, nthreads: 32 4 2
+  fprintf(stderr, "nblocks: %d %d %d, nthreads: %d %d %d\n", nblocks[0],
+          nblocks[1], nblocks[2], nthreads[0], nthreads[1], nthreads[2]);
+
+  return 0;
+}
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt 
b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 5dec8c4cd5a9d..791a55ffc6126 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -13,6 +13,7 @@ add_offload_test_device_code(global.cpp global)
 add_offload_test_device_code(global_ctor.cpp global_ctor)
 add_offload_test_device_code(global_dtor.cpp global_dtor)
 add_offload_test_device_code(sequence.cpp sequence)
+add_offload_test_device_code(gridsize.cpp gridsize)
 
 add_custom_target(offload_device_binaries DEPENDS
     foo.bin
@@ -29,5 +30,6 @@ add_custom_target(offload_device_binaries DEPENDS
     global_ctor.bin
     global_dtor.bin
     sequence.bin
+    gridsize.bin
 )
 set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/gridsize.cpp 
b/offload/unittests/OffloadAPI/device_code/gridsize.cpp
new file mode 100644
index 0000000000000..9fe1db78919a4
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/gridsize.cpp
@@ -0,0 +1,13 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern "C" __gpu_kernel void gridsize(uint32_t *nblocks, uint32_t *nthreads) {
+  if (__gpu_block_id(0) == 0 && __gpu_block_id(1) == 0 &&
+      __gpu_block_id(2) == 0 && __gpu_thread_id(0) == 0 &&
+      __gpu_thread_id(1) == 0 && __gpu_thread_id(2) == 0) {
+    for (int i = 0; i < 3; ++i) {
+      nblocks[i] = __gpu_num_blocks(i);
+      nthreads[i] = __gpu_num_threads(i);
+    }
+  }
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp 
b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 92fe178b67f5d..946fcc91a76fd 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -27,6 +27,7 @@ KERNEL_TEST(LocalMemStatic, localmem_static)
 KERNEL_TEST(SingleCounterSyncEvent, single_counter)
 KERNEL_TEST(GlobalCtor, global_ctor)
 KERNEL_TEST(GlobalDtor, global_dtor)
+KERNEL_TEST(GridSize, gridsize)
 
 struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
   void SetUpKernels(const char *program, std::vector<const char *> kernels) {
@@ -458,3 +459,43 @@ TEST_P(olLaunchKernelGlobalDtorTest, Success) {
                                 nullptr, nullptr));
   ASSERT_SUCCESS(olSyncQueue(Queue));
 }
+
+TEST_P(olLaunchKernelGridSizeTest, Success) {
+  void *Mem;
+  ASSERT_SUCCESS(
+      olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 6 * sizeof(uint32_t), &Mem));
+
+  uint32_t *NumBlocks = static_cast<uint32_t *>(Mem);
+  uint32_t *NumThreads = static_cast<uint32_t *>(Mem) + 3;
+
+  void *ArgPtrs[] = {&NumBlocks, &NumThreads};
+  size_t ArgSizes[] = {sizeof(NumBlocks), sizeof(NumThreads)};
+
+  const uint32_t BaseBlocks[3] = {64, 16, 8};
+  const uint32_t BaseThreads[3] = {32, 4, 2};
+
+  for (uint32_t Dim = 1; Dim <= 3; ++Dim) {
+    NumBlocks[0] = NumBlocks[1] = NumBlocks[2] = 0;
+    NumThreads[0] = NumThreads[1] = NumThreads[2] = 0;
+
+    LaunchArgs.Dimensions = 3;
+    LaunchArgs.NumGroups = {BaseBlocks[0], Dim >= 2 ? BaseBlocks[1] : 1,
+                            Dim >= 3 ? BaseBlocks[2] : 1};
+    LaunchArgs.GroupSize = {BaseThreads[0], Dim >= 2 ? BaseThreads[1] : 1,
+                            Dim >= 3 ? BaseThreads[2] : 1};
+
+    ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &LaunchArgs, nullptr,
+                                  std::size(ArgPtrs), ArgPtrs, ArgSizes));
+
+    ASSERT_SUCCESS(olSyncQueue(Queue));
+
+    ASSERT_EQ(NumBlocks[0], LaunchArgs.NumGroups.x);
+    ASSERT_EQ(NumBlocks[1], LaunchArgs.NumGroups.y);
+    ASSERT_EQ(NumBlocks[2], LaunchArgs.NumGroups.z);
+    ASSERT_EQ(NumThreads[0], LaunchArgs.GroupSize.x);
+    ASSERT_EQ(NumThreads[1], LaunchArgs.GroupSize.y);
+    ASSERT_EQ(NumThreads[2], LaunchArgs.GroupSize.z);
+  }
+
+  ASSERT_SUCCESS(olMemFree(Mem));
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to