https://github.com/kevinsala created https://github.com/llvm/llvm-project/pull/198719
This PR adds the option `-openmp-ir-builder-use-default-max-thread=<boolean-value>` to enable or disable the use of a default max threads in OpenMPIRBuilder when no max threads constant is provided. The option is enabled by default to maintain the same behavior as it is currently. This flag is useful to avoid limiting the number of threads that an OpenMP target region can run with when no `thread_limit` or `num_threads` (in a nested parallel region) are specified. This flag may be used when recording a kernel to allow replaying it later with a higher number of threads (e.g., reaching the maximum thread limit supported by the device). >From 76e503025baf29bd1f4e876e245f32e4efbd7b99 Mon Sep 17 00:00:00 2001 From: Kevin Sala <[email protected]> Date: Wed, 20 May 2026 00:40:11 -0700 Subject: [PATCH] [llvm][OpenMP] Add option to disable default max threads adjustment This commit adds the option -openmp-ir-builder-use-default-max-thread=<boolean-value> to enable or disable the use of a default max threads in OpenMPIRBuilder when no max threads are provided. The option is enabled by default, maintaining the same behavior as it is currently. --- clang/test/OpenMP/thread_limit_gpu.c | 29 +++++++++++++++++++---- clang/test/OpenMP/thread_limit_nvptx.c | 26 +++++++++++++------- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 10 +++++--- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c index 829b0a1b02d22..c976e9e72b1ba 100644 --- a/clang/test/OpenMP/thread_limit_gpu.c +++ b/clang/test/OpenMP/thread_limit_gpu.c @@ -1,8 +1,10 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -mllvm -openmp-ir-builder-use-default-max-threads=false -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU-FLAG %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -mllvm -openmp-ir-builder-use-default-max-threads=false -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV-FLAG %s // expected-no-diagnostics #ifndef HEADER @@ -28,11 +30,11 @@ void foo(int N) { #endif -// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] { -// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] { -// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] { -// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] { -// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l24({{.*}}) #[[ATTR5:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l14({{.*}}) #[[ATTR1:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l17({{.*}}) #[[ATTR2:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l20({{.*}}) #[[ATTR3:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l23({{.*}}) #[[ATTR4:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l26({{.*}}) #[[ATTR5:.+]] { // CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} } // CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} } @@ -45,3 +47,20 @@ void foo(int N) { // CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="42" {{.*}} } // CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="22" {{.*}} } // CHECK-SPIRV: attributes #[[ATTR5]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="20" {{.*}} } + +// CHECK-AMDGPU-FLAG: attributes #[[ATTR1]] = { +// CHECK-AMDGPU-FLAG-NOT: amdgpu-flat-work-group-size +// CHECK-AMDGPU-FLAG-NOT: omp_target_thread_limit +// CHECK-AMDGPU-FLAG-SAME: } +// CHECK-AMDGPU-FLAG: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} } +// CHECK-AMDGPU-FLAG: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" {{.*}} } +// CHECK-AMDGPU-FLAG: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" {{.*}} } +// CHECK-AMDGPU-FLAG: attributes #[[ATTR5]] = { {{.*}} "amdgpu-flat-work-group-size"="1,20" "amdgpu-max-num-workgroups"="86,1,1" {{.*}} } + +// CHECK-SPIRV-FLAG: attributes #[[ATTR1]] = { +// CHECK-SPIRV-FLAG-NOT: omp_target_thread_limit +// CHECK-SPIRV-FLAG-SAME: } +// CHECK-SPIRV-FLAG: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4" {{.*}} } +// CHECK-SPIRV-FLAG: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="42" {{.*}} } +// CHECK-SPIRV-FLAG: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="22" {{.*}} } +// CHECK-SPIRV-FLAG: attributes #[[ATTR5]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="20" {{.*}} } diff --git a/clang/test/OpenMP/thread_limit_nvptx.c b/clang/test/OpenMP/thread_limit_nvptx.c index ffa6c453067d1..c4a83ca052c95 100644 --- a/clang/test/OpenMP/thread_limit_nvptx.c +++ b/clang/test/OpenMP/thread_limit_nvptx.c @@ -1,27 +1,28 @@ // Test target codegen - host bc file has to be created first. // // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefixes=CHECK,CHECK-NVPTX +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -mllvm -openmp-ir-builder-use-default-max-threads=false -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefix=CHECK,CHECK-NVPTX-FLAG %s // expected-no-diagnostics #ifndef HEADER #define HEADER void foo(int N) { -// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]] +// CHECK: define {{.*}}l12{{.*}} #[[ATTR0:[0-9]+]] #pragma omp target teams distribute parallel for simd for (int i = 0; i < N; ++i) ; -// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]] +// CHECK: define {{.*}}l16{{.*}} #[[ATTR1:[0-9]+]] #pragma omp target teams distribute parallel for simd thread_limit(4) for (int i = 0; i < N; ++i) ; -// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]] +// CHECK: define {{.*}}l21{{.*}} #[[ATTR2:[0-9]+]] #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) for (int i = 0; i < N; ++i) ; -// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]] +// CHECK: define {{.*}}l26{{.*}} #[[ATTR3:[0-9]+]] #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22) for (int i = 0; i < N; ++i) ; @@ -29,7 +30,14 @@ void foo(int N) { #endif -// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}} -// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}} -// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}} -// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}} +// CHECK-NVPTX: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}} +// CHECK-NVPTX: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}} +// CHECK-NVPTX: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}} +// CHECK-NVPTX: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}} + +// CHECK-NVPTX-FLAG: attributes #[[ATTR0]] = { +// CHECK-NVPTX-FLAG-NOT: nvvm.maxntid +// CHECK-NVPTX-FLAG-SAME: } +// CHECK-NVPTX-FLAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}} +// CHECK-NVPTX-FLAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}} +// CHECK-NVPTX-FLAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}} diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 06026582538a2..57dc682838317 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -83,6 +83,10 @@ static cl::opt<double> UnrollThresholdFactor( "simplifications still taking place"), cl::init(1.5)); +static cl::opt<bool> UseDefaultMaxThreads( + "openmp-ir-builder-use-default-max-threads", cl::Hidden, + cl::desc("Use a default max threads if none is provided."), cl::init(true)); + #ifndef NDEBUG /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions /// at position IP1 may change the meaning of IP2 or vice-versa. This is because @@ -8155,10 +8159,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit( if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0) writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front()); - // If MaxThreads not set, select the maximum between the default workgroup - // size and the MinThreads value. + // If MaxThreads is not set and needs adjustment, select the maximum between + // the default workgroup size and the MinThreads value. int32_t MaxThreadsVal = Attrs.MaxThreads.front(); - if (MaxThreadsVal < 0) { + if (MaxThreadsVal < 0 && UseDefaultMaxThreads) { if (hasGridValue(T)) { MaxThreadsVal = std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
