https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/150927
>From 602a55d32630c0d5d56db957be003929fdde82b7 Mon Sep 17 00:00:00 2001 From: Sergio Afonso <[email protected]> Date: Fri, 25 Jul 2025 13:52:11 +0100 Subject: [PATCH 1/4] [OpenMPOpt] Make parallel regions reachable from new DeviceRTL loop functions This patch updates the OpenMP optimization pass to know about the new DeviceRTL functions for loop constructs. This change marks these functions as potentially containing parallel regions, which fixes a current bug with the state machine rewrite optimization. It previously failed to identify parallel regions located inside of the callbacks passed to these new DeviceRTL functions, causing the resulting code to skip executing these parallel regions. As a result, Generic kernels produced by Flang that contain parallel regions now work properly. One known related issue not fixed by this patch is that the presence of calls to these functions will prevent the SPMD-ization of Generic kernels by OpenMPOpt. Previously, this was due to assuming there was no parallel region. This is changed by this patch, but instead we now mark it temporarily as unsupported in an SPMD context. The reason is that, without additional changes, code intended for the main thread of the team located outside of the parallel region would not be guarded properly, resulting in race conditions and generally invalid behavior. --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 22 +++ .../fortran/target-generic-loops.f90 | 130 ++++++++++++++++++ .../offloading/fortran/target-spmd-loops.f90 | 39 ++++++ 3 files changed, 191 insertions(+) create mode 100644 offload/test/offloading/fortran/target-generic-loops.f90 create mode 100644 offload/test/offloading/fortran/target-spmd-loops.f90 diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 5266103d1ac24..05e719368f3f4 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5021,6 +5021,28 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_free_shared: // Return without setting a fixpoint, to be resolved in updateImpl. return; + case OMPRTL___kmpc_distribute_static_loop_4: + case OMPRTL___kmpc_distribute_static_loop_4u: + case OMPRTL___kmpc_distribute_static_loop_8: + case OMPRTL___kmpc_distribute_static_loop_8u: + case OMPRTL___kmpc_distribute_for_static_loop_4: + case OMPRTL___kmpc_distribute_for_static_loop_4u: + case OMPRTL___kmpc_distribute_for_static_loop_8: + case OMPRTL___kmpc_distribute_for_static_loop_8u: + case OMPRTL___kmpc_for_static_loop_4: + case OMPRTL___kmpc_for_static_loop_4u: + case OMPRTL___kmpc_for_static_loop_8: + case OMPRTL___kmpc_for_static_loop_8u: + // Parallel regions might be reached by these calls, as they take a + // callback argument potentially arbitrary user-provided code. + ReachedUnknownParallelRegions.insert(&CB); + // TODO: The presence of these calls on their own does not prevent a + // kernel from being SPMD-izable. We mark it as such because we need + // further changes in order to also consider the contents of the + // callbacks passed to them. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + SPMDCompatibilityTracker.insert(&CB); + break; default: // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, // generally. However, they do not hide parallel regions. diff --git a/offload/test/offloading/fortran/target-generic-loops.f90 b/offload/test/offloading/fortran/target-generic-loops.f90 new file mode 100644 index 0000000000000..07bcbfd2c8752 --- /dev/null +++ b/offload/test/offloading/fortran/target-generic-loops.f90 @@ -0,0 +1,130 @@ +! Offloading test for generic target regions containing different kinds of +! loop constructs inside. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + integer :: i1, i2, n1, n2, counter + + n1 = 100 + n2 = 50 + + counter = 0 + !$omp target map(tofrom:counter) + !$omp teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 1 100 + print '(I2" "I0)', 1, counter + + counter = 0 + !$omp target map(tofrom:counter) + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 2 200 + print '(I2" "I0)', 2, counter + + counter = 0 + !$omp target map(tofrom:counter) + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp end target + + ! CHECK: 3 203 + print '(I2" "I0)', 3, counter + + counter = 0 + !$omp target map(tofrom: counter) + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp end target + + ! CHECK: 4 102 + print '(I2" "I0)', 4, counter + + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + end do + + ! CHECK: 5 5000 + print '(I2" "I0)', 5, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + end do + + ! CHECK: 6 5200 + print '(I2" "I0)', 6, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + end do + + ! CHECK: 7 10000 + print '(I2" "I0)', 7, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + end do + + ! CHECK: 8 10300 + print '(I2" "I0)', 8, counter +end program diff --git a/offload/test/offloading/fortran/target-spmd-loops.f90 b/offload/test/offloading/fortran/target-spmd-loops.f90 new file mode 100644 index 0000000000000..7407f0c0768cb --- /dev/null +++ b/offload/test/offloading/fortran/target-spmd-loops.f90 @@ -0,0 +1,39 @@ +! Offloading test for generic target regions containing different kinds of +! loop constructs inside. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + integer :: i1, n1, counter + + n1 = 100 + + counter = 0 + !$omp target parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + + ! CHECK: 1 100 + print '(I2" "I0)', 1, counter + + counter = 0 + !$omp target map(tofrom:counter) + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 2 100 + print '(I2" "I0)', 2, counter + + counter = 0 + !$omp target teams distribute parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + + ! CHECK: 3 100 + print '(I2" "I0)', 3, counter +end program >From e4880244d2eb70ecf59cbc7509b8998cf31ce259 Mon Sep 17 00:00:00 2001 From: Sergio Afonso <[email protected]> Date: Thu, 14 Aug 2025 14:01:48 +0100 Subject: [PATCH 2/4] Address review comments --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 05e719368f3f4..7da65ff4dbd51 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5034,7 +5034,8 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_for_static_loop_8: case OMPRTL___kmpc_for_static_loop_8u: // Parallel regions might be reached by these calls, as they take a - // callback argument potentially arbitrary user-provided code. + // callback argument potentially containing arbitrary user-provided + // code. ReachedUnknownParallelRegions.insert(&CB); // TODO: The presence of these calls on their own does not prevent a // kernel from being SPMD-izable. We mark it as such because we need >From bd599dafe07020bfd79ca1c8065aa90d5f409e35 Mon Sep 17 00:00:00 2001 From: Sergio Afonso <[email protected]> Date: Tue, 10 Feb 2026 13:17:03 +0000 Subject: [PATCH 3/4] [CodeGen] Workaround for compiler crash This patch implements a workaround for a "Getting frame offset for a dead object?" assertion triggered when compiling the target-generic-loops.f90 offloading test. A more concise reproducer for this issue: ```f90 ! flang -O0 -fopenmp -fopenmp-version=52 --offload-arch=gfx1100 test.f90 program main integer :: i, counter !$omp target teams distribute do i=1, 10 end do counter = 0 !$omp target map(tofrom: counter) !$omp parallel do reduction(+:counter) do i=1, 10 counter = counter + 1 end do !$omp end target end program ``` This is an AI-generated fix and needs a proper review by an expert in this area of the compiler, as I am not familiar enough to evaluate whether it is a valid approach or if there is a problem with the LLVM IR produced by Flang itself. However, it does address the problem and causes these tests to be successfully compiled and run as expected. --- llvm/lib/CodeGen/PrologEpilogInserter.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 68fd54cf00146..c57fcfdcfe1c0 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -1403,7 +1403,15 @@ bool PEIImpl::replaceFrameIndexDebugInstr(MachineFunction &MF, MachineInstr &MI, "Frame indices can only appear as a debug operand in a DBG_VALUE*" " machine instruction"); Register Reg; - unsigned FrameIdx = Op.getIndex(); + int FrameIdx = Op.getIndex(); + + // If the frame object has been removed (e.g., dead object elimination), + // the debug value is undefined. Replace with $noreg. + if (MF.getFrameInfo().isDeadObjectIndex(FrameIdx)) { + Op.ChangeToRegister(0, false /*isDef*/); + return true; + } + unsigned Size = MF.getFrameInfo().getObjectSize(FrameIdx); StackOffset Offset = TFI->getFrameIndexReference(MF, FrameIdx, Reg); >From 1fcb8725334f83985d1b4ffd6e28b6d6a92d49d4 Mon Sep 17 00:00:00 2001 From: Sergio Afonso <[email protected]> Date: Wed, 25 Feb 2026 14:30:20 +0000 Subject: [PATCH 4/4] Revert "[CodeGen] Workaround for compiler crash" This reverts commit 23582905a37659bebf6a15c413b209ad0bbbd6c4. --- llvm/lib/CodeGen/PrologEpilogInserter.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index c57fcfdcfe1c0..68fd54cf00146 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -1403,15 +1403,7 @@ bool PEIImpl::replaceFrameIndexDebugInstr(MachineFunction &MF, MachineInstr &MI, "Frame indices can only appear as a debug operand in a DBG_VALUE*" " machine instruction"); Register Reg; - int FrameIdx = Op.getIndex(); - - // If the frame object has been removed (e.g., dead object elimination), - // the debug value is undefined. Replace with $noreg. - if (MF.getFrameInfo().isDeadObjectIndex(FrameIdx)) { - Op.ChangeToRegister(0, false /*isDef*/); - return true; - } - + unsigned FrameIdx = Op.getIndex(); unsigned Size = MF.getFrameInfo().getObjectSize(FrameIdx); StackOffset Offset = TFI->getFrameIndexReference(MF, FrameIdx, Reg); _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
