https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/127634
>From 090ea42681a2e0bfbb73853eb75f8e31d3adf120 Mon Sep 17 00:00:00 2001 From: ergawy <kareem.erg...@amd.com> Date: Tue, 18 Feb 2025 06:17:17 -0600 Subject: [PATCH] [flang][OpenMP] Extend `do concurrent` mapping to multi-range loops Adds support for converting mulit-range loops to OpenMP (on the host only for now). The changes here "prepare" a loop nest for collapsing by sinking iteration variables to the innermost `fir.do_loop` op in the nest. --- flang/docs/DoConcurrentConversionToOpenMP.md | 29 ++++ .../OpenMP/DoConcurrentConversion.cpp | 139 +++++++++++++++++- .../multiple_iteration_ranges.f90 | 72 +++++++++ 3 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md index 19611615ee9d6..ecb4428d7d3ba 100644 --- a/flang/docs/DoConcurrentConversionToOpenMP.md +++ b/flang/docs/DoConcurrentConversionToOpenMP.md @@ -173,6 +173,35 @@ omp.parallel { <!-- TODO --> +### Multi-range loops + +The pass currently supports multi-range loops as well. Given the following +example: + +```fortran + do concurrent(i=1:n, j=1:m) + a(i,j) = i * j + end do +``` + +The generated `omp.loop_nest` operation look like: + +``` +omp.loop_nest (%arg0, %arg1) + : index = (%17, %19) to (%18, %20) + inclusive step (%c1_2, %c1_4) { + fir.store %arg0 to %private_i#1 : !fir.ref<i32> + fir.store %arg1 to %private_j#1 : !fir.ref<i32> + ... + omp.yield +} +``` + +It is worth noting that we have privatized versions for both iteration +variables: `i` and `j`. These are locally allocated inside the parallel/target +OpenMP region similar to what the single-range example in previous section +shows. + <!-- More details about current status will be added along with relevant parts of the implementation in later upstreaming patches. diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 7ad2229df31ce..182820047b0c3 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -30,6 +30,9 @@ namespace looputils { struct InductionVariableInfo { /// the operation allocating memory for iteration variable, mlir::Operation *iterVarMemDef; + /// the operation(s) updating the iteration variable with the current + /// iteration number. + llvm::SetVector<mlir::Operation *> indVarUpdateOps; }; using LoopNestToIndVarMap = @@ -70,6 +73,47 @@ mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) { return result.getDefiningOp(); } +/// Collects the op(s) responsible for updating a loop's iteration variable with +/// the current iteration number. For example, for the input IR: +/// ``` +/// %i = fir.alloca i32 {bindc_name = "i"} +/// %i_decl:2 = hlfir.declare %i ... +/// ... +/// fir.do_loop %i_iv = %lb to %ub step %step unordered { +/// %1 = fir.convert %i_iv : (index) -> i32 +/// fir.store %1 to %i_decl#1 : !fir.ref<i32> +/// ... +/// } +/// ``` +/// this function would return the first 2 ops in the `fir.do_loop`'s region. +llvm::SetVector<mlir::Operation *> +extractIndVarUpdateOps(fir::DoLoopOp doLoop) { + mlir::Value indVar = doLoop.getInductionVar(); + llvm::SetVector<mlir::Operation *> indVarUpdateOps; + + llvm::SmallVector<mlir::Value> toProcess; + toProcess.push_back(indVar); + + llvm::DenseSet<mlir::Value> done; + + while (!toProcess.empty()) { + mlir::Value val = toProcess.back(); + toProcess.pop_back(); + + if (!done.insert(val).second) + continue; + + for (mlir::Operation *user : val.getUsers()) { + indVarUpdateOps.insert(user); + + for (mlir::Value result : user->getResults()) + toProcess.push_back(result); + } + } + + return std::move(indVarUpdateOps); +} + /// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff /// there are no operations in \p outerloop's body other than: /// @@ -166,7 +210,9 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop, while (true) { loopNest.insert( {currentLoop, - InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}}); + InductionVariableInfo{ + findLoopIterationVarMemDecl(currentLoop), + std::move(looputils::extractIndVarUpdateOps(currentLoop))}}); llvm::SmallVector<fir::DoLoopOp> unorderedLoops; @@ -193,6 +239,96 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop, return mlir::success(); } + +/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In +/// particular, this function would take this input IR: +/// ``` +/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered { +/// fir.store %i_iv to %i#1 : !fir.ref<i32> +/// %j_lb = arith.constant 1 : i32 +/// %j_ub = arith.constant 10 : i32 +/// %j_step = arith.constant 1 : index +/// +/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered { +/// fir.store %j_iv to %j#1 : !fir.ref<i32> +/// ... +/// } +/// } +/// ``` +/// +/// into the following form (using generic op form since the result is +/// technically an invalid `fir.do_loop` op: +/// +/// ``` +/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({ +/// ^bb0(%i_iv: index): +/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32 +/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32 +/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index +/// +/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({ +/// ^bb0(%new_i_iv: index, %new_j_iv: index): +/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> () +/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> () +/// ... +/// }) +/// ``` +/// +/// What happened to the loop nest is the following: +/// +/// * the innermost loop's entry block was updated from having one operand to +/// having `n` operands where `n` is the number of loops in the nest, +/// +/// * the outer loop(s)' ops that update the IVs were sank inside the innermost +/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above), +/// +/// * the innermost loop's entry block's arguments were mapped in order from the +/// outermost to the innermost IV. +/// +/// With this IR change, we can directly inline the innermost loop's region into +/// the newly generated `omp.loop_nest` op. +/// +/// Note that this function has a pre-condition that \p loopNest consists of +/// perfectly nested loops; i.e. there are no in-between ops between 2 nested +/// loops except for the ops to setup the inner loop's LB, UB, and step. These +/// ops are handled/cloned by `genLoopNestClauseOps(..)`. +void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter, + looputils::LoopNestToIndVarMap &loopNest) { + if (loopNest.size() <= 1) + return; + + fir::DoLoopOp innermostLoop = loopNest.back().first; + mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front(); + + llvm::SmallVector<mlir::Type> argTypes; + llvm::SmallVector<mlir::Location> argLocs; + + for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) { + // Sink the IV update ops to the innermost loop. We need to do for all loops + // except for the innermost one, hence the `drop_end` usage above. + for (mlir::Operation *op : indVarInfo.indVarUpdateOps) + op->moveBefore(&innermostFirstOp); + + argTypes.push_back(doLoop.getInductionVar().getType()); + argLocs.push_back(doLoop.getInductionVar().getLoc()); + } + + mlir::Region &innermmostRegion = innermostLoop.getRegion(); + // Extend the innermost entry block with arguments to represent the outer IVs. + innermmostRegion.addArguments(argTypes, argLocs); + + unsigned idx = 1; + // In reverse, remap the IVs of the loop nest from the old values to the new + // ones. We do that in reverse since the first argument before this loop is + // the old IV for the innermost loop. Therefore, we want to replace it first + // before the old value (1st argument in the block) is remapped to be the IV + // of the outermost loop in the nest. + for (auto &[doLoop, _] : llvm::reverse(loopNest)) { + doLoop.getInductionVar().replaceAllUsesWith( + innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx)); + ++idx; + } +} } // namespace looputils class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> { @@ -219,6 +355,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> { "Some `do concurent` loops are not perfectly-nested. " "These will be serialized."); + looputils::sinkLoopIVArgs(rewriter, loopNest); mlir::IRMapping mapper; genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper); mlir::omp::LoopNestOperands loopNestClauseOps; diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 new file mode 100644 index 0000000000000..232420fb07a75 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 @@ -0,0 +1,72 @@ +! Tests mapping of a `do concurrent` loop with multiple iteration ranges. + +! RUN: split-file %s %t + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \ +! RUN: | FileCheck %s + +!--- multi_range.f90 +program main + integer, parameter :: n = 20 + integer, parameter :: m = 40 + integer, parameter :: l = 60 + integer :: a(n, m, l) + + do concurrent(i=3:n, j=5:m, k=7:l) + a(i,j,k) = i * j + k + end do +end + +! CHECK: func.func @_QQmain + +! CHECK: %[[C3:.*]] = arith.constant 3 : i32 +! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index +! CHECK: %[[C20:.*]] = arith.constant 20 : i32 +! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index +! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index + +! CHECK: %[[C5:.*]] = arith.constant 5 : i32 +! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index +! CHECK: %[[C40:.*]] = arith.constant 40 : i32 +! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index +! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index + +! CHECK: %[[C7:.*]] = arith.constant 7 : i32 +! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index +! CHECK: %[[C60:.*]] = arith.constant 60 : i32 +! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index +! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index + +! CHECK: omp.parallel { + +! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"} +! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"} + +! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"} +! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"} + +! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"} +! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"} + +! CHECK: omp.wsloop { +! CHECK-NEXT: omp.loop_nest +! CHECK-SAME: (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]]) +! CHECK-SAME: : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]]) +! CHECK-SAME: to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive +! CHECK-SAME: step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) { + +! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]] +! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#1 + +! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]] +! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#1 + +! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]] +! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#1 + +! CHECK: omp.yield +! CHECK-NEXT: } +! CHECK-NEXT: } + +! CHECK-NEXT: omp.terminator +! CHECK-NEXT: } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits