https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/127634
>From 4515ac1a9f1a3efe94c902b7882400d04d25d4e8 Mon Sep 17 00:00:00 2001 From: ergawy <kareem.erg...@amd.com> Date: Tue, 18 Feb 2025 06:17:17 -0600 Subject: [PATCH 1/2] [flang][OpenMP] Extend `do concurrent` mapping to multi-range loops Adds support for converting mulit-range loops to OpenMP (on the host only for now). The changes here "prepare" a loop nest for collapsing by sinking iteration variables to the innermost `fir.do_loop` op in the nest. --- flang/docs/DoConcurrentConversionToOpenMP.md | 29 ++++ .../OpenMP/DoConcurrentConversion.cpp | 139 +++++++++++++++++- .../multiple_iteration_ranges.f90 | 72 +++++++++ 3 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md index 19611615ee9d6..ecb4428d7d3ba 100644 --- a/flang/docs/DoConcurrentConversionToOpenMP.md +++ b/flang/docs/DoConcurrentConversionToOpenMP.md @@ -173,6 +173,35 @@ omp.parallel { <!-- TODO --> +### Multi-range loops + +The pass currently supports multi-range loops as well. Given the following +example: + +```fortran + do concurrent(i=1:n, j=1:m) + a(i,j) = i * j + end do +``` + +The generated `omp.loop_nest` operation look like: + +``` +omp.loop_nest (%arg0, %arg1) + : index = (%17, %19) to (%18, %20) + inclusive step (%c1_2, %c1_4) { + fir.store %arg0 to %private_i#1 : !fir.ref<i32> + fir.store %arg1 to %private_j#1 : !fir.ref<i32> + ... + omp.yield +} +``` + +It is worth noting that we have privatized versions for both iteration +variables: `i` and `j`. These are locally allocated inside the parallel/target +OpenMP region similar to what the single-range example in previous section +shows. + <!-- More details about current status will be added along with relevant parts of the implementation in later upstreaming patches. diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index bb535cf8aba03..ae3fe4db6b9f5 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -30,6 +30,9 @@ namespace looputils { struct InductionVariableInfo { /// The operation allocating memory for iteration variable. mlir::Operation *iterVarMemDef; + /// the operation(s) updating the iteration variable with the current + /// iteration number. + llvm::SetVector<mlir::Operation *> indVarUpdateOps; }; using LoopNestToIndVarMap = @@ -87,6 +90,47 @@ mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) { return result.getDefiningOp(); } +/// Collects the op(s) responsible for updating a loop's iteration variable with +/// the current iteration number. For example, for the input IR: +/// ``` +/// %i = fir.alloca i32 {bindc_name = "i"} +/// %i_decl:2 = hlfir.declare %i ... +/// ... +/// fir.do_loop %i_iv = %lb to %ub step %step unordered { +/// %1 = fir.convert %i_iv : (index) -> i32 +/// fir.store %1 to %i_decl#1 : !fir.ref<i32> +/// ... +/// } +/// ``` +/// this function would return the first 2 ops in the `fir.do_loop`'s region. +llvm::SetVector<mlir::Operation *> +extractIndVarUpdateOps(fir::DoLoopOp doLoop) { + mlir::Value indVar = doLoop.getInductionVar(); + llvm::SetVector<mlir::Operation *> indVarUpdateOps; + + llvm::SmallVector<mlir::Value> toProcess; + toProcess.push_back(indVar); + + llvm::DenseSet<mlir::Value> done; + + while (!toProcess.empty()) { + mlir::Value val = toProcess.back(); + toProcess.pop_back(); + + if (!done.insert(val).second) + continue; + + for (mlir::Operation *user : val.getUsers()) { + indVarUpdateOps.insert(user); + + for (mlir::Value result : user->getResults()) + toProcess.push_back(result); + } + } + + return std::move(indVarUpdateOps); +} + /// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff /// there are no operations in \p outerloop's body other than: /// @@ -183,7 +227,9 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop, while (true) { loopNest.insert( {currentLoop, - InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}}); + InductionVariableInfo{ + findLoopIterationVarMemDecl(currentLoop), + std::move(looputils::extractIndVarUpdateOps(currentLoop))}}); llvm::SmallVector<fir::DoLoopOp> unorderedLoops; @@ -210,6 +256,96 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop, return mlir::success(); } + +/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In +/// particular, this function would take this input IR: +/// ``` +/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered { +/// fir.store %i_iv to %i#1 : !fir.ref<i32> +/// %j_lb = arith.constant 1 : i32 +/// %j_ub = arith.constant 10 : i32 +/// %j_step = arith.constant 1 : index +/// +/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered { +/// fir.store %j_iv to %j#1 : !fir.ref<i32> +/// ... +/// } +/// } +/// ``` +/// +/// into the following form (using generic op form since the result is +/// technically an invalid `fir.do_loop` op: +/// +/// ``` +/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({ +/// ^bb0(%i_iv: index): +/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32 +/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32 +/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index +/// +/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({ +/// ^bb0(%new_i_iv: index, %new_j_iv: index): +/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> () +/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> () +/// ... +/// }) +/// ``` +/// +/// What happened to the loop nest is the following: +/// +/// * the innermost loop's entry block was updated from having one operand to +/// having `n` operands where `n` is the number of loops in the nest, +/// +/// * the outer loop(s)' ops that update the IVs were sank inside the innermost +/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above), +/// +/// * the innermost loop's entry block's arguments were mapped in order from the +/// outermost to the innermost IV. +/// +/// With this IR change, we can directly inline the innermost loop's region into +/// the newly generated `omp.loop_nest` op. +/// +/// Note that this function has a pre-condition that \p loopNest consists of +/// perfectly nested loops; i.e. there are no in-between ops between 2 nested +/// loops except for the ops to setup the inner loop's LB, UB, and step. These +/// ops are handled/cloned by `genLoopNestClauseOps(..)`. +void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter, + looputils::LoopNestToIndVarMap &loopNest) { + if (loopNest.size() <= 1) + return; + + fir::DoLoopOp innermostLoop = loopNest.back().first; + mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front(); + + llvm::SmallVector<mlir::Type> argTypes; + llvm::SmallVector<mlir::Location> argLocs; + + for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) { + // Sink the IV update ops to the innermost loop. We need to do for all loops + // except for the innermost one, hence the `drop_end` usage above. + for (mlir::Operation *op : indVarInfo.indVarUpdateOps) + op->moveBefore(&innermostFirstOp); + + argTypes.push_back(doLoop.getInductionVar().getType()); + argLocs.push_back(doLoop.getInductionVar().getLoc()); + } + + mlir::Region &innermmostRegion = innermostLoop.getRegion(); + // Extend the innermost entry block with arguments to represent the outer IVs. + innermmostRegion.addArguments(argTypes, argLocs); + + unsigned idx = 1; + // In reverse, remap the IVs of the loop nest from the old values to the new + // ones. We do that in reverse since the first argument before this loop is + // the old IV for the innermost loop. Therefore, we want to replace it first + // before the old value (1st argument in the block) is remapped to be the IV + // of the outermost loop in the nest. + for (auto &[doLoop, _] : llvm::reverse(loopNest)) { + doLoop.getInductionVar().replaceAllUsesWith( + innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx)); + ++idx; + } +} } // namespace looputils class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> { @@ -236,6 +372,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> { "Some `do concurent` loops are not perfectly-nested. " "These will be serialized."); + looputils::sinkLoopIVArgs(rewriter, loopNest); mlir::IRMapping mapper; genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper); mlir::omp::LoopNestOperands loopNestClauseOps; diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 new file mode 100644 index 0000000000000..232420fb07a75 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 @@ -0,0 +1,72 @@ +! Tests mapping of a `do concurrent` loop with multiple iteration ranges. + +! RUN: split-file %s %t + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \ +! RUN: | FileCheck %s + +!--- multi_range.f90 +program main + integer, parameter :: n = 20 + integer, parameter :: m = 40 + integer, parameter :: l = 60 + integer :: a(n, m, l) + + do concurrent(i=3:n, j=5:m, k=7:l) + a(i,j,k) = i * j + k + end do +end + +! CHECK: func.func @_QQmain + +! CHECK: %[[C3:.*]] = arith.constant 3 : i32 +! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index +! CHECK: %[[C20:.*]] = arith.constant 20 : i32 +! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index +! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index + +! CHECK: %[[C5:.*]] = arith.constant 5 : i32 +! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index +! CHECK: %[[C40:.*]] = arith.constant 40 : i32 +! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index +! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index + +! CHECK: %[[C7:.*]] = arith.constant 7 : i32 +! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index +! CHECK: %[[C60:.*]] = arith.constant 60 : i32 +! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index +! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index + +! CHECK: omp.parallel { + +! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"} +! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"} + +! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"} +! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"} + +! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"} +! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"} + +! CHECK: omp.wsloop { +! CHECK-NEXT: omp.loop_nest +! CHECK-SAME: (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]]) +! CHECK-SAME: : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]]) +! CHECK-SAME: to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive +! CHECK-SAME: step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) { + +! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]] +! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#1 + +! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]] +! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#1 + +! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]] +! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#1 + +! CHECK: omp.yield +! CHECK-NEXT: } +! CHECK-NEXT: } + +! CHECK-NEXT: omp.terminator +! CHECK-NEXT: } >From 7b60c5bd09fbac288934d27af3bcea4b2350187d Mon Sep 17 00:00:00 2001 From: ergawy <kareem.erg...@amd.com> Date: Mon, 17 Mar 2025 02:43:30 -0500 Subject: [PATCH 2/2] Simplify iv update op detection --- .../OpenMP/DoConcurrentConversion.cpp | 157 +++++++----------- 1 file changed, 62 insertions(+), 95 deletions(-) diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index ae3fe4db6b9f5..6160c642767c0 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -28,108 +28,80 @@ namespace looputils { /// Stores info needed about the induction/iteration variable for each `do /// concurrent` in a loop nest. struct InductionVariableInfo { + InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); } + /// The operation allocating memory for iteration variable. mlir::Operation *iterVarMemDef; /// the operation(s) updating the iteration variable with the current /// iteration number. - llvm::SetVector<mlir::Operation *> indVarUpdateOps; -}; - -using LoopNestToIndVarMap = - llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>; + llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps; -/// For the \p doLoop parameter, find the operation that declares its iteration -/// variable or allocates memory for it. -/// -/// For example, give the following loop: -/// ``` -/// ... -/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ... -/// ... -/// fir.do_loop %ind_var = %lb to %ub step %s unordered { -/// %ind_var_conv = fir.convert %ind_var : (index) -> i32 -/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32> -/// ... -/// } -/// ``` -/// -/// This function returns the `hlfir.declare` op for `%i`. -/// -/// Note: The current implementation is dependent on how flang emits loop -/// bodies; which is sufficient for the current simple test/use cases. If this -/// proves to be insufficient, this should be made more generic. -mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) { - mlir::Value result = nullptr; - - // Checks if a StoreOp is updating the memref of the loop's iteration - // variable. - auto isStoringIV = [&](fir::StoreOp storeOp) { - // Direct store into the IV memref. - if (storeOp.getValue() == doLoop.getInductionVar()) - return true; - - // Indirect store into the IV memref. - if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>( - storeOp.getValue().getDefiningOp())) { - if (convertOp.getOperand() == doLoop.getInductionVar()) +private: + /// For the \p doLoop parameter, find the following: + /// + /// 1. The operation that declares its iteration variable or allocates memory + /// for it. For example, give the following loop: + /// ``` + /// ... + /// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ... + /// ... + /// fir.do_loop %ind_var = %lb to %ub step %s unordered { + /// %ind_var_conv = fir.convert %ind_var : (index) -> i32 + /// fir.store %ind_var_conv to %i#1 : !fir.ref<i32> + /// ... + /// } + /// ``` + /// + /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op + /// for `%i`. + /// + /// 2. The operation(s) that update the loop's iteration variable from its + /// induction variable. For the above example, the `indVarUpdateOps` is + /// populated with the first 2 ops in the loop's body. + /// + /// Note: The current implementation is dependent on how flang emits loop + /// bodies; which is sufficient for the current simple test/use cases. If this + /// proves to be insufficient, this should be made more generic. + void populateInfo(fir::DoLoopOp doLoop) { + mlir::Value result = nullptr; + + // Checks if a StoreOp is updating the memref of the loop's iteration + // variable. + auto isStoringIV = [&](fir::StoreOp storeOp) { + // Direct store into the IV memref. + if (storeOp.getValue() == doLoop.getInductionVar()) { + indVarUpdateOps.push_back(storeOp); return true; - } - - return false; - }; - - for (mlir::Operation &op : doLoop) { - if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) - if (isStoringIV(storeOp)) { - result = storeOp.getMemref(); - break; } - } - assert(result != nullptr && result.getDefiningOp() != nullptr); - return result.getDefiningOp(); -} - -/// Collects the op(s) responsible for updating a loop's iteration variable with -/// the current iteration number. For example, for the input IR: -/// ``` -/// %i = fir.alloca i32 {bindc_name = "i"} -/// %i_decl:2 = hlfir.declare %i ... -/// ... -/// fir.do_loop %i_iv = %lb to %ub step %step unordered { -/// %1 = fir.convert %i_iv : (index) -> i32 -/// fir.store %1 to %i_decl#1 : !fir.ref<i32> -/// ... -/// } -/// ``` -/// this function would return the first 2 ops in the `fir.do_loop`'s region. -llvm::SetVector<mlir::Operation *> -extractIndVarUpdateOps(fir::DoLoopOp doLoop) { - mlir::Value indVar = doLoop.getInductionVar(); - llvm::SetVector<mlir::Operation *> indVarUpdateOps; - - llvm::SmallVector<mlir::Value> toProcess; - toProcess.push_back(indVar); - - llvm::DenseSet<mlir::Value> done; - - while (!toProcess.empty()) { - mlir::Value val = toProcess.back(); - toProcess.pop_back(); - - if (!done.insert(val).second) - continue; + // Indirect store into the IV memref. + if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>( + storeOp.getValue().getDefiningOp())) { + if (convertOp.getOperand() == doLoop.getInductionVar()) { + indVarUpdateOps.push_back(convertOp); + indVarUpdateOps.push_back(storeOp); + return true; + } + } - for (mlir::Operation *user : val.getUsers()) { - indVarUpdateOps.insert(user); + return false; + }; - for (mlir::Value result : user->getResults()) - toProcess.push_back(result); + for (mlir::Operation &op : doLoop) { + if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) + if (isStoringIV(storeOp)) { + result = storeOp.getMemref(); + break; + } } + + assert(result != nullptr && result.getDefiningOp() != nullptr); + iterVarMemDef = result.getDefiningOp(); } +}; - return std::move(indVarUpdateOps); -} +using LoopNestToIndVarMap = + llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>; /// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff /// there are no operations in \p outerloop's body other than: @@ -225,12 +197,7 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop, assert(currentLoop.getUnordered()); while (true) { - loopNest.insert( - {currentLoop, - InductionVariableInfo{ - findLoopIterationVarMemDecl(currentLoop), - std::move(looputils::extractIndVarUpdateOps(currentLoop))}}); - + loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)}); llvm::SmallVector<fir::DoLoopOp> unorderedLoops; for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>()) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits