https://github.com/Saieiei updated https://github.com/llvm/llvm-project/pull/199967
>From ef37a6c3f7c426bf54a709cde4e682603e6c6b9c Mon Sep 17 00:00:00 2001 From: Sairudra More <[email protected]> Date: Fri, 12 Jun 2026 04:57:07 -0500 Subject: [PATCH] [flang][OpenMP] Lower target in_reduction for host fallback Enable host-fallback lowering for target in_reduction in Flang and MLIR OpenMP translation. Model target in_reduction through the matching map entry, force address-preserving implicit mapping for Flang in_reduction list items, and emit the host-side task-reduction lookup with __kmpc_task_reduction_get_th_data. Unsupported device/offload-entry and richer reduction forms remain diagnosed. Add Flang lowering, MLIR verifier/translation, and LLVM IR tests for the supported host-fallback path and the remaining unsupported cases. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 91 ++++++++--- .../Lower/OpenMP/Todo/target-inreduction.f90 | 15 -- .../OpenMP/target-inreduction-unused.f90 | 27 ++++ .../test/Lower/OpenMP/target-inreduction.f90 | 30 ++++ .../OpenMP/function-filtering-host-ops.mlir | 2 +- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 11 +- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 117 ++++++++++----- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 142 ++++++++++++++++-- mlir/test/Dialect/OpenMP/invalid.mlir | 60 ++++++++ .../openmp-target-in-reduction-multi.mlir | 75 +++++++++ .../LLVMIR/openmp-target-in-reduction.mlir | 52 +++++++ mlir/test/Target/LLVMIR/openmp-todo.mlir | 86 ++++++++++- 12 files changed, 617 insertions(+), 91 deletions(-) delete mode 100644 flang/test/Lower/OpenMP/Todo/target-inreduction.f90 create mode 100644 flang/test/Lower/OpenMP/target-inreduction-unused.f90 create mode 100644 flang/test/Lower/OpenMP/target-inreduction.f90 create mode 100644 mlir/test/Target/LLVMIR/openmp-target-in-reduction-multi.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 7cb7e379eb503..61d3b3b6329a5 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -433,18 +433,28 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter, .first); }; - // Process in clause name alphabetical order to match block arguments order. // Do not bind host_eval variables because they cannot be used inside of the // corresponding region, except for very specific cases handled separately. + // + // For `omp.task` / `omp.taskloop`, `in_reduction` list items have their own + // entry block argument and are bound here like other private-like variables. + // + // `in_reduction` list items on `omp.target` are not given their own entry + // block argument (`args.inReduction` is left empty for target), so the + // in_reduction bind below is a no-op there. Instead they are implicitly + // mapped, so in-body references resolve to the `map_entries` block argument + // bound here; the host side uses the `in_reduction` clause metadata to + // redirect that mapped value to the per-task reduction-private storage during + // translation. bindMapLike(args.hasDeviceAddr.objects, op.getHasDeviceAddrBlockArgs()); - bindPrivateLike(args.inReduction.objects, args.inReduction.vars, - op.getInReductionBlockArgs()); bindMapLike(args.map.objects, op.getMapBlockArgs()); bindPrivateLike(args.priv.objects, args.priv.vars, op.getPrivateBlockArgs()); bindPrivateLike(args.reduction.objects, args.reduction.vars, op.getReductionBlockArgs()); bindPrivateLike(args.taskReduction.objects, args.taskReduction.vars, op.getTaskReductionBlockArgs()); + bindPrivateLike(args.inReduction.objects, args.inReduction.vars, + op.getInReductionBlockArgs()); bindMapLike(args.useDeviceAddr.objects, op.getUseDeviceAddrBlockArgs()); bindMapLike(args.useDevicePtr.objects, op.getUseDevicePtrBlockArgs()); } @@ -1873,6 +1883,7 @@ genTargetClauses(lower::AbstractConverter &converter, mlir::omp::TargetOperands &clauseOps, DefaultMapsTy &defaultMaps, llvm::SmallVectorImpl<Object> &hasDeviceAddrObjects, + llvm::SmallVectorImpl<Object> &inReductionObjects, llvm::SmallVectorImpl<Object> &isDevicePtrObjects, llvm::SmallVectorImpl<Object> &mapObjects) { ClauseProcessor cp(converter, semaCtx, clauses); @@ -1887,13 +1898,14 @@ genTargetClauses(lower::AbstractConverter &converter, hostEvalInfo->collectValues(clauseOps.hostEvalVars); } cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps); + cp.processInReduction(loc, clauseOps, inReductionObjects); cp.processIsDevicePtr(stmtCtx, clauseOps, isDevicePtrObjects); cp.processMap(loc, stmtCtx, clauseOps, llvm::omp::Directive::OMPD_unknown, &mapObjects); cp.processNowait(clauseOps); cp.processThreadLimit(stmtCtx, clauseOps); - cp.processTODO<clause::Allocate, clause::InReduction, clause::UsesAllocators>( + cp.processTODO<clause::Allocate, clause::UsesAllocators>( loc, llvm::omp::Directive::OMPD_target); // `target private(..)` is only supported in delayed privatization mode. @@ -2932,10 +2944,10 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::omp::TargetOperands clauseOps; DefaultMapsTy defaultMaps; llvm::SmallVector<Object> mapObjects, hasDeviceAddrObjects, - isDevicePtrObjects; + inReductionObjects, isDevicePtrObjects; genTargetClauses(converter, semaCtx, symTable, stmtCtx, eval, item->clauses, loc, clauseOps, defaultMaps, hasDeviceAddrObjects, - isDevicePtrObjects, mapObjects); + inReductionObjects, isDevicePtrObjects, mapObjects); if (!isDevicePtrObjects.empty()) { // is_device_ptr maps get duplicated so the clause and synthesized @@ -2989,7 +3001,16 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, // symbols used inside the region that do not have explicit data-environment // attribute clauses (neither data-sharing; e.g. `private`, nor `map` // clauses). - auto captureImplicitMap = [&](const semantics::Symbol &sym) { + // + // When `forceAddressPreserving` is set, the symbol is force-mapped as an + // address-preserving `capture(ByRef)` with implicit `tofrom` flags, + // bypassing the scalar default capture rules. This is used for `target + // in_reduction` list items, whose mapped pointer is passed as the `orig` + // argument of `__kmpc_task_reduction_get_th_data`; a ByCopy scalar capture + // would break the runtime lookup against the enclosing taskgroup's + // task_reduction descriptor. + auto captureImplicitMap = [&](const semantics::Symbol &sym, + bool forceAddressPreserving = false) { // Structure component symbols don't have bindings, and can only be // explicitly mapped individually. If a member is captured implicitly // we map the entirety of the derived type when we find its symbol. @@ -2998,12 +3019,13 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, // if the symbol is part of an already mapped common block, do not make a // map for it. - if (const Fortran::semantics::Symbol *common = - Fortran::semantics::FindCommonBlockContaining(sym.GetUltimate())) - if (llvm::any_of(mapObjects, [=](const Object &object) { - return object.sym() == common; - })) - return; + if (!forceAddressPreserving) + if (const Fortran::semantics::Symbol *common = + Fortran::semantics::FindCommonBlockContaining(sym.GetUltimate())) + if (llvm::any_of(mapObjects, [=](const Object &object) { + return object.sym() == common; + })) + return; // If we come across a symbol without a symbol address, we // return as we cannot process it, this is intended as a @@ -3018,7 +3040,8 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, // dynamic indices on the device (e.g., const_array(runtime_index)). // Also, character scalar parameters must be mapped if they have dynamic // substring access. - if (semantics::IsNamedConstant(sym) && sym.Rank() == 0 && + if (!forceAddressPreserving && semantics::IsNamedConstant(sym) && + sym.Rank() == 0 && !symbolsWithDynamicSubstring.contains(&sym.GetUltimate())) return; @@ -3047,14 +3070,32 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, if (auto refType = mlir::dyn_cast<fir::ReferenceType>(baseOp.getType())) eleType = refType.getElementType(); + // `target in_reduction` list items must keep the original variable + // address (ByRef + implicit tofrom) so the runtime lookup receives the + // variable address; all other implicit captures follow the scalar + // default mapping rules. std::pair<mlir::omp::ClauseMapFlags, mlir::omp::VariableCaptureKind> - mapFlagAndKind = getImplicitMapTypeAndKind( - firOpBuilder, converter, defaultMaps, eleType, loc, sym); + mapFlagAndKind = + forceAddressPreserving + ? std::pair< + mlir::omp::ClauseMapFlags, + mlir::omp:: + VariableCaptureKind>{mlir::omp::ClauseMapFlags:: + implicit | + mlir::omp::ClauseMapFlags:: + to | + mlir::omp::ClauseMapFlags:: + from, + mlir::omp:: + VariableCaptureKind::ByRef} + : getImplicitMapTypeAndKind(firOpBuilder, converter, + defaultMaps, eleType, loc, sym); mlir::FlatSymbolRefAttr mapperId; auto defaultmapBehaviour = getDefaultmapIfPresent(defaultMaps, eleType); - if (defaultmapBehaviour == - clause::Defaultmap::ImplicitBehavior::Default) { + if (!forceAddressPreserving && + defaultmapBehaviour == + clause::Defaultmap::ImplicitBehavior::Default) { const semantics::DerivedTypeSpec *typeSpec = sym.GetType() ? sym.GetType()->AsDerived() : nullptr; if (typeSpec) { @@ -3108,6 +3149,15 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, Object{const_cast<semantics::Symbol *>(&sym), std::nullopt}); } }; + // OpenMP requires `in_reduction` list items on `target` to be implicitly + // data-mapped. Force-map them as address-preserving captures before the + // generic implicit-map walk so that walk treats the symbols as already + // mapped via `isDuplicateMappedSymbol` and does not downgrade them to + // ByCopy. + for (const Object &object : inReductionObjects) + if (const semantics::Symbol *sym = object.sym()) + captureImplicitMap(*sym, /*forceAddressPreserving=*/true); + lower::pft::visitAllSymbols(eval, captureImplicitMap); auto targetOp = mlir::omp::TargetOp::create(firOpBuilder, loc, clauseOps); @@ -3120,7 +3170,10 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, args.hasDeviceAddr.objects = hasDeviceAddrObjects; args.hasDeviceAddr.vars = hasDeviceAddrBaseValues; args.hostEvalVars = clauseOps.hostEvalVars; - // TODO: Add in_reduction syms and vars. + // `in_reduction` list items do not get their own entry block argument on + // `omp.target`; they are implicitly mapped (see the force-map above) and the + // target body accesses them through their `map_entries` block argument. The + // `in_reduction` operands remain on the op as host-side metadata. args.map.objects = mapObjects; args.map.vars = mapBaseValues; args.priv.objects = makeObjects(dsp.getDelayedPrivSymbols()); diff --git a/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 deleted file mode 100644 index e5a9cffac5a11..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 +++ /dev/null @@ -1,15 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s - -!=============================================================================== -! `mergeable` clause -!=============================================================================== - -! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TARGET construct -subroutine omp_target_inreduction() - integer i - i = 0 - !$omp target in_reduction(+:i) - i = i + 1 - !$omp end target -end subroutine omp_target_inreduction diff --git a/flang/test/Lower/OpenMP/target-inreduction-unused.f90 b/flang/test/Lower/OpenMP/target-inreduction-unused.f90 new file mode 100644 index 0000000000000..c002846494916 --- /dev/null +++ b/flang/test/Lower/OpenMP/target-inreduction-unused.f90 @@ -0,0 +1,27 @@ +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +! Per the OpenMP spec, an in_reduction list item on a target construct is +! implicitly data-mapped. The lowering must not rely on the variable being +! referenced inside the target body to discover that map: here `i` only +! appears in the in_reduction clause and is never read or written inside +! the region. Verify that an omp.map.info for `i` is still emitted and +! flows into the omp.target's map_entries. + +!CHECK-LABEL: func.func @_QPomp_target_in_reduction_unused() +!CHECK: %[[IDECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFomp_target_in_reduction_unusedEi"} +!CHECK: %[[IMAP:.*]] = omp.map.info var_ptr(%[[IDECL]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<i32> {name = "i"} +!CHECK: omp.target in_reduction(@{{[^ ]+}} %[[IDECL]]#0 : !fir.ref<i32>) +!CHECK-SAME: map_entries(%[[IMAP]] -> %{{[^ ]+}} : !fir.ref<i32>) + +subroutine omp_target_in_reduction_unused() + interface + subroutine sub() + end subroutine + end interface + integer i + i = 0 + !$omp target in_reduction(+:i) + call sub() + !$omp end target +end subroutine omp_target_in_reduction_unused diff --git a/flang/test/Lower/OpenMP/target-inreduction.f90 b/flang/test/Lower/OpenMP/target-inreduction.f90 new file mode 100644 index 0000000000000..40935dc109e94 --- /dev/null +++ b/flang/test/Lower/OpenMP/target-inreduction.f90 @@ -0,0 +1,30 @@ +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +! Verify that in_reduction on a target construct is lowered to an +! omp.target with both an in_reduction clause and an implicit map_entries +! entry for the same variable. The in_reduction clause does not define an +! entry block argument: inside the target body the variable is accessed +! through its map_entries block argument. The implicit map also captures the +! original pointer into the target region so the MLIR -> LLVM IR translation +! can pass it to __kmpc_task_reduction_get_th_data. + +!CHECK-LABEL: omp.declare_reduction +!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init { + +!CHECK-LABEL: func.func @_QPomp_target_in_reduction() +!CHECK: %[[IDECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFomp_target_in_reductionEi"} +!CHECK: %[[IMAP:.*]] = omp.map.info var_ptr(%[[IDECL]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<i32> {name = "i"} +!CHECK: omp.target in_reduction(@[[RED_I32_NAME]] %[[IDECL]]#0 : !fir.ref<i32>) +!CHECK-SAME: map_entries(%[[IMAP]] -> %[[MAPARG:[^ ]+]] : !fir.ref<i32>) +!CHECK: hlfir.declare %[[MAPARG]] +!CHECK: omp.terminator +!CHECK: } + +subroutine omp_target_in_reduction() + integer i + i = 0 + !$omp target in_reduction(+:i) + i = i + 1 + !$omp end target +end subroutine omp_target_in_reduction diff --git a/flang/test/Transforms/OpenMP/function-filtering-host-ops.mlir b/flang/test/Transforms/OpenMP/function-filtering-host-ops.mlir index 2df9c5a8c0713..be3274fc297d5 100644 --- a/flang/test/Transforms/OpenMP/function-filtering-host-ops.mlir +++ b/flang/test/Transforms/OpenMP/function-filtering-host-ops.mlir @@ -436,7 +436,7 @@ module attributes {omp.is_target_device = true} { omp.target allocate(%ref : !fir.ref<i32> -> %ref : !fir.ref<i32>) depend(taskdependin -> %ref : !fir.ref<i32>) device(%int : i32) if(%bool) thread_limit(%int : i32) - in_reduction(@reduction %ref -> %arg0 : !fir.ref<i32>) + in_reduction(@reduction %ref : !fir.ref<i32>) private(@privatizer %ref -> %arg1 : !fir.ref<i32>) { omp.terminator } diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 0962b330e2f23..bea493f7186b8 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1618,6 +1618,15 @@ def TargetOp : OpenMP_Op<"target", traits = [ ]; let extraClassDeclaration = [{ + // Override BlockArgOpenMPOpInterface method because `in_reduction` list + // items on `omp.target` do not define entry block arguments. The reduction + // variable is accessed inside the target body through its matching + // `map_entries` block argument; the `in_reduction` operands are kept only + // as host-side metadata used to look up the per-task private storage. + unsigned numInReductionBlockArgs() { + return 0; + } + mlir::Value getMappedValueForPrivateVar(unsigned privVarIdx) { std::optional<DenseI64ArrayAttr> privateMapIdices = getPrivateMapsAttr(); @@ -1655,7 +1664,7 @@ def TargetOp : OpenMP_Op<"target", traits = [ static ::mlir::omp::TargetExecMode getKernelExecFlags(Operation *capturedOp, bool *hostEvalTripCount = nullptr); - }] # clausesExtraClassDeclaration; + }]#clausesExtraClassDeclaration; let assemblyFormat = clausesAssemblyFormat # [{ custom<TargetOpRegion>( diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 7cef23bdfef18..cf8b4952ac522 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1070,6 +1070,10 @@ struct ReductionParseArgs { DenseBoolArrayAttr &byref; ArrayAttr &syms; ReductionModifierAttr *modifier; + // When false, the clause operands are parsed without defining entry block + // arguments (i.e. without the `-> %arg` mapping). Used for `in_reduction` on + // `omp.target`, where the list items are host-side metadata only. + bool regionArgs = true; ReductionParseArgs(SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars, SmallVectorImpl<Type> &types, DenseBoolArrayAttr &byref, ArrayAttr &syms, ReductionModifierAttr *mod = nullptr) @@ -1100,12 +1104,12 @@ static ParseResult parseClauseWithRegionArgs( SmallVectorImpl<OpAsmParser::Argument> ®ionPrivateArgs, ArrayAttr *symbols = nullptr, DenseI64ArrayAttr *mapIndices = nullptr, DenseBoolArrayAttr *byref = nullptr, - ReductionModifierAttr *modifier = nullptr, - UnitAttr *needsBarrier = nullptr) { + ReductionModifierAttr *modifier = nullptr, UnitAttr *needsBarrier = nullptr, + bool parseRegionArgs = true) { SmallVector<SymbolRefAttr> symbolVec; SmallVector<int64_t> mapIndicesVec; SmallVector<bool> isByRefVec; - unsigned regionArgOffset = regionPrivateArgs.size(); + [[maybe_unused]] unsigned regionArgOffset = regionPrivateArgs.size(); if (parser.parseLParen()) return failure(); @@ -1132,9 +1136,12 @@ static ParseResult parseClauseWithRegionArgs( if (symbols && parser.parseAttribute(symbolVec.emplace_back())) return failure(); - if (parser.parseOperand(operands.emplace_back()) || - parser.parseArrow() || - parser.parseArgument(regionPrivateArgs.emplace_back())) + if (parser.parseOperand(operands.emplace_back())) + return failure(); + + if (parseRegionArgs && + (parser.parseArrow() || + parser.parseArgument(regionPrivateArgs.emplace_back()))) return failure(); if (mapIndices) { @@ -1175,11 +1182,13 @@ static ParseResult parseClauseWithRegionArgs( *needsBarrier = mlir::UnitAttr::get(parser.getContext()); } - auto *argsBegin = regionPrivateArgs.begin(); - MutableArrayRef argsSubrange(argsBegin + regionArgOffset, - argsBegin + regionArgOffset + types.size()); - for (auto [prv, type] : llvm::zip_equal(argsSubrange, types)) { - prv.type = type; + if (parseRegionArgs) { + auto *argsBegin = regionPrivateArgs.begin(); + MutableArrayRef argsSubrange(argsBegin + regionArgOffset, + argsBegin + regionArgOffset + types.size()); + for (auto [prv, type] : llvm::zip_equal(argsSubrange, types)) { + prv.type = type; + } } if (symbols) { @@ -1239,7 +1248,8 @@ static ParseResult parseBlockArgClause( if (failed(parseClauseWithRegionArgs( parser, reductionArgs->vars, reductionArgs->types, entryBlockArgs, &reductionArgs->syms, /*mapIndices=*/nullptr, &reductionArgs->byref, - reductionArgs->modifier))) + reductionArgs->modifier, /*needsBarrier=*/nullptr, + reductionArgs->regionArgs))) return failure(); } return success(); @@ -1318,6 +1328,8 @@ static ParseResult parseTargetOpRegion( args.hostEvalArgs.emplace(hostEvalVars, hostEvalTypes); args.inReductionArgs.emplace(inReductionVars, inReductionTypes, inReductionByref, inReductionSyms); + // `in_reduction` on `omp.target` does not define entry block arguments. + args.inReductionArgs->regionArgs = false; args.mapArgs.emplace(mapVars, mapTypes); args.privateArgs.emplace(privateVars, privateTypes, privateSyms, privateNeedsBarrier, &privateMaps); @@ -1438,6 +1450,10 @@ struct ReductionPrintArgs { DenseBoolArrayAttr byref; ArrayAttr syms; ReductionModifierAttr modifier; + // When false, the clause operands are printed without their entry block + // arguments (i.e. without the `-> %arg` mapping). Used for `in_reduction` on + // `omp.target`, where the list items are host-side metadata only. + bool regionArgs = true; ReductionPrintArgs(ValueRange vars, TypeRange types, DenseBoolArrayAttr byref, ArrayAttr syms, ReductionModifierAttr mod = nullptr) : vars(vars), types(types), byref(byref), syms(syms), modifier(mod) {} @@ -1460,8 +1476,9 @@ static void printClauseWithRegionArgs( ValueRange argsSubrange, ValueRange operands, TypeRange types, ArrayAttr symbols = nullptr, DenseI64ArrayAttr mapIndices = nullptr, DenseBoolArrayAttr byref = nullptr, - ReductionModifierAttr modifier = nullptr, UnitAttr needsBarrier = nullptr) { - if (argsSubrange.empty()) + ReductionModifierAttr modifier = nullptr, UnitAttr needsBarrier = nullptr, + bool printRegionArgs = true) { + if (printRegionArgs ? argsSubrange.empty() : operands.empty()) return; p << clauseName << "("; @@ -1484,21 +1501,37 @@ static void printClauseWithRegionArgs( byref = DenseBoolArrayAttr::get(ctx, values); } - llvm::interleaveComma(llvm::zip_equal(operands, argsSubrange, symbols, - mapIndices.asArrayRef(), - byref.asArrayRef()), - p, [&p](auto t) { - auto [op, arg, sym, map, isByRef] = t; - if (isByRef) - p << "byref "; - if (sym) - p << sym << " "; - - p << op << " -> " << arg; - - if (map != -1) - p << " [map_idx=" << map << "]"; - }); + if (printRegionArgs) { + llvm::interleaveComma(llvm::zip_equal(operands, argsSubrange, symbols, + mapIndices.asArrayRef(), + byref.asArrayRef()), + p, [&p](auto t) { + auto [op, arg, sym, map, isByRef] = t; + if (isByRef) + p << "byref "; + if (sym) + p << sym << " "; + + p << op << " -> " << arg; + + if (map != -1) + p << " [map_idx=" << map << "]"; + }); + } else { + // The clause operands carry no entry block arguments, so the `-> %arg` + // mapping is omitted (e.g. `in_reduction` on `omp.target`). + llvm::interleaveComma( + llvm::zip_equal(operands, symbols, byref.asArrayRef()), p, + [&p](auto t) { + auto [op, sym, isByRef] = t; + if (isByRef) + p << "byref "; + if (sym) + p << sym << " "; + + p << op; + }); + } p << " : "; llvm::interleaveComma(types, p); p << ") "; @@ -1530,10 +1563,11 @@ printBlockArgClause(OpAsmPrinter &p, MLIRContext *ctx, StringRef clauseName, ValueRange argsSubrange, std::optional<ReductionPrintArgs> reductionArgs) { if (reductionArgs) - printClauseWithRegionArgs(p, ctx, clauseName, argsSubrange, - reductionArgs->vars, reductionArgs->types, - reductionArgs->syms, /*mapIndices=*/nullptr, - reductionArgs->byref, reductionArgs->modifier); + printClauseWithRegionArgs( + p, ctx, clauseName, argsSubrange, reductionArgs->vars, + reductionArgs->types, reductionArgs->syms, /*mapIndices=*/nullptr, + reductionArgs->byref, reductionArgs->modifier, + /*needsBarrier=*/nullptr, reductionArgs->regionArgs); } static void printBlockArgRegion(OpAsmPrinter &p, Operation *op, Region ®ion, @@ -1582,6 +1616,8 @@ static void printTargetOpRegion( args.hostEvalArgs.emplace(hostEvalVars, hostEvalTypes); args.inReductionArgs.emplace(inReductionVars, inReductionTypes, inReductionByref, inReductionSyms); + // `in_reduction` on `omp.target` does not define entry block arguments. + args.inReductionArgs->regionArgs = false; args.mapArgs.emplace(mapVars, mapTypes); args.privateArgs.emplace(privateVars, privateTypes, privateSyms, privateNeedsBarrier, privateMaps); @@ -2545,8 +2581,7 @@ LogicalResult TargetUpdateOp::verify() { void TargetOp::build(OpBuilder &builder, OperationState &state, const TargetOperands &clauses) { MLIRContext *ctx = builder.getContext(); - // TODO Store clauses in op: allocateVars, allocatorVars, inReductionVars, - // inReductionByref, inReductionSyms. + // TODO Store clauses in op: allocateVars, allocatorVars. TargetOp::build( builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{}, clauses.bare, makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars, @@ -2554,9 +2589,10 @@ void TargetOp::build(OpBuilder &builder, OperationState &state, clauses.device, clauses.dynGroupprivateAccessGroup, clauses.dynGroupprivateFallback, clauses.dynGroupprivateSize, clauses.hasDeviceAddrVars, clauses.hostEvalVars, clauses.ifExpr, - /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr, - /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars, clauses.mapVars, - clauses.nowait, clauses.privateVars, + clauses.inReductionVars, + makeDenseBoolArrayAttr(ctx, clauses.inReductionByref), + makeArrayAttr(ctx, clauses.inReductionSyms), clauses.isDevicePtrVars, + clauses.mapVars, clauses.nowait, clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms), clauses.privateNeedsBarrier, clauses.threadLimitVars, /*private_maps=*/nullptr); @@ -2583,6 +2619,11 @@ LogicalResult TargetOp::verify() { if (failed(verifyPrivateVarList(*this))) return failure(); + if (failed(verifyReductionVarList(*this, getInReductionSyms(), + getInReductionVars(), + getInReductionByref()))) + return failure(); + return verifyPrivateVarsMapping(*this); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index f0aa2486d9e9d..ffcca220ae154 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -344,10 +344,46 @@ static LogicalResult checkImplementationStatus(Operation &op) { if (op.getHint()) op.emitWarning("hint clause discarded"); }; + // in_reduction support varies by operation: + // - omp.task does not implement in_reduction at all yet. + // - omp.taskloop.context and omp.target implement the non-byref form; the + // byref form is not implemented yet. + // - omp.target additionally does not implement declare reductions that use + // a cleanup region or a two-argument (alloc) initializer. auto checkInReduction = [&todo](auto op, LogicalResult &result) { - if (!op.getInReductionVars().empty() || op.getInReductionByref() || - op.getInReductionSyms()) - result = todo("in_reduction"); + if (isa<omp::TaskOp>(op.getOperation())) { + if (!op.getInReductionVars().empty() || op.getInReductionByref() || + op.getInReductionSyms()) + result = todo("in_reduction"); + return; + } + if (auto byrefAttr = op.getInReductionByref()) + for (bool isByRef : *byrefAttr) + if (isByRef) { + result = todo("in_reduction with byref modifier"); + return; + } + if (isa<omp::TargetOp>(op.getOperation())) { + if (auto inReductionSyms = op.getInReductionSyms()) { + for (auto sym : + (*inReductionSyms).template getAsRange<SymbolRefAttr>()) { + auto decl = + SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>( + op, sym); + // Symbol resolution is guaranteed by the op verifier. + if (!decl) + continue; + if (decl.getInitializerRegion().front().getNumArguments() != 1) { + result = todo("in_reduction with two-argument initializer"); + return; + } + if (!decl.getCleanupRegion().empty()) { + result = todo("in_reduction with cleanup region"); + return; + } + } + } + } }; auto checkNowait = [&todo](auto op, LogicalResult &result) { if (op.getNowait()) @@ -386,14 +422,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { return; } }; - auto checkInReductionByref = [&todo](auto op, LogicalResult &result) { - if (auto byrefAttr = op.getInReductionByref()) - for (bool isByRef : *byrefAttr) - if (isByRef) { - result = todo("in_reduction with byref modifier"); - return; - } - }; auto checkNumTeams = [&todo](auto op, LogicalResult &result) { if (op.hasNumTeamsMultiDim()) result = todo("num_teams with multi-dimensional values"); @@ -453,7 +481,7 @@ static LogicalResult checkImplementationStatus(Operation &op) { }) .Case([&](omp::TaskloopContextOp op) { checkAllocate(op, result); - checkInReductionByref(op, result); + checkInReduction(op, result); checkReduction(op, result); checkReductionByref(op, result); }) @@ -490,6 +518,10 @@ static LogicalResult checkImplementationStatus(Operation &op) { .Case([&](omp::TargetOp op) { checkAllocate(op, result); checkBare(op, result); + // The byref / cleanup-region / two-argument-initializer in_reduction + // shapes on omp.target are not implemented yet (handled by + // checkInReduction). The device-side / offload-entry cases are + // diagnosed inline in convertOmpTarget. checkInReduction(op, result); checkThreadLimit(op, result); }) @@ -8232,6 +8264,49 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, bool isOffloadEntry = isTargetDevice || !ompBuilder->Config.TargetTriples.empty(); + // Validate and resolve in_reduction clauses on omp.target. We currently + // only support the non-offload host-fallback path: the per-task private + // pointer is obtained by calling __kmpc_task_reduction_get_th_data inside + // the to-be-outlined target task body. Threading that pointer through the + // device kernel argument list is left as follow-up work. + SmallVector<llvm::Value *> inRedOrigPtrs; + SmallVector<unsigned> inRedMapArgIdx; + if (!targetOp.getInReductionVars().empty()) { + if (isTargetDevice || isOffloadEntry) + return opInst.emitError( + "not yet implemented: in_reduction clause on omp.target with " + "offload / target-device compilation"); + // The byref / cleanup-region / two-argument-initializer in_reduction + // shapes are rejected earlier by checkImplementationStatus, and symbol + // resolution is guaranteed by verifyReductionVarList. + // + // The target body has no dedicated in_reduction block argument: each + // in_reduction variable is accessed through its map_entries block argument, + // which the host redirects to the per-task reduction-private storage below. + // So each in_reduction variable must also be captured by the target via a + // map_entries entry referring to the same outer SSA value. That value is + // also used as the `orig` argument of the runtime lookup, so without a + // matching map entry the outlined kernel would reference a value defined in + // the host function and fail IR verification. Record, for each in_reduction + // variable, the position of its matching map entry so the corresponding map + // block argument can be redirected once we are inside the body. + llvm::SmallDenseMap<Value, unsigned> mapVarPtrToArgIdx; + for (auto [idx, mapV] : llvm::enumerate(targetOp.getMapVars())) + if (auto mapInfo = mapV.getDefiningOp<omp::MapInfoOp>()) + mapVarPtrToArgIdx.try_emplace(mapInfo.getVarPtr(), idx); + inRedOrigPtrs.reserve(targetOp.getInReductionVars().size()); + inRedMapArgIdx.reserve(targetOp.getInReductionVars().size()); + for (Value v : targetOp.getInReductionVars()) { + auto it = mapVarPtrToArgIdx.find(v); + if (it == mapVarPtrToArgIdx.end()) + return targetOp.emitError() + << "not yet implemented: in_reduction variable on omp.target " + "must also be captured by a matching map_entries entry"; + inRedMapArgIdx.push_back(it->second); + inRedOrigPtrs.push_back(moduleTranslation.lookupValue(v)); + } + } + // For some private variables, the MapsForPrivatizedVariablesPass // creates MapInfoOp instances. Go through the private variables and // the mapped variables so that during codegeneration we are able @@ -8307,8 +8382,15 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, attr.isStringAttribute()) llvmOutlinedFn->addFnAttr(attr); - for (auto [arg, mapOp] : llvm::zip_equal(mapBlockArgs, mapVars)) { - auto mapInfoOp = cast<omp::MapInfoOp>(mapOp.getDefiningOp()); + for (auto [idx, arg] : llvm::enumerate(mapBlockArgs)) { + // in_reduction list items on omp.target are accessed through their + // map_entries block argument, which is redirected below to the per-task + // reduction-private storage returned by the runtime. Skip the default + // host-value mapping for those block arguments so the write-once + // mapValue mapping is free to be set to the private pointer. + if (llvm::is_contained(inRedMapArgIdx, idx)) + continue; + auto mapInfoOp = cast<omp::MapInfoOp>(mapVars[idx].getDefiningOp()); llvm::Value *mapOpValue = moduleTranslation.lookupValue(mapInfoOp.getVarPtr()); moduleTranslation.mapValue(arg, mapOpValue); @@ -8344,6 +8426,38 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, targetOp.getPrivateNeedsBarrier(), &mappedPrivateVars))) return llvm::make_error<PreviouslyReportedError>(); + // The target body accesses each in_reduction variable through its + // map_entries block argument. Redirect that block argument to the per-task + // private storage returned by __kmpc_task_reduction_get_th_data so the body + // accumulates into the reduction-private copy rather than the mapped + // original. The lookup must run inside the target task body so the gtid + // corresponds to the executing thread. The descriptor argument is NULL: the + // runtime walks enclosing taskgroups to locate the matching task_reduction + // registration for `origPtr`. Mirrors the in_reduction handling on + // omp.taskloop.context. + if (!inRedOrigPtrs.empty()) { + llvm::OpenMPIRBuilder &ompB = *ompBuilder; + llvm::Module *m = moduleTranslation.getLLVMModule(); + llvm::LLVMContext &llvmCtx = m->getContext(); + uint32_t srcLocSize; + llvm::Constant *srcLocStr = ompB.getOrCreateDefaultSrcLocStr(srcLocSize); + llvm::Value *bodyIdent = ompB.getOrCreateIdent(srcLocStr, srcLocSize); + llvm::Function *gtidFn = ompB.getOrCreateRuntimeFunctionPtr( + llvm::omp::OMPRTL___kmpc_global_thread_num); + llvm::Value *bodyGtid = + builder.CreateCall(gtidFn, {bodyIdent}, "omp_global_thread_num"); + llvm::FunctionCallee getThData = ompB.getOrCreateRuntimeFunction( + *m, llvm::omp::OMPRTL___kmpc_task_reduction_get_th_data); + llvm::Type *ptrTy = llvm::PointerType::getUnqual(llvmCtx); + llvm::Value *nullDesc = llvm::ConstantPointerNull::get(ptrTy); + for (auto [mapArgIdx, origPtr] : + llvm::zip_equal(inRedMapArgIdx, inRedOrigPtrs)) { + llvm::Value *priv = builder.CreateCall( + getThData, {bodyGtid, nullDesc, origPtr}, "omp.inred.priv"); + moduleTranslation.mapValue(mapBlockArgs[mapArgIdx], priv); + } + } + LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame( moduleTranslation, allocaIP, deallocBlocks); llvm::Expected<llvm::BasicBlock *> exitBlock = convertOmpOpRegions( diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index 06ad3d60ea635..daefcdccd5c2e 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -3129,6 +3129,66 @@ func.func @omp_target_depend(%data_var: memref<i32>) { // ----- +func.func @omp_target_in_reduction_unresolved(%ptr: !llvm.ptr) { + // expected-error @below {{op expected symbol reference @add_f32 to point to a reduction declaration}} + omp.target in_reduction(@add_f32 %ptr : !llvm.ptr) { + omp.terminator + } + return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +func.func @omp_target_in_reduction_duplicate(%ptr: !llvm.ptr) { + // expected-error @below {{op accumulator variable used more than once}} + omp.target in_reduction(@add_f32 %ptr, @add_f32 %ptr : !llvm.ptr, !llvm.ptr) { + omp.terminator + } + return +} + +// ----- + +omp.declare_reduction @add_i32 : i32 +init { +^bb0(%arg: i32): + %0 = arith.constant 0 : i32 + omp.yield (%0 : i32) +} +combiner { +^bb1(%arg0: i32, %arg1: i32): + %1 = arith.addi %arg0, %arg1 : i32 + omp.yield (%1 : i32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> i32 + llvm.atomicrmw add %arg2, %2 monotonic : !llvm.ptr, i32 + omp.yield +} + +func.func @omp_target_in_reduction_type_mismatch(%mem: memref<1xf32>) { + // expected-error @below {{op expected accumulator ('memref<1xf32>') to be the same type as reduction declaration ('!llvm.ptr')}} + omp.target in_reduction(@add_i32 %mem : memref<1xf32>) { + omp.terminator + } + return +} + +// ----- + func.func @omp_distribute_schedule(%chunk_size : i32, %lb : i32, %ub : i32, %step : i32) -> () { // expected-error @below {{op chunk size set without dist_schedule_static being present}} "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 0, 0, 1, 0>}> ({ diff --git a/mlir/test/Target/LLVMIR/openmp-target-in-reduction-multi.mlir b/mlir/test/Target/LLVMIR/openmp-target-in-reduction-multi.mlir new file mode 100644 index 0000000000000..8083f3c299ce0 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-in-reduction-multi.mlir @@ -0,0 +1,75 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Multiple in_reduction items on omp.target. Each item is captured into the +// target region through its own map_entries entry and accessed inside the +// body via the corresponding map_entries block argument. For the host +// fallback path every item performs an independent +// __kmpc_task_reduction_get_th_data lookup using its own captured original +// pointer, and the returned per-task private pointer is bound to that item's +// map block argument. This test pins down the pairing so it cannot pass if the +// two items were swapped or collapsed onto a single pointer. + +omp.declare_reduction @add_i32 : i32 +init { +^bb0(%arg0: i32): + %c0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%c0 : i32) +} +combiner { +^bb0(%arg0: i32, %arg1: i32): + %s = llvm.add %arg0, %arg1 : i32 + omp.yield(%s : i32) +} + +llvm.func @target_inreduction_multi(%x : !llvm.ptr, %y : !llvm.ptr) { + %mx = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr + %my = omp.map.info var_ptr(%y : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr + omp.target in_reduction(@add_i32 %x, @add_i32 %y : !llvm.ptr, !llvm.ptr) + map_entries(%mx -> %mxarg, %my -> %myarg : !llvm.ptr, !llvm.ptr) { + // First item (x): load, += 1, store back. + %vx = llvm.load %mxarg : !llvm.ptr -> i32 + %c1 = llvm.mlir.constant(1 : i32) : i32 + %sx = llvm.add %vx, %c1 : i32 + llvm.store %sx, %mxarg : i32, !llvm.ptr + // Second item (y): load, += 2, store back. + %vy = llvm.load %myarg : !llvm.ptr -> i32 + %c2 = llvm.mlir.constant(2 : i32) : i32 + %sy = llvm.add %vy, %c2 : i32 + llvm.store %sy, %myarg : i32, !llvm.ptr + omp.terminator + } + llvm.return +} + +// The host stub forwards both captured pointers into the outlined target +// kernel (the trailing argument is the unused descriptor slot). +// CHECK-LABEL: define void @target_inreduction_multi( +// CHECK: call void @__omp_offloading_{{.*}}_target_inreduction_multi_{{.*}}(ptr %{{.+}}, ptr %{{.+}}, ptr null) + +// The two captured original pointers arrive as distinct kernel arguments. +// CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_target_inreduction_multi_ +// CHECK-SAME: (ptr %[[CAPTX:.+]], ptr %[[CAPTY:.+]], ptr %{{.+}}) + +// A single gtid is shared by both lookups; each item then performs its own +// __kmpc_task_reduction_get_th_data call against its own captured pointer. +// CHECK: %[[GTID:.+]] = call i32 @__kmpc_global_thread_num( +// CHECK: %[[PRIVX:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[GTID]], ptr null, ptr %[[CAPTX]]) +// CHECK: %[[PRIVY:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[GTID]], ptr null, ptr %[[CAPTY]]) + +// The first item's private storage is the base of the +1 load/store; the +// CHECK-NOT below ensures the second item's pointer is not touched until the +// first item's accumulation has completed (i.e. the items are not swapped or +// merged onto a single private pointer). +// CHECK: %[[LX:.+]] = load i32, ptr %[[PRIVX]] +// CHECK: %[[SX:.+]] = add i32 %[[LX]], 1 +// CHECK-NOT: %[[PRIVY]] +// CHECK: store i32 %[[SX]], ptr %[[PRIVX]] + +// The second item's private storage is the base of the +2 load/store. +// CHECK: %[[LY:.+]] = load i32, ptr %[[PRIVY]] +// CHECK: %[[SY:.+]] = add i32 %[[LY]], 2 +// CHECK: store i32 %[[SY]], ptr %[[PRIVY]] + +// Exactly two reduction lookups are emitted; no third call sneaks in. The +// `call` form is used so this does not match the runtime declaration. +// CHECK-NOT: call ptr @__kmpc_task_reduction_get_th_data diff --git a/mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir new file mode 100644 index 0000000000000..e4094d24bf592 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir @@ -0,0 +1,52 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +// in_reduction on omp.target: the in_reduction variable is also captured +// into the target region as a map entry (the Flang front-end emits this +// implicit map). The in_reduction clause does not define an entry block +// argument; inside the target body the variable is accessed through its +// map_entries block argument. The captured pointer is passed to +// __kmpc_task_reduction_get_th_data with a NULL descriptor; the runtime +// walks enclosing taskgroups to locate the matching task_reduction +// registration. The returned per-task private pointer is bound to the +// map_entries block argument so subsequent loads/stores inside the region +// use the private copy. + +omp.declare_reduction @add_i32 : i32 +init { +^bb0(%arg0: i32): + %c0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%c0 : i32) +} +combiner { +^bb0(%arg0: i32, %arg1: i32): + %s = llvm.add %arg0, %arg1 : i32 + omp.yield(%s : i32) +} + +llvm.func @target_inreduction(%x : !llvm.ptr) { + %m = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr + omp.target in_reduction(@add_i32 %x : !llvm.ptr) map_entries(%m -> %marg : !llvm.ptr) { + %v = llvm.load %marg : !llvm.ptr -> i32 + %c1 = llvm.mlir.constant(1 : i32) : i32 + %s = llvm.add %v, %c1 : i32 + llvm.store %s, %marg : i32, !llvm.ptr + omp.terminator + } + llvm.return +} + +// The host stub forwards the captured pointer into the outlined target +// kernel. +// CHECK-LABEL: define void @target_inreduction( +// CHECK: call void @__omp_offloading_{{.*}}_target_inreduction_{{.*}}(ptr %{{.+}}, ptr null) + +// In the outlined target body the in_reduction private pointer is +// obtained from the runtime using the captured original pointer; that +// pointer is then the base of the load and store inside the region. +// CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_target_inreduction_ +// CHECK-SAME: (ptr %[[CAPT:.+]], ptr %{{.+}}) +// CHECK: %[[GTID:.+]] = call i32 @__kmpc_global_thread_num( +// CHECK: %[[PRIV:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[GTID]], ptr null, ptr %[[CAPT]]) +// CHECK: %[[LOADED:.+]] = load i32, ptr %[[PRIV]] +// CHECK: %[[SUM:.+]] = add i32 %[[LOADED]], 1 +// CHECK: store i32 %[[SUM]], ptr %[[PRIV]] diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index 5c22f7f081bb5..cf96bcfee2b74 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -190,10 +190,90 @@ atomic { llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 omp.yield } -llvm.func @target_in_reduction(%x : !llvm.ptr) { - // expected-error@below {{not yet implemented: Unhandled clause in_reduction in omp.target operation}} +llvm.func @target_in_reduction_byref(%x : !llvm.ptr) { + // expected-error@below {{not yet implemented: Unhandled clause in_reduction with byref modifier in omp.target operation}} // expected-error@below {{LLVM Translation failed for operation: omp.target}} - omp.target in_reduction(@add_f32 %x -> %prv : !llvm.ptr) { + omp.target in_reduction(byref @add_f32 %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_cleanup_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +cleanup { +^bb2(%arg2: f32): + omp.yield +} +llvm.func @target_in_reduction_cleanup(%x : !llvm.ptr) { + // expected-error@below {{not yet implemented: Unhandled clause in_reduction with cleanup region in omp.target operation}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target in_reduction(@add_cleanup_f32 %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_two_arg_init_i32 : !llvm.ptr alloc { +^bb0(%arg: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} init { +^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(0 : i32) : i32 + llvm.store %0, %arg1 : i32, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) +} combiner { +^bb1(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> i32 + %1 = llvm.load %arg1 : !llvm.ptr -> i32 + %2 = llvm.add %0, %1 : i32 + llvm.store %2, %arg0 : i32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) +} +llvm.func @target_in_reduction_two_arg_init(%x : !llvm.ptr) { + // expected-error@below {{not yet implemented: Unhandled clause in_reduction with two-argument initializer in omp.target operation}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target in_reduction(@add_two_arg_init_i32 %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_no_map_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +llvm.func @target_in_reduction_no_map(%x : !llvm.ptr) { + // The in_reduction variable %x has no matching map_entries entry. The + // outlined target kernel would otherwise reference %x across function + // boundaries; the translation must reject this up front. + // expected-error@below {{not yet implemented: in_reduction variable on omp.target must also be captured by a matching map_entries entry}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target in_reduction(@add_no_map_f32 %x : !llvm.ptr) { omp.terminator } llvm.return _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
