Author: Johannes Doerfert Date: 2021-07-10T18:44:25-05:00 New Revision: 514c033db1e0c237eccd56b9fc11fe05a6baff39
URL: https://github.com/llvm/llvm-project/commit/514c033db1e0c237eccd56b9fc11fe05a6baff39 DIFF: https://github.com/llvm/llvm-project/commit/514c033db1e0c237eccd56b9fc11fe05a6baff39.diff LOG: [OpenMP] Detect SPMD compatible kernels and execute them as such In the spirit of TRegions [0], this patch analyzes a kernel and tracks if it can be executed in SPMD-mode. If so, we flip the arguments of the __kmpc_target_init and deinit call to enable the mode. We also update the `<kernel>_exec_mode` flag to indicate to the runtime we changed the mode to SPMD. The code analysis is done interprocedurally by extending the AAKernelInfo abstract attribute to track SPMD compatibility as well. [0] https://link.springer.com/chapter/10.1007/978-3-030-28596-8_11 Differential Revision: https://reviews.llvm.org/D102307 Added: llvm/test/Transforms/OpenMP/spmdization.ll llvm/test/Transforms/OpenMP/spmdization_remarks.ll Modified: clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c clang/test/OpenMP/remarks_parallel_in_target_state_machine.c llvm/include/llvm/Frontend/OpenMP/OMPConstants.h llvm/lib/IR/Assumptions.cpp llvm/lib/Transforms/IPO/OpenMPOpt.cpp llvm/test/Transforms/OpenMP/custom_state_machines.ll llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll Removed: ################################################################################ diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c index d50311f2d8692..20142d944f362 100644 --- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c +++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c @@ -4,6 +4,8 @@ // host-no-diagnostics +void baz(void) __attribute__((assume("omp_no_openmp"))); + void bar1(void) { #pragma omp parallel // #0 // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} @@ -27,6 +29,7 @@ void foo1(void) { // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}} // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: __omp_offloading}} { + baz(); // all-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}} #pragma omp parallel // #3 // all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}} @@ -47,6 +50,7 @@ void foo2(void) { // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading}} // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__4_wrapper, kernel ID: __omp_offloading}} { + baz(); // all-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}} #pragma omp parallel // #6 // all-remark@#6 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#6 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__4_wrapper, kernel ID: __omp_offloading}} @@ -70,6 +74,7 @@ void foo3(void) { // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading}} // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__8_wrapper, kernel ID: __omp_offloading}} { + baz(); // all-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}} #pragma omp parallel // #9 // all-remark@#9 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} // all-remark@#9 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading}} diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c index cc656b78d8e83..b461000dade4d 100644 --- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c +++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c @@ -4,6 +4,8 @@ // host-no-diagnostics +void baz(void) __attribute__((assume("omp_no_openmp"))); + void bar(void) { #pragma omp parallel // #1 \ // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ @@ -18,6 +20,7 @@ void foo(void) { // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}} \ // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: __omp_offloading}} { + baz(); // expected-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}} #pragma omp parallel // #3 \ // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ // expected-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h index a05aa231eb516..d174cc8992ddb 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -111,7 +111,10 @@ inline std::string getAllAssumeClauseOptions() { /// Todo: Update kmp.h to include this file, and remove the enums in kmp.h /// To complete this, more enum values will need to be moved here. enum class OMPScheduleType { + StaticChunked = 33, Static = 34, // static unspecialized + DistributeChunked = 91, + Distribute = 92, DynamicChunked = 35, GuidedChunked = 36, // guided unspecialized Runtime = 37, diff --git a/llvm/lib/IR/Assumptions.cpp b/llvm/lib/IR/Assumptions.cpp index 1bd8b7f51e676..6498114cd60d5 100644 --- a/llvm/lib/IR/Assumptions.cpp +++ b/llvm/lib/IR/Assumptions.cpp @@ -33,4 +33,5 @@ StringSet<> llvm::KnownAssumptionStrings({ "omp_no_openmp", // OpenMP 5.1 "omp_no_openmp_routines", // OpenMP 5.1 "omp_no_parallelism", // OpenMP 5.1 + "ompx_spmd_amenable", // OpenMPOpt extension }); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index ceb2d8295c23a..b30c33c43f652 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -27,6 +27,7 @@ #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/Assumptions.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/InitializePasses.h" @@ -73,6 +74,9 @@ STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, "Number of OpenMP runtime function uses identified"); STATISTIC(NumOpenMPTargetRegionKernels, "Number of OpenMP target region entry points (=kernels) identified"); +STATISTIC(NumOpenMPTargetRegionKernelsSPMD, + "Number of OpenMP target region entry points (=kernels) executed in " + "SPMD-mode instead of generic-mode"); STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine, "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode without a state machines"); @@ -481,6 +485,10 @@ struct KernelInfoState : AbstractState { /// State to track what parallel region we might reach. BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions; + /// State to track if we are in SPMD-mode, assumed or know, and why we decided + /// we cannot be. + BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker; + /// The __kmpc_target_init call in this kernel, if any. If we find more than /// one we abort as the kernel is malformed. CallBase *KernelInitCB = nullptr; @@ -507,6 +515,7 @@ struct KernelInfoState : AbstractState { /// See AbstractState::indicatePessimisticFixpoint(...) ChangeStatus indicatePessimisticFixpoint() override { IsAtFixpoint = true; + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); ReachedUnknownParallelRegions.indicatePessimisticFixpoint(); return ChangeStatus::CHANGED; } @@ -522,6 +531,8 @@ struct KernelInfoState : AbstractState { const KernelInfoState &getAssumed() const { return *this; } bool operator==(const KernelInfoState &RHS) const { + if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker) + return false; if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions) return false; if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions) @@ -552,6 +563,7 @@ struct KernelInfoState : AbstractState { indicatePessimisticFixpoint(); KernelDeinitCB = KIS.KernelDeinitCB; } + SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; return *this; @@ -2669,8 +2681,10 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { const std::string getAsStr() const override { if (!isValidState()) return "<invalid>"; - return - + return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD" + : "generic") + + std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]" + : "") + std::string(" #PRs: ") + std::to_string(ReachedKnownParallelRegions.size()) + ", #Unknown PRs: " + @@ -2745,8 +2759,9 @@ struct AAKernelInfoFunction : AAKernelInfo { assert((KernelInitCB && KernelDeinitCB) && "Kernel without __kmpc_target_init or __kmpc_target_deinit!"); - // For kernels we need to register a simplification callback so that the Attributor - // knows the constant arguments to ___kmpc_target_init and + // For kernels we might need to initialize/finalize the IsSPMD state and + // we need to register a simplification callback so that the Attributor + // knows the constant arguments to __kmpc_target_init and // __kmpc_target_deinit might actually change. Attributor::SimplifictionCallbackTy StateMachineSimplifyCB = @@ -2767,10 +2782,45 @@ struct AAKernelInfoFunction : AAKernelInfo { return FalseVal; }; + Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB = + [&](const IRPosition &IRP, const AbstractAttribute *AA, + bool &UsedAssumedInformation) -> Optional<Value *> { + // IRP represents the "SPMDCompatibilityTracker" argument of an + // __kmpc_target_init or + // __kmpc_target_deinit call. We will answer this one with the internal + // state. + if (!isValidState()) + return nullptr; + if (!SPMDCompatibilityTracker.isAtFixpoint()) { + if (AA) + A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); + UsedAssumedInformation = true; + } else { + UsedAssumedInformation = false; + } + auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(), + SPMDCompatibilityTracker.isAssumed()); + return Val; + }; + + constexpr const int InitIsSPMDArgNo = 1; + constexpr const int DeinitIsSPMDArgNo = 1; constexpr const int InitUseStateMachineArgNo = 2; A.registerSimplificationCallback( IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), StateMachineSimplifyCB); + A.registerSimplificationCallback( + IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo), + IsSPMDModeSimplifyCB); + A.registerSimplificationCallback( + IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo), + IsSPMDModeSimplifyCB); + + // Check if we know we are in SPMD-mode already. + ConstantInt *IsSPMDArg = + dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); + if (IsSPMDArg && !IsSPMDArg->isZero()) + SPMDCompatibilityTracker.indicateOptimisticFixpoint(); } /// Modify the IR based on the KernelInfoState as the fixpoint iteration is @@ -2781,11 +2831,81 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!KernelInitCB || !KernelDeinitCB) return ChangeStatus::UNCHANGED; - buildCustomStateMachine(A); + // Known SPMD-mode kernels need no manifest changes. + if (SPMDCompatibilityTracker.isKnown()) + return ChangeStatus::UNCHANGED; + + // If we can we change the execution mode to SPMD-mode otherwise we build a + // custom state machine. + if (!changeToSPMDMode(A)) + buildCustomStateMachine(A); return ChangeStatus::CHANGED; } + bool changeToSPMDMode(Attributor &A) { + if (!SPMDCompatibilityTracker.isAssumed()) { + for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) { + if (!NonCompatibleI) + continue; + auto Remark = [&](OptimizationRemarkAnalysis ORA) { + ORA << "Kernel will be executed in generic-mode due to this " + "potential side-effect"; + if (auto *CI = dyn_cast<CallBase>(NonCompatibleI)) { + if (Function *F = CI->getCalledFunction()) + ORA << ", consider to add " + "`__attribute__((assume(\"ompx_spmd_amenable\")))`" + " to the called function '" + << F->getName() << "'"; + } + return ORA << "."; + }; + A.emitRemark<OptimizationRemarkAnalysis>( + NonCompatibleI, "OpenMPKernelNonSPMDMode", Remark); + + LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: " + << *NonCompatibleI << "\n"); + } + + return false; + } + + // Adjust the global exec mode flag that tells the runtime what mode this + // kernel is executed in. + Function *Kernel = getAnchorScope(); + GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( + (Kernel->getName() + "_exec_mode").str()); + assert(ExecMode && "Kernel without exec mode?"); + assert(ExecMode->getInitializer() && + ExecMode->getInitializer()->isOneValue() && + "Initially non-SPMD kernel has SPMD exec mode!"); + ExecMode->setInitializer( + ConstantInt::get(ExecMode->getInitializer()->getType(), 0)); + + // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. + const int InitIsSPMDArgNo = 1; + const int DeinitIsSPMDArgNo = 1; + const int InitUseStateMachineArgNo = 2; + + auto &Ctx = getAnchorValue().getContext(); + A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo), + *ConstantInt::getBool(Ctx, 1)); + A.changeUseAfterManifest( + KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), + *ConstantInt::getBool(Ctx, 0)); + A.changeUseAfterManifest( + KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo), + *ConstantInt::getBool(Ctx, 1)); + ++NumOpenMPTargetRegionKernelsSPMD; + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Generic-mode kernel is changed to SPMD-mode."; + }; + A.emitRemark<OptimizationRemark>(KernelInitCB, "OpenMPKernelSPMDMode", + Remark); + return true; + }; + ChangeStatus buildCustomStateMachine(Attributor &A) { assert(ReachedKnownParallelRegions.isValidState() && "Custom state machine with invalid parallel region states?"); @@ -2809,7 +2929,7 @@ struct AAKernelInfoFunction : AAKernelInfo { !IsSPMD->isZero()) return ChangeStatus::UNCHANGED; - // First, indicate we use a custom state machine now. + // If not SPMD mode, indicate we use a custom state machine now. auto &Ctx = getAnchorValue().getContext(); auto *FalseVal = ConstantInt::getBool(Ctx, 0); A.changeUseAfterManifest( @@ -3064,6 +3184,28 @@ struct AAKernelInfoFunction : AAKernelInfo { ChangeStatus updateImpl(Attributor &A) override { KernelInfoState StateBefore = getState(); + // Callback to check a read/write instruction. + auto CheckRWInst = [&](Instruction &I) { + // We handle calls later. + if (isa<CallBase>(I)) + return true; + // We only care about write effects. + if (!I.mayWriteToMemory()) + return true; + if (auto *SI = dyn_cast<StoreInst>(&I)) { + SmallVector<const Value *> Objects; + getUnderlyingObjects(SI->getPointerOperand(), Objects); + if (llvm::all_of(Objects, + [](const Value *Obj) { return isa<AllocaInst>(Obj); })) + return true; + } + // For now we give up on everything but stores. + SPMDCompatibilityTracker.insert(&I); + return true; + }; + if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this)) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + // Callback to check a call instruction. auto CheckCallInst = [&](Instruction &I) { auto &CB = cast<CallBase>(I); @@ -3101,6 +3243,10 @@ struct AAKernelInfoCallSite : AAKernelInfo { return Fn && hasAssumption(*Fn, AssumptionStr); }; + // Check for SPMD-mode assumptions. + if (HasAssumption(Callee, "ompx_spmd_amenable")) + SPMDCompatibilityTracker.indicateOptimisticFixpoint(); + // First weed out calls we do not care about, that is readonly/readnone // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a // parallel region or anything else we are looking for. @@ -3125,6 +3271,11 @@ struct AAKernelInfoCallSite : AAKernelInfo { HasAssumption(Callee, "omp_no_parallelism"))) ReachedUnknownParallelRegions.insert(&CB); + // If SPMDCompatibilityTracker is not fixed, we need to give up on the + // idea we can run something unknown in SPMD-mode. + if (!SPMDCompatibilityTracker.isAtFixpoint()) + SPMDCompatibilityTracker.insert(&CB); + // We have updated the state for this unknown call properly, there won't // be any change so we indicate a fixpoint. indicateOptimisticFixpoint(); @@ -3137,6 +3288,37 @@ struct AAKernelInfoCallSite : AAKernelInfo { const unsigned int WrapperFunctionArgNo = 6; RuntimeFunction RF = It->getSecond(); switch (RF) { + // All the functions we know are compatible with SPMD mode. + case OMPRTL___kmpc_is_spmd_exec_mode: + case OMPRTL___kmpc_for_static_fini: + case OMPRTL___kmpc_global_thread_num: + case OMPRTL___kmpc_single: + case OMPRTL___kmpc_end_single: + case OMPRTL___kmpc_master: + case OMPRTL___kmpc_end_master: + case OMPRTL___kmpc_barrier: + break; + case OMPRTL___kmpc_for_static_init_4: + case OMPRTL___kmpc_for_static_init_4u: + case OMPRTL___kmpc_for_static_init_8: + case OMPRTL___kmpc_for_static_init_8u: { + // Check the schedule and allow static schedule in SPMD mode. + unsigned ScheduleArgOpNo = 2; + auto *ScheduleTypeCI = + dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo)); + unsigned ScheduleTypeVal = + ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0; + switch (OMPScheduleType(ScheduleTypeVal)) { + case OMPScheduleType::Static: + case OMPScheduleType::StaticChunked: + case OMPScheduleType::Distribute: + case OMPScheduleType::DistributeChunked: + break; + default: + SPMDCompatibilityTracker.insert(&CB); + break; + }; + } break; case OMPRTL___kmpc_target_init: KernelInitCB = &CB; break; @@ -3156,9 +3338,13 @@ struct AAKernelInfoCallSite : AAKernelInfo { break; case OMPRTL___kmpc_omp_task: // We do not look into tasks right now, just give up. + SPMDCompatibilityTracker.insert(&CB); ReachedUnknownParallelRegions.insert(&CB); break; default: + // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, + // generally. + SPMDCompatibilityTracker.insert(&CB); break; } // All other OpenMP runtime calls will not reach parallel regions so they diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll index b352255a1b62f..f2e5e59ad5dcb 100644 --- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -1526,51 +1526,17 @@ attributes #10 = { convergent nounwind readonly willreturn } ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_pure_l72 ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__13_wrapper -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check3: -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef true, i1 noundef false, i1 noundef true) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]] ; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; CHECK-NEXT: call void @__omp_outlined__12(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll index d7383f853519c..167fb0da0af39 100644 --- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll @@ -22,13 +22,15 @@ target triple = "nvptx64" ;; unknown(); ;; } ;; } -;; +;; +;; void no_openmp(void) __attribute__((assume("omp_no_openmp"))); ;; void test_no_fallback(void) { ;; #pragma omp target teams ;; { ;; known(); ;; known(); ;; known(); +;; no_openmp(); // make it non-spmd ;; } ;; } @@ -50,6 +52,7 @@ target triple = "nvptx64" @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1 @12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([73 x i8], [73 x i8]* @12, i32 0, i32 0) }, align 8 +@G = external global i32 @llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata" ; Function Attrs: convergent norecurse nounwind @@ -124,6 +127,8 @@ user_code.entry: ; preds = %entry %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43 call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45 + call void @no_openmp() + call void @no_parallelism() call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46 br label %common.ret } @@ -154,6 +159,9 @@ declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5 ; Function Attrs: argmemonly nofree nosync nounwind willreturn declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5 +declare void @no_openmp() #7 +declare void @no_parallelism() #8 + attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } @@ -161,6 +169,8 @@ attributes #3 = { nounwind } attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } attributes #5 = { argmemonly nofree nosync nounwind willreturn } attributes #6 = { convergent nounwind } +attributes #7 = { "llvm.assume"="omp_no_openmp" } +attributes #8 = { "llvm.assume"="omp_no_parallelism" } !llvm.dbg.cu = !{!0} !omp_offload.info = !{!3, !4} diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll new file mode 100644 index 0000000000000..6ecda643acdbc --- /dev/null +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -0,0 +1,214 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s + +;; void unknown(void); +;; void spmd_amenable(void) __attribute__((assume("ompx_spmd_amenable"))) +;; +;; void sequential_loop() { +;; #pragma omp target teams +;; { +;; for (int i = 0; i < 100; ++i) { +;; #pragma omp parallel +;; { +;; unknown(); +;; } +;; } +; spmd_amenable(); +;; } +;; } + +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata" + +; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode. +; We also adjusted the global __omp_offloading_2c_38c77_sequential_loop_l4_exec_mode to have a zero initializer (which indicates SPMD mode to the runtime). +;. +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 +; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata" +;. +define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_38c77_sequential_loop_l4 +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: call void @spmd_amenable() +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 100 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = load i32, i32* %.global_tid., align 4 + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %1, i64 0) + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond, !llvm.loop !6 + +for.end: ; preds = %for.cond + call void @spmd_amenable() + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 +; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @unknown() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: ret void +; +entry: + call void @unknown() #3 + ret void +} + +; Function Attrs: convergent +declare void @unknown() #1 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +; CHECK-NEXT: ret void +; +entry: + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) + +; Function Attrs: nounwind +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2 + +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) + +declare void @spmd_amenable() #4 + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #2 = { nounwind } +attributes #3 = { convergent } +attributes #4 = { "llvm.assume"="ompx_spmd_amenable" } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3, !4, !8, !9} +!llvm.ident = !{!5} + +!0 = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0} +!1 = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 2} +!5 = !{!"clang version 13.0.0"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.mustprogress"} +!8 = !{i32 7, !"openmp", i32 50} +!9 = !{i32 7, !"openmp-device", i32 50} +;. +; CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR2]] = { nounwind } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" } +; CHECK: attributes #[[ATTR4]] = { convergent } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0} +; CHECK: [[META1:![0-9]+]] = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1} +; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} +; CHECK: [[META4:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META7:![0-9]+]] = !{!"clang version 13.0.0"} +; CHECK: [[LOOP8]] = distinct !{!8, !9} +; CHECK: [[META9:![0-9]+]] = !{!"llvm.loop.mustprogress"} +;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll new file mode 100644 index 0000000000000..188b84d5c1187 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll @@ -0,0 +1,233 @@ +; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s +target triple = "nvptx64" + +; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function 'unknown'. +; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function 'unknown'. +; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback [1 known parallel regions, 2 unkown parallel regions] (bad). +; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp")))` (or "omp_no_parallelism"). +; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp")))` (or "omp_no_parallelism"). +; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:20:1: Generic-mode kernel is changed to SPMD-mode. + +;; void unknown(void); +;; void known(void) { +;; #pragma omp parallel +;; { +;; unknown(); +;; } +;; } +;; +;; void test_fallback(void) { +;; #pragma omp target teams +;; { +;; unknown(); +;; known(); +;; unknown(); +;; } +;; } +;; +;; void no_openmp(void) __attribute__((assume("omp_no_openmp"))); +;; void test_no_fallback(void) { +;; #pragma omp target teams +;; { +;; known(); +;; known(); +;; known(); +;; spmd_amenable(); +;; } +;; } + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [103 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([103 x i8], [103 x i8]* @0, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant [72 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;test_fallback;11;1;;\00", align 1 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([72 x i8], [72 x i8]* @2, i32 0, i32 0) }, align 8 +@4 = private unnamed_addr constant [104 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;25;;\00", align 1 +@5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([104 x i8], [104 x i8]* @4, i32 0, i32 0) }, align 8 +@__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode = weak constant i8 1 +@6 = private unnamed_addr constant [106 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;1;;\00", align 1 +@7 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([106 x i8], [106 x i8]* @6, i32 0, i32 0) }, align 8 +@8 = private unnamed_addr constant [75 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;test_no_fallback;20;1;;\00", align 1 +@9 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([75 x i8], [75 x i8]* @8, i32 0, i32 0) }, align 8 +@10 = private unnamed_addr constant [107 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;25;;\00", align 1 +@11 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([107 x i8], [107 x i8]* @10, i32 0, i32 0) }, align 8 +@__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1 +@12 = private unnamed_addr constant [63 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;known;4;1;;\00", align 1 +@13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([63 x i8], [63 x i8]* @12, i32 0, i32 0) }, align 8 +@G = external global i32 +@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata" + +; Function Attrs: convergent norecurse nounwind +define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 { +entry: + %captured_vars_addrs.i.i = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #3, !dbg !18 + %exec_user_code = icmp eq i32 %0, -1, !dbg !18 + br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18 + +common.ret: ; preds = %entry, %user_code.entry + ret void, !dbg !19 + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @3) #3 + call void @unknown() #6, !dbg !20 + %2 = bitcast [0 x i8*]* %captured_vars_addrs.i.i to i8* + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i.i, i64 0, i64 0, !dbg !23 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26 + call void @unknown() #6, !dbg !27 + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i1 false, i1 true) #3, !dbg !28 + br label %common.ret +} + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr + +; Function Attrs: convergent +declare void @unknown() local_unnamed_addr #1 + +; Function Attrs: nounwind +define hidden void @known() local_unnamed_addr #2 !dbg !29 { +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @13) + %1 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs, i64 0, i64 0, !dbg !30 + call void @__kmpc_parallel_51(%struct.ident_t* nonnull @13, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** nonnull %1, i64 0) #3, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: nounwind +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3 + +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 { +entry: + %captured_vars_addrs.i2.i = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i1 false, i1 true, i1 true) #3, !dbg !33 + %exec_user_code = icmp eq i32 %0, -1, !dbg !33 + br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33 + +common.ret: ; preds = %entry, %user_code.entry + ret void, !dbg !34 + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @9) #3 + %2 = bitcast [0 x i8*]* %captured_vars_addrs.i2.i to i8* + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i2.i, i64 0, i64 0, !dbg !35 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !35 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !39 + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %5, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !40 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !42 + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3 + %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3 + call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45 + call void @spmd_amenable() + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46 + br label %common.ret +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias nocapture nofree readnone %.global_tid., i32* noalias nocapture nofree readnone %.bound_tid.) #0 !dbg !47 { +entry: + call void @unknown() #6, !dbg !48 + ret void, !dbg !49 +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 !dbg !50 { +entry: + %global_args = alloca i8**, align 8 + call void @__kmpc_get_shared_variables(i8*** nonnull %global_args) #3, !dbg !51 + call void @unknown() #6, !dbg !52 + ret void, !dbg !51 +} + +declare void @__kmpc_get_shared_variables(i8***) local_unnamed_addr + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) local_unnamed_addr + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5 + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5 + +declare void @spmd_amenable() #7 + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #3 = { nounwind } +attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #5 = { argmemonly nofree nosync nounwind willreturn } +attributes #6 = { convergent nounwind } +attributes #7 = { "llvm.assume"="ompx_spmd_amenable" } + +!llvm.dbg.cu = !{!0} +!omp_offload.info = !{!3, !4} +!nvvm.annotations = !{!5, !6} +!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13} +!llvm.ident = !{!14} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "spmdization_remarks.c", directory: "/data/src/llvm-project") +!2 = !{} +!3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1} +!4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0} +!5 = !{void ()* @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1} +!6 = !{void ()* @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1} +!7 = !{i32 7, !"Dwarf Version", i32 2} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 4} +!10 = !{i32 7, !"openmp", i32 50} +!11 = !{i32 7, !"openmp-device", i32 50} +!12 = !{i32 7, !"PIC Level", i32 2} +!13 = !{i32 7, !"frame-pointer", i32 2} +!14 = !{!"clang version 13.0.0"} +!15 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_fallback_l11", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!16 = !DIFile(filename: "llvm/test/Transforms/OpenMP/spmdization_remarks.c", directory: "/data/src/llvm-project") +!17 = !DISubroutineType(types: !2) +!18 = !DILocation(line: 11, column: 1, scope: !15) +!19 = !DILocation(line: 0, scope: !15) +!20 = !DILocation(line: 13, column: 5, scope: !21, inlinedAt: !22) +!21 = distinct !DISubprogram(name: "__omp_outlined__", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!22 = distinct !DILocation(line: 11, column: 1, scope: !15) +!23 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !25) +!24 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!25 = distinct !DILocation(line: 14, column: 5, scope: !21, inlinedAt: !22) +!26 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !25) +!27 = !DILocation(line: 15, column: 5, scope: !21, inlinedAt: !22) +!28 = !DILocation(line: 11, column: 25, scope: !15) +!29 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!30 = !DILocation(line: 4, column: 1, scope: !29) +!31 = !DILocation(line: 8, column: 1, scope: !29) +!32 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_no_fallback_l20", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!33 = !DILocation(line: 20, column: 1, scope: !32) +!34 = !DILocation(line: 0, scope: !32) +!35 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !36) +!36 = distinct !DILocation(line: 22, column: 5, scope: !37, inlinedAt: !38) +!37 = distinct !DISubprogram(name: "__omp_outlined__1", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!38 = distinct !DILocation(line: 20, column: 1, scope: !32) +!39 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !36) +!40 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !41) +!41 = distinct !DILocation(line: 23, column: 5, scope: !37, inlinedAt: !38) +!42 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !41) +!43 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !44) +!44 = distinct !DILocation(line: 24, column: 5, scope: !37, inlinedAt: !38) +!45 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !44) +!46 = !DILocation(line: 20, column: 25, scope: !32) +!47 = distinct !DISubprogram(name: "__omp_outlined__2", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!48 = !DILocation(line: 6, column: 5, scope: !47) +!49 = !DILocation(line: 7, column: 3, scope: !47) +!50 = distinct !DISubprogram(linkageName: "__omp_outlined__2_wrapper", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagArtificial, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!51 = !DILocation(line: 4, column: 1, scope: !50) +!52 = !DILocation(line: 6, column: 5, scope: !47, inlinedAt: !53) +!53 = distinct !DILocation(line: 4, column: 1, scope: !50) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits