================
@@ -4692,23 +4691,74 @@ OpenMPIRBuilder::InsertPointOrErrorTy
OpenMPIRBuilder::createReductionsGPU(
Builder.restoreIP(CodeGenIP);
- Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
- RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
-
- Value *Args3[] = {SrcLocInfo,
- KernelTeamsReductionPtr,
- Builder.getInt32(ReductionBufNum),
- ReductionDataSize,
- RL,
- *SarFunc,
- WcFunc,
- *LtGCFunc,
- *LtGRFunc,
- *GtLCFunc,
- *GtLRFunc};
+ // The runtime's cross-team final aggregate uses the storage pointed at by
+ // its reduce-list argument as per-thread scratch. When the surrounding
+ // kernel is already in SPMD execution mode, clang emitted each reduction
+ // private as a per-thread `alloca addrspace(5)`, so the original red_list
+ // (RL) is already per-thread and nothing else is needed.
+ //
+ // When the kernel is in Non-SPMD execution mode at codegen time, clang's
+ // Generic-mode globalization put the reduction private into team-shared
+ // LDS. OpenMPOpt may later upgrade the kernel to Generic-SPMD, at which
+ // point all threads of the last team enter the cross-team final aggregate
+ // — and they would race on the shared LDS slot if we passed RL through.
+ // Emit a per-thread scratch buffer + a per-thread red_list, copy the
+ // team-local value in, and hand the per-thread red_list to the runtime
+ // instead. `PerThreadScratchFieldPtrs` is then non-empty, which signals
+ // the writer-thread combine loop below to source the final value from
+ // the per-thread scratch (which the runtime updated) rather than from
+ // RI.PrivateVariable (which still holds the team-local value).
+ Value *RuntimeRedList = RL;
+ if (!IsSPMD) {
+ CodeGenIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ Value *PerThreadScratchAlloca = Builder.CreateAlloca(
+ ReductionsBufferTy, /*ArraySize=*/nullptr, ".omp.reduction.scratch");
+ Value *PerThreadScratch = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ PerThreadScratchAlloca, PtrTy,
+ PerThreadScratchAlloca->getName() + ".ascast");
+ ArrayType *PerThreadRedListTy =
+ ArrayType::get(PtrTy, ReductionInfos.size());
+ Value *PerThreadRedListAlloca =
+ Builder.CreateAlloca(PerThreadRedListTy, /*ArraySize=*/nullptr,
+ ".omp.reduction.per_thread_red_list");
+ Value *PerThreadRedList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ PerThreadRedListAlloca, PtrTy,
+ PerThreadRedListAlloca->getName() + ".ascast");
+ Builder.restoreIP(CodeGenIP);
+
+ PerThreadScratchFieldPtrs.assign(ReductionInfos.size(), nullptr);
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Type *FieldTy = ReductionTypeArgs[En.index()];
+ Value *FieldPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, PerThreadScratch, 0, En.index());
+ Value *Slot =
+ Builder.CreateInBoundsGEP(PerThreadRedListTy, PerThreadRedList,
+ {ConstantInt::get(IndexTy, 0),
+ ConstantInt::get(IndexTy, En.index())});
+ Builder.CreateStore(FieldPtr, Slot);
----------------
ro-i wrote:
fixed
https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits