================
@@ -4692,23 +4691,74 @@ OpenMPIRBuilder::InsertPointOrErrorTy 
OpenMPIRBuilder::createReductionsGPU(
 
     Builder.restoreIP(CodeGenIP);
 
-    Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
-        RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
-
-    Value *Args3[] = {SrcLocInfo,
-                      KernelTeamsReductionPtr,
-                      Builder.getInt32(ReductionBufNum),
-                      ReductionDataSize,
-                      RL,
-                      *SarFunc,
-                      WcFunc,
-                      *LtGCFunc,
-                      *LtGRFunc,
-                      *GtLCFunc,
-                      *GtLRFunc};
+    // The runtime's cross-team final aggregate uses the storage pointed at by
+    // its reduce-list argument as per-thread scratch.  When the surrounding
+    // kernel is already in SPMD execution mode, clang emitted each reduction
+    // private as a per-thread `alloca addrspace(5)`, so the original red_list
+    // (RL) is already per-thread and nothing else is needed.
+    //
+    // When the kernel is in Non-SPMD execution mode at codegen time, clang's
+    // Generic-mode globalization put the reduction private into team-shared
+    // LDS.  OpenMPOpt may later upgrade the kernel to Generic-SPMD, at which
+    // point all threads of the last team enter the cross-team final aggregate
+    // — and they would race on the shared LDS slot if we passed RL through.
+    // Emit a per-thread scratch buffer + a per-thread red_list, copy the
+    // team-local value in, and hand the per-thread red_list to the runtime
+    // instead.  `PerThreadScratchFieldPtrs` is then non-empty, which signals
+    // the writer-thread combine loop below to source the final value from
+    // the per-thread scratch (which the runtime updated) rather than from
+    // RI.PrivateVariable (which still holds the team-local value).
+    Value *RuntimeRedList = RL;
+    if (!IsSPMD) {
+      CodeGenIP = Builder.saveIP();
+      Builder.restoreIP(AllocaIP);
+      Value *PerThreadScratchAlloca = Builder.CreateAlloca(
+          ReductionsBufferTy, /*ArraySize=*/nullptr, ".omp.reduction.scratch");
+      Value *PerThreadScratch = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          PerThreadScratchAlloca, PtrTy,
+          PerThreadScratchAlloca->getName() + ".ascast");
+      ArrayType *PerThreadRedListTy =
+          ArrayType::get(PtrTy, ReductionInfos.size());
+      Value *PerThreadRedListAlloca =
+          Builder.CreateAlloca(PerThreadRedListTy, /*ArraySize=*/nullptr,
+                               ".omp.reduction.per_thread_red_list");
+      Value *PerThreadRedList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          PerThreadRedListAlloca, PtrTy,
+          PerThreadRedListAlloca->getName() + ".ascast");
+      Builder.restoreIP(CodeGenIP);
+
+      PerThreadScratchFieldPtrs.assign(ReductionInfos.size(), nullptr);
+      for (auto En : enumerate(ReductionInfos)) {
+        const ReductionInfo &RI = En.value();
+        Type *FieldTy = ReductionTypeArgs[En.index()];
+        Value *FieldPtr = Builder.CreateConstInBoundsGEP2_32(
+            ReductionsBufferTy, PerThreadScratch, 0, En.index());
+        Value *Slot =
+            Builder.CreateInBoundsGEP(PerThreadRedListTy, PerThreadRedList,
+                                      {ConstantInt::get(IndexTy, 0),
+                                       ConstantInt::get(IndexTy, En.index())});
+        Builder.CreateStore(FieldPtr, Slot);
----------------
ro-i wrote:

fixed

https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to