efwright updated this revision to Diff 543540.
efwright added a comment.

Dropping off a simple test case. If this looks about what you would expect for 
the tests I have a couple more involved ones that I can repurpose and add in. 
For more complex tests we have a couple of the benchmark codes from ICPP that 
were working.

Some cleanup of the code gen is coming, will be on travel tomorrow so might 
take a day or two.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D154568/new/

https://reviews.llvm.org/D154568

Files:
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/Parse/ParseOpenMP.cpp
  llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
  llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
  openmp/libomptarget/DeviceRTL/src/Workshare.cpp
  openmp/libomptarget/test/offloading/simd.c

Index: openmp/libomptarget/test/offloading/simd.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/offloading/simd.c
@@ -0,0 +1,27 @@
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+
+  int *A = (int*) malloc(32*sizeof(int));
+
+  #pragma omp target teams map(tofrom:A[0:32]) num_teams(1)
+  {
+    #pragma omp parallel num_threads(32)
+    {
+      #pragma omp simd
+      for(int i = 0; i < 32; i++)
+        A[i] = 1;
+    }
+  }
+
+  for(int i = 0; i < 32; i++)
+    assert(A[i] == 1);
+
+  free(A);
+
+  printf("PASS\n");
+}
+// CHECK: PASS
+
Index: openmp/libomptarget/DeviceRTL/src/Workshare.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -658,6 +658,19 @@
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
   FunctionTracingRAII();
 }
+
+void __kmpc_simd_51(
+  IdentTy *ident, void *WorkFn, uint64_t TripCount,
+  void **Args, uint32_t nargs
+) {
+  FunctionTracingRAII();
+
+  ASSERT(WorkFn); 
+  for(uint64_t omp_iv = 0; omp_iv < TripCount; omp_iv++) {
+    ((void (*)(uint64_t, void**))WorkFn)(omp_iv, Args);
+  }
+  
+}
 }
 
 #pragma omp end declare target
Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===================================================================
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -454,6 +454,7 @@
   for (OutlineInfo &OI : OutlineInfos) {
     // Skip functions that have not finalized yet; may happen with nested
     // function generation.
+    
     if (Fn && OI.getFunction() != Fn) {
       DeferredOutlines.push_back(OI);
       continue;
@@ -462,7 +463,6 @@
     ParallelRegionBlockSet.clear();
     Blocks.clear();
     OI.collectBlocks(ParallelRegionBlockSet, Blocks);
-
     Function *OuterFn = OI.getFunction();
     CodeExtractorAnalysisCache CEAC(*OuterFn);
     CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
@@ -481,11 +481,10 @@
     assert(Extractor.isEligible() &&
            "Expected OpenMP outlining to be possible!");
 
-    for (auto *V : OI.ExcludeArgsFromAggregate)
+    for (auto *V : OI.ExcludeArgsFromAggregate) {
       Extractor.excludeArgFromAggregate(V);
-
+    }
     Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
-
     LLVM_DEBUG(dbgs() << "After      outlining: " << *OuterFn << "\n");
     LLVM_DEBUG(dbgs() << "   Outlined function: " << *OutlinedFn << "\n");
     assert(OutlinedFn->getReturnType()->isVoidTy() &&
@@ -1230,6 +1229,308 @@
   return AfterIP;
 }
 
+
+IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
+  const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
+  LoopBodyCallbackTy BodyGenCB,
+  TripCountCallbackTy DistanceCB,
+  PrivatizeCallbackTy PrivCB,
+  FinalizeCallbackTy FiniCB,
+  bool SPMDMode
+)
+{
+  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = getOrCreateThreadID(Ident);
+
+  BasicBlock *InsertBB = Builder.GetInsertBlock();
+  Function *OuterFn = InsertBB->getParent();
+
+  LLVM_DEBUG(dbgs() << "At the start of createSimdLoop: " << *OuterFn << "\n");
+
+  // Save the outer alloca block because the insertion iterator may get
+  // invalidated and we still need this later.
+  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+
+  // Vector to remember instructions we used only during the modeling but which
+  // we want to delete at the end.
+  SmallVector<Instruction *, 16> ToBeDeleted;
+
+  // Create an artificial insertion point that will also ensure the blocks we
+  // are about to split are not degenerated.
+  auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
+
+  Instruction *ThenTI = UI, *ElseTI = nullptr;
+
+  BasicBlock *ThenBB = ThenTI->getParent();
+  BasicBlock *LRegDistanceBB = ThenBB->splitBasicBlock(ThenTI, "omp.loop.distance");
+  BasicBlock *PRegEntryBB = LRegDistanceBB->splitBasicBlock(ThenTI, "omp.loop.entry");
+  BasicBlock *PRegBodyBB =
+      PRegEntryBB->splitBasicBlock(ThenTI, "omp.loop.region");
+  BasicBlock *PRegPreFiniBB =
+      PRegBodyBB->splitBasicBlock(ThenTI, "omp.loop.pre_finalize");
+  BasicBlock *PRegExitBB =
+      PRegPreFiniBB->splitBasicBlock(ThenTI, "omp.loop.exit");
+
+
+  auto FiniCBWrapper = [&](InsertPointTy IP) {
+    // Hide "open-ended" blocks from the given FiniCB by setting the right jump
+    // target to the region exit blocks
+    if (IP.getBlock()->end() == IP.getPoint()) {
+      IRBuilder<>::InsertPointGuard IPG(Builder);
+      Builder.restoreIP(IP);
+      Instruction *I = Builder.CreateBr(PRegExitBB);
+      IP = InsertPointTy(I->getParent(), I->getIterator());
+    }
+    assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
+           IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
+           "Unexpected insertion point for finalization call!");
+    return FiniCB(IP);
+  };
+
+  FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, false});
+
+  // Compute the loop trip count
+  // Insert after the outer alloca to ensure all variables need
+  // in its calculation are ready
+  InsertPointTy DistanceIP(LRegDistanceBB, LRegDistanceBB->begin());
+  Value *DistVal;
+  bool IsTripCountSigned;
+  assert(DistanceCB && "expected loop trip count callback function!");
+  DistanceCB(DistanceIP, DistVal, IsTripCountSigned);
+  assert(DistVal && "trip count call back should return integer trip count");
+  Type *DistValType = DistVal->getType();
+  assert(DistValType->isIntegerTy() && "trip count should be integer type");
+
+  // Possibly need to cast DistVal to Int64
+  // FIXME if the integer is signed it needs to be converted to
+  // unsigned. If the tripcount is <0 it's fine to just convert
+  // it to 0.
+  if(!DistValType->isIntegerTy(64)) {
+    Builder.SetInsertPoint(LRegDistanceBB->getTerminator());
+    DistVal = Builder.CreateIntCast(
+     DistVal, Int64, /*IsTripCountSigned*/ false, DistVal->getName()+".casted");
+  }
+
+  LLVM_DEBUG(dbgs() << "After DistanceCB: " << *LRegDistanceBB << "\n");
+  LLVM_DEBUG(dbgs() << "Trip count variable: " << *DistVal << "\n");
+
+  // Create the virtual iteration variable that will be pulled into
+  // the outlined function.
+  Builder.restoreIP(OuterAllocaIP);
+  AllocaInst *OMPIVAlloca = Builder.CreateAlloca(Int64, nullptr, "omp.iv.tmp");
+  LoadInst *OMPIVLoad = Builder.CreateLoad(Int64, OMPIVAlloca, "omp.iv");
+
+  // Generate the privatization allocas in the block that will become the entry
+  // of the outlined function.
+  Builder.SetInsertPoint(PRegEntryBB->getTerminator());
+  InsertPointTy InnerAllocaIP = Builder.saveIP();
+
+  // Use omp.iv in the outlined region. Cast it if needed.
+  Instruction *OMPIV;
+  if(!DistValType->isIntegerTy(64)) {
+    // Cast omp.iv to the same type as the trip count.
+    // If the cast is needed, keep it in the outlined region
+    OMPIV = dyn_cast<Instruction>(
+      Builder.CreateTrunc(OMPIVLoad, DistValType, "omp.iv.casted"));
+  } else {
+    // If cast is unneeded, we still need to generate a fake use of
+    // omp.iv so the outlined function picks it up as the first arg
+    Instruction *OMPIVUse = dyn_cast<Instruction>(
+     Builder.CreateAdd(OMPIVLoad, Builder.getInt64(0), "omp.iv.tobedeleted"));
+    OMPIV = OMPIVLoad;
+    ToBeDeleted.push_back(OMPIVUse);
+  }
+
+  // Order matters
+  ToBeDeleted.push_back(OMPIVLoad);
+  ToBeDeleted.push_back(OMPIVAlloca);
+
+  LLVM_DEBUG(llvm::dbgs() << "omp.iv variable generated: " << *OuterFn << "\n");
+
+  LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
+  assert(BodyGenCB && "Expected body generation callback!");
+  InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
+
+  BodyGenCB(InnerAllocaIP, CodeGenIP, OMPIV);
+
+  LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
+
+  FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_simd_51);
+
+  OutlineInfo OI;
+
+  // Adjust the finalization stack, verify the adjustment, and call the
+  // finalize function a last time to finalize values between the pre-fini
+  // block and the exit block if we left the parallel "the normal way".
+  auto FiniInfo = FinalizationStack.pop_back_val();
+  (void)FiniInfo;
+  assert(FiniInfo.DK == OMPD_parallel &&
+         "Unexpected finalization stack state!");
+
+  Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
+
+  InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
+  FiniCB(PreFiniIP);
+
+  OI.OuterAllocaBB = OuterAllocaBlock;
+  OI.EntryBB = PRegEntryBB;
+  OI.ExitBB = PRegExitBB;
+
+  SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+  SmallVector<BasicBlock *, 32> Blocks;
+  OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+  // Ensure a single exit node for the outlined region by creating one.
+  // We might have multiple incoming edges to the exit now due to finalizations,
+  // e.g., cancel calls that cause the control flow to leave the region.
+  BasicBlock *PRegOutlinedExitBB = PRegExitBB;
+  PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
+  PRegOutlinedExitBB->setName("omp.loop.outlined.exit");
+  Blocks.push_back(PRegOutlinedExitBB);
+
+  CodeExtractorAnalysisCache CEAC(*OuterFn);
+
+  CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+                          /* AggregateArgs */ true,
+                          /* BlockFrequencyInfo */ nullptr,
+                          /* BranchProbabilityInfo */ nullptr,
+                          /* AssumptionCache */ nullptr,
+                          /* AllowVarArgs */ false,
+                          /* AllowAlloca */ true,
+                          /* AllocationBlock */ OuterAllocaBlock,
+                          /* Suffix */ ".omp_simd");
+
+  BasicBlock *CommonExit = nullptr;
+  SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+  Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+  Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+  LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
+
+  auto PrivHelper = [&](Value &V) {
+    // Exclude omp.iv from aggregate
+    if (&V == OMPIVLoad) {
+      OI.ExcludeArgsFromAggregate.push_back(&V);
+      return;
+    }
+
+    // Get all uses of value that are inside of the outlined region
+    SetVector<Use *> Uses;
+    for (Use &U : V.uses())
+      if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+        if (ParallelRegionBlockSet.count(UserI->getParent()))
+          Uses.insert(&U);
+
+    Value *Inner = &V;
+
+    // If the value isn't a pointer type, store it in a pointer
+    // Unpack it inside the outlined region
+    if (!V.getType()->isPointerTy()) {
+      IRBuilder<>::InsertPointGuard Guard(Builder);
+      LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+      Builder.restoreIP(OuterAllocaIP);
+      Value *Ptr = Builder.CreateAlloca(
+        V.getType(), nullptr, V.getName() + ".reloaded");
+
+      // Store to stack at end of the block that currently branches to the entry
+      // block of the to-be-outlined region.
+      Builder.SetInsertPoint(
+        InsertBB, InsertBB->getTerminator()->getIterator());
+      Builder.CreateStore(&V, Ptr);
+
+      // Load back next to allocations in the to-be-outlined region.
+      Builder.restoreIP(InnerAllocaIP);
+      Inner = Builder.CreateLoad(V.getType(), Ptr);
+    }
+
+    Value *ReplacementValue = nullptr;
+    Builder.restoreIP(
+      PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
+    assert(ReplacementValue &&
+      "Expected copy/create callback to set replacement value!");
+    if (ReplacementValue == &V)
+      return;
+
+    for (Use *UPtr : Uses)
+      UPtr->set(ReplacementValue);
+
+  };
+
+  InnerAllocaIP = IRBuilder<>::InsertPoint(
+      OMPIV->getParent(), OMPIV->getNextNode()->getIterator());
+
+  // Reset the outer alloca insertion point to the entry of the relevant block
+  // in case it was invalidated.
+  OuterAllocaIP = IRBuilder<>::InsertPoint(
+    OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
+  for (Value *Input : Inputs) {
+    PrivHelper(*Input);
+  }
+
+  assert(Outputs.empty() &&
+    "OpenMP outlining should not produce live-out values!");
+
+  LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
+  for (auto *BB : Blocks) {
+    LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+  }
+
+  int NumInputs = Inputs.size()-1; // One argument is always omp.iv
+  OI.PostOutlineCB = [=](Function &OutlinedFn) {
+    OutlinedFn.addFnAttr(Attribute::NoUnwind);
+    OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+    assert(OutlinedFn.arg_size() == 2 &&
+           "Expected omp.iv & structArg as arguments");
+
+    CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+    BasicBlock *CallBlock = CI->getParent();
+    CallBlock->setName("omp_loop");
+    Builder.SetInsertPoint(CI);
+
+    Value * StructArg = CI->getArgOperand(1); // 0 should be omp.iv
+
+    Value *SimdArgs[] = {
+        Ident,
+        Builder.CreateBitCast(&OutlinedFn, LoopTaskPtr),
+        DistVal,
+        Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr),
+        Builder.getInt32(NumInputs)};
+
+    SmallVector<Value *, 16> RealArgs;
+    RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs));
+
+    CallInst *Simd51Call = Builder.CreateCall(RTLFn, RealArgs);
+
+    LLVM_DEBUG(dbgs() << "With runtime call placed: " << *Builder.GetInsertBlock()->getParent() << "\n");
+
+    InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end());
+
+    CI->eraseFromParent();
+
+    for (Instruction *I : ToBeDeleted)
+      I->eraseFromParent();
+
+  };
+
+  addOutlineInfo(std::move(OI));
+
+  InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
+  UI->eraseFromParent();
+
+  return AfterIP;
+
+}
+
 void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
   // Build call void __kmpc_flush(ident_t *loc)
   uint32_t SrcLocStrSize;
Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -119,6 +119,8 @@
 __OMP_FUNCTION_TYPE(InterWarpCopy, false, Void, VoidPtr, Int32)
 __OMP_FUNCTION_TYPE(GlobalList, false, Void, VoidPtr, Int32, VoidPtr)
 
+__OMP_FUNCTION_TYPE(LoopTask, false, Void, Int64, VoidPtrPtr)
+
 #undef __OMP_FUNCTION_TYPE
 #undef OMP_FUNCTION_TYPE
 
@@ -484,6 +486,8 @@
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
+__OMP_RTL(__kmpc_simd_51, false, Void, IdentPtr, LoopTaskPtr, Int64, VoidPtrPtr, Int32)
+
 __OMP_RTL(__last, false, Void, )
 
 #undef __OMP_RTL
Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -506,6 +506,16 @@
   using BodyGenCallbackTy =
       function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
 
+  using LoopBodyCallbackTy =
+      function_ref<void(
+        InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *IterationNum
+      )>;
+
+  using TripCountCallbackTy =
+      function_ref<
+        void(InsertPointTy CodeGenIP, Value *&TripCount, bool &Signed)
+      >;
+
   // This is created primarily for sections construct as llvm::function_ref
   // (BodyGenCallbackTy) is not storable (as described in the comments of
   // function_ref class - function_ref contains non-ownable reference
@@ -605,6 +615,15 @@
                  Value *NumThreads, omp::ProcBindKind ProcBind,
                  bool IsCancellable);
 
+  IRBuilder<>::InsertPoint
+  createSimdLoop(const LocationDescription &Loc, InsertPointTy AllocaIP,
+                 LoopBodyCallbackTy BodyGenCB,
+                 TripCountCallbackTy DistanceCB,
+                 PrivatizeCallbackTy PrivCB,
+                 FinalizeCallbackTy FiniCB,
+                 bool SPMDMode);
+
+
   /// Generator for the control flow structure of an OpenMP canonical loop.
   ///
   /// This generator operates on the logical iteration space of the loop, i.e.
Index: clang/lib/Parse/ParseOpenMP.cpp
===================================================================
--- clang/lib/Parse/ParseOpenMP.cpp
+++ clang/lib/Parse/ParseOpenMP.cpp
@@ -2953,7 +2953,7 @@
         Sema::CompoundScopeRAII Scope(Actions);
         AssociatedStmt = ParseStatement();
 
-        if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
+        if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind)  &&
             getLangOpts().OpenMPIRBuilder)
           AssociatedStmt = Actions.ActOnOpenMPLoopnest(AssociatedStmt.get());
       }
Index: clang/lib/CodeGen/CodeGenFunction.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -94,7 +94,8 @@
   // seems to be a reasonable spot. We do it here, as opposed to the deletion
   // time of the CodeGenModule, because we have to ensure the IR has not yet
   // been "emitted" to the outside, thus, modifications are still sensible.
-  if (CGM.getLangOpts().OpenMPIRBuilder && CurFn)
+  //if (CGM.getLangOpts().OpenMPIRBuilder && CurFn)
+  if (CurFn)
     CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn);
 }
 
Index: clang/lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -2664,59 +2664,102 @@
 }
 
 void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
-  bool UseOMPIRBuilder =
-      CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S);
+  bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder;
+  //bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIsDevice;
   if (UseOMPIRBuilder) {
-    auto &&CodeGenIRBuilder = [this, &S, UseOMPIRBuilder](CodeGenFunction &CGF,
-                                                          PrePostActionTy &) {
-      // Use the OpenMPIRBuilder if enabled.
-      if (UseOMPIRBuilder) {
-        llvm::MapVector<llvm::Value *, llvm::Value *> AlignedVars =
-            GetAlignedMapping(S, CGF);
-        // Emit the associated statement and get its loop representation.
-        const Stmt *Inner = S.getRawStmt();
-        llvm::CanonicalLoopInfo *CLI =
-            EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
-
-        llvm::OpenMPIRBuilder &OMPBuilder =
-            CGM.getOpenMPRuntime().getOMPBuilder();
-        // Add SIMD specific metadata
-        llvm::ConstantInt *Simdlen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSimdlenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Simdlen = Val;
-        }
-        llvm::ConstantInt *Safelen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSafelenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Safelen = Val;
-        }
-        llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown;
-        if (const auto *C = S.getSingleClause<OMPOrderClause>()) {
-          if (C->getKind() == OpenMPOrderClauseKind ::OMPC_ORDER_concurrent) {
-            Order = llvm::omp::OrderKind::OMP_ORDER_concurrent;
+    auto *CS = dyn_cast<CapturedStmt>(S.getAssociatedStmt());
+    auto *CL = dyn_cast<OMPCanonicalLoop>(CS->getCapturedStmt());
+    CGCapturedStmtInfo CGSI(*CS, CR_OpenMP);
+
+    CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI);
+    llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
+      AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+
+    const auto *For = dyn_cast<ForStmt>(CL->getLoopStmt());
+    if(const Stmt *InitStmt = For->getInit())
+      EmitStmt(InitStmt);
+    const DeclRefExpr *LoopVarRef = CL->getLoopVarRef();
+    LValue LCVal = EmitLValue(LoopVarRef);
+    Address LoopVarAddress = LCVal.getAddress(*this);
+    llvm::AllocaInst *LoopVar = dyn_cast<llvm::AllocaInst>(LoopVarAddress.getPointer());
+
+    llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+
+    using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+
+    // FIXME check if trip count is signed
+    auto DistanceCB = [this, CL, LoopVar](InsertPointTy CodeGenIP, llvm::Value *&TripCount, bool &Signed) -> void {
+      Builder.restoreIP(CodeGenIP);
+
+      const CapturedStmt *DistanceFunc = CL->getDistanceFunc();
+      EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc);
+
+      QualType LogicalTy = DistanceFunc->getCapturedDecl()
+                           ->getParam(0)
+                           ->getType()
+                           .getNonReferenceType();
+      Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+      emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
+      TripCount = Builder.CreateLoad(CountAddr, ".count");
+
+      return;
+    };
+
+    auto FiniCB = [this](InsertPointTy IP) {
+      OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP);
+    };
+
+    auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                     llvm::Value &, llvm::Value &Val, llvm::Value *&ReplVal) {
+      ReplVal = &Val;
+      return CodeGenIP;
+    };
+
+    const Stmt *loopBody = S.getBody();
+    auto BodyGenCB = [loopBody, this, CL, LoopVar, &S]
+                     (InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                      llvm::Value *Virtual) {
+      llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+
+      OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody(
+          *this,
+          loopBody,
+          AllocaIP,
+          CodeGenIP,
+          "simd");
+
+      Builder.restoreIP(AllocaIP);
+      llvm::AllocaInst *NewLoopVar =
+            Builder.CreateAlloca(LoopVar->getAllocatedType(), LoopVar->getAddressSpace(),
+                                 LoopVar->getArraySize(), LoopVar->getName()+".loopvar");
+
+      for(llvm::User *U : LoopVar->users()) {
+        if(auto I = dyn_cast<llvm::Instruction>(U)) {
+          if(I->getParent() == CodeGenIPBB) {
+            U->replaceUsesOfWith(LoopVar, NewLoopVar);
           }
         }
-        // Add simd metadata to the collapsed loop. Do not generate
-        // another loop for if clause. Support for if clause is done earlier.
-        OMPBuilder.applySimd(CLI, AlignedVars,
-                             /*IfCond*/ nullptr, Order, Simdlen, Safelen);
-        return;
       }
+
+      const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+      EmittedClosureTy LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc);
+      Builder.SetInsertPoint(CodeGenIPBB, CodeGenIPBB->begin());
+      emitCapturedStmtCall(*this, LoopVarClosure,
+                           {NewLoopVar, Virtual});
+
     };
-    {
-      auto LPCRegion =
-          CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
-      OMPLexicalScope Scope(*this, S, OMPD_unknown);
-      CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd,
-                                                  CodeGenIRBuilder);
-    }
+
+    Builder.restoreIP(
+      OMPBuilder.createSimdLoop(
+        Builder,
+        AllocaIP,
+        BodyGenCB,
+        DistanceCB,
+        PrivCB,
+        FiniCB,
+        /*SPMD*/ true
+    ));
+
     return;
   }
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D154568: [C... Eric Wright via Phabricator via cfe-commits

Reply via email to