Re: [PATCH] D17170: [OPENMP] Codegen for distribute directive

Carlo Bertolli via cfe-commits Wed, 17 Feb 2016 09:23:45 -0800

carlo.bertolli updated this revision to Diff 48200.
carlo.bertolli added a comment.


Updated diff reflecting comments.
Note that having the emission of the runtime kmpc_for_static_init function in a 
static function (instead of a method of CGOpenMPRuntime) required moving to 
public some fields in CGOpenMPRuntime used when calling the runtime function.
This is painful and comments are welcome on how to overcome this, if needed.


Repository:
  rL LLVM

http://reviews.llvm.org/D17170

Files:
  include/clang/AST/StmtOpenMP.h
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.h
  lib/Serialization/ASTReaderStmt.cpp
  lib/Serialization/ASTWriterStmt.cpp
  test/OpenMP/distribute_codegen.cpp

Index: test/OpenMP/distribute_codegen.cpp
===================================================================
--- /dev/null
+++ test/OpenMP/distribute_codegen.cpp
@@ -0,0 +1,239 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64  --check-prefix HCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32  --check-prefix HCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CHECK-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void without_schedule_clause(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute
+  for (int i = 33; i < 32000000; i += 7) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 4571423, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 92, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp sgt i32 [[UBV0]], 4571423
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 4571423, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp sle i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add nsw i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_not_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute dist_schedule(static)
+  for (int i = 32000000; i > 33; i += -7) {
+        a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 4571423, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 92, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp sgt i32 [[UBV0]], 4571423
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 4571423, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp sle i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add nsw i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+#pragma omp distribute dist_schedule(static, 5)
+  for (unsigned i = 131071; i <= 2147483647; i += 127) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 16908288, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 91, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 5)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp ugt i32 [[UBV0]], 16908288
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 16908288, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp ule i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+// CHECK-LABEL: test_precond
+void test_precond() {
+  char a = 0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute
+  for(char i = a; i < 10; ++i);
+}
+
+// a is passed as a parameter to the outlined functions
+// CHECK:  define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], i8* dereferenceable({{[0-9]+}}) [[APARM:%.+]])
+// CHECK:  store i8* [[APARM]], i8** [[APTRADDR:%.+]]
+// ..many loads of %0..
+// CHECK:  [[A2:%.+]] = load i8*, i8** [[APTRADDR]]
+// CHECK:  [[AVAL0:%.+]] = load i8, i8* [[A2]]
+// CHECK:  [[AVAL1:%.+]] = load i8, i8* [[A2]]
+// CHECK:  [[AVAL2:%.+]] = load i8, i8* [[A2]]
+// CHECK:  [[ACONV:%.+]] = sext i8 [[AVAL2]] to i32
+// CHECK:  [[ACMP:%.+]] = icmp slt i32 [[ACONV]], 10
+// CHECK:  br i1 [[ACMP]], label %[[PRECOND_THEN:.+]], label %[[PRECOND_END:.+]]
+// CHECK:  [[PRECOND_THEN]]
+// CHECK:  call void @__kmpc_for_static_init_4
+// CHECK:  call void @__kmpc_for_static_fini
+// CHECK:  [[PRECOND_END]]
+
+// no templates for now, as these require special handling in target regions and/or declare target
+
+#endif
Index: lib/Serialization/ASTWriterStmt.cpp
===================================================================
--- lib/Serialization/ASTWriterStmt.cpp
+++ lib/Serialization/ASTWriterStmt.cpp
@@ -2080,7 +2080,8 @@
   Writer.AddStmt(D->getCond());
   Writer.AddStmt(D->getInit());
   Writer.AddStmt(D->getInc());
-  if (isOpenMPWorksharingDirective(D->getDirectiveKind())) {
+  if (isOpenMPWorksharingDirective(D->getDirectiveKind()) ||
+      isOpenMPDistributeDirective(D->getDirectiveKind())) {
     Writer.AddStmt(D->getIsLastIterVariable());
     Writer.AddStmt(D->getLowerBoundVariable());
     Writer.AddStmt(D->getUpperBoundVariable());
Index: lib/Serialization/ASTReaderStmt.cpp
===================================================================
--- lib/Serialization/ASTReaderStmt.cpp
+++ lib/Serialization/ASTReaderStmt.cpp
@@ -2293,7 +2293,8 @@
   D->setCond(Reader.ReadSubExpr());
   D->setInit(Reader.ReadSubExpr());
   D->setInc(Reader.ReadSubExpr());
-  if (isOpenMPWorksharingDirective(D->getDirectiveKind())) {
+  if (isOpenMPWorksharingDirective(D->getDirectiveKind()) ||
+      isOpenMPDistributeDirective(D->getDirectiveKind())) {
     D->setIsLastIterVariable(Reader.ReadSubExpr());
     D->setLowerBoundVariable(Reader.ReadSubExpr());
     D->setUpperBoundVariable(Reader.ReadSubExpr());
Index: lib/CodeGen/CodeGenFunction.h
===================================================================
--- lib/CodeGen/CodeGenFunction.h
+++ lib/CodeGen/CodeGenFunction.h
@@ -2359,6 +2359,7 @@
   void EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S);
   void EmitOMPTaskLoopSimdDirective(const OMPTaskLoopSimdDirective &S);
   void EmitOMPDistributeDirective(const OMPDistributeDirective &S);
+  void EmitOMPDistributeLoop(const OMPDistributeDirective &S);
 
   /// \brief Emit inner loop of the worksharing/simd construct.
   ///
@@ -2388,11 +2389,18 @@
   /// \return true, if this construct has any lastprivate clause, false -
   /// otherwise.
   bool EmitOMPWorksharingLoop(const OMPLoopDirective &S);
+  void EmitOMPOuterLoop(bool IsMonotonic, bool DynamicOrOrdered,
+        const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered,
+      Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk);
   void EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind,
                            bool IsMonotonic, const OMPLoopDirective &S,
                            OMPPrivateScope &LoopScope, bool Ordered, Address LB,
                            Address UB, Address ST, Address IL,
                            llvm::Value *Chunk);
+  void EmitOMPDistributeOuterLoop(
+      OpenMPDistScheduleClauseKind ScheduleKind,
+      const OMPDistributeDirective &S, OMPPrivateScope &LoopScope,
+      Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk);
   /// \brief Emit code for sections directive.
   OpenMPDirectiveKind EmitSections(const OMPExecutableDirective &S);
 
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -1351,82 +1351,15 @@
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
 }
 
-void CodeGenFunction::EmitOMPForOuterLoop(
-    OpenMPScheduleClauseKind ScheduleKind, bool IsMonotonic,
+void CodeGenFunction::EmitOMPOuterLoop(bool DynamicOrOrdered, bool IsMonotonic,
     const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered,
     Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk) {
   auto &RT = CGM.getOpenMPRuntime();
 
-  // Dynamic scheduling of the outer loop (dynamic, guided, auto, runtime).
-  const bool DynamicOrOrdered = Ordered || RT.isDynamic(ScheduleKind);
-
-  assert((Ordered ||
-          !RT.isStaticNonchunked(ScheduleKind, /*Chunked=*/Chunk != nullptr)) &&
-         "static non-chunked schedule does not need outer loop");
-
-  // Emit outer loop.
-  //
-  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
-  // When schedule(dynamic,chunk_size) is specified, the iterations are
-  // distributed to threads in the team in chunks as the threads request them.
-  // Each thread executes a chunk of iterations, then requests another chunk,
-  // until no chunks remain to be distributed. Each chunk contains chunk_size
-  // iterations, except for the last chunk to be distributed, which may have
-  // fewer iterations. When no chunk_size is specified, it defaults to 1.
-  //
-  // When schedule(guided,chunk_size) is specified, the iterations are assigned
-  // to threads in the team in chunks as the executing threads request them.
-  // Each thread executes a chunk of iterations, then requests another chunk,
-  // until no chunks remain to be assigned. For a chunk_size of 1, the size of
-  // each chunk is proportional to the number of unassigned iterations divided
-  // by the number of threads in the team, decreasing to 1. For a chunk_size
-  // with value k (greater than 1), the size of each chunk is determined in the
-  // same way, with the restriction that the chunks do not contain fewer than k
-  // iterations (except for the last chunk to be assigned, which may have fewer
-  // than k iterations).
-  //
-  // When schedule(auto) is specified, the decision regarding scheduling is
-  // delegated to the compiler and/or runtime system. The programmer gives the
-  // implementation the freedom to choose any possible mapping of iterations to
-  // threads in the team.
-  //
-  // When schedule(runtime) is specified, the decision regarding scheduling is
-  // deferred until run time, and the schedule and chunk size are taken from the
-  // run-sched-var ICV. If the ICV is set to auto, the schedule is
-  // implementation defined
-  //
-  // while(__kmpc_dispatch_next(&LB, &UB)) {
-  //   idx = LB;
-  //   while (idx <= UB) { BODY; ++idx;
-  //   __kmpc_dispatch_fini_(4|8)[u](); // For ordered loops only.
-  //   } // inner loop
-  // }
-  //
-  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
-  // When schedule(static, chunk_size) is specified, iterations are divided into
-  // chunks of size chunk_size, and the chunks are assigned to the threads in
-  // the team in a round-robin fashion in the order of the thread number.
-  //
-  // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) {
-  //   while (idx <= UB) { BODY; ++idx; } // inner loop
-  //   LB = LB + ST;
-  //   UB = UB + ST;
-  // }
-  //
-
   const Expr *IVExpr = S.getIterationVariable();
   const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
   const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
 
-  if (DynamicOrOrdered) {
-    llvm::Value *UBVal = EmitScalarExpr(S.getLastIteration());
-    RT.emitForDispatchInit(*this, S.getLocStart(), ScheduleKind,
-                           IVSize, IVSigned, Ordered, UBVal, Chunk);
-  } else {
-    RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind,
-                         IVSize, IVSigned, Ordered, IL, LB, UB, ST, Chunk);
-  }
-
   auto LoopExit = getJumpDestInCurrentScope("omp.dispatch.end");
 
   // Start the loop with a block that tests the condition.
@@ -1506,6 +1439,111 @@
   // Tell the runtime we are done.
   if (!DynamicOrOrdered)
     RT.emitForStaticFinish(*this, S.getLocEnd());
+
+}
+
+void CodeGenFunction::EmitOMPForOuterLoop(
+    OpenMPScheduleClauseKind ScheduleKind, bool IsMonotonic,
+    const OMPLoopDirective &S, OMPPrivateScope &LoopScope, bool Ordered,
+    Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk) {
+  auto &RT = CGM.getOpenMPRuntime();
+
+  // Dynamic scheduling of the outer loop (dynamic, guided, auto, runtime).
+  const bool DynamicOrOrdered = Ordered || RT.isDynamic(ScheduleKind);
+
+  assert((Ordered ||
+          !RT.isStaticNonchunked(ScheduleKind, /*Chunked=*/Chunk != nullptr)) &&
+         "static non-chunked schedule does not need outer loop");
+
+  // Emit outer loop.
+  //
+  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
+  // When schedule(dynamic,chunk_size) is specified, the iterations are
+  // distributed to threads in the team in chunks as the threads request them.
+  // Each thread executes a chunk of iterations, then requests another chunk,
+  // until no chunks remain to be distributed. Each chunk contains chunk_size
+  // iterations, except for the last chunk to be distributed, which may have
+  // fewer iterations. When no chunk_size is specified, it defaults to 1.
+  //
+  // When schedule(guided,chunk_size) is specified, the iterations are assigned
+  // to threads in the team in chunks as the executing threads request them.
+  // Each thread executes a chunk of iterations, then requests another chunk,
+  // until no chunks remain to be assigned. For a chunk_size of 1, the size of
+  // each chunk is proportional to the number of unassigned iterations divided
+  // by the number of threads in the team, decreasing to 1. For a chunk_size
+  // with value k (greater than 1), the size of each chunk is determined in the
+  // same way, with the restriction that the chunks do not contain fewer than k
+  // iterations (except for the last chunk to be assigned, which may have fewer
+  // than k iterations).
+  //
+  // When schedule(auto) is specified, the decision regarding scheduling is
+  // delegated to the compiler and/or runtime system. The programmer gives the
+  // implementation the freedom to choose any possible mapping of iterations to
+  // threads in the team.
+  //
+  // When schedule(runtime) is specified, the decision regarding scheduling is
+  // deferred until run time, and the schedule and chunk size are taken from the
+  // run-sched-var ICV. If the ICV is set to auto, the schedule is
+  // implementation defined
+  //
+  // while(__kmpc_dispatch_next(&LB, &UB)) {
+  //   idx = LB;
+  //   while (idx <= UB) { BODY; ++idx;
+  //   __kmpc_dispatch_fini_(4|8)[u](); // For ordered loops only.
+  //   } // inner loop
+  // }
+  //
+  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
+  // When schedule(static, chunk_size) is specified, iterations are divided into
+  // chunks of size chunk_size, and the chunks are assigned to the threads in
+  // the team in a round-robin fashion in the order of the thread number.
+  //
+  // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) {
+  //   while (idx <= UB) { BODY; ++idx; } // inner loop
+  //   LB = LB + ST;
+  //   UB = UB + ST;
+  // }
+  //
+
+  const Expr *IVExpr = S.getIterationVariable();
+  const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
+  const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+
+  if (DynamicOrOrdered) {
+    llvm::Value *UBVal = EmitScalarExpr(S.getLastIteration());
+    RT.emitForDispatchInit(*this, S.getLocStart(), ScheduleKind,
+                           IVSize, IVSigned, Ordered, UBVal, Chunk);
+  } else {
+    RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned,
+                         Ordered, IL, LB, UB, ST, Chunk);
+  }
+
+  EmitOMPOuterLoop(IsMonotonic, DynamicOrOrdered, S, LoopScope, Ordered, LB, UB,
+                   ST, IL, Chunk);
+}
+
+void CodeGenFunction::EmitOMPDistributeOuterLoop(
+    OpenMPDistScheduleClauseKind ScheduleKind,
+    const OMPDistributeDirective &S, OMPPrivateScope &LoopScope,
+    Address LB, Address UB, Address ST, Address IL, llvm::Value *Chunk) {
+
+  auto &RT = CGM.getOpenMPRuntime();
+
+  // Emit outer loop.
+  // Same behavior as a OMPForOuterLoop, except that schedule cannot be
+  // dynamic
+  //
+
+  const Expr *IVExpr = S.getIterationVariable();
+  const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
+  const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+
+  RT.emitDistributeStaticInit(*this, S.getLocStart(), ScheduleKind,
+                              IVSize, IVSigned, /* Ordered = */ false,
+                              IL, LB, UB, ST, Chunk);
+
+  EmitOMPOuterLoop(/* DynamicOrOrdered = */ false, /* IsMonotonic = */ false,
+                   S, LoopScope, /* Ordered = */ false, LB, UB, ST, IL, Chunk);
 }
 
 /// \brief Emit a helper variable and return corresponding lvalue.
@@ -1828,8 +1866,9 @@
     (void)LoopScope.Privatize();
 
     // Emit static non-chunked loop.
-    CGF.CGM.getOpenMPRuntime().emitForStaticInit(
-        CGF, S.getLocStart(), OMPC_SCHEDULE_static, /*IVSize=*/32,
+    CGF.CGM.getOpenMPRuntime().emitForStaticInit(CGF, S.getLocStart(),
+        OMPC_SCHEDULE_static,
+        /*IVSize=*/32,
         /*IVSigned=*/true, /*Ordered=*/false, IL.getAddress(), LB.getAddress(),
         UB.getAddress(), ST.getAddress());
     // UB = min(UB, GlobalUB);
@@ -2146,9 +2185,157 @@
   }(), S.getLocStart());
 }
 
+static std::pair<llvm::Value * /*Chunk*/, OpenMPDistScheduleClauseKind>
+emitDistScheduleClause(CodeGenFunction &CGF, const OMPDistributeDirective &S,
+                   bool OuterRegion) {
+  // Detect the distribute schedule kind and chunk.
+  auto ScheduleKind = OMPC_DIST_SCHEDULE_unknown;
+  llvm::Value *Chunk = nullptr;
+  if (const auto *C = S.getSingleClause<OMPDistScheduleClause>()) {
+    ScheduleKind = C->getDistScheduleKind();
+    if (const auto *Ch = C->getChunkSize()) {
+      if (auto *ImpRef = cast_or_null<DeclRefExpr>(C->getHelperChunkSize())) {
+        if (OuterRegion) {
+          const VarDecl *ImpVar = cast<VarDecl>(ImpRef->getDecl());
+          CGF.EmitVarDecl(*ImpVar);
+          CGF.EmitStoreThroughLValue(
+              CGF.EmitAnyExpr(Ch),
+              CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(ImpVar),
+                                 ImpVar->getType()));
+        } else {
+          Ch = ImpRef;
+        }
+      }
+      if (!C->getHelperChunkSize() || !OuterRegion) {
+        Chunk = CGF.EmitScalarExpr(Ch);
+        Chunk = CGF.EmitScalarConversion(Chunk, Ch->getType(),
+                                         S.getIterationVariable()->getType(),
+                                         S.getLocStart());
+      }
+    }
+  }
+  return std::make_pair(Chunk, ScheduleKind);
+}
+
+void CodeGenFunction::EmitOMPDistributeLoop(const OMPDistributeDirective &S) {
+  // Emit the loop iteration variable.
+  auto IVExpr = cast<DeclRefExpr>(S.getIterationVariable());
+  auto IVDecl = cast<VarDecl>(IVExpr->getDecl());
+  EmitVarDecl(*IVDecl);
+
+  // Emit the iterations count variable.
+  // If it is not a variable, Sema decided to calculate iterations count on each
+  // iteration (e.g., it is foldable into a constant).
+  if (auto LIExpr = dyn_cast<DeclRefExpr>(S.getLastIteration())) {
+    EmitVarDecl(*cast<VarDecl>(LIExpr->getDecl()));
+    // Emit calculation of the iterations count.
+    EmitIgnoredExpr(S.getCalcLastIteration());
+  }
+
+  auto &RT = CGM.getOpenMPRuntime();
+
+  // Check pre-condition.
+  {
+    // Skip the entire loop if we don't meet the precondition.
+    // If the condition constant folds and can be elided, avoid emitting the
+    // whole loop.
+    bool CondConstant;
+    llvm::BasicBlock *ContBlock = nullptr;
+    if (ConstantFoldsToSimpleInteger(S.getPreCond(), CondConstant)) {
+      if (!CondConstant)
+        return;
+    } else {
+      auto *ThenBlock = createBasicBlock("omp.precond.then");
+      ContBlock = createBasicBlock("omp.precond.end");
+      emitPreCond(*this, S, S.getPreCond(), ThenBlock, ContBlock,
+                  getProfileCount(&S));
+      EmitBlock(ThenBlock);
+      incrementProfileCounter(&S);
+    }
+
+    // Emit 'then' code.
+    {
+      // Emit helper vars inits.
+      LValue LB =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getLowerBoundVariable()));
+      LValue UB =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getUpperBoundVariable()));
+      LValue ST =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getStrideVariable()));
+      LValue IL =
+          EmitOMPHelperVar(*this, cast<DeclRefExpr>(S.getIsLastIterVariable()));
+
+      OMPPrivateScope LoopScope(*this);
+      emitPrivateLoopCounters(*this, LoopScope, S.counters(),
+                              S.private_counters());
+      (void)LoopScope.Privatize();
+
+      // Detect the loop schedule kind and chunk.
+      llvm::Value *Chunk;
+      OpenMPDistScheduleClauseKind ScheduleKind;
+      auto ScheduleInfo =
+          emitDistScheduleClause(*this, S, /*OuterRegion=*/false);
+      Chunk = ScheduleInfo.first;
+      ScheduleKind = ScheduleInfo.second;
+      const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
+      const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+
+      // OpenMP [2.10.8, distribute Construct, Description]
+      // If dist_schedule is specified, kind must be static. If specified,
+      // iterations are divided into chunks of size chunk_size, chunks are
+      // assigned to the teams of the league in a round-robin fashion in the
+      // order of the team number. When no chunk_size is specified, the
+      // iteration space is divided into chunks that are approximately equal
+      // in size, and at most one chunk is distributed to each team of the
+      // league. The size of the chunks is unspecified in this case.
+      if (RT.isStaticNonchunked(ScheduleKind,
+                                /* Chunked */ Chunk != nullptr)) {
+        RT.emitDistributeStaticInit(*this, S.getLocStart(), ScheduleKind,
+                             IVSize, IVSigned, /* Ordered = */ false,
+                             IL.getAddress(), LB.getAddress(),
+                             UB.getAddress(), ST.getAddress());
+        auto LoopExit =
+            getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit"));
+        // UB = min(UB, GlobalUB);
+        EmitIgnoredExpr(S.getEnsureUpperBound());
+        // IV = LB;
+        EmitIgnoredExpr(S.getInit());
+        // while (idx <= UB) { BODY; ++idx; }
+        EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(),
+                         S.getInc(),
+                         [&S, LoopExit](CodeGenFunction &CGF) {
+                           CGF.EmitOMPLoopBody(S, LoopExit);
+                           CGF.EmitStopPoint(&S);
+                         },
+                         [](CodeGenFunction &) {});
+        EmitBlock(LoopExit.getBlock());
+        // Tell the runtime we are done.
+        RT.emitForStaticFinish(*this, S.getLocStart());
+      } else {
+        // Emit the outer loop, which requests its work chunk [LB..UB] from
+        // runtime and runs the inner loop to process it.
+        EmitOMPDistributeOuterLoop(ScheduleKind, S, LoopScope,
+                            LB.getAddress(), UB.getAddress(), ST.getAddress(),
+                            IL.getAddress(), Chunk);
+      }
+    }
+
+    // We're now done with the loop, so jump to the continuation block.
+    if (ContBlock) {
+      EmitBranch(ContBlock);
+      EmitBlock(ContBlock, true);
+    }
+  }
+}
+
 void CodeGenFunction::EmitOMPDistributeDirective(
     const OMPDistributeDirective &S) {
-  llvm_unreachable("CodeGen for 'omp distribute' is not supported yet.");
+  LexicalScope Scope(*this, S.getSourceRange());
+  auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+    CGF.EmitOMPDistributeLoop(S);
+  };
+  CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_distribute, CodeGen,
+                                              false);
 }
 
 static llvm::Function *emitOutlinedOrderedFunction(CodeGenModule &CGM,
Index: lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.h
+++ lib/CodeGen/CGOpenMPRuntime.h
@@ -182,6 +182,9 @@
     OMPRTL__tgt_unregister_lib,
   };
 
+  CodeGenModule &CGM;
+
+public:
   /// \brief Values for bit flags used in the ident_t to describe the fields.
   /// All enumeric elements are named and described in accordance with the code
   /// from http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
@@ -203,16 +206,7 @@
     /// \brief Implicit barrier in 'single' directive.
     OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
   };
-  CodeGenModule &CGM;
-  /// \brief Default const ident_t object used for initialization of all other
-  /// ident_t objects.
-  llvm::Constant *DefaultOpenMPPSource;
-  /// \brief Map of flags and corresponding default locations.
-  typedef llvm::DenseMap<unsigned, llvm::Value *> OpenMPDefaultLocMapTy;
-  OpenMPDefaultLocMapTy OpenMPDefaultLocMap;
-  Address getOrCreateDefaultLocation(OpenMPLocationFlags Flags);
 
-public:
   /// \brief Describes ident structure that describes a source location.
   /// All descriptions are taken from
   /// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
@@ -254,6 +248,14 @@
     IdentField_PSource
   };
 private:
+  /// \brief Default const ident_t object used for initialization of all other
+  /// ident_t objects.
+  llvm::Constant *DefaultOpenMPPSource;
+  /// \brief Map of flags and corresponding default locations.
+  typedef llvm::DenseMap<unsigned, llvm::Value *> OpenMPDefaultLocMapTy;
+  OpenMPDefaultLocMapTy OpenMPDefaultLocMap;
+  Address getOrCreateDefaultLocation(OpenMPLocationFlags Flags);
+
   llvm::StructType *IdentTy;
   /// \brief Map for SourceLocation and OpenMP runtime library debug locations.
   typedef llvm::DenseMap<unsigned, llvm::Value *> OpenMPDebugLocMapTy;
@@ -482,12 +484,6 @@
   /// \brief Build type kmp_routine_entry_t (if not built yet).
   void emitKmpRoutineEntryT(QualType KmpInt32Ty);
 
-  /// \brief Emits object of ident_t type with info for source location.
-  /// \param Flags Flags for OpenMP location.
-  ///
-  llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc,
-                                  OpenMPLocationFlags Flags = OMP_IDENT_KMPC);
-
   /// \brief Returns pointer to ident_t type.
   llvm::Type *getIdentTyPointerTy();
 
@@ -499,10 +495,6 @@
   /// \return Specified function.
   llvm::Constant *createRuntimeFunction(OpenMPRTLFunction Function);
 
-  /// \brief Returns __kmpc_for_static_init_* runtime function for the specified
-  /// size \a IVSize and sign \a IVSigned.
-  llvm::Constant *createForStaticInitFunction(unsigned IVSize, bool IVSigned);
-
   /// \brief Returns __kmpc_dispatch_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned.
   llvm::Constant *createDispatchInitFunction(unsigned IVSize, bool IVSigned);
@@ -526,10 +518,6 @@
   /// stored.
   virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc);
 
-  /// \brief Gets thread id value for the current thread.
-  ///
-  llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc);
-
   /// \brief Gets (if variable with the given name already exist) or creates
   /// internal global variable with the specified Name. The created variable has
   /// linkage CommonLinkage by default and is initialized by null value.
@@ -564,6 +552,20 @@
   virtual ~CGOpenMPRuntime() {}
   virtual void clear();
 
+  /// \brief Emits object of ident_t type with info for source location.
+  /// \param Flags Flags for OpenMP location.
+  ///
+  llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc,
+                                  OpenMPLocationFlags Flags = OMP_IDENT_KMPC);
+
+  /// \brief Gets thread id value for the current thread.
+  ///
+  llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc);
+
+  /// \brief Returns __kmpc_for_static_init_* runtime function for the specified
+  /// size \a IVSize and sign \a IVSigned.
+  llvm::Constant *createForStaticInitFunction(unsigned IVSize, bool IVSigned);
+
   /// \brief Emits outlined function for the specified OpenMP parallel directive
   /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
   /// kmp_int32 BoundID, struct context_vars*).
@@ -674,6 +676,14 @@
   virtual bool isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
                                   bool Chunked) const;
 
+  /// \brief Check if the specified \a ScheduleKind is static non-chunked.
+  /// This kind of distribute directive is emitted without outer loop.
+  /// \param ScheduleKind Schedule kind specified in the 'dist_schedule' clause.
+  /// \param Chunked True if chunk is specified in the clause.
+  ///
+  virtual bool isStaticNonchunked(OpenMPDistScheduleClauseKind ScheduleKind,
+                                  bool Chunked) const;
+
   /// \brief Check if the specified \a ScheduleKind is dynamic.
   /// This kind of worksharing directive is emitted without outer loop.
   /// \param ScheduleKind Schedule Kind specified in the 'schedule' clause.
@@ -686,12 +696,6 @@
                                    bool Ordered, llvm::Value *UB,
                                    llvm::Value *Chunk = nullptr);
 
-  /// \brief Call the appropriate runtime routine to initialize it before start
-  /// of loop.
-  ///
-  /// Depending on the loop schedule, it is nesessary to call some runtime
-  /// routine before start of the OpenMP loop to get the loop upper / lower
-  /// bounds \a LB and \a UB and stride \a ST.
   ///
   /// \param CGF Reference to current CodeGenFunction.
   /// \param Loc Clang source location.
@@ -717,6 +721,31 @@
                                  Address UB, Address ST,
                                  llvm::Value *Chunk = nullptr);
 
+  ///
+  /// \param CGF Reference to current CodeGenFunction.
+  /// \param Loc Clang source location.
+  /// \param SchedKind Schedule kind, specified by the 'dist_schedule' clause.
+  /// \param IVSize Size of the iteration variable in bits.
+  /// \param IVSigned Sign of the interation variable.
+  /// \param Ordered true if loop is ordered, false otherwise.
+  /// \param IL Address of the output variable in which the flag of the
+  /// last iteration is returned.
+  /// \param LB Address of the output variable in which the lower iteration
+  /// number is returned.
+  /// \param UB Address of the output variable in which the upper iteration
+  /// number is returned.
+  /// \param ST Address of the output variable in which the stride value is
+  /// returned nesessary to generated the static_chunked scheduled loop.
+  /// \param Chunk Value of the chunk for the static_chunked scheduled loop.
+  /// For the default (nullptr) value, the chunk 1 will be used.
+  ///
+  virtual void emitDistributeStaticInit(CodeGenFunction &CGF, SourceLocation Loc,
+                                        OpenMPDistScheduleClauseKind SchedKind,
+                                        unsigned IVSize, bool IVSigned,
+                                        bool Ordered, Address IL, Address LB,
+                                        Address UB, Address ST,
+                                        llvm::Value *Chunk = nullptr);
+
   /// \brief Call the appropriate runtime routine to notify that we finished
   /// iteration of the ordered loop with the dynamic scheduling.
   ///
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1756,26 +1756,29 @@
 }
 
 /// \brief Schedule types for 'omp for' loops (these enumerators are taken from
-/// the enum sched_type in kmp.h).
-enum OpenMPSchedType {
-  /// \brief Lower bound for default (unordered) versions.
-  OMP_sch_lower = 32,
-  OMP_sch_static_chunked = 33,
-  OMP_sch_static = 34,
-  OMP_sch_dynamic_chunked = 35,
-  OMP_sch_guided_chunked = 36,
-  OMP_sch_runtime = 37,
-  OMP_sch_auto = 38,
-  /// \brief Lower bound for 'ordered' versions.
-  OMP_ord_lower = 64,
-  OMP_ord_static_chunked = 65,
-  OMP_ord_static = 66,
-  OMP_ord_dynamic_chunked = 67,
-  OMP_ord_guided_chunked = 68,
-  OMP_ord_runtime = 69,
-  OMP_ord_auto = 70,
-  OMP_sch_default = OMP_sch_static,
-};
+ /// the enum sched_type in kmp.h).
+ enum OpenMPSchedType {
+   /// \brief Lower bound for default (unordered) versions.
+   OMP_sch_lower = 32,
+   OMP_sch_static_chunked = 33,
+   OMP_sch_static = 34,
+   OMP_sch_dynamic_chunked = 35,
+   OMP_sch_guided_chunked = 36,
+   OMP_sch_runtime = 37,
+   OMP_sch_auto = 38,
+   /// \brief Lower bound for 'ordered' versions.
+   OMP_ord_lower = 64,
+   OMP_ord_static_chunked = 65,
+   OMP_ord_static = 66,
+   OMP_ord_dynamic_chunked = 67,
+   OMP_ord_guided_chunked = 68,
+   OMP_ord_runtime = 69,
+   OMP_ord_auto = 70,
+   /// \brief dist_schedule types
+   OMP_dist_sch_static_chunked = 91,
+   OMP_dist_sch_static = 92,
+   OMP_sch_default = OMP_sch_static,
+ };
 
 /// \brief Map the OpenMP loop schedule to the runtime enumeration.
 static OpenMPSchedType getRuntimeSchedule(OpenMPScheduleClauseKind ScheduleKind,
@@ -1799,12 +1802,26 @@
   llvm_unreachable("Unexpected runtime schedule");
 }
 
+/// \brief Map the OpenMP distribute schedule to the runtime enumeration.
+static OpenMPSchedType
+getRuntimeSchedule(OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) {
+  // only static is allowed for dist_schedule
+  return Chunked ? OMP_dist_sch_static_chunked : OMP_dist_sch_static;
+}
+
 bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
                                          bool Chunked) const {
   auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked, /*Ordered=*/false);
   return Schedule == OMP_sch_static;
 }
 
+bool CGOpenMPRuntime::isStaticNonchunked(
+    OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
+  auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked);
+  return Schedule == OMP_dist_sch_static;
+}
+
+
 bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
   auto Schedule =
       getRuntimeSchedule(ScheduleKind, /*Chunked=*/false, /*Ordered=*/false);
@@ -1845,48 +1862,81 @@
   CGF.EmitRuntimeCall(createDispatchInitFunction(IVSize, IVSigned), Args);
 }
 
+static void emitForStaticInitCall(CGOpenMPRuntime &RT, CodeGenFunction &CGF,
+                                  SourceLocation Loc,
+                                  OpenMPScheduleClauseKind ForSchedKind,
+                                  OpenMPDistScheduleClauseKind DistSchedKind,
+                                  bool isForSchedule,
+                                  unsigned IVSize, bool IVSigned, bool Ordered,
+                                  Address IL, Address LB, Address UB,
+                                  Address ST, llvm::Value *Chunk) {
+  OpenMPSchedType Schedule = (isForSchedule) ?
+      getRuntimeSchedule(ForSchedKind, Chunk != nullptr, Ordered) :
+      getRuntimeSchedule(DistSchedKind, Chunk != nullptr);
+
+  if (!CGF.HaveInsertPoint())
+     return;
+
+   assert(!Ordered);
+   assert(Schedule == OMP_sch_static || Schedule == OMP_sch_static_chunked ||
+          Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked ||
+          Schedule == OMP_dist_sch_static ||
+          Schedule == OMP_dist_sch_static_chunked);
+
+   // Call __kmpc_for_static_init(
+   //          ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
+   //          kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
+   //          kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
+   //          kmp_int[32|64] incr, kmp_int[32|64] chunk);
+   if (Chunk == nullptr) {
+     assert((Schedule == OMP_sch_static || Schedule == OMP_ord_static ||
+             Schedule == OMP_dist_sch_static) &&
+            "expected static non-chunked schedule");
+     // If the Chunk was not specified in the clause - use default value 1.
+       Chunk = CGF.Builder.getIntN(IVSize, 1);
+   } else {
+     assert((Schedule == OMP_sch_static_chunked ||
+             Schedule == OMP_ord_static_chunked ||
+             Schedule == OMP_dist_sch_static_chunked) &&
+            "expected static chunked schedule");
+   }
+   llvm::Value *Args[] = {
+     RT.emitUpdateLocation(CGF, Loc, RT.OMP_IDENT_KMPC),
+     RT.getThreadID(CGF, Loc),
+     CGF.Builder.getInt32(Schedule), // Schedule type
+     IL.getPointer(),                // &isLastIter
+     LB.getPointer(),                // &LB
+     UB.getPointer(),                // &UB
+     ST.getPointer(),                // &Stride
+     CGF.Builder.getIntN(IVSize, 1), // Incr
+     Chunk                           // Chunk
+   };
+   CGF.EmitRuntimeCall(RT.createForStaticInitFunction(IVSize, IVSigned), Args);
+}
+
 void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
                                         SourceLocation Loc,
-                                        OpenMPScheduleClauseKind ScheduleKind,
+                                        OpenMPScheduleClauseKind SchedKind,
                                         unsigned IVSize, bool IVSigned,
                                         bool Ordered, Address IL, Address LB,
                                         Address UB, Address ST,
                                         llvm::Value *Chunk) {
-  if (!CGF.HaveInsertPoint())
-    return;
-  OpenMPSchedType Schedule =
-    getRuntimeSchedule(ScheduleKind, Chunk != nullptr, Ordered);
-  assert(!Ordered);
-  assert(Schedule == OMP_sch_static || Schedule == OMP_sch_static_chunked ||
-         Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked);
-
-  // Call __kmpc_for_static_init(
-  //          ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
-  //          kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
-  //          kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
-  //          kmp_int[32|64] incr, kmp_int[32|64] chunk);
-  if (Chunk == nullptr) {
-    assert((Schedule == OMP_sch_static || Schedule == OMP_ord_static) &&
-           "expected static non-chunked schedule");
-    // If the Chunk was not specified in the clause - use default value 1.
-      Chunk = CGF.Builder.getIntN(IVSize, 1);
-  } else {
-    assert((Schedule == OMP_sch_static_chunked ||
-            Schedule == OMP_ord_static_chunked) &&
-           "expected static chunked schedule");
-  }
-  llvm::Value *Args[] = {
-    emitUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
-    getThreadID(CGF, Loc),
-    CGF.Builder.getInt32(Schedule), // Schedule type
-    IL.getPointer(),                // &isLastIter
-    LB.getPointer(),                // &LB
-    UB.getPointer(),                // &UB
-    ST.getPointer(),                // &Stride
-    CGF.Builder.getIntN(IVSize, 1), // Incr
-    Chunk                           // Chunk
-  };
-  CGF.EmitRuntimeCall(createForStaticInitFunction(IVSize, IVSigned), Args);
+
+  emitForStaticInitCall(*this, CGF, Loc, SchedKind, OMPC_DIST_SCHEDULE_unknown,
+                        /* isForSchedule = */ true, IVSize, IVSigned, Ordered,
+                        IL, LB, UB, ST, Chunk);
+}
+
+void CGOpenMPRuntime::emitDistributeStaticInit(CodeGenFunction &CGF,
+    SourceLocation Loc, OpenMPDistScheduleClauseKind SchedKind,
+    unsigned IVSize, bool IVSigned,
+    bool Ordered, Address IL, Address LB,
+    Address UB, Address ST,
+    llvm::Value *Chunk) {
+
+  emitForStaticInitCall(*this, CGF, Loc, OMPC_SCHEDULE_unknown, SchedKind,
+                        /* isForSchedule = */ false, IVSize, IVSigned, Ordered,
+                        IL, LB, UB, ST, Chunk);
 }
 
 void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
Index: include/clang/AST/StmtOpenMP.h
===================================================================
--- include/clang/AST/StmtOpenMP.h
+++ include/clang/AST/StmtOpenMP.h
@@ -595,49 +595,56 @@
   }
   Expr *getIsLastIterVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), IsLastIterVariableOffset)));
   }
   Expr *getLowerBoundVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), LowerBoundVariableOffset)));
   }
   Expr *getUpperBoundVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), UpperBoundVariableOffset)));
   }
   Expr *getStrideVariable() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), StrideVariableOffset)));
   }
   Expr *getEnsureUpperBound() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), EnsureUpperBoundOffset)));
   }
   Expr *getNextLowerBound() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), NextLowerBoundOffset)));
   }
   Expr *getNextUpperBound() const {
     assert((isOpenMPWorksharingDirective(getDirectiveKind()) ||
-            isOpenMPTaskLoopDirective(getDirectiveKind())) &&
+            isOpenMPTaskLoopDirective(getDirectiveKind()) ||
+            isOpenMPDistributeDirective(getDirectiveKind())) &&
            "expected worksharing loop directive");
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), NextUpperBoundOffset)));

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Re: [PATCH] D17170: [OPENMP] Codegen for distribute directive

Reply via email to