Hi doug.gregor, rjmccall, hfinkel, rsmith, ABataev,

Please review codegen implementation of the ‘omp for’ loop with dynamic 
schedule kinds.

http://reviews.llvm.org/D7138

Files:
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGStmtOpenMP.cpp
  test/OpenMP/for_codegen.cpp

EMAIL PREFERENCES
  http://reviews.llvm.org/settings/panel/emailpreferences/
Index: test/OpenMP/for_codegen.cpp
===================================================================
--- test/OpenMP/for_codegen.cpp
+++ test/OpenMP/for_codegen.cpp
@@ -146,5 +146,176 @@
 // CHECK: ret void
 }
 
+// CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void dynamic1(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(dynamic)
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 35, i64 0, i64 16908287, i64 1, i64 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+
+// CHECK-NEXT: [[LB:%.+]] = load i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}guided7{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void guided7(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(guided, 7)
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 36, i64 0, i64 16908287, i64 1, i64 7)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+
+// CHECK-NEXT: [[LB:%.+]] = load i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}test_auto{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void test_auto(float *a, float *b, float *c, float *d) {
+  unsigned int x = 0;
+  unsigned int y = 0;
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(auto) collapse(2)
+// CHECK: call void @__kmpc_dispatch_init_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 38, i64 0, i64 [[LAST_ITER:%[^,]+]], i64 1, i64 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+
+// CHECK-NEXT: [[LB:%.+]] = load i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+// FIXME: When the iteration count of some nested loop is not a known constant,
+// we should pre-calculate it, like we do for the total number of iterations!
+  for (char i = static_cast<char>(y); i <= '9'; ++i)
+    for (x = 11; x > 0; --x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}runtime{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void runtime(float *a, float *b, float *c, float *d) {
+  int x = 0;
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for collapse(2) schedule(runtime)
+// CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 37, i32 0, i32 199, i32 1, i32 1)
+//
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+
+// CHECK-NEXT: [[LB:%.+]] = load i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned char i = '0' ; i <= '9'; ++i)
+    for (x = -10; x < 10; ++x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
 #endif // HEADER
 
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -423,6 +423,134 @@
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_for_static_fini");
     break;
   }
+  case OMPRTL__kmpc_dispatch_init_4: {
+    auto ITy = CGM.Int32Ty;
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(), // loc
+        CGM.Int32Ty,           // tid
+        CGM.Int32Ty,           // schedtype
+        ITy,                   // lower
+        ITy,                   // upper
+        ITy,                   // stride
+        ITy                    // chunk
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_init_4");
+    break;
+  }
+  case OMPRTL__kmpc_dispatch_init_4u: {
+    auto ITy = CGM.Int32Ty;
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(), // loc
+        CGM.Int32Ty,           // tid
+        CGM.Int32Ty,           // schedtype
+        ITy,                   // lower
+        ITy,                   // upper
+        ITy,                   // stride
+        ITy                    // chunk
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_init_4u");
+    break;
+  }
+  case OMPRTL__kmpc_dispatch_init_8: {
+    auto ITy = CGM.Int64Ty;
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(), // loc
+        CGM.Int32Ty,           // tid
+        CGM.Int32Ty,           // schedtype
+        ITy,                   // lower
+        ITy,                   // upper
+        ITy,                   // stride
+        ITy                    // chunk
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_init_8");
+    break;
+  }
+  case OMPRTL__kmpc_dispatch_init_8u: {
+    auto ITy = CGM.Int64Ty;
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(), // loc
+        CGM.Int32Ty,           // tid
+        CGM.Int32Ty,           // schedtype
+        ITy,                   // lower
+        ITy,                   // upper
+        ITy,                   // stride
+        ITy                    // chunk
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_init_8u");
+    break;
+  }
+  case OMPRTL__kmpc_dispatch_next_4: {
+    auto ITy = CGM.Int32Ty;
+    auto PtrTy = llvm::PointerType::getUnqual(ITy);
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(),                     // loc
+        CGM.Int32Ty,                               // tid
+        llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
+        PtrTy,                                     // p_lower
+        PtrTy,                                     // p_upper
+        PtrTy                                      // p_stride
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_next_4");
+    break;
+  }
+  case OMPRTL__kmpc_dispatch_next_4u: {
+    auto ITy = CGM.Int32Ty;
+    auto PtrTy = llvm::PointerType::getUnqual(ITy);
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(),                     // loc
+        CGM.Int32Ty,                               // tid
+        llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
+        PtrTy,                                     // p_lower
+        PtrTy,                                     // p_upper
+        PtrTy                                      // p_stride
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_next_4u");
+    break;
+  }
+  case OMPRTL__kmpc_dispatch_next_8: {
+    auto ITy = CGM.Int64Ty;
+    auto PtrTy = llvm::PointerType::getUnqual(ITy);
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(),                     // loc
+        CGM.Int32Ty,                               // tid
+        llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
+        PtrTy,                                     // p_lower
+        PtrTy,                                     // p_upper
+        PtrTy                                      // p_stride
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_next_8");
+    break;
+  }
+  case OMPRTL__kmpc_dispatch_next_8u: {
+    auto ITy = CGM.Int64Ty;
+    auto PtrTy = llvm::PointerType::getUnqual(ITy);
+    llvm::Type *TypeParams[] = {
+        getIdentTyPointerTy(),                     // loc
+        CGM.Int32Ty,                               // tid
+        llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
+        PtrTy,                                     // p_lower
+        PtrTy,                                     // p_upper
+        PtrTy                                      // p_stride
+    };
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_dispatch_next_8u");
+    break;
+  }
   case OMPRTL__kmpc_push_num_threads: {
     // Build void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
     // kmp_int32 num_threads)
@@ -861,36 +989,67 @@
                                      llvm::Value *UB, llvm::Value *ST,
                                      llvm::Value *Chunk) {
   OpenMPSchedType Schedule = getRuntimeSchedule(ScheduleKind, Chunk != nullptr);
-  // Call __kmpc_for_static_init(
-  //          ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
-  //          kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
-  //          kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
-  //          kmp_int[32|64] incr, kmp_int[32|64] chunk);
-  // TODO: Implement dynamic schedule.
-
-  // If the Chunk was not specified in the clause - use default value 1.
-  if (Chunk == nullptr)
-    Chunk = CGF.Builder.getIntN(IVSize, /*C*/ 1);
-
-  llvm::Value *Args[] = {
-      EmitOpenMPUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
-      GetOpenMPThreadID(CGF, Loc),
-      CGF.Builder.getInt32(Schedule), // Schedule type
-      IL,                             // &isLastIter
-      LB,                             // &LB
-      UB,                             // &UB
-      ST,                             // &Stride
-      CGF.Builder.getIntN(IVSize, 1), // Incr
-      Chunk                           // Chunk
-  };
-  assert((IVSize == 32 || IVSize == 64) &&
-         "Index size is not compatible with the omp runtime");
-  auto F = IVSize == 32 ? (IVSigned ? OMPRTL__kmpc_for_static_init_4
-                                    : OMPRTL__kmpc_for_static_init_4u)
-                        : (IVSigned ? OMPRTL__kmpc_for_static_init_8
-                                    : OMPRTL__kmpc_for_static_init_8u);
-  auto RTLFn = CreateRuntimeFunction(F);
-  CGF.EmitRuntimeCall(RTLFn, Args);
+  if (Schedule != OMP_sch_static && Schedule != OMP_sch_static_chunked) {
+    // Call __kmpc_dispatch_init(
+    //          ident_t *loc, kmp_int32 tid, kmp_int32 schedule,
+    //          kmp_int[32|64] lower, kmp_int[32|64] upper,
+    //          kmp_int[32|64] stride, kmp_int[32|64] chunk);
+
+    // If the Chunk was not specified in the clause - use default value 1.
+    if (Chunk == nullptr)
+      Chunk = CGF.Builder.getIntN(IVSize, 1);
+
+    llvm::Value *Args[] = {
+        EmitOpenMPUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
+        GetOpenMPThreadID(CGF, Loc),    // Tid
+        CGF.Builder.getInt32(Schedule), // Schedule type
+        CGF.Builder.getIntN(IVSize, 0), // Lower
+        UB,                             // Upper
+        CGF.Builder.getIntN(IVSize, 1), // Stride
+        Chunk                           // Chunk
+    };
+    assert((IVSize == 32 || IVSize == 64) &&
+           "IV size is not compatible with the omp runtime");
+    auto F = IVSize == 32 ? (IVSigned ? OMPRTL__kmpc_dispatch_init_4
+                                      : OMPRTL__kmpc_dispatch_init_4u)
+                          : (IVSigned ? OMPRTL__kmpc_dispatch_init_8
+                                      : OMPRTL__kmpc_dispatch_init_8u);
+    auto RTLFn = CreateRuntimeFunction(F);
+    CGF.EmitRuntimeCall(RTLFn, Args);
+  } else {
+    // Call __kmpc_for_static_init(
+    //          ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
+    //          kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
+    //          kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
+    //          kmp_int[32|64] incr, kmp_int[32|64] chunk);
+    if (Chunk == nullptr) {
+      assert(Schedule == OMP_sch_static &&
+             "expected static non-chunked schedule");
+      // If the Chunk was not specified in the clause - use default value 1.
+      Chunk = CGF.Builder.getIntN(IVSize, 1);
+    } else
+      assert(Schedule == OMP_sch_static_chunked &&
+             "expected static chunked schedule");
+    llvm::Value *Args[] = {
+        EmitOpenMPUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
+        GetOpenMPThreadID(CGF, Loc),
+        CGF.Builder.getInt32(Schedule), // Schedule type
+        IL,                             // &isLastIter
+        LB,                             // &LB
+        UB,                             // &UB
+        ST,                             // &Stride
+        CGF.Builder.getIntN(IVSize, 1), // Incr
+        Chunk                           // Chunk
+    };
+    assert((IVSize == 32 || IVSize == 64) &&
+           "IV size is not compatible with the omp runtime");
+    auto F = IVSize == 32 ? (IVSigned ? OMPRTL__kmpc_for_static_init_4
+                                      : OMPRTL__kmpc_for_static_init_4u)
+                          : (IVSigned ? OMPRTL__kmpc_for_static_init_8
+                                      : OMPRTL__kmpc_for_static_init_8u);
+    auto RTLFn = CreateRuntimeFunction(F);
+    CGF.EmitRuntimeCall(RTLFn, Args);
+  }
 }
 
 void CGOpenMPRuntime::EmitOMPForFinish(CodeGenFunction &CGF, SourceLocation Loc,
@@ -905,6 +1064,31 @@
   CGF.EmitRuntimeCall(RTLFn, Args);
 }
 
+llvm::CallInst *CGOpenMPRuntime::EmitOMPForNext(
+    CodeGenFunction &CGF, SourceLocation Loc, unsigned IVSize, bool IVSigned,
+    llvm::Value *IL, llvm::Value *LB, llvm::Value *UB, llvm::Value *ST) {
+  // Call __kmpc_dispatch_next(
+  //          ident_t *loc, kmp_int32 tid, kmp_int32 *p_lastiter,
+  //          kmp_int[32|64] *p_lower, kmp_int[32|64] *p_upper,
+  //          kmp_int[32|64] *p_stride);
+  llvm::Value *Args[] = {
+      EmitOpenMPUpdateLocation(CGF, Loc, OMP_IDENT_KMPC),
+      GetOpenMPThreadID(CGF, Loc), // Tid
+      IL,                          // &isLastIter
+      LB,                          // &Lower
+      UB,                          // &Upper
+      ST                           // &Stride
+  };
+  assert((IVSize == 32 || IVSize == 64) &&
+         "IV size is not compatible with the omp runtime");
+  auto F = IVSize == 32 ? (IVSigned ? OMPRTL__kmpc_dispatch_next_4
+                                    : OMPRTL__kmpc_dispatch_next_4u)
+                        : (IVSigned ? OMPRTL__kmpc_dispatch_next_8
+                                    : OMPRTL__kmpc_dispatch_next_8u);
+  auto RTLFn = CreateRuntimeFunction(F);
+  return CGF.EmitRuntimeCall(RTLFn, Args, "has_work_to_do");
+}
+
 void CGOpenMPRuntime::EmitOMPNumThreadsClause(CodeGenFunction &CGF,
                                               llvm::Value *NumThreads,
                                               SourceLocation Loc) {
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -507,16 +507,50 @@
                                           llvm::Value *ST, llvm::Value *IL,
                                           llvm::Value *Chunk) {
   auto &RT = CGM.getOpenMPRuntime();
+
+  // Dynamic scheduling of the outer loop (dynamic, guided, auto, runtime).
+  const bool Dynamic = RT.isDynamic(ScheduleKind);
+
   assert(!RT.isStaticNonchunked(ScheduleKind, /* Chunked */ Chunk != nullptr) &&
          "static non-chunked schedule does not need outer loop");
-  if (RT.isDynamic(ScheduleKind)) {
-    ErrorUnsupported(&S, "OpenMP loop with dynamic schedule");
-    return;
-  }
 
   // Emit outer loop.
   //
   // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
+  // When schedule(dynamic,chunk_size) is specified, the iterations are
+  // distributed to threads in the team in chunks as the threads request them.
+  // Each thread executes a chunk of iterations, then requests another chunk,
+  // until no chunks remain to be distributed. Each chunk contains chunk_size
+  // iterations, except for the last chunk to be distributed, which may have
+  // fewer iterations. When no chunk_size is specified, it defaults to 1.
+  //
+  // When schedule(guided,chunk_size) is specified, the iterations are assigned
+  // to threads in the team in chunks as the executing threads request them.
+  // Each thread executes a chunk of iterations, then requests another chunk,
+  // until no chunks remain to be assigned. For a chunk_size of 1, the size of
+  // each chunk is proportional to the number of unassigned iterations divided
+  // by the number of threads in the team, decreasing to 1. For a chunk_size
+  // with value k (greater than 1), the size of each chunk is determined in the
+  // same way, with the restriction that the chunks do not contain fewer than k
+  // iterations (except for the last chunk to be assigned, which may have fewer
+  // than k iterations).
+  //
+  // When schedule(auto) is specified, the decision regarding scheduling is
+  // delegated to the compiler and/or runtime system. The programmer gives the
+  // implementation the freedom to choose any possible mapping of iterations to
+  // threads in the team.
+  //
+  // When schedule(runtime) is specified, the decision regarding scheduling is
+  // deferred until run time, and the schedule and chunk size are taken from the
+  // run-sched-var ICV. If the ICV is set to auto, the schedule is
+  // implementation defined
+  //
+  // while(__kmpc_dispatch_next(&LB, &UB)) {
+  //   idx = LB;
+  //   while (idx <= UB) { BODY; ++idx; } // inner loop
+  // }
+  //
+  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
   // When schedule(static, chunk_size) is specified, iterations are divided into
   // chunks of size chunk_size, and the chunks are assigned to the threads in
   // the team in a round-robin fashion in the order of the thread number.
@@ -527,26 +561,37 @@
   //   UB = UB + ST;
   // }
   //
+
   const Expr *IVExpr = S.getIterationVariable();
   const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
   const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
 
-  RT.EmitOMPForInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, IL,
-                    LB, UB, ST, Chunk);
+  RT.EmitOMPForInit(
+      *this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, IL, LB,
+      (Dynamic ? EmitAnyExpr(S.getLastIteration()).getScalarVal() : UB), ST,
+      Chunk);
+
   auto LoopExit = getJumpDestInCurrentScope("omp.dispatch.end");
 
   // Start the loop with a block that tests the condition.
   auto CondBlock = createBasicBlock("omp.dispatch.cond");
   EmitBlock(CondBlock);
   LoopStack.push(CondBlock);
-
   llvm::Value *BoolCondVal = nullptr;
-  // UB = min(UB, GlobalUB)
-  EmitIgnoredExpr(S.getEnsureUpperBound());
-  // IV = LB
-  EmitIgnoredExpr(S.getInit());
-  // IV < UB
-  BoolCondVal = EvaluateExprAsBool(S.getCond(false));
+  if (!Dynamic) {
+    // UB = min(UB, GlobalUB)
+    EmitIgnoredExpr(S.getEnsureUpperBound());
+    // IV = LB
+    EmitIgnoredExpr(S.getInit());
+    // IV < UB
+    BoolCondVal = EvaluateExprAsBool(S.getCond(false));
+  } else {
+    auto Call = RT.EmitOMPForNext(*this, S.getLocStart(), IVSize, IVSigned, IL,
+                                  LB, UB, ST);
+    BoolCondVal = EmitScalarConversion(
+        Call, getContext().getIntTypeForBitwidth(32, /* Signed */ true),
+        getContext().BoolTy);
+  }
 
   // If there are any cleanups between here and the loop-exit scope,
   // create a block to stage a loop exit along.
@@ -562,25 +607,32 @@
   }
   EmitBlock(LoopBody);
 
+  if (Dynamic)
+    EmitIgnoredExpr(S.getInit());
+
   // Create a block for the increment.
   auto Continue = getJumpDestInCurrentScope("omp.dispatch.inc");
   BreakContinueStack.push_back(BreakContinue(LoopExit, Continue));
 
   EmitOMPInnerLoop(S, LoopScope);
 
   EmitBlock(Continue.getBlock());
   BreakContinueStack.pop_back();
-  // Emit "LB = LB + Stride", "UB = UB + Stride".
-  EmitIgnoredExpr(S.getNextLowerBound());
-  EmitIgnoredExpr(S.getNextUpperBound());
+  if (!Dynamic) {
+    // Emit "LB = LB + Stride", "UB = UB + Stride".
+    EmitIgnoredExpr(S.getNextLowerBound());
+    EmitIgnoredExpr(S.getNextUpperBound());
+  }
 
   EmitBranch(CondBlock);
   LoopStack.pop();
   // Emit the fall-through block.
   EmitBlock(LoopExit.getBlock());
 
   // Tell the runtime we are done.
-  RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind);
+  // FIXME: Also call fini for ordered loops with dynamic scheduling.
+  if (!Dynamic)
+    RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind);
 }
 
 /// \brief Emit a helper variable and return corresponding lvalue.
Index: lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.h
+++ lib/CodeGen/CGOpenMPRuntime.h
@@ -24,6 +24,7 @@
 namespace llvm {
 class ArrayType;
 class Constant;
+class CallInst;
 class Function;
 class FunctionType;
 class GlobalVariable;
@@ -73,6 +74,15 @@
     OMPRTL__kmpc_for_static_init_8,
     OMPRTL__kmpc_for_static_init_8u,
     OMPRTL__kmpc_for_static_fini,
+    // Calls for dynamic scheduling 'omp for' loops.
+    OMPRTL__kmpc_dispatch_init_4,
+    OMPRTL__kmpc_dispatch_init_4u,
+    OMPRTL__kmpc_dispatch_init_8,
+    OMPRTL__kmpc_dispatch_init_8u,
+    OMPRTL__kmpc_dispatch_next_4,
+    OMPRTL__kmpc_dispatch_next_4u,
+    OMPRTL__kmpc_dispatch_next_8,
+    OMPRTL__kmpc_dispatch_next_8u,
     // Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
     // global_tid);
     OMPRTL__kmpc_serialized_parallel,
@@ -365,6 +375,28 @@
   virtual void EmitOMPForFinish(CodeGenFunction &CGF, SourceLocation Loc,
                                 OpenMPScheduleClauseKind ScheduleKind);
 
+  /// Call __kmpc_dispatch_next(
+  ///          ident_t *loc, kmp_int32 tid, kmp_int32 *p_lastiter,
+  ///          kmp_int[32|64] *p_lower, kmp_int[32|64] *p_upper,
+  ///          kmp_int[32|64] *p_stride);
+  /// \param CGF Reference to current CodeGenFunction.
+  /// \param Loc Clang source location.
+  /// \param IVSize Size of the iteration variable in bits.
+  /// \param IVSigned Sign of the interation variable.
+  /// \param IL Address of the output variable in which the flag of the
+  /// last iteration is returned.
+  /// \param LB Address of the output variable in which the lower iteration
+  /// number is returned.
+  /// \param UB Address of the output variable in which the upper iteration
+  /// number is returned.
+  /// \param ST Address of the output variable in which the stride value is
+  /// returned.
+  virtual llvm::CallInst *EmitOMPForNext(CodeGenFunction &CGF,
+                                         SourceLocation Loc, unsigned IVSize,
+                                         bool IVSigned, llvm::Value *IL,
+                                         llvm::Value *LB, llvm::Value *UB,
+                                         llvm::Value *ST);
+
   /// \brief Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
   /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
   /// clause.
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Reply via email to