[PATCH] [OPENMP] Initial codegen for 'parallel for' directive.

Alexey Bataev Thu, 26 Mar 2015 05:37:22 -0700

Hi rjmccall, hfinkel, fraggamuffin, ejstotzer,

Allows generation of combined 'parallel for' directive that represents 
'parallel' region with internal implicit 'for' worksharing region.
A new combined region RAII is added to the runtime support module, intended to 
be used for all combined directives, like 'parallel sections', 'parallel for 
simd' etc.
This RAII region allows to split codegen for combined directives into several 
independent steps: at first it tries to generate the code for the outher 
implicit OpenMP region, then for internal. For example, for 'parallel for' 
directive on first iteration it generates an outlined function for parallel 
region, and then generates internal loop using information from the outer 
region. This region stores a kind of implicit directive for which it currently 
emits code and then advances to the next implicit directive. This allows to 
reuse the codegen emission method for combined directive for all implicitly 
included directives just like this:
```
if (<CurrentImplicitRegion> == OMPD_parallel)
  <Generate code for 'parallel' region>
else if (<CurrentImplicitRegion> == OMPD_for>
  <Generate code for 'for' region>
...
```


http://reviews.llvm.org/D8631

Files:
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.h
  test/OpenMP/parallel_codegen.cpp
  test/OpenMP/parallel_for_codegen.cpp
  test/OpenMP/sections_codegen.cpp

EMAIL PREFERENCES
  http://reviews.llvm.org/settings/panel/emailpreferences/

Index: test/OpenMP/parallel_codegen.cpp
===================================================================
--- test/OpenMP/parallel_codegen.cpp
+++ test/OpenMP/parallel_codegen.cpp
@@ -39,7 +39,7 @@
 // CHECK:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-NEXT:  store i32* {{%[a-z0-9.]+}}, i32** [[ARGC_REF]]
 // CHECK-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
-// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* @.omp_outlined. to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
 // CHECK-NEXT:  [[ARGV:%.+]] = load i8**, i8*** {{%[a-z0-9.]+}}
 // CHECK-NEXT:  [[RET:%.+]] = call {{[a-z]*[ ]?i32}} [[TMAIN:@.+tmain.+]](i8** [[ARGV]])
 // CHECK-NEXT:  ret i32 [[RET]]
@@ -55,34 +55,36 @@
 // CHECK-DEBUG-NEXT:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
 // CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC1]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
 // CHECK-DEBUG-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
-// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* @.omp_outlined. to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
 // CHECK-DEBUG-NEXT:  [[ARGV:%.+]] = load i8**, i8*** {{%[a-z0-9.]+}}
 // CHECK-DEBUG-NEXT:  [[RET:%.+]] = call i32 [[TMAIN:@.+tmain.+]](i8** [[ARGV]])
 // CHECK-DEBUG-NEXT:  ret i32 [[RET]]
 // CHECK-DEBUG-NEXT:  }
 
-// CHECK-LABEL: define internal void @.omp_outlined.(i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
+// CHECK:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
 // CHECK:       #[[FN_ATTRS:[0-9]+]]
 // CHECK:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon*
 // CHECK:       store %struct.anon* %__context, %struct.anon** [[CONTEXT_ADDR]]
 // CHECK:       [[CONTEXT_PTR:%.+]] = load %struct.anon*, %struct.anon** [[CONTEXT_ADDR]]
 // CHECK-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[CONTEXT_PTR]], i32 0, i32 0
 // CHECK-NEXT:  [[ARGC_REF:%.+]] = load i32*, i32** [[ARGC_PTR_REF]]
 // CHECK-NEXT:  [[ARGC:%.+]] = load i32, i32* [[ARGC_REF]]
 // CHECK-NEXT:  invoke void [[FOO:@.+foo.+]](i32{{[ ]?[a-z]*}} [[ARGC]])
+// CHECK:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK:       ret void
 // CHECK:       call void @{{.+terminate.*|abort}}(
 // CHECK-NEXT:  unreachable
 // CHECK-NEXT:  }
-// CHECK-DEBUG-LABEL: define internal void @.omp_outlined.(i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
+// CHECK-DEBUG:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon* %__context)
 // CHECK-DEBUG:       #[[FN_ATTRS:[0-9]+]]
 // CHECK-DEBUG:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon*
 // CHECK-DEBUG:       store %struct.anon* %__context, %struct.anon** [[CONTEXT_ADDR]]
 // CHECK-DEBUG:       [[CONTEXT_PTR:%.+]] = load %struct.anon*, %struct.anon** [[CONTEXT_ADDR]]
 // CHECK-DEBUG-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon, %struct.anon* [[CONTEXT_PTR]], i32 0, i32 0
 // CHECK-DEBUG-NEXT:  [[ARGC_REF:%.+]] = load i32*, i32** [[ARGC_PTR_REF]]
 // CHECK-DEBUG-NEXT:  [[ARGC:%.+]] = load i32, i32* [[ARGC_REF]]
 // CHECK-DEBUG-NEXT:  invoke void [[FOO:@.+foo.+]](i32 [[ARGC]])
+// CHECK-DEBUG:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK-DEBUG:       ret void
 // CHECK-DEBUG:       call void @{{.+terminate.*|abort}}(
 // CHECK-DEBUG-NEXT:  unreachable
@@ -98,7 +100,7 @@
 // CHECK:       [[ARGC_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-NEXT:  store i8*** {{%[a-z0-9.]+}}, i8**** [[ARGC_REF]]
 // CHECK-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon.0* [[AGG_CAPTURED]] to i8*
-// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* @.omp_outlined.1 to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[DEF_LOC_2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
 // CHECK-NEXT:  ret i32 0
 // CHECK-NEXT:  }
 // CHECK-DEBUG:       define linkonce_odr i32 [[TMAIN]](i8** %argc)
@@ -112,30 +114,32 @@
 // CHECK-DEBUG-NEXT:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
 // CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC2]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
 // CHECK-DEBUG-NEXT:  [[BITCAST:%.+]] = bitcast %struct.anon.0* [[AGG_CAPTURED]] to i8*
-// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* @.omp_outlined.1 to void (i32*, i32*, ...)*), i8* [[BITCAST]])
+// CHECK-DEBUG-NEXT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call(%ident_t* [[LOC_2_ADDR]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %struct.anon.0*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8* [[BITCAST]])
 // CHECK-DEBUG-NEXT:  ret i32 0
 // CHECK-DEBUG-NEXT:  }
 
-// CHECK-LABEL: define internal void @.omp_outlined.1(i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
+// CHECK:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
 // CHECK:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon.0*
 // CHECK:       store %struct.anon.0* %__context, %struct.anon.0** [[CONTEXT_ADDR]]
 // CHECK:       [[CONTEXT_PTR:%.+]] = load %struct.anon.0*, %struct.anon.0** [[CONTEXT_ADDR]]
 // CHECK-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[CONTEXT_PTR]], i32 0, i32 0
 // CHECK-NEXT:  [[ARGC_REF:%.+]] = load i8***, i8**** [[ARGC_PTR_REF]]
 // CHECK-NEXT:  [[ARGC:%.+]] = load i8**, i8*** [[ARGC_REF]]
 // CHECK-NEXT:  invoke void [[FOO1:@.+foo.+]](i8** [[ARGC]])
+// CHECK:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK:       ret void
 // CHECK:       call void @{{.+terminate.*|abort}}(
 // CHECK-NEXT:  unreachable
 // CHECK-NEXT:  }
-// CHECK-DEBUG-LABEL: define internal void @.omp_outlined.1(i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
+// CHECK-DEBUG:       define internal void [[OMP_OUTLINED]](i32* %.global_tid., i32* %.bound_tid., %struct.anon.0* %__context)
 // CHECK-DEBUG:       [[CONTEXT_ADDR:%.+]] = alloca %struct.anon.0*
 // CHECK-DEBUG:       store %struct.anon.0* %__context, %struct.anon.0** [[CONTEXT_ADDR]]
 // CHECK-DEBUG:       [[CONTEXT_PTR:%.+]] = load %struct.anon.0*, %struct.anon.0** [[CONTEXT_ADDR]]
 // CHECK-DEBUG-NEXT:  [[ARGC_PTR_REF:%.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* [[CONTEXT_PTR]], i32 0, i32 0
 // CHECK-DEBUG-NEXT:  [[ARGC_REF:%.+]] = load i8***, i8**** [[ARGC_PTR_REF]]
 // CHECK-DEBUG-NEXT:  [[ARGC:%.+]] = load i8**, i8*** [[ARGC_REF]]
 // CHECK-DEBUG-NEXT:  invoke void [[FOO1:@.+foo.+]](i8** [[ARGC]])
+// CHECK-DEBUG:       call {{.+}} @__kmpc_cancel_barrier(
 // CHECK-DEBUG:       ret void
 // CHECK-DEBUG:       call void @{{.+terminate.*|abort}}(
 // CHECK-DEBUG-NEXT:  unreachable
Index: test/OpenMP/sections_codegen.cpp
===================================================================
--- test/OpenMP/sections_codegen.cpp
+++ test/OpenMP/sections_codegen.cpp
@@ -88,6 +88,7 @@
 // CHECK-NEXT:  br label %[[END]]
 // CHECK:       [[END]]
 // CHECK-NEXT:  call i32 @__kmpc_cancel_barrier(
+// CHECK-NEXT:  call i32 @__kmpc_cancel_barrier(
 // CHECK-NEXT:  ret
 // CHECK:       [[TERM_LPAD]]
 // CHECK:       call void @__clang_call_terminate(i8*
Index: test/OpenMP/parallel_for_codegen.cpp
===================================================================
--- test/OpenMP/parallel_for_codegen.cpp
+++ test/OpenMP/parallel_for_codegen.cpp
@@ -0,0 +1,398 @@
+// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp=libiomp5 -fexceptions -fcxx-exceptions -gline-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
+//
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void without_schedule_clause(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
+// UB = min(UB, GlobalUB)
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// Loop header
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (int i = 33; i < 32000000; i += 7) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 33, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_not_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(static)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
+// UB = min(UB, GlobalUB)
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// Loop header
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (int i = 32000000; i > 33; i += -7) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
+// CHECK-NEXT: [[CALC_I_2:%.+]] = sub nsw i32 32000000, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(static, 5)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
+// UB = min(UB, GlobalUB)
+// CHECK: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+
+// Outer loop header
+// CHECK: [[O_IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[O_UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]]
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned i = 131071; i <= 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// Update the counters, adding stride
+// CHECK:  [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]]
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]]
+
+// CHECK: [[O_LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void dynamic1(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(dynamic)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 35, i64 0, i64 16908287, i64 1, i64 1)
+//
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}guided7{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void guided7(float *a, float *b, float *c, float *d) {
+  #pragma omp parallel for schedule(guided, 7)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 36, i64 0, i64 16908287, i64 1, i64 7)
+//
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned long long i = 131071; i < 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}test_auto{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void test_auto(float *a, float *b, float *c, float *d) {
+  unsigned int x = 0;
+  unsigned int y = 0;
+  #pragma omp parallel for schedule(auto) collapse(2)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 38, i64 0, i64 [[LAST_ITER:%[^,]+]], i64 1, i64 1)
+//
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
+// CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i64 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+// FIXME: When the iteration count of some nested loop is not a known constant,
+// we should pre-calculate it, like we do for the total number of iterations!
+  for (char i = static_cast<char>(y); i <= '9'; ++i)
+    for (x = 11; x > 0; --x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i64 [[IV1_2]], 1
+// CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}runtime{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void runtime(float *a, float *b, float *c, float *d) {
+  int x = 0;
+  #pragma omp parallel for collapse(2) schedule(runtime)
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...)* @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, %{{.+}}*)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*), i8* %{{.+}})
+// CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* [[GTID_PARAM_ADDR:%.+]], i32* %{{.+}}, %{{.+}}* %{{.+}})
+// CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 37, i32 0, i32 199, i32 1, i32 1)
+//
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]])
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+// CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
+
+// CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned char i = '0' ; i <= '9'; ++i)
+    for (x = -10; x < 10; ++x) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: indices are calculated from IV:
+// CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
+// CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// CHECK: [[O_LOOP1_END]]
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_REF_ADDR]],
+// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
+// TERM_DEBUG-LABEL: foo
+int foo() {return 0;};
+
+// TERM_DEBUG-LABEL: parallel_for
+void parallel_for(float *a) {
+#pragma omp parallel for schedule(static, 5)
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_for_static_init_4u({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]]
+  // TERM_DEBUG:     invoke i32 {{.*}}foo{{.*}}()
+  // TERM_DEBUG:     unwind label %[[TERM_LPAD:.+]],
+  // TERM_DEBUG-NOT: __kmpc_global_thread_num
+  // TERM_DEBUG:     call void @__kmpc_for_static_fini({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]]
+  // TERM_DEBUG:     call {{.+}} @__kmpc_cancel_barrier({{.+}}), !dbg [[DBG_LOC_CANCEL:![0-9]+]]
+  // TERM_DEBUG:     [[TERM_LPAD]]
+  // TERM_DEBUG:     call void @__clang_call_terminate
+  // TERM_DEBUG:     unreachable
+  for (unsigned i = 131071; i <= 2147483647; i += 127)
+    a[i] += foo();
+}
+// Check source line corresponds to "#pragma omp parallel for schedule(static, 5)" above:
+// TERM_DEBUG-DAG: [[DBG_LOC_START]] = !MDLocation(line: [[@LINE-15]],
+// TERM_DEBUG-DAG: [[DBG_LOC_END]] = !MDLocation(line: [[@LINE-16]],
+// TERM_DEBUG-DAG: [[DBG_LOC_CANCEL]] = !MDLocation(line: [[@LINE-17]],
+
+#endif // HEADER
+
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -31,11 +31,29 @@
 /// \brief Base class for handling code generation inside OpenMP regions.
 class CGOpenMPRegionInfo : public CodeGenFunction::CGCapturedStmtInfo {
 public:
-  CGOpenMPRegionInfo(const OMPExecutableDirective &D, const CapturedStmt &CS)
-      : CGCapturedStmtInfo(CS, CR_OpenMP), Directive(D) {}
+  /// \brief Kinds of OpenMP regions used in codegen.
+  enum CGOpenMPRegionKind {
+    /// \brief Region with outlined function for standalone 'parallel'
+    /// directive.
+    ParallelOutlinedRegion,
+    /// \brief Region with outlined function for standalone 'task' directive.
+    TaskOutlinedRegion,
+    /// \brief Region for constructs that do not require function outlining,
+    /// like 'for', 'sections', 'atomic' etc. directives.
+    InlinedRegion,
+    /// \brief Region for combined directives, like 'parallel for' or 'parallel
+    /// sections'.
+    CombinedRegion,
+  };
 
-  CGOpenMPRegionInfo(const OMPExecutableDirective &D)
-      : CGCapturedStmtInfo(CR_OpenMP), Directive(D) {}
+  CGOpenMPRegionInfo(const OMPExecutableDirective &D, const CapturedStmt &CS,
+                     const CGOpenMPRegionKind RegionKind)
+      : CGCapturedStmtInfo(CS, CR_OpenMP), Directive(D),
+        RegionKind(RegionKind) {}
+
+  CGOpenMPRegionInfo(const OMPExecutableDirective &D,
+                     const CGOpenMPRegionKind RegionKind)
+      : CGCapturedStmtInfo(CR_OpenMP), Directive(D), RegionKind(RegionKind) {}
 
   /// \brief Get a variable or parameter for storing global thread id
   /// inside OpenMP construct.
@@ -48,30 +66,44 @@
     /// \brief Emit the captured statement body.
   virtual void EmitBody(CodeGenFunction &CGF, const Stmt *S) override;
 
+  CGOpenMPRegionKind getRegionKind() const { return RegionKind; }
+
   static bool classof(const CGCapturedStmtInfo *Info) {
     return Info->getKind() == CR_OpenMP;
   }
 protected:
   /// \brief OpenMP executable directive associated with the region.
   const OMPExecutableDirective &Directive;
+  CGOpenMPRegionKind RegionKind;
 };
 
 /// \brief API for captured statement code generation in OpenMP constructs.
 class CGOpenMPOutlinedRegionInfo : public CGOpenMPRegionInfo {
 public:
   CGOpenMPOutlinedRegionInfo(const OMPExecutableDirective &D,
                              const CapturedStmt &CS, const VarDecl *ThreadIDVar)
-      : CGOpenMPRegionInfo(D, CS), ThreadIDVar(ThreadIDVar) {
+      : CGOpenMPRegionInfo(D, CS, ParallelOutlinedRegion),
+        ThreadIDVar(ThreadIDVar) {
     assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
   }
   /// \brief Get a variable or parameter for storing global thread id
   /// inside OpenMP construct.
   virtual const VarDecl *getThreadIDVariable() const override {
     return ThreadIDVar;
   }
+
+  /// \brief Emit the captured statement body.
+  virtual void EmitBody(CodeGenFunction &CGF, const Stmt *S) override;
+
   /// \brief Get the name of the capture helper.
   StringRef getHelperName() const override { return ".omp_outlined."; }
 
+  static bool classof(const CGCapturedStmtInfo *Info) {
+    return CGOpenMPRegionInfo::classof(Info) &&
+           cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
+               ParallelOutlinedRegion;
+  }
+
 private:
   /// \brief A variable or parameter storing global thread id for OpenMP
   /// constructs.
@@ -85,7 +117,7 @@
                                  const CapturedStmt &CS,
                                  const VarDecl *ThreadIDVar,
                                  const VarDecl *PartIDVar)
-      : CGOpenMPRegionInfo(D, CS), ThreadIDVar(ThreadIDVar),
+      : CGOpenMPRegionInfo(D, CS, TaskOutlinedRegion), ThreadIDVar(ThreadIDVar),
         PartIDVar(PartIDVar) {
     assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
   }
@@ -104,6 +136,12 @@
   /// \brief Get the name of the capture helper.
   StringRef getHelperName() const override { return ".omp_outlined."; }
 
+  static bool classof(const CGCapturedStmtInfo *Info) {
+    return CGOpenMPRegionInfo::classof(Info) &&
+           cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
+               TaskOutlinedRegion;
+  }
+
 private:
   /// \brief A variable or parameter storing global thread id for OpenMP
   /// constructs.
@@ -119,14 +157,21 @@
 public:
   CGOpenMPInlinedRegionInfo(const OMPExecutableDirective &D,
                             CodeGenFunction::CGCapturedStmtInfo *OldCSI)
-      : CGOpenMPRegionInfo(D), OldCSI(OldCSI),
+      : CGOpenMPRegionInfo(D, InlinedRegion), OldCSI(OldCSI),
         OuterRegionInfo(dyn_cast_or_null<CGOpenMPRegionInfo>(OldCSI)) {}
   // \brief Retrieve the value of the context parameter.
   virtual llvm::Value *getContextValue() const override {
     if (OuterRegionInfo)
       return OuterRegionInfo->getContextValue();
     llvm_unreachable("No context value for inlined OpenMP region");
   }
+  virtual void setContextValue(llvm::Value *V) override {
+    if (OuterRegionInfo) {
+      OuterRegionInfo->setContextValue(V);
+      return;
+    }
+    llvm_unreachable("No context value for inlined OpenMP region");
+  }
   /// \brief Lookup the captured field decl for a variable.
   virtual const FieldDecl *lookup(const VarDecl *VD) const override {
     if (OuterRegionInfo)
@@ -154,11 +199,70 @@
 
   CodeGenFunction::CGCapturedStmtInfo *getOldCSI() const { return OldCSI; }
 
+  static bool classof(const CGCapturedStmtInfo *Info) {
+    return CGOpenMPRegionInfo::classof(Info) &&
+           cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == InlinedRegion;
+  }
+
 private:
   /// \brief CodeGen info about outer OpenMP region.
   CodeGenFunction::CGCapturedStmtInfo *OldCSI;
   CGOpenMPRegionInfo *OuterRegionInfo;
 };
+
+/// \brief API for code generation of combined OpenMP constructs.
+class CGOpenMPCombinedRegionInfo : public CGOpenMPInlinedRegionInfo {
+public:
+  CGOpenMPCombinedRegionInfo(const OMPExecutableDirective &D,
+                             CodeGenFunction::CGCapturedStmtInfo *OldCSI,
+                             ArrayRef<OpenMPDirectiveKind> CurrentCodeGenKinds)
+      : CGOpenMPInlinedRegionInfo(D, OldCSI), CurrentKindPos(0),
+        CurrentKind(CurrentCodeGenKinds.empty() ? OMPD_unknown
+                                                : CurrentCodeGenKinds[0]) {
+    RegionKind = CombinedRegion;
+  }
+  /// \brief Make current combined region to emit code for the next OpenMP
+  /// region specified in \a CurrentCodeGenKinds list.
+  void nextCodeGenKind(ArrayRef<OpenMPDirectiveKind> CurrentCodeGenKinds);
+  /// \brief Make current combined region to emit code for \a At OpenMP region
+  /// that must be listed in \a CurrentCodeGenKinds list.
+  void
+  nextCodeGenKindAtSpecified(ArrayRef<OpenMPDirectiveKind> CurrentCodeGenKinds,
+                             const OpenMPDirectiveKind At);
+  /// \brief Get current codegen kind for the combined region.
+  OpenMPDirectiveKind getCurrentCodeGenKind() const { return CurrentKind; }
+  /// \brief Check if the current combined region is used for codegen of the \a
+  /// D directive.
+  bool isRegionForDirective(const OMPExecutableDirective &D) const {
+    return &D == &Directive;
+  }
+  /// \brief Emit the captured statement body for combined directives.
+  virtual void EmitBody(CodeGenFunction &CGF, const Stmt *) override {
+    // If current codegen kind points to some specific kind - use code emission
+    // for the directive again to emit the code for the remaining parts of the
+    // combined construct.
+    assert (CurrentKind != OMPD_unknown);
+    CGF.EmitStmt(&Directive);
+  }
+
+  /// \brief Get the name of the capture helper.
+  virtual StringRef getHelperName() const override {
+    if (auto *OuterRegionInfo = getOldCSI())
+      return OuterRegionInfo->getHelperName();
+    llvm_unreachable("No helper name for combined construct");
+  }
+
+  static bool classof(const CGCapturedStmtInfo *Info) {
+    return CGOpenMPRegionInfo::classof(Info) &&
+           cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == CombinedRegion;
+  }
+
+private:
+  /// \brief Position of current codegen kind.
+  unsigned CurrentKindPos;
+  /// \brief Actual codegen kind.
+  OpenMPDirectiveKind CurrentKind;
+};
 } // namespace
 
 LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) {
@@ -183,6 +287,13 @@
   CGCapturedStmtInfo::EmitBody(CGF, S);
 }
 
+void CGOpenMPOutlinedRegionInfo::EmitBody(CodeGenFunction &CGF, const Stmt *S) {
+  CGOpenMPRegionInfo::EmitBody(CGF, S);
+  // Emit implicit barrier at the end of parallel region.
+  CGF.CGM.getOpenMPRuntime().emitBarrierCall(CGF, Directive.getLocStart(),
+                                             /*IsExplicit=*/false);
+}
+
 LValue CGOpenMPTaskOutlinedRegionInfo::getThreadIDVariableLValue(
     CodeGenFunction &CGF) {
   return CGF.MakeNaturalAlignAddrLValue(
@@ -198,6 +309,27 @@
   CGCapturedStmtInfo::EmitBody(CGF, S);
 }
 
+void CGOpenMPCombinedRegionInfo::nextCodeGenKind(
+    ArrayRef<OpenMPDirectiveKind> CurrentCodeGenKinds) {
+  ++CurrentKindPos;
+  CurrentKind = CurrentKindPos < CurrentCodeGenKinds.size()
+                    ? CurrentCodeGenKinds[CurrentKindPos]
+                    : OMPD_unknown;
+}
+
+void CGOpenMPCombinedRegionInfo::nextCodeGenKindAtSpecified(
+    ArrayRef<OpenMPDirectiveKind> CurrentCodeGenKinds,
+    const OpenMPDirectiveKind At) {
+  for (auto E = CurrentCodeGenKinds.size(); CurrentKindPos < E;
+       ++CurrentKindPos) {
+    if (CurrentCodeGenKinds[CurrentKindPos] == At) {
+      CurrentKind = CurrentCodeGenKinds[CurrentKindPos];
+      return;
+    }
+  }
+  llvm_unreachable("No specified codegen kind is found.");
+}
+
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
     : CGM(CGM), DefaultOpenMPPSource(nullptr), KmpRoutineEntryPtrTy(nullptr) {
   IdentTy = llvm::StructType::create(
@@ -215,15 +347,23 @@
   InternalVars.clear();
 }
 
-llvm::Value *
-CGOpenMPRuntime::emitOutlinedFunction(const OMPExecutableDirective &D,
-                                      const VarDecl *ThreadIDVar) {
+llvm::Value *CGOpenMPRuntime::emitParallelOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    ArrayRef<OpenMPDirectiveKind> CombinedCodeGenKinds) {
   assert(ThreadIDVar->getType()->isPointerType() &&
          "thread id variable must be of type kmp_int32 *");
   const CapturedStmt *CS = cast<CapturedStmt>(D.getAssociatedStmt());
   CodeGenFunction CGF(CGM, true);
   CGOpenMPOutlinedRegionInfo CGInfo(D, *CS, ThreadIDVar);
   CGF.CapturedStmtInfo = &CGInfo;
+  if (!CombinedCodeGenKinds.empty()) {
+    // We're emitting combined region with implicit 'parallel' region.
+    CombinedOpenMPRegionRAII Region(CGF, D, CombinedCodeGenKinds);
+    // Skip everything before OMPD_parallel - it was emitted already.
+    cast<CGOpenMPCombinedRegionInfo>(CGF.CapturedStmtInfo)
+        ->nextCodeGenKindAtSpecified(CombinedCodeGenKinds, OMPD_parallel);
+    return CGF.GenerateCapturedStmtFunction(*CS);
+  }
   return CGF.GenerateCapturedStmtFunction(*CS);
 }
 
@@ -1524,3 +1664,40 @@
   CGF.CapturedStmtInfo = OldCSI;
 }
 
+CombinedOpenMPRegionRAII::CombinedOpenMPRegionRAII(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D,
+    ArrayRef<OpenMPDirectiveKind> CombinedCodeGenKinds)
+    : CGF(CGF), IsLastIteration(false) {
+  assert(CombinedCodeGenKinds.size() > 1);
+  if (auto *CRI =
+          dyn_cast_or_null<CGOpenMPCombinedRegionInfo>(CGF.CapturedStmtInfo)) {
+    if (CRI->isRegionForDirective(D)) {
+      // We're already emitting code for the specified combined construct -
+      // advance to the next codegen kind.
+      CRI->nextCodeGenKind(CombinedCodeGenKinds);
+      IsLastIteration =
+          CRI->getCurrentCodeGenKind() == CombinedCodeGenKinds.back();
+      return;
+    }
+  }
+  // Start emission for the combined construct.
+  CGF.CapturedStmtInfo = new CGOpenMPCombinedRegionInfo(D, CGF.CapturedStmtInfo,
+                                                        CombinedCodeGenKinds);
+}
+
+OpenMPDirectiveKind CombinedOpenMPRegionRAII::getCurrentCodeGenKind() const {
+  return cast<CGOpenMPCombinedRegionInfo>(CGF.CapturedStmtInfo)
+      ->getCurrentCodeGenKind();
+}
+
+CombinedOpenMPRegionRAII::~CombinedOpenMPRegionRAII() {
+  // Restore original CapturedStmtInfo only if we're done with code emission for
+  // the combined directive.
+  if (IsLastIteration) {
+    auto *OldCSI =
+        cast<CGOpenMPCombinedRegionInfo>(CGF.CapturedStmtInfo)->getOldCSI();
+    delete CGF.CapturedStmtInfo;
+    CGF.CapturedStmtInfo = OldCSI;
+  }
+}
+
Index: lib/CodeGen/CodeGenFunction.h
===================================================================
--- lib/CodeGen/CodeGenFunction.h
+++ lib/CodeGen/CodeGenFunction.h
@@ -192,7 +192,7 @@
 
     CapturedRegionKind getKind() const { return Kind; }
 
-    void setContextValue(llvm::Value *V) { ThisValue = V; }
+    virtual void setContextValue(llvm::Value *V) { ThisValue = V; }
     // \brief Retrieve the value of the context parameter.
     virtual llvm::Value *getContextValue() const { return ThisValue; }
 
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -225,8 +225,8 @@
 }
 
 /// \brief Emits code for OpenMP parallel directive in the parallel region.
-static void EmitOMPParallelCall(CodeGenFunction &CGF,
-                                const OMPParallelDirective &S,
+static void emitOMPParallelCall(CodeGenFunction &CGF,
+                                const OMPExecutableDirective &S,
                                 llvm::Value *OutlinedFn,
                                 llvm::Value *CapturedStruct) {
   if (auto C = S.getSingleClause(/*K*/ OMPC_num_threads)) {
@@ -241,22 +241,29 @@
                                               CapturedStruct);
 }
 
-void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
+static void emitCommonOMPParallelDirective(
+    CodeGenFunction &CGF, const OMPExecutableDirective &S,
+    ArrayRef<OpenMPDirectiveKind> CombinedCodeGenKinds) {
   auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
-  auto CapturedStruct = GenerateCapturedStmtArgument(*CS);
-  auto OutlinedFn = CGM.getOpenMPRuntime().emitOutlinedFunction(
-      S, *CS->getCapturedDecl()->param_begin());
+  auto CapturedStruct = CGF.GenerateCapturedStmtArgument(*CS);
+  auto OutlinedFn = CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction(
+      S, *CS->getCapturedDecl()->param_begin(), CombinedCodeGenKinds);
   if (auto C = S.getSingleClause(/*K*/ OMPC_if)) {
     auto Cond = cast<OMPIfClause>(C)->getCondition();
-    EmitOMPIfClause(*this, Cond, [&](bool ThenBlock) {
+    EmitOMPIfClause(CGF, Cond, [&](bool ThenBlock) {
       if (ThenBlock)
-        EmitOMPParallelCall(*this, S, OutlinedFn, CapturedStruct);
+        emitOMPParallelCall(CGF, S, OutlinedFn, CapturedStruct);
       else
-        CGM.getOpenMPRuntime().emitSerialCall(*this, S.getLocStart(),
-                                              OutlinedFn, CapturedStruct);
+        CGF.CGM.getOpenMPRuntime().emitSerialCall(CGF, S.getLocStart(),
+                                                  OutlinedFn, CapturedStruct);
     });
   } else
-    EmitOMPParallelCall(*this, S, OutlinedFn, CapturedStruct);
+    emitOMPParallelCall(CGF, S, OutlinedFn, CapturedStruct);
+}
+
+void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
+  // Emit parallel region as a standalone region.
+  emitCommonOMPParallelDirective(*this, S, /*CombinedCodeGenKinds=*/llvm::None);
 }
 
 void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &S,
@@ -949,9 +956,25 @@
       }, S.getLocStart());
 }
 
-void
-CodeGenFunction::EmitOMPParallelForDirective(const OMPParallelForDirective &) {
-  llvm_unreachable("CodeGen for 'omp parallel for' is not supported yet.");
+void CodeGenFunction::EmitOMPParallelForDirective(
+    const OMPParallelForDirective &S) {
+  // Emit directive as a combined directive that consists of two implicit
+  // directives: 'parallel' with 'for' directive.
+  OpenMPDirectiveKind CodeGenKinds[] = {OMPD_parallel, OMPD_for};
+  CombinedOpenMPRegionRAII Region(*this, S, CodeGenKinds);
+
+  OpenMPDirectiveKind CurrentCodeGenKind = Region.getCurrentCodeGenKind();
+  // At first emit an outlined function for 'parallel' region.
+  if (CurrentCodeGenKind == OMPD_parallel) {
+    emitCommonOMPParallelDirective(*this, S, CodeGenKinds);
+    return;
+  }
+  // Emit implicit worksharing loop construct.
+  assert(CurrentCodeGenKind == OMPD_for);
+  EmitOMPWorksharingLoop(S);
+  // Emit implicit barrier at the end of parallel region.
+  CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getLocStart(),
+                                         /*IsExplicit=*/false);
 }
 
 void CodeGenFunction::EmitOMPParallelForSimdDirective(
Index: lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.h
+++ lib/CodeGen/CGOpenMPRuntime.h
@@ -284,14 +284,20 @@
   virtual ~CGOpenMPRuntime() {}
   virtual void clear();
 
-  /// \brief Emits outlined function for the specified OpenMP directive \a D.
-  /// This outlined function has type void(*)(kmp_int32 *ThreadID, kmp_int32
-  /// BoundID, struct context_vars*).
+  /// \brief Emits outlined function for the specified OpenMP parallel directive
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
   /// \param D OpenMP directive.
   /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
-  ///
-  virtual llvm::Value *emitOutlinedFunction(const OMPExecutableDirective &D,
-                                            const VarDecl *ThreadIDVar);
+  /// \param CombinedCodeGenKinds Lists types of codegen kinds that must be
+  /// used for combined directives. For example, if it includes OMPD_parallel
+  /// and OMPD_for kinds, then the outlined function for 'parallel' region must
+  /// be generated that includes the code for 'for' region, generated for the
+  /// specified directive \a D.
+  ///
+  virtual llvm::Value *emitParallelOutlinedFunction(
+      const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+      ArrayRef<OpenMPDirectiveKind> CombinedCodeGenKinds);
 
   /// \brief Emits outlined function for the OpenMP task directive \a D. This
   /// outlined function has type void(*)(kmp_int32 ThreadID, kmp_int32
@@ -516,6 +522,29 @@
                           const OMPExecutableDirective &D);
   ~InlinedOpenMPRegionRAII();
 };
+
+/// \brief RAII for emitting code of combined OpenMP constructs like '#pragma
+/// omp parallel for' or '#pragma omp parallel sections' that implicitly include
+/// several OpenMP regions. For example, 'parallel for' directive implicitly
+/// includes 'parallel' region with inner 'for' region.
+class CombinedOpenMPRegionRAII {
+  CodeGenFunction &CGF;
+  bool IsLastIteration;
+
+public:
+  /// \brief Constructs region for combined constructs.
+  /// \param CombinedCodeGenKinds Lists types of codegen kinds that must be
+  /// used for combined directives. For example, if it includes OMPD_parallel
+  /// and OMPD_for kinds, then the outlined function for 'parallel' region must
+  /// be generated that includes the code for 'for' region, generated for the
+  /// specified directive \a D.
+  CombinedOpenMPRegionRAII(CodeGenFunction &CGF,
+                           const OMPExecutableDirective &D,
+                           ArrayRef<OpenMPDirectiveKind> CombinedCodeGenKinds);
+  /// \brief Return current codegen kind for combined construct.
+  OpenMPDirectiveKind getCurrentCodeGenKind() const;
+  ~CombinedOpenMPRegionRAII();
+};
 } // namespace CodeGen
 } // namespace clang

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

[PATCH] [OPENMP] Initial codegen for 'parallel for' directive.

Reply via email to