gcc/
        * internal-fn.c (expand_GOMP_SIMT_LANE): New.
        (expand_GOMP_SIMT_VF): New.
        (expand_GOMP_SIMT_LAST_LANE): New.
        (expand_GOMP_SIMT_ORDERED_PRED): New.
        (expand_GOMP_SIMT_VOTE_ANY): New.
        (expand_GOMP_SIMT_XCHG_BFLY): New.
        (expand_GOMP_SIMT_XCHG_IDX): New.
        * internal-fn.def (GOMP_SIMT_LANE): New.
        (GOMP_SIMT_VF): New.
        (GOMP_SIMT_LAST_LANE): New.
        (GOMP_SIMT_ORDERED_PRED): New.
        (GOMP_SIMT_VOTE_ANY): New.
        (GOMP_SIMT_XCHG_BFLY): New.
        (GOMP_SIMT_XCHG_IDX): New.
        * omp-low.c (omp_maybe_offloaded_ctx): New, outlined from...
        (create_omp_child_function): ...here.  Set "omp target entrypoint"
        or "omp declare target" attribute based on is_gimple_omp_offloaded.
        (omp_max_simt_vf): New.  Use it...
        (omp_max_vf): ...here.
        (lower_rec_input_clauses): Add reduction lowering for SIMT execution.
        (lower_lastprivate_clauses): Likewise, for "lastprivate" lowering.
        (lower_omp_ordered): Likewise, for "ordered" lowering.
        (expand_omp_simd): Add SIMT transforms.
        (pass_data_lower_omp): Add PROP_gimple_lomp_dev.
        (execute_omp_device_lower): New.
        (pass_data_omp_device_lower): New.
        (pass_omp_device_lower): New pass.
        (make_pass_omp_device_lower): New.
        * passes.def (pass_omp_device_lower): Position new pass.
        * tree-pass.h (PROP_gimple_lomp_dev): Define.
        (make_pass_omp_device_lower): Declare.

diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index cbee97e..fd1cd8b 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -157,6 +157,132 @@ expand_ANNOTATE (internal_fn, gcall *)
   gcc_unreachable ();
 }
 
+/* Lane index on SIMT targets: thread index in the warp on NVPTX.  On targets
+   without SIMT execution this should be expanded in omp_device_lower pass.  */
+
+static void
+expand_GOMP_SIMT_LANE (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  gcc_assert (targetm.have_omp_simt_lane ());
+  emit_insn (targetm.gen_omp_simt_lane (target));
+}
+
+/* This should get expanded in omp_device_lower pass.  */
+
+static void
+expand_GOMP_SIMT_VF (internal_fn, gcall *)
+{
+  gcc_unreachable ();
+}
+
+/* Lane index of the first SIMT lane that supplies a non-zero argument.
+   This is a SIMT counterpart to GOMP_SIMD_LAST_LANE, used to represent the
+   lane that executed the last iteration for handling OpenMP lastprivate.  */
+
+static void
+expand_GOMP_SIMT_LAST_LANE (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx cond = expand_normal (gimple_call_arg (stmt, 0));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[2];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], cond, mode);
+  gcc_assert (targetm.have_omp_simt_last_lane ());
+  expand_insn (targetm.code_for_omp_simt_last_lane, 2, ops);
+}
+
+/* Non-transparent predicate used in SIMT lowering of OpenMP "ordered".  */
+
+static void
+expand_GOMP_SIMT_ORDERED_PRED (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx ctr = expand_normal (gimple_call_arg (stmt, 0));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[2];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], ctr, mode);
+  gcc_assert (targetm.have_omp_simt_ordered ());
+  expand_insn (targetm.code_for_omp_simt_ordered, 2, ops);
+}
+
+/* "Or" boolean reduction across SIMT lanes: return non-zero in all lanes if
+   any lane supplies a non-zero argument.  */
+
+static void
+expand_GOMP_SIMT_VOTE_ANY (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx cond = expand_normal (gimple_call_arg (stmt, 0));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[2];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], cond, mode);
+  gcc_assert (targetm.have_omp_simt_vote_any ());
+  expand_insn (targetm.code_for_omp_simt_vote_any, 2, ops);
+}
+
+/* Exchange between SIMT lanes with a "butterfly" pattern: source lane index
+   is destination lane index XOR given offset.  */
+
+static void
+expand_GOMP_SIMT_XCHG_BFLY (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_normal (gimple_call_arg (stmt, 0));
+  rtx idx = expand_normal (gimple_call_arg (stmt, 1));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[3];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], src, mode);
+  create_input_operand (&ops[2], idx, SImode);
+  gcc_assert (targetm.have_omp_simt_xchg_bfly ());
+  expand_insn (targetm.code_for_omp_simt_xchg_bfly, 3, ops);
+}
+
+/* Exchange between SIMT lanes according to given source lane index.  */
+
+static void
+expand_GOMP_SIMT_XCHG_IDX (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  if (!lhs)
+    return;
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_normal (gimple_call_arg (stmt, 0));
+  rtx idx = expand_normal (gimple_call_arg (stmt, 1));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
+  struct expand_operand ops[3];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], src, mode);
+  create_input_operand (&ops[2], idx, SImode);
+  gcc_assert (targetm.have_omp_simt_xchg_idx ());
+  expand_insn (targetm.code_for_omp_simt_xchg_idx, 3, ops);
+}
+
 /* This should get expanded in adjust_simduid_builtins.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 0869b2f..77ce63a 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -141,6 +141,13 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST, ffs, unary)
 DEF_INTERNAL_INT_FN (PARITY, ECF_CONST, parity, unary)
 DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST, popcount, unary)
 
+DEF_INTERNAL_FN (GOMP_SIMT_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_LAST_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, 
NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_ORDERED_PRED, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_VOTE_ANY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_XCHG_BFLY, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, 
NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_XCHG_IDX, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index e5b9e4c..da5476b 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -2427,6 +2427,20 @@ cilk_for_check_loop_diff_type (tree type)
     }
 }
 
+/* Return true if CTX may belong to offloaded code: either if current function
+   is offloaded, or any enclosing context corresponds to a target region.  */
+
+static bool
+omp_maybe_offloaded_ctx (omp_context *ctx)
+{
+  if (cgraph_node::get (current_function_decl)->offloadable)
+    return true;
+  for (; ctx; ctx = ctx->outer)
+    if (is_gimple_omp_offloaded (ctx->stmt))
+      return true;
+  return false;
+}
+
 /* Build a decl for the omp child function.  It'll not contain a body
    yet, just the bare decl.  */
 
@@ -2475,28 +2489,24 @@ create_omp_child_function (omp_context *ctx, bool 
task_copy)
   DECL_CONTEXT (decl) = NULL_TREE;
   DECL_INITIAL (decl) = make_node (BLOCK);
   BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
-  if (cgraph_node::get (current_function_decl)->offloadable)
-    cgraph_node::get_create (decl)->offloadable = 1;
-  else
+  if (omp_maybe_offloaded_ctx (ctx))
     {
-      omp_context *octx;
-      for (octx = ctx; octx; octx = octx->outer)
-       if (is_gimple_omp_offloaded (octx->stmt))
-         {
-           cgraph_node::get_create (decl)->offloadable = 1;
-           if (ENABLE_OFFLOADING)
-             g->have_offload = true;
-
-           break;
-         }
+      cgraph_node::get_create (decl)->offloadable = 1;
+      if (ENABLE_OFFLOADING)
+       g->have_offload = true;
     }
 
   if (cgraph_node::get_create (decl)->offloadable
       && !lookup_attribute ("omp declare target",
                            DECL_ATTRIBUTES (current_function_decl)))
-    DECL_ATTRIBUTES (decl)
-      = tree_cons (get_identifier ("omp target entrypoint"),
-                   NULL_TREE, DECL_ATTRIBUTES (decl));
+    {
+      const char *target_attr = (is_gimple_omp_offloaded (ctx->stmt)
+                                ? "omp target entrypoint"
+                                : "omp declare target");
+      DECL_ATTRIBUTES (decl)
+       = tree_cons (get_identifier (target_attr),
+                    NULL_TREE, DECL_ATTRIBUTES (decl));
+    }
 
   t = build_decl (DECL_SOURCE_LOCATION (decl),
                  RESULT_DECL, NULL_TREE, void_type_node);
@@ -4264,6 +4274,25 @@ omp_clause_aligned_alignment (tree clause)
   return build_int_cst (integer_type_node, al);
 }
 
+
+/* Return maximum SIMT width if offloading may target SIMT hardware.  */
+
+static int
+omp_max_simt_vf (void)
+{
+  if (!optimize)
+    return 0;
+  if (ENABLE_OFFLOADING)
+    for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c; )
+      {
+       if (!strncmp (c, "nvptx", strlen ("nvptx")))
+         return 32;
+       else if ((c = strchr (c, ',')))
+         c++;
+      }
+  return 0;
+}
+
 /* Return maximum possible vectorization factor for the target.  */
 
 static int
@@ -4277,16 +4306,18 @@ omp_max_vf (void)
               || global_options_set.x_flag_tree_vectorize)))
     return 1;
 
+  int vf = 1;
   int vs = targetm.vectorize.autovectorize_vector_sizes ();
   if (vs)
+    vf = 1 << floor_log2 (vs);
+  else
     {
-      vs = 1 << floor_log2 (vs);
-      return vs;
+      machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode);
+      if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT)
+       vf = GET_MODE_NUNITS (vqimode);
     }
-  machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode);
-  if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT)
-    return GET_MODE_NUNITS (vqimode);
-  return 1;
+  int svf = omp_max_simt_vf ();
+  return MAX (vf, svf);
 }
 
 /* Helper function of lower_rec_input_clauses, used for #pragma omp simd
@@ -4374,10 +4405,13 @@ lower_rec_input_clauses (tree clauses, gimple_seq 
*ilist, gimple_seq *dlist,
   int pass;
   bool is_simd = (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
                  && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD);
+  bool maybe_simt
+    = is_simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
   int max_vf = 0;
   tree lane = NULL_TREE, idx = NULL_TREE;
+  tree simt_lane = NULL_TREE;
   tree ivar = NULL_TREE, lvar = NULL_TREE;
-  gimple_seq llist[2] = { NULL, NULL };
+  gimple_seq llist[3] = { };
 
   copyin_seq = NULL;
 
@@ -5251,6 +5285,16 @@ lower_rec_input_clauses (tree clauses, gimple_seq 
*ilist, gimple_seq *dlist,
 
                      gimplify_assign (unshare_expr (ivar), x, &llist[0]);
 
+                     if (maybe_simt)
+                       {
+                         if (!simt_lane)
+                           simt_lane = create_tmp_var (unsigned_type_node);
+                         x = build_call_expr_internal_loc
+                           (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_BFLY,
+                            TREE_TYPE (ivar), 2, ivar, simt_lane);
+                         x = build2 (code, TREE_TYPE (ivar), ivar, x);
+                         gimplify_assign (ivar, x, &llist[2]);
+                       }
                      x = build2 (code, TREE_TYPE (ref), ref, ivar);
                      ref = build_outer_var_ref (var, ctx);
                      gimplify_assign (ref, x, &llist[1]);
@@ -5303,6 +5347,39 @@ lower_rec_input_clauses (tree clauses, gimple_seq 
*ilist, gimple_seq *dlist,
       g = gimple_build_assign (lane, INTEGER_CST,
                               build_int_cst (unsigned_type_node, 0));
       gimple_seq_add_stmt (ilist, g);
+      /* Emit reductions across SIMT lanes in log_2(simt_vf) steps.  */
+      if (llist[2])
+       {
+         tree simt_vf = create_tmp_var (unsigned_type_node);
+         g = gimple_build_call_internal (IFN_GOMP_SIMT_VF, 0);
+         gimple_call_set_lhs (g, simt_vf);
+         gimple_seq_add_stmt (dlist, g);
+
+         tree t = build_int_cst (unsigned_type_node, 1);
+         g = gimple_build_assign (simt_lane, INTEGER_CST, t);
+         gimple_seq_add_stmt (dlist, g);
+
+         t = build_int_cst (unsigned_type_node, 0);
+         g = gimple_build_assign (idx, INTEGER_CST, t);
+         gimple_seq_add_stmt (dlist, g);
+
+         tree body = create_artificial_label (UNKNOWN_LOCATION);
+         tree header = create_artificial_label (UNKNOWN_LOCATION);
+         tree end = create_artificial_label (UNKNOWN_LOCATION);
+         gimple_seq_add_stmt (dlist, gimple_build_goto (header));
+         gimple_seq_add_stmt (dlist, gimple_build_label (body));
+
+         gimple_seq_add_seq (dlist, llist[2]);
+
+         g = gimple_build_assign (simt_lane, LSHIFT_EXPR, simt_lane, 
integer_one_node);
+         gimple_seq_add_stmt (dlist, g);
+
+         gimple_seq_add_stmt (dlist, gimple_build_label (header));
+         g = gimple_build_cond (LT_EXPR, simt_lane, simt_vf, body, end);
+         gimple_seq_add_stmt (dlist, g);
+
+         gimple_seq_add_stmt (dlist, gimple_build_label (end));
+       }
       for (int i = 0; i < 2; i++)
        if (llist[i])
          {
@@ -5389,7 +5466,7 @@ lower_lastprivate_clauses (tree clauses, tree predicate, 
gimple_seq *stmt_list,
 {
   tree x, c, label = NULL, orig_clauses = clauses;
   bool par_clauses = false;
-  tree simduid = NULL, lastlane = NULL;
+  tree simduid = NULL, lastlane = NULL, simtcond = NULL, simtlast = NULL;
 
   /* Early exit if there are no lastprivate or linear clauses.  */
   for (; clauses ; clauses = OMP_CLAUSE_CHAIN (clauses))
@@ -5416,6 +5493,16 @@ lower_lastprivate_clauses (tree clauses, tree predicate, 
gimple_seq *stmt_list,
       par_clauses = true;
     }
 
+  bool maybe_simt = false;
+  if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
+      && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD)
+    {
+      maybe_simt = omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
+      simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_);
+      if (simduid)
+       simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
+    }
+
   if (predicate)
     {
       gcond *stmt;
@@ -5427,20 +5514,27 @@ lower_lastprivate_clauses (tree clauses, tree 
predicate, gimple_seq *stmt_list,
       arm2 = TREE_OPERAND (predicate, 1);
       gimplify_expr (&arm1, stmt_list, NULL, is_gimple_val, fb_rvalue);
       gimplify_expr (&arm2, stmt_list, NULL, is_gimple_val, fb_rvalue);
-      stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2,
-                               label_true, label);
+      if (maybe_simt)
+       {
+         c = build2 (TREE_CODE (predicate), boolean_type_node, arm1, arm2);
+         c = fold_convert (integer_type_node, c);
+         simtcond = create_tmp_var (integer_type_node);
+         gimplify_assign (simtcond, c, stmt_list);
+         gcall *g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY,
+                                                1, simtcond);
+         c = create_tmp_var (integer_type_node);
+         gimple_call_set_lhs (g, c);
+         gimple_seq_add_stmt (stmt_list, g);
+         stmt = gimple_build_cond (NE_EXPR, c, integer_zero_node,
+                                   label_true, label);
+       }
+      else
+       stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2,
+                                 label_true, label);
       gimple_seq_add_stmt (stmt_list, stmt);
       gimple_seq_add_stmt (stmt_list, gimple_build_label (label_true));
     }
 
-  if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
-      && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD)
-    {
-      simduid = find_omp_clause (orig_clauses, OMP_CLAUSE__SIMDUID_);
-      if (simduid)
-       simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
-    }
-
   for (c = clauses; c ;)
     {
       tree var, new_var;
@@ -5491,6 +5585,24 @@ lower_lastprivate_clauses (tree clauses, tree predicate, 
gimple_seq *stmt_list,
                  new_var = build4 (ARRAY_REF, TREE_TYPE (val),
                                    TREE_OPERAND (val, 0), lastlane,
                                    NULL_TREE, NULL_TREE);
+                 if (maybe_simt)
+                   {
+                     gcall *g;
+                     if (simtlast == NULL)
+                       {
+                         simtlast = create_tmp_var (unsigned_type_node);
+                         g = gimple_build_call_internal
+                           (IFN_GOMP_SIMT_LAST_LANE, 1, simtcond);
+                         gimple_call_set_lhs (g, simtlast);
+                         gimple_seq_add_stmt (stmt_list, g);
+                       }
+                     x = build_call_expr_internal_loc
+                       (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_IDX,
+                        TREE_TYPE (new_var), 2, new_var, simtlast);
+                     new_var = unshare_expr (new_var);
+                     gimplify_assign (new_var, x, stmt_list);
+                     new_var = unshare_expr (new_var);
+                   }
                }
            }
 
@@ -10498,12 +10610,23 @@ expand_omp_simd (struct omp_region *region, struct 
omp_for_data *fd)
   edge e, ne;
   tree *counts = NULL;
   int i;
+  int safelen_int = INT_MAX;
   tree safelen = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
                                  OMP_CLAUSE_SAFELEN);
   tree simduid = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
                                  OMP_CLAUSE__SIMDUID_);
   tree n1, n2;
 
+  if (safelen)
+    {
+      safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen);
+      if (TREE_CODE (safelen) != INTEGER_CST)
+       safelen_int = 0;
+      else if (tree_fits_uhwi_p (safelen) && tree_to_uhwi (safelen) < INT_MAX)
+       safelen_int = tree_to_uhwi (safelen);
+      if (safelen_int == 1)
+       safelen_int = 0;
+    }
   type = TREE_TYPE (fd->loop.v);
   entry_bb = region->entry;
   cont_bb = region->cont;
@@ -10557,20 +10680,53 @@ expand_omp_simd (struct omp_region *region, struct 
omp_for_data *fd)
                                OMP_CLAUSE__LOOPTEMP_);
       gcc_assert (innerc);
       n2 = OMP_CLAUSE_DECL (innerc);
-      expand_omp_build_assign (&gsi, fd->loop.v,
-                              fold_convert (type, n1));
+    }
+  tree step = fd->loop.step;
+
+  bool offloaded = cgraph_node::get (current_function_decl)->offloadable;
+  for (struct omp_region *rgn = region; !offloaded && rgn; rgn = rgn->outer)
+    offloaded = rgn->type == GIMPLE_OMP_TARGET;
+  bool is_simt = offloaded && omp_max_simt_vf () > 1 && safelen_int > 1;
+  tree simt_lane = NULL_TREE, simt_maxlane = NULL_TREE;
+  if (is_simt)
+    {
+      cfun->curr_properties &= ~PROP_gimple_lomp_dev;
+      simt_lane = create_tmp_var (unsigned_type_node);
+      gimple *g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0);
+      gimple_call_set_lhs (g, simt_lane);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      tree offset = fold_build2 (MULT_EXPR, TREE_TYPE (step), step,
+                                fold_convert (TREE_TYPE (step), simt_lane));
+      n1 = fold_convert (type, n1);
+      if (POINTER_TYPE_P (type))
+       n1 = fold_build_pointer_plus (n1, offset);
+      else
+       n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, offset));
+
+      /* Collapsed loops not handled for SIMT yet: limit to one lane only.  */
       if (fd->collapse > 1)
+       simt_maxlane = build_one_cst (unsigned_type_node);
+      else if (safelen_int < omp_max_simt_vf ())
+       simt_maxlane = build_int_cst (unsigned_type_node, safelen_int);
+      tree vf
+       = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_VF,
+                                       unsigned_type_node, 0);
+      if (simt_maxlane)
+       vf = fold_build2 (MIN_EXPR, unsigned_type_node, vf, simt_maxlane);
+      vf = fold_convert (TREE_TYPE (step), vf);
+      step = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, vf);
+    }
+
+  expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+  if (fd->collapse > 1)
+    {
+      if (gimple_omp_for_combined_into_p (fd->for_stmt))
        {
          gsi_prev (&gsi);
          expand_omp_for_init_vars (fd, &gsi, counts, NULL, n1);
          gsi_next (&gsi);
        }
-    }
-  else
-    {
-      expand_omp_build_assign (&gsi, fd->loop.v,
-                              fold_convert (type, fd->loop.n1));
-      if (fd->collapse > 1)
+      else
        for (i = 0; i < fd->collapse; i++)
          {
            tree itype = TREE_TYPE (fd->loops[i].v);
@@ -10579,7 +10735,7 @@ expand_omp_simd (struct omp_region *region, struct 
omp_for_data *fd)
            t = fold_convert (TREE_TYPE (fd->loops[i].v), fd->loops[i].n1);
            expand_omp_build_assign (&gsi, fd->loops[i].v, t);
          }
-      }
+    }
 
   /* Remove the GIMPLE_OMP_FOR statement.  */
   gsi_remove (&gsi, true);
@@ -10592,9 +10748,9 @@ expand_omp_simd (struct omp_region *region, struct 
omp_for_data *fd)
       gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE);
 
       if (POINTER_TYPE_P (type))
-       t = fold_build_pointer_plus (fd->loop.v, fd->loop.step);
+       t = fold_build_pointer_plus (fd->loop.v, step);
       else
-       t = fold_build2 (PLUS_EXPR, type, fd->loop.v, fd->loop.step);
+       t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
       expand_omp_build_assign (&gsi, fd->loop.v, t);
 
       if (fd->collapse > 1)
@@ -10668,6 +10824,18 @@ expand_omp_simd (struct omp_region *region, struct 
omp_for_data *fd)
       gimple_regimplify_operands (cond_stmt, &gsi);
     }
 
+  /* Add 'V -= STEP * (SIMT_VF - 1)' after the loop.  */
+  if (is_simt)
+    {
+      gsi = gsi_start_bb (l2_bb);
+      step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), fd->loop.step, step);
+      if (POINTER_TYPE_P (type))
+       t = fold_build_pointer_plus (fd->loop.v, step);
+      else
+       t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
+      expand_omp_build_assign (&gsi, fd->loop.v, t);
+    }
+
   /* Remove GIMPLE_OMP_RETURN.  */
   gsi = gsi_last_bb (exit_bb);
   gsi_remove (&gsi, true);
@@ -10697,30 +10865,29 @@ expand_omp_simd (struct omp_region *region, struct 
omp_for_data *fd)
   ne->probability = REG_BR_PROB_BASE / 8;
 
   set_immediate_dominator (CDI_DOMINATORS, l1_bb, entry_bb);
-  set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
   set_immediate_dominator (CDI_DOMINATORS, l0_bb, l1_bb);
 
+  if (simt_maxlane)
+    {
+      cond_stmt = gimple_build_cond (LT_EXPR, simt_lane, simt_maxlane,
+                                    NULL_TREE, NULL_TREE);
+      gsi = gsi_last_bb (entry_bb);
+      gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
+      make_edge (entry_bb, l2_bb, EDGE_FALSE_VALUE);
+      FALLTHRU_EDGE (entry_bb)->flags = EDGE_TRUE_VALUE;
+      FALLTHRU_EDGE (entry_bb)->probability = REG_BR_PROB_BASE * 7 / 8;
+      BRANCH_EDGE (entry_bb)->probability = REG_BR_PROB_BASE / 8;
+      l2_dom_bb = entry_bb;
+    }
+  set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
+
   if (!broken_loop)
     {
       struct loop *loop = alloc_loop ();
       loop->header = l1_bb;
       loop->latch = cont_bb;
       add_loop (loop, l1_bb->loop_father);
-      if (safelen == NULL_TREE)
-       loop->safelen = INT_MAX;
-      else
-       {
-         safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen);
-         if (TREE_CODE (safelen) != INTEGER_CST)
-           loop->safelen = 0;
-         else if (!tree_fits_uhwi_p (safelen)
-                  || tree_to_uhwi (safelen) > INT_MAX)
-           loop->safelen = INT_MAX;
-         else
-           loop->safelen = tree_to_uhwi (safelen);
-         if (loop->safelen == 1)
-           loop->safelen = 0;
-       }
+      loop->safelen = safelen_int;
       if (simduid)
        {
          loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
@@ -13885,7 +14052,6 @@ expand_omp (struct omp_region *region)
     }
 }
 
-
 /* Helper for build_omp_regions.  Scan the dominator tree starting at
    block BB.  PARENT is the region that contains BB.  If SINGLE_TREE is
    true, the function ends once a single tree is built (otherwise, whole
@@ -14768,12 +14934,14 @@ static void
 lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 {
   tree block;
-  gimple *stmt = gsi_stmt (*gsi_p);
+  gimple *stmt = gsi_stmt (*gsi_p), *g;
   gomp_ordered *ord_stmt = as_a <gomp_ordered *> (stmt);
   gcall *x;
   gbind *bind;
   bool simd = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
                               OMP_CLAUSE_SIMD);
+  bool maybe_simt
+    = simd && omp_maybe_offloaded_ctx (ctx) && omp_max_simt_vf () > 1;
   bool threads = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
                                  OMP_CLAUSE_THREADS);
 
@@ -14807,11 +14975,56 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, 
omp_context *ctx)
                           0);
   gimple_bind_add_stmt (bind, x);
 
+  tree counter = NULL_TREE, test = NULL_TREE, body = NULL_TREE;
+  if (maybe_simt)
+    {
+      counter = create_tmp_var (integer_type_node);
+      g = gimple_build_call_internal (IFN_GOMP_SIMT_LANE, 0);
+      gimple_call_set_lhs (g, counter);
+      gimple_bind_add_stmt (bind, g);
+
+      body = create_artificial_label (UNKNOWN_LOCATION);
+      test = create_artificial_label (UNKNOWN_LOCATION);
+      gimple_bind_add_stmt (bind, gimple_build_label (body));
+
+      tree simt_pred = create_tmp_var (integer_type_node);
+      g = gimple_build_call_internal (IFN_GOMP_SIMT_ORDERED_PRED, 1, counter);
+      gimple_call_set_lhs (g, simt_pred);
+      gimple_bind_add_stmt (bind, g);
+
+      tree t = create_artificial_label (UNKNOWN_LOCATION);
+      g = gimple_build_cond (EQ_EXPR, simt_pred, integer_zero_node, t, test);
+      gimple_bind_add_stmt (bind, g);
+
+      gimple_bind_add_stmt (bind, gimple_build_label (t));
+    }
   lower_omp (gimple_omp_body_ptr (stmt), ctx);
   gimple_omp_set_body (stmt, maybe_catch_exception (gimple_omp_body (stmt)));
   gimple_bind_add_seq (bind, gimple_omp_body (stmt));
   gimple_omp_set_body (stmt, NULL);
 
+  if (maybe_simt)
+    {
+      gimple_bind_add_stmt (bind, gimple_build_label (test));
+      g = gimple_build_assign (counter, MINUS_EXPR, counter, integer_one_node);
+      gimple_bind_add_stmt (bind, g);
+
+      tree c = build2 (GE_EXPR, boolean_type_node, counter, integer_zero_node);
+      tree nonneg = create_tmp_var (integer_type_node);
+      gimple_seq tseq = NULL;
+      gimplify_assign (nonneg, fold_convert (integer_type_node, c), &tseq);
+      gimple_bind_add_seq (bind, tseq);
+
+      g = gimple_build_call_internal (IFN_GOMP_SIMT_VOTE_ANY, 1, nonneg);
+      gimple_call_set_lhs (g, nonneg);
+      gimple_bind_add_stmt (bind, g);
+
+      tree end = create_artificial_label (UNKNOWN_LOCATION);
+      g = gimple_build_cond (NE_EXPR, nonneg, integer_zero_node, body, end);
+      gimple_bind_add_stmt (bind, g);
+
+      gimple_bind_add_stmt (bind, gimple_build_label (end));
+    }
   if (simd)
     x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 1,
                                    build_int_cst (NULL_TREE, threads));
@@ -17932,7 +18145,7 @@ const pass_data pass_data_lower_omp =
   OPTGROUP_NONE, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_gimple_any, /* properties_required */
-  PROP_gimple_lomp, /* properties_provided */
+  PROP_gimple_lomp | PROP_gimple_lomp_dev, /* properties_provided */
   0, /* properties_destroyed */
   0, /* todo_flags_start */
   0, /* todo_flags_finish */
@@ -19864,6 +20077,109 @@ make_pass_oacc_device_lower (gcc::context *ctxt)
 {
   return new pass_oacc_device_lower (ctxt);
 }
+
+
+/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
+   VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
+   LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
+   internal functions on non-SIMT targets, and likewise some SIMD internal
+   functions on SIMT targets.  */
+
+static unsigned int
+execute_omp_device_lower ()
+{
+  int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+  FOR_EACH_BB_FN (bb, cfun)
+    for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+       gimple *stmt = gsi_stmt (gsi);
+       if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
+         continue;
+       tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
+       tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
+       switch (gimple_call_internal_fn (stmt))
+         {
+         case IFN_GOMP_SIMT_LANE:
+         case IFN_GOMP_SIMT_LAST_LANE:
+           rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
+           break;
+         case IFN_GOMP_SIMT_VF:
+           rhs = build_int_cst (type, vf);
+           break;
+         case IFN_GOMP_SIMT_ORDERED_PRED:
+           rhs = vf == 1 ? integer_zero_node : NULL_TREE;
+           if (rhs || !lhs)
+             unlink_stmt_vdef (stmt);
+           break;
+         case IFN_GOMP_SIMT_VOTE_ANY:
+         case IFN_GOMP_SIMT_XCHG_BFLY:
+         case IFN_GOMP_SIMT_XCHG_IDX:
+           rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
+           break;
+         case IFN_GOMP_SIMD_LANE:
+         case IFN_GOMP_SIMD_LAST_LANE:
+           rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
+           break;
+         case IFN_GOMP_SIMD_VF:
+           rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
+           break;
+         default:
+           continue;
+         }
+       if (lhs && !rhs)
+         continue;
+       stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
+       gsi_replace (&gsi, stmt, false);
+      }
+  if (vf != 1)
+    cfun->has_force_vectorize_loops = false;
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_omp_device_lower =
+{
+  GIMPLE_PASS, /* type */
+  "ompdevlow", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_cfg, /* properties_required */
+  PROP_gimple_lomp_dev, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_omp_device_lower : public gimple_opt_pass
+{
+public:
+  pass_omp_device_lower (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *fun)
+    {
+      /* FIXME: inlining does not propagate the lomp_dev property.  */
+      return 1 || !(fun->curr_properties & PROP_gimple_lomp_dev);
+    }
+  virtual unsigned int execute (function *)
+    {
+      return execute_omp_device_lower ();
+    }
+
+}; // class pass_expand_omp_ssa
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_device_lower (gcc::context *ctxt)
+{
+  return new pass_omp_device_lower (ctxt);
+}
 
 /* "omp declare target link" handling pass.  */
 
diff --git a/gcc/passes.def b/gcc/passes.def
index 85a5af0..2a470a7 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -183,6 +183,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_fixup_cfg);
   NEXT_PASS (pass_lower_eh_dispatch);
   NEXT_PASS (pass_oacc_device_lower);
+  NEXT_PASS (pass_omp_device_lower);
   NEXT_PASS (pass_omp_target_link);
   NEXT_PASS (pass_all_optimizations);
   PUSH_INSERT_PASSES_WITHIN (pass_all_optimizations)
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index da9ba13..8befebe 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -222,6 +222,7 @@ protected:
                                                   of math functions; the
                                                   current choices have
                                                   been optimized.  */
+#define PROP_gimple_lomp_dev   (1 << 16)       /* done omp_device_lower */
 
 #define PROP_trees \
   (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
@@ -417,6 +418,7 @@ extern gimple_opt_pass *make_pass_expand_omp (gcc::context 
*ctxt);
 extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_strlen (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_fold_builtins (gcc::context *ctxt);

Reply via email to