Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-12-13 Thread Julian Brown
On Tue, 4 Dec 2018 16:55:04 +0100
Tom de Vries  wrote:

> On 04-12-18 13:29, Jakub Jelinek wrote:
> > On Fri, Jun 29, 2018 at 11:19:53AM -0700, Cesar Philippidis wrote:  
> >> The attached patch includes the nvptx and GCC ME reductions
> >> enhancements.
> >>
> >> Is this patch OK for trunk? It bootstrapped / regression tested
> >> cleanly for x86_64 with nvptx offloading.  
> > This is all OpenACC specific code not really shareable with OpenMP,
> > if Thomas (for middle-end) and Tom (for NVPTX backend) are ok with
> > it, it is ok for trunk.
> >   
> 
> Formatting needs to be fixed:
> ...
> There should be exactly one space between function name and
> parenthesis. 160:+  unsigned old_shift = DIM_SIZE(VECTOR);
> ...
> 
> Also, the updated patch does not address my comment about
> probabilities here
> ( https://gcc.gnu.org/ml/gcc-patches/2018-10/msg00325.html ): ...
> > +  /* Create the loop.  */
> > +  post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;  
> 
> Edges need probabilities, as in nvptx_lockless_update,
> nvptx_lockfull_update and nvptx_goacc_reduction_init.
> ...

Something like the attached?

Tested alongside other revised patches in the series:

https://gcc.gnu.org/ml/gcc-patches/2018-12/msg00930.html
https://gcc.gnu.org/ml/gcc-patches/2018-12/msg00931.html

(except the lines adding edge probabilities, which I've
smoke-tested but haven't yet gone through a full test cycle).

Thanks,

Julian

ChangeLog

gcc/
* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
(nvptx_split_blocks): Call it for cond_uni insn.
(nvptx_expand_cond_uni): New.
(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
(nvptx_init_builtins): Initialize it.
(nvptx_expand_builtin):
(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
tree BITS operand.
(nvptx_vector_reduction): New.
(nvptx_adjust_reduction_type): New.
(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
Use it to adjust the type of ref_to_res.
(nvptx_goacc_reduction_teardown):
* config/nvptx/nvptx.md (cond_uni): New pattern.

commit 401876d422c4fa7f02c1b899e81568eea6ad7531
Author: Julian Brown 
Date:   Tue Dec 11 13:35:52 2018 -0800

Various OpenACC reduction enhancements - ME and nvptx changes

	gcc/
	* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
	(nvptx_split_blocks): Call it for cond_uni insn.
	(nvptx_expand_cond_uni): New.
	(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
	(nvptx_init_builtins): Initialize it.
	(nvptx_expand_builtin):
	(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
	tree BITS operand.
	(nvptx_vector_reduction): New.
	(nvptx_adjust_reduction_type): New.
	(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
	(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
	Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_teardown):
	* config/nvptx/nvptx.md (cond_uni): New pattern.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 9903a27..0023dad 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
 }
 }
 
+/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
+   mark that as unified.  We expect to be in a single block.  */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+  rtx_insn *probe = unified;
+  rtx cond_reg = SET_DEST (PATTERN (unified));
+  rtx pat = NULL_RTX;
+
+  /* Find the comparison.  (We could skip this and simply scan to he
+ blocks' terminating branch, if we didn't care for self
+ checking.)  */
+  for (;;)
+{
+  probe = next_real_insn (probe);
+  if (!probe)
+	break;
+  pat = PATTERN (probe);
+
+  if (GET_CODE (pat) == SET
+	  && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+	  && XEXP (SET_SRC (pat), 0) == cond_reg)
+	break;
+  gcc_assert (NONJUMP_INSN_P (probe));
+}
+  gcc_assert (pat);
+  rtx pred_reg = SET_DEST (pat);
+
+  /* Find the branch.  */
+  do
+probe = NEXT_INSN (probe);
+  while (!JUMP_P (probe));
+
+  pat = PATTERN (probe);
+  rtx itec = XEXP (SET_SRC (pat), 0);
+  gcc_assert (XEXP (itec, 0) == pred_reg);
+
+  /* Mark the branch's condition as unified.  */
+  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+			   UNSPEC_BR_UNIFIED);
+  bool ok = validate_change (probe,  (itec, 0), unspec, false);
+
+  gcc_assert (ok);
+}
+
 /* Loop structure of the function.  The entire function is described as
a NULL loop.  */
 
@@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
 	continue;
 	  

Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-12-04 Thread Tom de Vries
On 04-12-18 13:29, Jakub Jelinek wrote:
> On Fri, Jun 29, 2018 at 11:19:53AM -0700, Cesar Philippidis wrote:
>> The attached patch includes the nvptx and GCC ME reductions enhancements.
>>
>> Is this patch OK for trunk? It bootstrapped / regression tested cleanly
>> for x86_64 with nvptx offloading.
> This is all OpenACC specific code not really shareable with OpenMP, if
> Thomas (for middle-end) and Tom (for NVPTX backend) are ok with it, it is ok
> for trunk.
> 

Formatting needs to be fixed:
...
There should be exactly one space between function name and parenthesis.
160:+  unsigned old_shift = DIM_SIZE(VECTOR);
...

Also, the updated patch does not address my comment about probabilities
here ( https://gcc.gnu.org/ml/gcc-patches/2018-10/msg00325.html ):
...
> +  /* Create the loop.  */
> +  post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;

Edges need probabilities, as in nvptx_lockless_update,
nvptx_lockfull_update and nvptx_goacc_reduction_init.
...

Thanks,
- Tom


Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-12-04 Thread Jakub Jelinek
On Fri, Jun 29, 2018 at 11:19:53AM -0700, Cesar Philippidis wrote:
> The attached patch includes the nvptx and GCC ME reductions enhancements.
> 
> Is this patch OK for trunk? It bootstrapped / regression tested cleanly
> for x86_64 with nvptx offloading.

This is all OpenACC specific code not really shareable with OpenMP, if
Thomas (for middle-end) and Tom (for NVPTX backend) are ok with it, it is ok
for trunk.

> 2018-06-29  Cesar Philippidis  
>   Nathan Sidwell  
> 
>   gcc/
>   * config/nvptx/nvptx.c (nvptx_propagate_unified): New.
>   (nvptx_split_blocks): Call it for cond_uni insn.
>   (nvptx_expand_cond_uni): New.
>   (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
>   (nvptx_init_builtins): Initialize it.
>   (nvptx_expand_builtin):
>   (nvptx_generate_vector_shuffle): Change integral SHIFT operand to
>   tree BITS operand.
>   (nvptx_vector_reduction): New.
>   (nvptx_adjust_reduction_type): New.
>   (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
>   (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
>   (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
>   Use it to adjust the type of ref_to_res.
>   (nvptx_goacc_reduction_teardown):
>   * config/nvptx/nvptx.md (cond_uni): New pattern.
>   * omp-general.h (enum oacc_loop_flags): Add OLF_REDUCTION enum.
>   * omp-low.c (lower_oacc_reductions): Handle reduction decls mapped
>   with GOMP_MAP_FIRSTPRIVATE_POINTER.
>   (lower_oacc_head_mark): Use OLF_REDUCTION to mark OpenACC reductions.
>   * omp-offload.c (oacc_loop_auto_partitions): Don't assign gang
>   level parallelism to orphan reductions.
>   (default_goacc_reduction): Retype ref_to_res as necessary.

Jakub


Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-10-30 Thread Cesar Philippidis
On 10/5/18 07:07, Tom de Vries wrote:
> On 6/29/18 8:19 PM, Cesar Philippidis wrote:
>> The attached patch includes the nvptx and GCC ME reductions enhancements.
>>
>> Is this patch OK for trunk? It bootstrapped / regression tested cleanly
>> for x86_64 with nvptx offloading.
>>
> 
> These need fixing:
> ...
> === ERROR type #5: trailing whitespace (4 error(s)) ===
> gcc/config/nvptx/nvptx.c:5139:0:██
> gcc/config/nvptx/nvptx.c:5660:8:  do█
> gcc/config/nvptx/nvptx.c:5702:0:██
> gcc/config/nvptx/nvptx.c:5726:0:██
> ...

Sorry. The attached patch fixes that.

> Otherwise, nvptx part LGTM.
Tomorrow's my last day at Mentor, so either Thomas or Julian will need
to commit it once the other patches get approved.

Thanks,
Cesar
	gcc/
	* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
	(nvptx_split_blocks): Call it for cond_uni insn.
	(nvptx_expand_cond_uni): New.
	(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
	(nvptx_init_builtins): Initialize it.
	(nvptx_expand_builtin):
	(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
	tree BITS operand.
	(nvptx_vector_reduction): New.
	(nvptx_adjust_reduction_type): New.
	(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
	(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
	Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_teardown):
	* config/nvptx/nvptx.md (cond_uni): New pattern.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 9903a273863..acb490a9a90 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
 }
 }
 
+/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
+   mark that as unified.  We expect to be in a single block.  */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+  rtx_insn *probe = unified;
+  rtx cond_reg = SET_DEST (PATTERN (unified));
+  rtx pat = NULL_RTX;
+
+  /* Find the comparison.  (We could skip this and simply scan to he
+ blocks' terminating branch, if we didn't care for self
+ checking.)  */
+  for (;;)
+{
+  probe = next_real_insn (probe);
+  if (!probe)
+	break;
+  pat = PATTERN (probe);
+
+  if (GET_CODE (pat) == SET
+	  && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+	  && XEXP (SET_SRC (pat), 0) == cond_reg)
+	break;
+  gcc_assert (NONJUMP_INSN_P (probe));
+}
+  gcc_assert (pat);
+  rtx pred_reg = SET_DEST (pat);
+
+  /* Find the branch.  */
+  do
+probe = NEXT_INSN (probe);
+  while (!JUMP_P (probe));
+
+  pat = PATTERN (probe);
+  rtx itec = XEXP (SET_SRC (pat), 0);
+  gcc_assert (XEXP (itec, 0) == pred_reg);
+
+  /* Mark the branch's condition as unified.  */
+  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+			   UNSPEC_BR_UNIFIED);
+  bool ok = validate_change (probe,  (itec, 0), unspec, false);
+
+  gcc_assert (ok);
+}
+
 /* Loop structure of the function.  The entire function is described as
a NULL loop.  */
 
@@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
 	continue;
 	  switch (recog_memoized (insn))
 	{
+	case CODE_FOR_cond_uni:
+	  nvptx_propagate_unified (insn);
+	  /* FALLTHROUGH */
 	default:
 	  seen_insn = true;
 	  continue;
@@ -5083,6 +5132,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
   return target;
 }
 
+/* Expander for the compare unified builtin.  */
+
+static rtx
+nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
+{
+  if (ignore)
+return target;
+
+  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+			 NULL_RTX, mode, EXPAND_NORMAL);
+
+  emit_insn (gen_cond_uni (target, src));
+
+  return target;
+}
 
 /* Codes for all the NVPTX builtins.  */
 enum nvptx_builtins
@@ -5092,6 +5156,7 @@ enum nvptx_builtins
   NVPTX_BUILTIN_WORKER_ADDR,
   NVPTX_BUILTIN_CMP_SWAP,
   NVPTX_BUILTIN_CMP_SWAPLL,
+  NVPTX_BUILTIN_COND_UNI,
   NVPTX_BUILTIN_MAX
 };
 
@@ -5129,6 +5194,7 @@ nvptx_init_builtins (void)
(PTRVOID, ST, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
+  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE));
 
 #undef DEF
 #undef ST
@@ -5161,6 +5227,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 case NVPTX_BUILTIN_CMP_SWAPLL:
   return nvptx_expand_cmp_swap (exp, target, mode, ignore);
 
+case NVPTX_BUILTIN_COND_UNI:
+  return nvptx_expand_cond_uni (exp, target, mode, ignore);
+
 default: gcc_unreachable ();
 }
 }
@@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset)
 
 static void
 nvptx_generate_vector_shuffle (location_t loc,
-			   tree dest_var, tree var, unsigned shift,
+			   tree dest_var, tree var, tree bits,
 			   gimple_seq *seq)
 

Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-10-05 Thread Tom de Vries
On 6/29/18 8:19 PM, Cesar Philippidis wrote:
> The attached patch includes the nvptx and GCC ME reductions enhancements.
> 
> Is this patch OK for trunk? It bootstrapped / regression tested cleanly
> for x86_64 with nvptx offloading.
> 

These need fixing:
...
=== ERROR type #5: trailing whitespace (4 error(s)) ===
gcc/config/nvptx/nvptx.c:5139:0:██
gcc/config/nvptx/nvptx.c:5660:8:  do█
gcc/config/nvptx/nvptx.c:5702:0:██
gcc/config/nvptx/nvptx.c:5726:0:██
...


>   gcc/
>   * config/nvptx/nvptx.c (nvptx_propagate_unified): New.
>   (nvptx_split_blocks): Call it for cond_uni insn.
>   (nvptx_expand_cond_uni): New.
>   (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
>   (nvptx_init_builtins): Initialize it.
>   (nvptx_expand_builtin):
>   (nvptx_generate_vector_shuffle): Change integral SHIFT operand to
>   tree BITS operand.
>   (nvptx_vector_reduction): New.
>   (nvptx_adjust_reduction_type): New.
>   (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
>   (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
>   (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
>   Use it to adjust the type of ref_to_res.
>   (nvptx_goacc_reduction_teardown):
>   * config/nvptx/nvptx.md (cond_uni): New pattern.

> diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
> index 5608bee8a8d..33ec3db1153 100644
> --- a/gcc/config/nvptx/nvptx.c
> +++ b/gcc/config/nvptx/nvptx.c
> @@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
>  }
>  }
>  
> +/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
> +   mark that as unified.  We expect to be in a single block.  */
> +
> +static void
> +nvptx_propagate_unified (rtx_insn *unified)
> +{
> +  rtx_insn *probe = unified;
> +  rtx cond_reg = SET_DEST (PATTERN (unified));
> +  rtx pat = NULL_RTX;
> +
> +  /* Find the comparison.  (We could skip this and simply scan to he
> + blocks' terminating branch, if we didn't care for self
> + checking.)  */
> +  for (;;)
> +{
> +  probe = next_real_insn (probe);
> +  if (!probe)
> + break;
> +  pat = PATTERN (probe);
> +
> +  if (GET_CODE (pat) == SET
> +   && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
> +   && XEXP (SET_SRC (pat), 0) == cond_reg)
> + break;
> +  gcc_assert (NONJUMP_INSN_P (probe));
> +}
> +  gcc_assert (pat);
> +  rtx pred_reg = SET_DEST (pat);
> +
> +  /* Find the branch.  */
> +  do
> +probe = NEXT_INSN (probe);
> +  while (!JUMP_P (probe));
> +
> +  pat = PATTERN (probe);
> +  rtx itec = XEXP (SET_SRC (pat), 0);
> +  gcc_assert (XEXP (itec, 0) == pred_reg);
> +
> +  /* Mark the branch's condition as unified.  */
> +  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
> +UNSPEC_BR_UNIFIED);
> +  bool ok = validate_change (probe,  (itec, 0), unspec, false);
> +
> +  gcc_assert (ok);
> +}
> +
>  /* Loop structure of the function.  The entire function is described as
> a NULL loop.  */
>  
> @@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
>   continue;
> switch (recog_memoized (insn))
>   {
> + case CODE_FOR_cond_uni:
> +   nvptx_propagate_unified (insn);
> +   /* FALLTHROUGH */
>   default:
> seen_insn = true;
> continue;
> @@ -5080,6 +5129,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
>return target;
>  }
>  
> +/* Expander for the compare unified builtin.  */
> +
> +static rtx
> +nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
> +{
> +  if (ignore)
> +return target;
> +  
> +  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
> +  NULL_RTX, mode, EXPAND_NORMAL);
> +
> +  emit_insn (gen_cond_uni (target, src));
> +
> +  return target;
> +}
>  
>  /* Codes for all the NVPTX builtins.  */
>  enum nvptx_builtins
> @@ -5089,6 +5153,7 @@ enum nvptx_builtins
>NVPTX_BUILTIN_WORKER_ADDR,
>NVPTX_BUILTIN_CMP_SWAP,
>NVPTX_BUILTIN_CMP_SWAPLL,
> +  NVPTX_BUILTIN_COND_UNI,
>NVPTX_BUILTIN_MAX
>  };
>  
> @@ -5126,6 +5191,7 @@ nvptx_init_builtins (void)
> (PTRVOID, ST, UINT, UINT, NULL_TREE));
>DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
>DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, 
> NULL_TREE));
> +  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, 
> NULL_TREE));
>  
>  #undef DEF
>  #undef ST
> @@ -5158,6 +5224,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx 
> ARG_UNUSED (subtarget),
>  case NVPTX_BUILTIN_CMP_SWAPLL:
>return nvptx_expand_cmp_swap (exp, target, mode, ignore);
>  
> +case NVPTX_BUILTIN_COND_UNI:
> +  return nvptx_expand_cond_uni (exp, target, mode, ignore);
> +
>  default: gcc_unreachable ();
>  }
>  }
> @@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset)
>  
> 

Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-06-29 Thread Cesar Philippidis
The attached patch includes the nvptx and GCC ME reductions enhancements.

Is this patch OK for trunk? It bootstrapped / regression tested cleanly
for x86_64 with nvptx offloading.

Thanks,
Cesar
2018-06-29  Cesar Philippidis  
	Nathan Sidwell  

	gcc/
	* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
	(nvptx_split_blocks): Call it for cond_uni insn.
	(nvptx_expand_cond_uni): New.
	(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
	(nvptx_init_builtins): Initialize it.
	(nvptx_expand_builtin):
	(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
	tree BITS operand.
	(nvptx_vector_reduction): New.
	(nvptx_adjust_reduction_type): New.
	(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
	(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
	Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_teardown):
	* config/nvptx/nvptx.md (cond_uni): New pattern.
	* omp-general.h (enum oacc_loop_flags): Add OLF_REDUCTION enum.
	* omp-low.c (lower_oacc_reductions): Handle reduction decls mapped
	with GOMP_MAP_FIRSTPRIVATE_POINTER.
	(lower_oacc_head_mark): Use OLF_REDUCTION to mark OpenACC reductions.
	* omp-offload.c (oacc_loop_auto_partitions): Don't assign gang
	level parallelism to orphan reductions.
	(default_goacc_reduction): Retype ref_to_res as necessary.

---
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5608bee8a8d..33ec3db1153 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
 }
 }
 
+/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
+   mark that as unified.  We expect to be in a single block.  */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+  rtx_insn *probe = unified;
+  rtx cond_reg = SET_DEST (PATTERN (unified));
+  rtx pat = NULL_RTX;
+
+  /* Find the comparison.  (We could skip this and simply scan to he
+ blocks' terminating branch, if we didn't care for self
+ checking.)  */
+  for (;;)
+{
+  probe = next_real_insn (probe);
+  if (!probe)
+	break;
+  pat = PATTERN (probe);
+
+  if (GET_CODE (pat) == SET
+	  && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+	  && XEXP (SET_SRC (pat), 0) == cond_reg)
+	break;
+  gcc_assert (NONJUMP_INSN_P (probe));
+}
+  gcc_assert (pat);
+  rtx pred_reg = SET_DEST (pat);
+
+  /* Find the branch.  */
+  do
+probe = NEXT_INSN (probe);
+  while (!JUMP_P (probe));
+
+  pat = PATTERN (probe);
+  rtx itec = XEXP (SET_SRC (pat), 0);
+  gcc_assert (XEXP (itec, 0) == pred_reg);
+
+  /* Mark the branch's condition as unified.  */
+  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+			   UNSPEC_BR_UNIFIED);
+  bool ok = validate_change (probe,  (itec, 0), unspec, false);
+
+  gcc_assert (ok);
+}
+
 /* Loop structure of the function.  The entire function is described as
a NULL loop.  */
 
@@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
 	continue;
 	  switch (recog_memoized (insn))
 	{
+	case CODE_FOR_cond_uni:
+	  nvptx_propagate_unified (insn);
+	  /* FALLTHROUGH */
 	default:
 	  seen_insn = true;
 	  continue;
@@ -5080,6 +5129,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
   return target;
 }
 
+/* Expander for the compare unified builtin.  */
+
+static rtx
+nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
+{
+  if (ignore)
+return target;
+  
+  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+			 NULL_RTX, mode, EXPAND_NORMAL);
+
+  emit_insn (gen_cond_uni (target, src));
+
+  return target;
+}
 
 /* Codes for all the NVPTX builtins.  */
 enum nvptx_builtins
@@ -5089,6 +5153,7 @@ enum nvptx_builtins
   NVPTX_BUILTIN_WORKER_ADDR,
   NVPTX_BUILTIN_CMP_SWAP,
   NVPTX_BUILTIN_CMP_SWAPLL,
+  NVPTX_BUILTIN_COND_UNI,
   NVPTX_BUILTIN_MAX
 };
 
@@ -5126,6 +5191,7 @@ nvptx_init_builtins (void)
(PTRVOID, ST, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
+  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE));
 
 #undef DEF
 #undef ST
@@ -5158,6 +5224,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 case NVPTX_BUILTIN_CMP_SWAPLL:
   return nvptx_expand_cmp_swap (exp, target, mode, ignore);
 
+case NVPTX_BUILTIN_COND_UNI:
+  return nvptx_expand_cond_uni (exp, target, mode, ignore);
+
 default: gcc_unreachable ();
 }
 }
@@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset)
 
 static void
 nvptx_generate_vector_shuffle (location_t loc,
-			   tree dest_var, tree var, unsigned shift,
+			   tree dest_var, tree var, tree bits,
 			   gimple_seq *seq)
 {
   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
@@ -5307,7 +5376,6 @@