Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-10-30 Thread Cesar Philippidis
On 10/5/18 07:07, Tom de Vries wrote:
> On 6/29/18 8:19 PM, Cesar Philippidis wrote:
>> The attached patch includes the nvptx and GCC ME reductions enhancements.
>>
>> Is this patch OK for trunk? It bootstrapped / regression tested cleanly
>> for x86_64 with nvptx offloading.
>>
> 
> These need fixing:
> ...
> === ERROR type #5: trailing whitespace (4 error(s)) ===
> gcc/config/nvptx/nvptx.c:5139:0:██
> gcc/config/nvptx/nvptx.c:5660:8:  do█
> gcc/config/nvptx/nvptx.c:5702:0:██
> gcc/config/nvptx/nvptx.c:5726:0:██
> ...

Sorry. The attached patch fixes that.

> Otherwise, nvptx part LGTM.
Tomorrow's my last day at Mentor, so either Thomas or Julian will need
to commit it once the other patches get approved.

Thanks,
Cesar
	gcc/
	* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
	(nvptx_split_blocks): Call it for cond_uni insn.
	(nvptx_expand_cond_uni): New.
	(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
	(nvptx_init_builtins): Initialize it.
	(nvptx_expand_builtin):
	(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
	tree BITS operand.
	(nvptx_vector_reduction): New.
	(nvptx_adjust_reduction_type): New.
	(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
	(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
	Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_teardown):
	* config/nvptx/nvptx.md (cond_uni): New pattern.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 9903a273863..acb490a9a90 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
 }
 }
 
+/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
+   mark that as unified.  We expect to be in a single block.  */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+  rtx_insn *probe = unified;
+  rtx cond_reg = SET_DEST (PATTERN (unified));
+  rtx pat = NULL_RTX;
+
+  /* Find the comparison.  (We could skip this and simply scan to he
+ blocks' terminating branch, if we didn't care for self
+ checking.)  */
+  for (;;)
+{
+  probe = next_real_insn (probe);
+  if (!probe)
+	break;
+  pat = PATTERN (probe);
+
+  if (GET_CODE (pat) == SET
+	  && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+	  && XEXP (SET_SRC (pat), 0) == cond_reg)
+	break;
+  gcc_assert (NONJUMP_INSN_P (probe));
+}
+  gcc_assert (pat);
+  rtx pred_reg = SET_DEST (pat);
+
+  /* Find the branch.  */
+  do
+probe = NEXT_INSN (probe);
+  while (!JUMP_P (probe));
+
+  pat = PATTERN (probe);
+  rtx itec = XEXP (SET_SRC (pat), 0);
+  gcc_assert (XEXP (itec, 0) == pred_reg);
+
+  /* Mark the branch's condition as unified.  */
+  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+			   UNSPEC_BR_UNIFIED);
+  bool ok = validate_change (probe,  (itec, 0), unspec, false);
+
+  gcc_assert (ok);
+}
+
 /* Loop structure of the function.  The entire function is described as
a NULL loop.  */
 
@@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
 	continue;
 	  switch (recog_memoized (insn))
 	{
+	case CODE_FOR_cond_uni:
+	  nvptx_propagate_unified (insn);
+	  /* FALLTHROUGH */
 	default:
 	  seen_insn = true;
 	  continue;
@@ -5083,6 +5132,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
   return target;
 }
 
+/* Expander for the compare unified builtin.  */
+
+static rtx
+nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
+{
+  if (ignore)
+return target;
+
+  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+			 NULL_RTX, mode, EXPAND_NORMAL);
+
+  emit_insn (gen_cond_uni (target, src));
+
+  return target;
+}
 
 /* Codes for all the NVPTX builtins.  */
 enum nvptx_builtins
@@ -5092,6 +5156,7 @@ enum nvptx_builtins
   NVPTX_BUILTIN_WORKER_ADDR,
   NVPTX_BUILTIN_CMP_SWAP,
   NVPTX_BUILTIN_CMP_SWAPLL,
+  NVPTX_BUILTIN_COND_UNI,
   NVPTX_BUILTIN_MAX
 };
 
@@ -5129,6 +5194,7 @@ nvptx_init_builtins (void)
(PTRVOID, ST, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
+  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE));
 
 #undef DEF
 #undef ST
@@ -5161,6 +5227,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 case NVPTX_BUILTIN_CMP_SWAPLL:
   return nvptx_expand_cmp_swap (exp, target, mode, ignore);
 
+case NVPTX_BUILTIN_COND_UNI:
+  return nvptx_expand_cond_uni (exp, target, mode, ignore);
+
 default: gcc_unreachable ();
 }
 }
@@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset)
 
 static void
 nvptx_generate_vector_shuffle (location_t loc,
-			   tree 

[PATCH 4/4] [og8] Attach / Detach compiler tests

2018-10-30 Thread Cesar Philippidis
This patch introduces a couple of compiler tests for the OpenACC
attach and detach clauses.

I've committed it to openacc-gcc-8-branch.

Cesar
2018-10-30  Cesar Philippidis  

	gcc/testsuite/
	* c-c++-common/goacc/mdc-1.c: New test.
	* c-c++-common/goacc/mdc-2.c: New test.
	* g++.dg/goacc/mdc.C: New test.
---
 gcc/testsuite/c-c++-common/goacc/mdc-1.c | 54 +++
 gcc/testsuite/c-c++-common/goacc/mdc-2.c | 62 +
 gcc/testsuite/g++.dg/goacc/mdc.C | 68 
 3 files changed, 184 insertions(+)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/mdc-1.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/mdc-2.c
 create mode 100644 gcc/testsuite/g++.dg/goacc/mdc.C

diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-1.c b/gcc/testsuite/c-c++-common/goacc/mdc-1.c
new file mode 100644
index 000..c20b94ddbdc
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/mdc-1.c
@@ -0,0 +1,54 @@
+/* Test OpenACC's support for manual deep copy, including the attach
+   and detach clauses.  */
+
+/* { dg-additional-options "-fdump-tree-omplower" } */
+
+void
+t1 ()
+{
+  struct foo {
+int *a, *b, c, d, *e;
+  } s;
+
+  int *a, *z;
+
+#pragma acc enter data copyin(s)
+  {
+#pragma acc data copy(s.a[0:10]) copy(z[0:10])
+{
+  s.e = z;
+#pragma acc parallel loop attach(s.e)
+  for (int i = 0; i < 10; i++)
+s.a[i] = s.e[i];
+
+
+  a = s.e;
+#pragma acc enter data attach(a)
+#pragma acc exit data detach(a)
+}
+
+#pragma acc enter data copyin(a)
+#pragma acc acc enter data attach(s.e)
+#pragma acc exit data detach(s.e)
+
+#pragma acc data attach(s.e)
+{
+}
+#pragma acc exit data delete(a)
+
+#pragma acc exit data detach(a) finalize
+#pragma acc exit data detach(s.a) finalize
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:s .len: 32.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.tofrom:.z .len: 40.. map.struct:s .len: 1.. map.alloc:s.a .len: 8.. map.tofrom:._1 .len: 40.. map.always_pointer:s.a .pointer assign, bias: 0.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_parallel map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.attach:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:s.e .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.release:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:s.a .len: 8.." 1 "omplower" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-2.c b/gcc/testsuite/c-c++-common/goacc/mdc-2.c
new file mode 100644
index 000..ebfb99d4caf
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/mdc-2.c
@@ -0,0 +1,62 @@
+/* Test OpenACC's support for manual deep copy, including the attach
+   and detach clauses.  */
+
+void
+t1 ()
+{
+  struct foo {
+int *a, *b, c, d, *e;
+  } s;
+
+  int *a, *z, scalar, **y;
+
+#pragma acc enter data copyin(s) detach(z) /* { dg-error ".detach. is not valid for" } */
+  {
+#pragma acc data copy(s.a[0:10]) copy(z[0:10])
+{
+  s.e = z;
+#pragma acc parallel loop attach(s.e) detach(s.b) /* { dg-error ".detach. is not valid for" } */
+  for (int i = 0; i < 10; i++)
+s.a[i] = s.e[i];
+
+  a = s.e;
+#pragma acc enter data attach(a) detach(s.c) /* { dg-error ".detach. is not valid for" } */
+#pragma acc exit data detach(a)
+}
+
+#pragma acc enter data attach(z[:5]) /* { dg-error "array section in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(z[:5]) /* { dg-error "array section in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(z[1:]) /* { dg-error "array section in .attach. clause" } */
+/* { dg-error "

[PATCH 3/4] [og8] Attach / Detach C++ FE changes

2018-10-30 Thread Cesar Philippidis
As noted here <https://gcc.gnu.org/ml/gcc-patches/2018-10/msg01643.html>
this patch adds support for attach and detach in the C++ front end.
Unlike trunk, OG8 has some preliminary support for the this pointer.
Consequently, finish_omp_clauses had to take care of a couple more cases
in order to get libgomp.oacc-c++/this.C to work.

I've committed this patch to openacc-gcc-8-branch.

Cesar
2018-10-30  Cesar Philippidis  

	gcc/cp/
	* parser.c (cp_parser_omp_clause_name): Scan for attach and detach.
	(cp_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH,
	DETACH}.
	(cp_parser_oacc_all_clauses): Likewise.
	(OACC_DATA_CLAUSE_MASK): Add support for attach and detach.
	(OACC_ENTER_DATA_CLAUSE_MASK): Likewise.
	(cp_parser_oacc_declare): Likewise.
	(OACC_KERNELS_CLAUSE_MASK): Likewise.
	(OACC_PARALLEL_CLAUSE_MASK): Likewise.
	* semantics.c (handle_omp_array_sections_1): Reject subarrays for
	attach and detach.
	(cp_oacc_check_attachments): New function.
	(finish_omp_clauses): Use it. Also, allow structure fields and
	class members to appear in OpenACC data clauses.
---
 gcc/cp/parser.c| 28 +-
 gcc/cp/semantics.c | 71 +-
 2 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 9a8ec70bb17..8161d6301df 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -31266,6 +31266,8 @@ cp_parser_omp_clause_name (cp_parser *parser, bool consume_token = true)
 	result = PRAGMA_OMP_CLAUSE_ALIGNED;
 	  else if (!strcmp ("async", p))
 	result = PRAGMA_OACC_CLAUSE_ASYNC;
+	  else if (!strcmp ("attach", p))
+	result = PRAGMA_OACC_CLAUSE_ATTACH;
 	  break;
 	case 'b':
 	  if (!strcmp ("bind", p))
@@ -31290,6 +31292,8 @@ cp_parser_omp_clause_name (cp_parser *parser, bool consume_token = true)
 	result = PRAGMA_OMP_CLAUSE_DEFAULTMAP;
 	  else if (!strcmp ("depend", p))
 	result = PRAGMA_OMP_CLAUSE_DEPEND;
+	  else if (!strcmp ("detach", p))
+	result = PRAGMA_OACC_CLAUSE_DETACH;
 	  else if (!strcmp ("device", p))
 	result = PRAGMA_OMP_CLAUSE_DEVICE;
 	  else if (!strcmp ("deviceptr", p))
@@ -31679,11 +31683,13 @@ cp_parser_omp_var_list (cp_parser *parser, enum omp_clause_code kind, tree list)
 }
 
 /* OpenACC 2.5:
+   attach ( variable-list )
copy ( variable-list )
copyin ( variable-list )
copyout ( variable-list )
create ( variable-list )
delete ( variable-list )
+   detach ( variable-list )
present ( variable-list ) */
 
 static tree
@@ -31693,6 +31699,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind,
   enum gomp_map_kind kind;
   switch (c_kind)
 {
+case PRAGMA_OACC_CLAUSE_ATTACH:
+  kind = GOMP_MAP_ATTACH;
+  break;
 case PRAGMA_OACC_CLAUSE_COPY:
   kind = GOMP_MAP_TOFROM;
   break;
@@ -31708,6 +31717,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind,
 case PRAGMA_OACC_CLAUSE_DELETE:
   kind = GOMP_MAP_RELEASE;
   break;
+case PRAGMA_OACC_CLAUSE_DETACH:
+  kind = GOMP_MAP_DETACH;
+  break;
 case PRAGMA_OACC_CLAUSE_DEVICE:
   kind = GOMP_MAP_FORCE_TO;
   break;
@@ -33851,6 +33863,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask,
 		 clauses, here);
 	  c_name = "auto";
 	  break;
+	case PRAGMA_OACC_CLAUSE_ATTACH:
+	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "attach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_BIND:
 	  clauses = cp_parser_oacc_clause_bind (parser, clauses);
 	  c_name = "bind";
@@ -33883,6 +33899,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask,
 	  clauses = cp_parser_omp_clause_default (parser, clauses, here, true);
 	  c_name = "default";
 	  break;
+	case PRAGMA_OACC_CLAUSE_DETACH:
+	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "detach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_DEVICE:
 	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "device";
@@ -36904,10 +36924,12 @@ cp_parser_oacc_cache (cp_parser *parser, cp_token *pragma_tok)
  structured-block  */
 
 #define OACC_DATA_CLAUSE_MASK		\
-	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DETACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT) )
@@ -37107,6 +37129,7 @@ cp_parser_oacc_dec

[PATCH 2/4] [og8] Attach / Detach C FE changes

2018-10-30 Thread Cesar Philippidis
As noted here
<https://gcc.gnu.org/ml/gcc-patches/2018-10/msg01642.html>, this patch
adds support for attach and detach in the C front end. The only major
difference between this and the trunk patch is that OG8 supports the acc
routine bind clause, do the trunk patch didn't apply cleanly. Other than
that, these patches are identical.

I've committed this patch to openacc-gcc-8-branch.

Cesar
2018-10-30  Cesar Philippidis  

	gcc/c/
	* c-parser.c (c_parser_omp_clause_name): Scan for attach and detach.
	(c_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH,
	DETACH}.
	(c_parser_oacc_all_clauses): Likewise.
	(OACC_DATA_CLAUSE_MASK): Add support for attach and detach.
	(OACC_ENTER_DATA_CLAUSE_MASK): Likewise.
	(OACC_KERNELS_CLAUSE_MASK): Likewise.
	(OACC_PARALLEL_CLAUSE_MASK): Likewise.
	* c-typeck.c (handle_omp_array_sections_1): Reject subarrays for
	attach and detach.
	(c_oacc_check_attachments): New function.
	(c_finish_omp_clauses): Use it.
---
 gcc/c/c-parser.c | 27 +++-
 gcc/c/c-typeck.c | 55 +---
 2 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 578c0660c54..ffc5fe9b0d3 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -11226,6 +11226,8 @@ c_parser_omp_clause_name (c_parser *parser, bool consume_token = true)
 	result = PRAGMA_OMP_CLAUSE_ALIGNED;
 	  else if (!strcmp ("async", p))
 	result = PRAGMA_OACC_CLAUSE_ASYNC;
+	  else if (!strcmp ("attach", p))
+	result = PRAGMA_OACC_CLAUSE_ATTACH;
 	  break;
 	case 'b':
 	  if (!strcmp ("bind", p))
@@ -11252,6 +11254,8 @@ c_parser_omp_clause_name (c_parser *parser, bool consume_token = true)
 	result = PRAGMA_OACC_CLAUSE_DELETE;
 	  else if (!strcmp ("depend", p))
 	result = PRAGMA_OMP_CLAUSE_DEPEND;
+	  else if (!strcmp ("detach", p))
+	result = PRAGMA_OACC_CLAUSE_DETACH;
 	  else if (!strcmp ("device", p))
 	result = PRAGMA_OMP_CLAUSE_DEVICE;
 	  else if (!strcmp ("deviceptr", p))
@@ -11675,11 +11679,13 @@ c_parser_omp_var_list_parens (c_parser *parser, enum omp_clause_code kind,
 }
 
 /* OpenACC 2.5:
+   attach (variable-list )
copy ( variable-list )
copyin ( variable-list )
copyout ( variable-list )
create ( variable-list )
delete ( variable-list )
+   detach ( variable-list )
present ( variable-list ) */
 
 static tree
@@ -11689,6 +11695,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind,
   enum gomp_map_kind kind;
   switch (c_kind)
 {
+case PRAGMA_OACC_CLAUSE_ATTACH:
+  kind = GOMP_MAP_ATTACH;
+  break;
 case PRAGMA_OACC_CLAUSE_COPY:
   kind = GOMP_MAP_TOFROM;
   break;
@@ -11704,6 +11713,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind,
 case PRAGMA_OACC_CLAUSE_DELETE:
   kind = GOMP_MAP_RELEASE;
   break;
+case PRAGMA_OACC_CLAUSE_DETACH:
+  kind = GOMP_MAP_DETACH;
+  break;
 case PRAGMA_OACC_CLAUSE_DEVICE:
   kind = GOMP_MAP_FORCE_TO;
   break;
@@ -14083,6 +14095,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 		 clauses);
 	  c_name = "auto";
 	  break;
+	case PRAGMA_OACC_CLAUSE_ATTACH:
+	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "attach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_BIND:
 	  clauses = c_parser_oacc_clause_bind (parser, clauses);
 	  c_name = "bind";
@@ -14115,6 +14131,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  clauses = c_parser_omp_clause_default (parser, clauses, true);
 	  c_name = "default";
 	  break;
+	case PRAGMA_OACC_CLAUSE_DETACH:
+	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "detach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_DEVICE:
 	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "device";
@@ -14589,7 +14609,8 @@ c_parser_oacc_cache (location_t loc, c_parser *parser)
 */
 
 #define OACC_DATA_CLAUSE_MASK		\
-	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
@@ -14773,6 +14794,7 @@ c_parser_oacc_declare (c_parser *parser)
 #define OACC_ENTER_DATA_CLAUSE_MASK	\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE

[PATCH 1/4] [og8] Attach / Detach generic infrastructure

2018-10-30 Thread Cesar Philippidis
As mentioned here
<https://gcc.gnu.org/ml/gcc-patches/2018-10/msg01641.html>, this patch
series adds support for the new attach / detach clauses introduced in
OpenACC 2.6 to the C and C++ front ends.

There is one notable difference between this patch and the one I posted
for trunk. This patch tweaks GOMP_MAP_DEEP_COPY because OG8 has a lot of
other map types for acc declare and dynamic arrays. I suspect that
change would be required for trunk too, eventually.

I've committed this patch to openacc-gcc-8-branch.

Cesar
2018-10-30  Cesar Philippidis  

	gcc/
	* gimplify.c (gimplify_adjust_omp_clauses): Filter out
	GOMP_MAP_STRUCT for acc exit data.
	(gimplify_omp_target_update): Promote GOMP_MAP_DETACH
	to GOMP_MAP_FORCE_DETACH when the finalize clause is present.
	* omp-low.c (lower_omp_target): Add support for GOMP_MAP_{ATTACH,
	DETACH, FORCE_DETACH}.
	* tree-pretty-print.c (dump_omp_clause): Likewise.

	gcc/c-family/
	* c-pragma.h (enum pragma_omp_clause): Define
	PRAGMA_OACC_CLAUSE_{ATTACH,DETACH}.

	include/
	* gomp-constants.h (GOMP_MAP_DEEP_COPY): Define.
	(enum gomp_map_kind): Add GOMP_MAP_{ATTACH, DETACH, FORCE_DETACH}.
---
 gcc/c-family/c-pragma.h  |  2 ++
 gcc/gimplify.c   | 12 +---
 gcc/omp-low.c|  3 +++
 gcc/tree-pretty-print.c  |  9 +
 include/gomp-constants.h |  9 +
 5 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
index 8b392486615..bce915187c1 100644
--- a/gcc/c-family/c-pragma.h
+++ b/gcc/c-family/c-pragma.h
@@ -131,12 +131,14 @@ enum pragma_omp_clause {
 
   /* Clauses for OpenACC.  */
   PRAGMA_OACC_CLAUSE_ASYNC,
+  PRAGMA_OACC_CLAUSE_ATTACH,
   PRAGMA_OACC_CLAUSE_AUTO,
   PRAGMA_OACC_CLAUSE_BIND,
   PRAGMA_OACC_CLAUSE_COPY,
   PRAGMA_OACC_CLAUSE_COPYOUT,
   PRAGMA_OACC_CLAUSE_CREATE,
   PRAGMA_OACC_CLAUSE_DELETE,
+  PRAGMA_OACC_CLAUSE_DETACH,
   PRAGMA_OACC_CLAUSE_DEVICEPTR,
   PRAGMA_OACC_CLAUSE_DEVICE_RESIDENT,
   PRAGMA_OACC_CLAUSE_DEVICE_TYPE,
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index fda0d69caf7..9be0b70fc7f 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -9468,7 +9468,8 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p,
 		}
 	}
 	  else if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_STRUCT
-		   && code == OMP_TARGET_EXIT_DATA)
+		   && (code == OMP_TARGET_EXIT_DATA
+		   || code == OACC_EXIT_DATA))
 	remove = true;
 	  else if (DECL_SIZE (decl)
 		   && TREE_CODE (DECL_SIZE (decl)) != INTEGER_CST
@@ -11156,8 +11157,9 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p)
 	   && omp_find_clause (OMP_STANDALONE_CLAUSES (expr),
 			   OMP_CLAUSE_FINALIZE))
 {
-  /* Use GOMP_MAP_DELETE/GOMP_MAP_FORCE_FROM to denote that "finalize"
-	 semantics apply to all mappings of this OpenACC directive.  */
+  /* Use GOMP_MAP_DELETE, GOMP_MAP_FORCE_DETACH, and
+	 GOMP_MAP_FORCE_FROM to denote that "finalize" semantics apply
+	 to all mappings of this OpenACC directive.  */
   bool finalize_marked = false;
   for (tree c = OMP_STANDALONE_CLAUSES (expr); c; c = OMP_CLAUSE_CHAIN (c))
 	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP)
@@ -11171,6 +11173,10 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p)
 	  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_DELETE);
 	  finalize_marked = true;
 	  break;
+	case GOMP_MAP_DETACH:
+	  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_DETACH);
+	  finalize_marked = true;
+	  break;
 	default:
 	  /* Check consistency: libgomp relies on the very first data
 		 mapping clause being marked, so make sure we did that before
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index a219b825488..e559211f413 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -8185,6 +8185,9 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 	  case GOMP_MAP_DYNAMIC_ARRAY_FORCE_ALLOC:
 	  case GOMP_MAP_DYNAMIC_ARRAY_FORCE_PRESENT:
 	  case GOMP_MAP_LINK:
+	  case GOMP_MAP_ATTACH:
+	  case GOMP_MAP_DETACH:
+	  case GOMP_MAP_FORCE_DETACH:
 	gcc_assert (is_gimple_omp_oacc (stmt));
 	break;
 	  default:
diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c
index 05a163d8956..ecbb51646b0 100644
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@@ -778,6 +778,15 @@ dump_omp_clause (pretty_printer *pp, tree clause, int spc, dump_flags_t flags)
 	case GOMP_MAP_DECLARE_DEALLOCATE:
 	  pp_string (pp, "declare_deallocate");
 	  break;
+	case GOMP_MAP_ATTACH:
+	  pp_string (pp, "attach");
+	  break;
+	case GOMP_MAP_DETACH:
+	  pp_string (pp, "detach");
+	  break;
+	case GOMP_MAP_FORCE_DETACH:
+	  pp_string (pp, "force_detach");
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
diff --git a/include/gomp-constants.h b/include/gomp-constants.h
index 9ef51c04994..c6cd48805e0 100644
--- a/include/gomp-constants.h
+++ b/include/gomp-const

Re: [nvptx] vector length patch series

2018-10-29 Thread Cesar Philippidis
On 10/5/18 23:22, Tom de Vries wrote:
> On 9/18/18 10:04 PM, Cesar Philippidis wrote:
>> 591973d3c3a [nvptx] use user-defined vectors when possible
> 
> If I drop this patch, I get the same test results. Can you find a
> testcase for which this patch has an effect?

I just revisited the vector length patch series, and that patch in
specific is bogus and can be safely dropped.

>From what I can remember, the intent behind that patch is to allow the
user to override the default vector length using GOMP_OPENACC_DIM. E.g.,

  #pragma acc parallel loop vector
  for (...)

Here, the nvptx BE defaults to vector length = 32. But I had originally
wanted to allow something like

  GOMP_OPENACC_DIM="1:1:128" ./a.out

to use a vector length of 128 in that parallel region. After looking the
rest of the patch series, that's not possible because the nvptx BE
hard-codes the vector length to 128 at compile time. This was done
because large, multi-warp vector reductions are slow (O(n) vs O(ln n)).

Is this patch series OK without that patch? And if so, because that
patch series depends on other patches, can the following patches be
committed independently?

91e5c13b462 [nvptx] Generalize state propagation and synchronization
cb4b27a93e0 [nvptx] Use MAX, MIN, ROUND_UP macros
0af782ae93c [nvptx] Use TARGET_SET_CURRENT_FUNCTION
87cfb384dbe [nvptx] Add axis_dim
d1783939d98 [nvptx] Add thread count parm to bar.sync
47e80fa77a5 [nvptx] only use one bar.sync barriers in OpenACC offloaded
code
dafc9957ee7 [nvptx] Fix whitespace in nvptx_single and nvptx_neuter_pars
a4857b94879 [nvptx] make nvptx state propagation function names more
generic
b4b85f6e0b5 [nvptx] consolidate offloaded function attributes into
struct offload_attrs
bcdb1e8afac [nvptx] Rename worker_bcast variables oacc_bcast.
34958a0904d [nvptx] update openacc dim macros

These patches just refactor code in the nvptx BE.

Thanks,
Cesar


nvptx-vl.tar.gz
Description: application/gzip


[PATCH 4/4] [OpenACC] Attach / Detach compiler tests

2018-10-25 Thread Cesar Philippidis
This patch introduces a couple of compiler tests for the OpenACC
attach and detach clauses.

Is this OK for trunk after the other patches get approved?

Thanks,
Cesar
2018-XX-YY  Cesar Philippidis  

	gcc/testsuite/
	* c-c++-common/goacc/mdc-1.c: New test.
	* c-c++-common/goacc/mdc-2.c: New test.
	* g++.dg/goacc/mdc.C: New test.


diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-1.c b/gcc/testsuite/c-c++-common/goacc/mdc-1.c
new file mode 100644
index 000..c20b94ddbdc
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/mdc-1.c
@@ -0,0 +1,54 @@
+/* Test OpenACC's support for manual deep copy, including the attach
+   and detach clauses.  */
+
+/* { dg-additional-options "-fdump-tree-omplower" } */
+
+void
+t1 ()
+{
+  struct foo {
+int *a, *b, c, d, *e;
+  } s;
+
+  int *a, *z;
+
+#pragma acc enter data copyin(s)
+  {
+#pragma acc data copy(s.a[0:10]) copy(z[0:10])
+{
+  s.e = z;
+#pragma acc parallel loop attach(s.e)
+  for (int i = 0; i < 10; i++)
+s.a[i] = s.e[i];
+
+
+  a = s.e;
+#pragma acc enter data attach(a)
+#pragma acc exit data detach(a)
+}
+
+#pragma acc enter data copyin(a)
+#pragma acc acc enter data attach(s.e)
+#pragma acc exit data detach(s.e)
+
+#pragma acc data attach(s.e)
+{
+}
+#pragma acc exit data delete(a)
+
+#pragma acc exit data detach(a) finalize
+#pragma acc exit data detach(s.a) finalize
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:s .len: 32.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.tofrom:.z .len: 40.. map.struct:s .len: 1.. map.alloc:s.a .len: 8.. map.tofrom:._1 .len: 40.. map.always_pointer:s.a .pointer assign, bias: 0.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_parallel map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.attach:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:s.e .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.release:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:s.a .len: 8.." 1 "omplower" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-2.c b/gcc/testsuite/c-c++-common/goacc/mdc-2.c
new file mode 100644
index 000..ebfb99d4caf
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/mdc-2.c
@@ -0,0 +1,62 @@
+/* Test OpenACC's support for manual deep copy, including the attach
+   and detach clauses.  */
+
+void
+t1 ()
+{
+  struct foo {
+int *a, *b, c, d, *e;
+  } s;
+
+  int *a, *z, scalar, **y;
+
+#pragma acc enter data copyin(s) detach(z) /* { dg-error ".detach. is not valid for" } */
+  {
+#pragma acc data copy(s.a[0:10]) copy(z[0:10])
+{
+  s.e = z;
+#pragma acc parallel loop attach(s.e) detach(s.b) /* { dg-error ".detach. is not valid for" } */
+  for (int i = 0; i < 10; i++)
+s.a[i] = s.e[i];
+
+  a = s.e;
+#pragma acc enter data attach(a) detach(s.c) /* { dg-error ".detach. is not valid for" } */
+#pragma acc exit data detach(a)
+}
+
+#pragma acc enter data attach(z[:5]) /* { dg-error "array section in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(z[:5]) /* { dg-error "array section in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(z[1:]) /* { dg-error "array section in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(z[1:]) /* { dg-error "array section in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(z[:]) /* { dg-error "array section in .attach. clause" } */
+/* { dg-error "h

[PATCH 3/4] [OpenACC] Attach / Detach C++ FE changes

2018-10-25 Thread Cesar Philippidis
This patch adds support for attach and detach in the C front end. All
of the comments for the C FE patch apply here. Arguably, there's not a
significant difference between cp_oacc_check_attachments and its C
counterpart. However, I decided to keep them separate in case the
standard gets updated in the future to support more complicated C++
functionality.

Is this patch OK for trunk? I bootstrapped and regression tested it
for x86_64 Linux with nvptx offloading.

Thanks,
Cesar
2018-XX-YY  Cesar Philippidis  

	gcc/cp/
	* parser.c (cp_parser_omp_clause_name): Scan for attach and detach.
	(cp_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH,
	DETACH}.
	(cp_parser_oacc_all_clauses): Likewise.
	(OACC_DATA_CLAUSE_MASK): Add support for attach and detach.
	(OACC_ENTER_DATA_CLAUSE_MASK): Likewise.
	(cp_parser_oacc_declare): Likewise.
	(OACC_KERNELS_CLAUSE_MASK): Likewise.
	(OACC_PARALLEL_CLAUSE_MASK): Likewise.
	* semantics.c (handle_omp_array_sections_1): Reject subarrays for
	attach and detach.
	(cp_oacc_check_attachments): New function.
	(finish_omp_clauses): Use it. Also, allow structure fields and
	class members to appear in OpenACC data clauses.


diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 2533871fb28..a1b6244483b 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -31381,6 +31381,8 @@ cp_parser_omp_clause_name (cp_parser *parser)
 	result = PRAGMA_OMP_CLAUSE_ALIGNED;
 	  else if (!strcmp ("async", p))
 	result = PRAGMA_OACC_CLAUSE_ASYNC;
+	  else if (!strcmp ("attach", p))
+	result = PRAGMA_OACC_CLAUSE_ATTACH;
 	  break;
 	case 'c':
 	  if (!strcmp ("collapse", p))
@@ -31401,6 +31403,8 @@ cp_parser_omp_clause_name (cp_parser *parser)
 	result = PRAGMA_OMP_CLAUSE_DEFAULTMAP;
 	  else if (!strcmp ("depend", p))
 	result = PRAGMA_OMP_CLAUSE_DEPEND;
+	  else if (!strcmp ("detach", p))
+	result = PRAGMA_OACC_CLAUSE_DETACH;
 	  else if (!strcmp ("device", p))
 	result = PRAGMA_OMP_CLAUSE_DEVICE;
 	  else if (!strcmp ("deviceptr", p))
@@ -31767,11 +31771,13 @@ cp_parser_omp_var_list (cp_parser *parser, enum omp_clause_code kind, tree list)
 }
 
 /* OpenACC 2.0:
+   attach ( variable-list )
copy ( variable-list )
copyin ( variable-list )
copyout ( variable-list )
create ( variable-list )
delete ( variable-list )
+   detach ( variable-list )
present ( variable-list ) */
 
 static tree
@@ -31781,6 +31787,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind,
   enum gomp_map_kind kind;
   switch (c_kind)
 {
+case PRAGMA_OACC_CLAUSE_ATTACH:
+  kind = GOMP_MAP_ATTACH;
+  break;
 case PRAGMA_OACC_CLAUSE_COPY:
   kind = GOMP_MAP_TOFROM;
   break;
@@ -31796,6 +31805,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind,
 case PRAGMA_OACC_CLAUSE_DELETE:
   kind = GOMP_MAP_RELEASE;
   break;
+case PRAGMA_OACC_CLAUSE_DETACH:
+  kind = GOMP_MAP_DETACH;
+  break;
 case PRAGMA_OACC_CLAUSE_DEVICE:
   kind = GOMP_MAP_FORCE_TO;
   break;
@@ -33776,6 +33788,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask,
 		 clauses, here);
 	  c_name = "auto";
 	  break;
+	case PRAGMA_OACC_CLAUSE_ATTACH:
+	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "attach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_COLLAPSE:
 	  clauses = cp_parser_omp_clause_collapse (parser, clauses, here);
 	  c_name = "collapse";
@@ -33804,6 +33820,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask,
 	  clauses = cp_parser_omp_clause_default (parser, clauses, here, true);
 	  c_name = "default";
 	  break;
+	case PRAGMA_OACC_CLAUSE_DETACH:
+	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "detach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_DEVICE:
 	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "device";
@@ -36809,10 +36829,12 @@ cp_parser_oacc_cache (cp_parser *parser, cp_token *pragma_tok)
  structured-block  */
 
 #define OACC_DATA_CLAUSE_MASK		\
-	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DETACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT) )
@@ -37012,6 +37034,7 @@ cp_parser_oacc_declare (cp_parser *parser, cp_token *pragma_tok)
 
 #define OACC_ENTER_DATA_CLAUSE_MASK	\
 	( (OMP_CLAUSE_MASK_1 &l

[PATCH 2/4] [OpenACC] Attach / Detach C FE changes

2018-10-25 Thread Cesar Philippidis
This patch adds support for attach and detach in the C front end. Both
attach and detach are a little different from the other data clauses
because they require variables that are pointers. Consequently, this
patch teaches handle_omp_array_sections_1 to bail out of it detects a
subarray argument for attach or detach. Likewise, c_finish_omp_clauses
calls c_oacc_check_attachments to ensure that the variable is a
pointer.

Is this patch OK for trunk? I bootstrapped and regression tested it
for x86_64 Linux with nvptx offloading.

Thanks,
Cesar
2018-XX-YY  Cesar Philippidis  

	gcc/c/
	* c-parser.c (c_parser_omp_clause_name): Scan for attach and detach.
	(c_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH,
	DETACH}.
	(c_parser_oacc_all_clauses): Likewise.
	(OACC_DATA_CLAUSE_MASK): Add support for attach and detach.
	(OACC_ENTER_DATA_CLAUSE_MASK): Likewise.
	(OACC_KERNELS_CLAUSE_MASK): Likewise.
	(OACC_PARALLEL_CLAUSE_MASK): Likewise.
	* c-typeck.c (handle_omp_array_sections_1): Reject subarrays for
	attach and detach.
	(c_oacc_check_attachments): New function.
	(c_finish_omp_clauses): Use it.


diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index ee66ce89b5d..749a7f946ce 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -11396,6 +11396,8 @@ c_parser_omp_clause_name (c_parser *parser)
 	result = PRAGMA_OMP_CLAUSE_ALIGNED;
 	  else if (!strcmp ("async", p))
 	result = PRAGMA_OACC_CLAUSE_ASYNC;
+	  else if (!strcmp ("attach", p))
+	result = PRAGMA_OACC_CLAUSE_ATTACH;
 	  break;
 	case 'c':
 	  if (!strcmp ("collapse", p))
@@ -11418,6 +11420,8 @@ c_parser_omp_clause_name (c_parser *parser)
 	result = PRAGMA_OACC_CLAUSE_DELETE;
 	  else if (!strcmp ("depend", p))
 	result = PRAGMA_OMP_CLAUSE_DEPEND;
+	  else if (!strcmp ("detach", p))
+	result = PRAGMA_OACC_CLAUSE_DETACH;
 	  else if (!strcmp ("device", p))
 	result = PRAGMA_OMP_CLAUSE_DEVICE;
 	  else if (!strcmp ("deviceptr", p))
@@ -11818,11 +11822,13 @@ c_parser_omp_var_list_parens (c_parser *parser, enum omp_clause_code kind,
 }
 
 /* OpenACC 2.0:
+   attach (variable-list )
copy ( variable-list )
copyin ( variable-list )
copyout ( variable-list )
create ( variable-list )
delete ( variable-list )
+   detach ( variable-list )
present ( variable-list ) */
 
 static tree
@@ -11832,6 +11838,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind,
   enum gomp_map_kind kind;
   switch (c_kind)
 {
+case PRAGMA_OACC_CLAUSE_ATTACH:
+  kind = GOMP_MAP_ATTACH;
+  break;
 case PRAGMA_OACC_CLAUSE_COPY:
   kind = GOMP_MAP_TOFROM;
   break;
@@ -11847,6 +11856,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind,
 case PRAGMA_OACC_CLAUSE_DELETE:
   kind = GOMP_MAP_RELEASE;
   break;
+case PRAGMA_OACC_CLAUSE_DETACH:
+  kind = GOMP_MAP_DETACH;
+  break;
 case PRAGMA_OACC_CLAUSE_DEVICE:
   kind = GOMP_MAP_FORCE_TO;
   break;
@@ -14072,6 +14084,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 		clauses);
 	  c_name = "auto";
 	  break;
+	case PRAGMA_OACC_CLAUSE_ATTACH:
+	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "attach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_COLLAPSE:
 	  clauses = c_parser_omp_clause_collapse (parser, clauses);
 	  c_name = "collapse";
@@ -14100,6 +14116,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  clauses = c_parser_omp_clause_default (parser, clauses, true);
 	  c_name = "default";
 	  break;
+	case PRAGMA_OACC_CLAUSE_DETACH:
+	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "detach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_DEVICE:
 	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "device";
@@ -14558,7 +14578,8 @@ c_parser_oacc_cache (location_t loc, c_parser *parser)
 */
 
 #define OACC_DATA_CLAUSE_MASK		\
-	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
@@ -14741,6 +14762,7 @@ c_parser_oacc_declare (c_parser *parser)
 #define OACC_ENTER_DATA_CLAUSE_MASK	\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
@@ -14750,6 +14772,7 @@ c_parser_oacc_declare (c_parse

[PATCH 1/4] [OpenACC] Attach / Detach generic infrastructure

2018-10-25 Thread Cesar Philippidis
This patch series adds support for the new attach / detach clauses
introduced in OpenACC 2.6 to the C and C++ front ends. Julian is
working patches for the Fortran front end along with the runtime.

As their names somewhat imply, attach and detach are new data clauses
that are used to support manual deep copy in OpenACC. Specifically,
OpenACC 2.6 allows users to specify individual structure fields inside
data clauses, whereas before that would only work inside the update
directive. The attach and detach clauses allow users to update the
pointers in structure fields with their on-device counterparts.

As an example, consider the the following code:

  struct { int *a, b } s;
  int *z = ...

  #pragma acc enter data copyin(a[:N], s)
  ...
  s.a = z;
  #pragma acc enter data attach(s.a)
  ...
  #pragma acc exit data detach(s.a)
  #pragma acc exit data copyout(s)

Because the attach clause updates field s.a with the device address,
"acc exit data detach" must be used to restore the host pointer
contents before that value is copied back to the host.

This patch in particular adds the generic infrastructure for the attach
and detach clauses. All of the front ends lower the attach clause as
GOMP_MAP_DETACH data mapping. However, if a detachment is finalized, e.g.

  #pragma acc exit data finalize detach(ptr)

the gimplifier will promote it to GOMP_MAP_FORCE_FINALIZE. Also, this
patch teaches the gimplifier how to ignore GOMP_MAP_STRUCT for the
target update constructs.

Is this patch OK for trunk? I bootstrapped and regression tested it
for x86_64 Linux with nvptx offloading.

Thanks,
Cesar
2018-XX-YY  Cesar Philippidis  

	gcc/
	* gimplify.c (gimplify_adjust_omp_clauses): Filter out
	GOMP_MAP_STRUCT for acc exit data.
	(gimplify_omp_target_update): Promote GOMP_MAP_DETACH
	to GOMP_MAP_FORCE_DETACH when the finalize clause is present.
	* omp-low.c (lower_omp_target): Add support for GOMP_MAP_{ATTACH,
	DETACH, FORCE_DETACH}.
	* tree-pretty-print.c (dump_omp_clause): Likewise.

	gcc/c-family/
	* c-pragma.h (enum pragma_omp_clause): Define
	PRAGMA_OACC_CLAUSE_{ATTACH,DETACH}.

	include/
	* gomp-constants.h (GOMP_MAP_DEEP_COPY): Define.
	(enum gomp_map_kind): Add GOMP_MAP_{ATTACH, DETACH, FORCE_DETACH}.


diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
index b322547b11a..ab4c03b21f1 100644
--- a/gcc/c-family/c-pragma.h
+++ b/gcc/c-family/c-pragma.h
@@ -131,11 +131,13 @@ enum pragma_omp_clause {
 
   /* Clauses for OpenACC.  */
   PRAGMA_OACC_CLAUSE_ASYNC,
+  PRAGMA_OACC_CLAUSE_ATTACH,
   PRAGMA_OACC_CLAUSE_AUTO,
   PRAGMA_OACC_CLAUSE_COPY,
   PRAGMA_OACC_CLAUSE_COPYOUT,
   PRAGMA_OACC_CLAUSE_CREATE,
   PRAGMA_OACC_CLAUSE_DELETE,
+  PRAGMA_OACC_CLAUSE_DETACH,
   PRAGMA_OACC_CLAUSE_DEVICEPTR,
   PRAGMA_OACC_CLAUSE_DEVICE_RESIDENT,
   PRAGMA_OACC_CLAUSE_FINALIZE,
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 509fc2f3f5b..ead412e3f6f 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -9145,7 +9145,8 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p,
 		}
 	}
 	  else if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_STRUCT
-		   && code == OMP_TARGET_EXIT_DATA)
+		   && (code == OMP_TARGET_EXIT_DATA
+		   || code == OACC_EXIT_DATA))
 	remove = true;
 	  else if (DECL_SIZE (decl)
 		   && TREE_CODE (DECL_SIZE (decl)) != INTEGER_CST
@@ -11001,8 +11002,9 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p)
 	   && omp_find_clause (OMP_STANDALONE_CLAUSES (expr),
 			   OMP_CLAUSE_FINALIZE))
 {
-  /* Use GOMP_MAP_DELETE/GOMP_MAP_FORCE_FROM to denote that "finalize"
-	 semantics apply to all mappings of this OpenACC directive.  */
+  /* Use GOMP_MAP_DELETE, GOMP_MAP_FORCE_DETACH, and
+	 GOMP_MAP_FORCE_FROM to denote that "finalize" semantics apply
+	 to all mappings of this OpenACC directive.  */
   bool finalize_marked = false;
   for (tree c = OMP_STANDALONE_CLAUSES (expr); c; c = OMP_CLAUSE_CHAIN (c))
 	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP)
@@ -11016,6 +11018,10 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p)
 	  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_DELETE);
 	  finalize_marked = true;
 	  break;
+	case GOMP_MAP_DETACH:
+	  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_DETACH);
+	  finalize_marked = true;
+	  break;
 	default:
 	  /* Check consistency: libgomp relies on the very first data
 		 mapping clause being marked, so make sure we did that before
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index bbcbc121bae..f5ee117887f 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -7581,6 +7581,9 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 	  case GOMP_MAP_FORCE_DEVICEPTR:
 	  case GOMP_MAP_DEVICE_RESIDENT:
 	  case GOMP_MAP_LINK:
+	  case GOMP_MAP_ATTACH:
+	  case GOMP_MAP_DETACH:
+	  case GOMP_MAP_FORCE_DETACH:
 	gcc_assert (is_gimple_omp_oacc (stmt));
 	break;
 	  default:
diff --git a/gcc/t

[OpenACC] initial manual deep copy in c

2018-10-02 Thread Cesar Philippidis
I've push the attach patch to my github trunk-acc-mdc branch which
enables OpenMP 4.5 deep copy semantics in OpenACC data clauses in C. Now
GCC accepts data clauses of the form

  #pragma acc data copy(v.a[:n], v.b)

I think there are a couple of limitations in OpenMP that's going to
force me to introduce a new GOMP_MAP_ACC_STRUCT map kind. Basically,
GOMP_MAP_STRUCT reserves the minimum amount of device storage to the
member actually used in a struct. OpenACC allows the users to
dynamically attach and detach struct members, so GOMP_MAP_ACC_STRUCT
would need reserve enough memory for the entire struct. This is also
necessary for cases like this

  struct {
int *a, b, *c;
  } v;

  #pragma acc data copy(v.b)
  {
#pragma acc parallel copy(v.a[:n], v.c[:n])
  }

If the acc data directive is replaced with omp target data, and the acc
parallel replaced with omp target something, then the runtime would
crash because struct v has been partially mapped already.

Going forward, OpenACC 2.6 requires the runtime to maintain an
attachment counter to keep track if struct fields have been mapped. So
that's another justification for the GOMP_MAP_ACC_STRUCT type.

This is all an early work in progress. I'm still experimenting with some
other functionality. If you checkout that branch, beware it may be rebased.

Cesar
[OpenACC] Initial Manual Deep Copy

2018-10-02  Cesar Philippidis  

	gcc/c/
	* c-typeck.c (handle_omp_array_sections_1): Enable structs in acc
	data clauses.
	(c_finish_omp_clauses): Likewise.

	libgomp/
	* libgomp.h: Declare gomp_map_val.
	* oacc-parallel.c (GOACC_parallel_keyed): Use it to set devaddrs.
	* target.c (gomp_map_val): Remove static inline.
	* testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c: New test.


diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index 9d09b8d65fd..0428f48952a 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -12605,7 +12605,6 @@ handle_omp_array_sections_1 (tree c, tree t, vec ,
 	  return error_mark_node;
 	}
   if (TREE_CODE (t) == COMPONENT_REF
-	  && ort == C_ORT_OMP
 	  && (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
 	  || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_TO
 	  || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FROM))
@@ -13799,7 +13798,6 @@ c_finish_omp_clauses (tree clauses, enum c_omp_region_type ort)
 	  break;
 	}
 	  if (TREE_CODE (t) == COMPONENT_REF
-	  && (ort & C_ORT_OMP)
 	  && OMP_CLAUSE_CODE (c) != OMP_CLAUSE__CACHE_)
 	{
 	  if (DECL_BIT_FIELD (TREE_OPERAND (t, 1)))
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 3a8cc2bd7d6..553d1bb81ba 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -996,6 +996,7 @@ extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
 extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
 extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
    unsigned short *);
+extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t);
 
 extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
 	  size_t, void **, void **,
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index b80ace58590..fd5bbfbdf7d 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -231,8 +231,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
-devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
-			+ tgt->list[i].key->tgt_offset);
+devaddrs[i] = (void *) gomp_map_val (tgt, hostaddrs, i);
 
   acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
 			  async, dims, tgt);
diff --git a/libgomp/target.c b/libgomp/target.c
index dda041cdbef..a87ba7cad0e 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -457,7 +457,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
 	  (void *) cur_node.host_end);
 }
 
-static inline uintptr_t
+uintptr_t
 gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i)
 {
   if (tgt->list[i].key != NULL)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c
new file mode 100644
index 000..d489cc645cd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c
@@ -0,0 +1,25 @@
+#include 
+#include 
+
+struct dc
+{
+  int a;
+  int *b;
+};
+
+int
+main ()
+{
+  int n = 100, i;
+  struct dc v = { .a = 3, .b = (int *) malloc (sizeof (int) * n) };
+
+#pragma omp target teams distribute parallel for map(tofrom:v.a, v.b[:n])
+#pragma acc parallel loop copy(v.a, v.b[:n])
+  for (i = 0; i < n; i++)
+v.b[i] = v.a;
+
+  for (i = 0; i < 10; i++)
+printf ("%d: %d\n", i, v.b[i]);
+
+  return 0;
+}


[patch,openacc] Add warning for unused acc routine parallelism

2018-10-02 Thread Cesar Philippidis
This patch teaches omp-general to be a little more verbose when it comes
time to reporting the missing usage of gang, worker, and vector clauses
on acc routines. As before, the Fortran FE does this directly so that it
can handle modules. Therefore, this primarily handle the C and C++ cases
(although certain Fortran routines fall though to this).

Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux
with nvptx offloading. This patch only touches the OpenACC code path.

Thanks,
Cesar
[OpenACC] Add warning for unused acc routine parallelism

(was [OpenACC] Don't error on implicitly private induction variables in gfortran)

2018-XX-YY  Cesar Philippidis  

	gcc/
	* omp-general.c (oacc_verify_routine_clauses): New warning.

	gcc/testsuite/
	* c-c++-common/goacc-gomp/nesting-fail-1.c: Update test.
	* c-c++-common/goacc/Wparentheses-1.c: Likewise.
	* c-c++-common/goacc/builtin-goacc-parlevel-id-size-2.c: Likewise.
	* c-c++-common/goacc/builtin-goacc-parlevel-id-size.c: Likewise.
	* c-c++-common/goacc/nesting-fail-1.c: Likewise.
	* c-c++-common/goacc/routine-1.c: Likewise.
	* c-c++-common/goacc/routine-level-of-parallelism-1.c: Likewise.
	* c-c++-common/goacc/routine-level-of-parallelism-2.c: Likewise.
	* c-c++-common/goacc/routine-nohost-1.c: Likewise.
	* c-c++-common/goacc/routine-nohost-2.c: Likewise.
	* g++.dg/goacc/routine-1.C: Likewise.
	* g++.dg/goacc/routine-2.C: Likewise.
	* gfortran.dg/goacc/pr72741-2.f: Likewise.
	* gfortran.dg/goacc/routine-9.f90: Likewise.
	* gfortran.dg/goacc/routine-without-clauses.f90: New test.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/declare-2.c: Update test.
	* testsuite/libgomp.oacc-c-c++-common/declare-3.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/declare-4.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/mode-transitions.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-3.c: Likewise.

(cherry picked from gomp-4_0-branch r244980)
---
 gcc/omp-general.c |  6 ++-
 .../c-c++-common/goacc-gomp/nesting-fail-1.c  |  4 +-
 .../c-c++-common/goacc/Wparentheses-1.c   |  4 +-
 .../goacc/builtin-goacc-parlevel-id-size-2.c  |  2 +
 .../goacc/builtin-goacc-parlevel-id-size.c|  2 +
 .../c-c++-common/goacc/nesting-fail-1.c   |  2 +-
 gcc/testsuite/c-c++-common/goacc/routine-1.c  |  4 ++
 .../goacc/routine-level-of-parallelism-1.c|  8 ++--
 .../goacc/routine-level-of-parallelism-2.c| 34 
 .../c-c++-common/goacc/routine-nohost-1.c | 20 +-
 .../c-c++-common/goacc/routine-nohost-2.c | 40 +--
 gcc/testsuite/g++.dg/goacc/routine-1.C|  6 +--
 gcc/testsuite/g++.dg/goacc/routine-2.C| 10 ++---
 gcc/testsuite/gfortran.dg/goacc/pr72741-2.f   |  4 +-
 gcc/testsuite/gfortran.dg/goacc/routine-9.f90 | 22 +-
 .../goacc/routine-without-clauses.f90 | 34 
 .../libgomp.oacc-c-c++-common/declare-2.c |  4 +-
 .../libgomp.oacc-c-c++-common/declare-3.c |  2 +-
 .../libgomp.oacc-c-c++-common/declare-4.c |  2 +-
 .../libgomp.oacc-c-c++-common/host_data-1.c   |  2 +-
 .../loop-dim-default.c|  2 +-
 .../mode-transitions.c|  2 +-
 .../parallel-loop-2.h |  2 +-
 .../libgomp.oacc-c-c++-common/routine-1.c |  2 +-
 .../libgomp.oacc-c-c++-common/routine-3.c |  2 +-
 25 files changed, 132 insertions(+), 90 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-without-clauses.f90

diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index 5c91ce73a50..d290766329f 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -613,8 +613,10 @@ oacc_verify_routine_clauses (tree fndecl, tree *clauses, location_t loc,
   }
   if (c_level == NULL_TREE)
 {
-  /* OpenACC 2.5 makes this an error; for the current OpenACC 2.0a
-	 implementation add an implicit "seq" clause.  */
+  /* OpenACC 2.5 expects the user to supply one parallelism clause.  */
+  warning_at (loc, 0, "expecting one of %, %, % "
+		  "or % clauses");
+  inform (loc, "assigning % parallelism to this routine");
   c_level = build_omp_clause (loc, OMP_CLAUSE_SEQ);
   OMP_CLAUSE_CHAIN (c_level) = *clauses;
   *clauses = c_level;
diff --git a/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c b/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c
index 1a3324200e2..57eaa0296d6 100644
--- a/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c
+++ b/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c
@@ -362,7 +362,7 @@ f_acc_data (void)
   }
 }
 
-#pragma acc routine
+#pragma acc routine seq
 void
 f_acc_loop (void)

[patch,openacc] Check for sufficient parallelism when calling acc routines in Fortran

2018-10-02 Thread Cesar Philippidis
This patch updates the Fortran FE OpenACC routine parser to enforce the
new OpenACC 2.5 routine directive semantics. In addition to emitting a
warning when the user doesn't specify a gang, worker or vector clause,
it also clarifies some error messages and introduces a new error when
the user tries to use an acc routine with insufficient parallelism,
e.g., calling a gang routine from a vector loop.

Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
Linux with nvptx offloading.

Thanks,
Cesar
[OpenACC] Check for sufficient parallelism when calling acc routines in fortran

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* gfortran.h (gfc_resolve_oacc_routine_call): Declare.
	(gfc_resolve_oacc_routines): Declare.
	* openmp.c (gfc_match_oacc_routine): Make error reporting more
	precise.  Defer rejection of non-function and subroutine symbols
	until gfc_resolve_oacc_routines.
	(struct fortran_omp_context): Add a dims member.
	(gfc_resolve_oacc_blocks): Update ctx->dims.
	(gfc_resolve_oacc_routine_call): New function.
	(gfc_resolve_oacc_routines): New function.
	* resolve.c (resolve_function): Call gfc_resolve_oacc_routine_call.
	(resolve_call): Likewise.
	(resolve_codes): Call gfc_resolve_oacc_routines.

	gcc/testsuite/
	* gfortran.dg/goacc/routine-10.f90: New test.
	* gfortran.dg/goacc/routine-9.f90: New test.
	* gfortran.dg/goacc/routine-nested-parallelism.f: New test.
	* gfortran.dg/goacc/routine-nested-parallelism.f90: New test.

(cherry picked from gomp-4_0-branch r239784)
(cherry picked from gomp-4_0-branch r247353)

---
 gcc/fortran/gfortran.h|   2 +
 gcc/fortran/openmp.c  | 108 +-
 gcc/fortran/resolve.c |  11 +
 .../gfortran.dg/goacc/routine-10.f90  |   6 +
 gcc/testsuite/gfortran.dg/goacc/routine-9.f90 |  96 +
 .../goacc/routine-nested-parallelism.f| 340 ++
 .../goacc/routine-nested-parallelism.f90  | 340 ++
 7 files changed, 887 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-10.f90
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-9.f90
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-nested-parallelism.f
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-nested-parallelism.f90

diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 781dc2a7d17..87f98bbd110 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -3166,6 +3166,8 @@ void gfc_resolve_oacc_directive (gfc_code *, gfc_namespace *);
 void gfc_resolve_oacc_declare (gfc_namespace *);
 void gfc_resolve_oacc_parallel_loop_blocks (gfc_code *, gfc_namespace *);
 void gfc_resolve_oacc_blocks (gfc_code *, gfc_namespace *);
+void gfc_resolve_oacc_routine_call (gfc_symbol *, locus *); 
+void gfc_resolve_oacc_routines (gfc_namespace *);
 
 /* expr.c */
 void gfc_free_actual_arglist (gfc_actual_arglist *);
diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 58cbe0ae90c..5850538c1f0 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -2319,7 +2319,13 @@ gfc_match_oacc_routine (void)
 	{
 	  if ((isym = gfc_find_function (buffer)) == NULL
 	  && (isym = gfc_find_subroutine (buffer)) == NULL)
-	st = gfc_find_symtree (gfc_current_ns->sym_root, buffer);
+	{
+	  st = gfc_find_symtree (gfc_current_ns->sym_root, buffer);
+	  if (st == NULL && gfc_current_ns->proc_name->attr.contained
+		  && gfc_current_ns->parent)
+		st = gfc_find_symtree (gfc_current_ns->parent->sym_root,
+   buffer);
+	}
 	  if (st)
 	{
 	  sym = st->n.sym;
@@ -2327,18 +2333,12 @@ gfc_match_oacc_routine (void)
 		  && strcmp (sym->name, gfc_current_ns->proc_name->name) == 0)
 	sym = NULL;
 	}
-
-	  if ((isym == NULL && st == NULL)
-	  || (sym
-		  && !sym->attr.external
-		  && !sym->attr.function
-		  && !sym->attr.subroutine))
+	  else if (isym == NULL)
 	{
-	  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C, "
-			 "invalid function name %s",
-			 (sym) ? sym->name : buffer);
-	  gfc_current_locus = old_loc;
-	  return MATCH_ERROR;
+	  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L, "
+			 "invalid function name %qs", _loc, buffer);\
+	  goto cleanup;
+
 	}
 
 	  /* Set sym to NULL if it matches the current procedure's
@@ -2371,20 +2371,27 @@ gfc_match_oacc_routine (void)
   dims = gfc_oacc_routine_dims (c);
   if (dims == OACC_FUNCTION_NONE)
 {
-  gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at %C");
+  gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at %L",
+		 _loc);
 
   /* Don't abort early, because it's important to let the user
 	 know of any potential duplicate routine directives

[patch,openacc] Check clauses with intrinsic function specified in !$ACC ROUTINE ( NAME )

2018-10-02 Thread Cesar Philippidis
This patch allows Fortran intrinsic functions to be declared as acc
routines. For instance, abort can now be called from acc within
offloaded regions.

Given that intrinsic functions like sin and cos are important for
offloaded functions, I wonder if there is a better way to accomplish to
enabling this. Maybe certain intrinsic functions should default to
having an implied acc routine directive. But I suppose that's something
for another patch.

Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux
with nvptx offloading.

Thanks,
Cesar
[PR fortran/72741] Check clauses with intrinsic function specified in !$ACC ROUTINE ( NAME )

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* openmp.c (gfc_match_oacc_routine): Check clauses of intrinsic
	functions.

	gcc/testsuite/
	* gfortran.dg/goacc/fixed-1.f: Update test.
	* gfortran.dg/goacc/pr72741-2.f: New test.
	* gfortran.dg/goacc/pr72741-intrinsic-1.f: New test.
	* gfortran.dg/goacc/pr72741-intrinsic-2.f: New test.
	* gfortran.dg/goacc/pr72741.f90: Update test.

	libgomp/
	* testsuite/libgomp.oacc-fortran/abort-1.f90: Update test.
	* testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f: Update test.

(cherry picked from gomp-4_0-branch r239422)
(cherry picked from gomp-4_0-branch r239515, and r247954)
---
 gcc/fortran/openmp.c  | 41 +++
 gcc/testsuite/gfortran.dg/goacc/fixed-1.f |  2 +
 gcc/testsuite/gfortran.dg/goacc/pr72741-2.f   | 39 ++
 .../gfortran.dg/goacc/pr72741-intrinsic-1.f   | 16 
 .../gfortran.dg/goacc/pr72741-intrinsic-2.f   | 22 ++
 gcc/testsuite/gfortran.dg/goacc/pr72741.f90   | 20 +++--
 .../libgomp.oacc-fortran/abort-1.f90  |  1 +
 .../libgomp.oacc-fortran/acc_on_device-1-2.f  |  1 +
 8 files changed, 130 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/pr72741-2.f
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-1.f
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-2.f

diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 60ecaf54523..58cbe0ae90c 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -2288,8 +2288,9 @@ match
 gfc_match_oacc_routine (void)
 {
   locus old_loc;
-  gfc_symbol *sym = NULL;
   match m;
+  gfc_intrinsic_sym *isym = NULL;
+  gfc_symbol *sym = NULL;
   gfc_omp_clauses *c = NULL;
   gfc_oacc_routine_name *n = NULL;
   oacc_function dims;
@@ -2311,12 +2312,14 @@ gfc_match_oacc_routine (void)
   if (m == MATCH_YES)
 {
   char buffer[GFC_MAX_SYMBOL_LEN + 1];
-  gfc_symtree *st;
+  gfc_symtree *st = NULL;
 
   m = gfc_match_name (buffer);
   if (m == MATCH_YES)
 	{
-	  st = gfc_find_symtree (gfc_current_ns->sym_root, buffer);
+	  if ((isym = gfc_find_function (buffer)) == NULL
+	  && (isym = gfc_find_subroutine (buffer)) == NULL)
+	st = gfc_find_symtree (gfc_current_ns->sym_root, buffer);
 	  if (st)
 	{
 	  sym = st->n.sym;
@@ -2325,7 +2328,7 @@ gfc_match_oacc_routine (void)
 	sym = NULL;
 	}
 
-	  if (st == NULL
+	  if ((isym == NULL && st == NULL)
 	  || (sym
 		  && !sym->attr.external
 		  && !sym->attr.function
@@ -2337,6 +2340,13 @@ gfc_match_oacc_routine (void)
 	  gfc_current_locus = old_loc;
 	  return MATCH_ERROR;
 	}
+
+	  /* Set sym to NULL if it matches the current procedure's
+	 name.  This will simplify the check for duplicate ACC
+	 ROUTINE attributes.  */
+	  if (gfc_current_ns->proc_name
+	  && !strcmp (buffer, gfc_current_ns->proc_name->name))
+	sym = NULL;
 	}
   else
 {
@@ -2357,15 +2367,30 @@ gfc_match_oacc_routine (void)
 	  != MATCH_YES))
 return MATCH_ERROR;
 
+  /* Scan for invalid routine geometry.  */
   dims = gfc_oacc_routine_dims (c);
   if (dims == OACC_FUNCTION_NONE)
 {
-  gfc_error ("Multiple loop axes specified for routine %C");
-  gfc_current_locus = old_loc;
-  return MATCH_ERROR;
+  gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at %C");
+
+  /* Don't abort early, because it's important to let the user
+	 know of any potential duplicate routine directives.  */
+  seen_error = true;
 }
 
-  if (sym != NULL)
+  if (isym != NULL)
+{
+  if (c && (c->gang || c->worker || c->vector))
+	{
+	  gfc_error ("Intrinsic symbol specified in !$ACC ROUTINE ( NAME ) "
+		 "at %C, with incompatible clauses specifying the level "
+		 "of parallelism");
+	  goto cleanup;
+	}
+  /* The intrinsic symbol has been marked with a SEQ, or with no clause at
+	 all, which is OK.  */
+}
+  else if (sym != NULL)
 {
   bool needs_entry = true;
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/fixed-1.f b/gcc/testsuite/gfortran.dg/goacc/fixed-1.f
index 974f2702260..3a900c5b4e6 100644
--- a/gcc/testsuite/gfortran.dg/goacc/fixed

[patch,openacc] Repeated use of the OpenACC routine directive

2018-10-02 Thread Cesar Philippidis
This is another patch that teaches the C and C++ to emit more errors
involving acc routine clauses. In retrospect, I should have merged it
together with the patch I posted here
<https://gcc.gnu.org/ml/gcc-patches/2018-10/msg00089.html>, however at
the time I thought it would make the patch too large.

Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
Linux with nvptx offloading. This patch is also self-contained to the
OpenACC code path.

Thanks,
Cesar
[OpenACC] Repeated use of the OpenACC routine directive

2018-XX-YY  Thomas Schwinge  
	    Cesar Philippidis  

	gcc/
	* omp-general.h (oacc_verify_routine_clauses): Declare.
	* omp-general.c (oacc_verify_routine_clauses): Change formal
	parameters.  Add checking if already marked as an accelerator
	routine.  Adjust all users.

	gcc/c/
	* c-parser.c (c_finish_oacc_routine): Rework checking if already
	marked as an accelerator routine.

	gcc/cp/
	* parser.c (cp_finalize_oacc_routine): Rework checking if already
	marked as an accelerator routine.

	gcc/testsuite/
	* c-c++-common/goacc/routine-1.c: Update tests.
	* c-c++-common/goacc/routine-5.c: Likewise.
	* c-c++-common/goacc/routine-level-of-parallelism-1.c: Likewise.
	* c-c++-common/goacc/routine-level-of-parallelism-2.c: New test.
	* c-c++-common/goacc/routine-nohost-1.c: Update tests.
	* c-c++-common/goacc/routine-nohost-2.c: New test.


(cherry picked from gomp-4_0-branch r239521)

remove bind clause support
---
 gcc/c/c-parser.c  |  46 ++--
 gcc/cp/parser.c   |  50 ++--
 gcc/omp-general.c | 105 +++-
 gcc/omp-general.h |   3 +-
 gcc/testsuite/c-c++-common/goacc/routine-1.c  |  10 +-
 gcc/testsuite/c-c++-common/goacc/routine-5.c  |   4 +-
 .../goacc/routine-level-of-parallelism-1.c| 233 --
 .../goacc/routine-level-of-parallelism-2.c|  73 ++
 .../c-c++-common/goacc/routine-nohost-1.c |  20 ++
 .../c-c++-common/goacc/routine-nohost-2.c |  97 
 10 files changed, 566 insertions(+), 75 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/routine-level-of-parallelism-2.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/routine-nohost-2.c

diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 187a2dec999..3d5cbe76acf 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -15090,35 +15090,39 @@ c_finish_oacc_routine (struct oacc_routine_data *data, tree fndecl,
   return;
 }
 
-  oacc_verify_routine_clauses (>clauses, data->loc);
-
-  if (oacc_get_fn_attrib (fndecl))
+  int compatible
+= oacc_verify_routine_clauses (fndecl, >clauses, data->loc,
+   "#pragma acc routine");
+  if (compatible < 0)
 {
-  error_at (data->loc,
-		"%<#pragma acc routine%> already applied to %qD", fndecl);
   data->error_seen = true;
   return;
 }
-
-  if (TREE_USED (fndecl) || (!is_defn && DECL_SAVED_TREE (fndecl)))
+  if (compatible > 0)
 {
-  error_at (data->loc,
-		TREE_USED (fndecl)
-		? G_("%<#pragma acc routine%> must be applied before use")
-		: G_("%<#pragma acc routine%> must be applied before "
-		 "definition"));
-  data->error_seen = true;
-  return;
 }
+  else
+{
+  if (TREE_USED (fndecl) || (!is_defn && DECL_SAVED_TREE (fndecl)))
+	{
+	  error_at (data->loc,
+		TREE_USED (fndecl)
+		? G_("%<#pragma acc routine%> must be applied before use")
+		: G_("%<#pragma acc routine%> must be applied before"
+			 " definition"));
+	  data->error_seen = true;
+	  return;
+	}
 
-  /* Process the routine's dimension clauses.  */
-  tree dims = oacc_build_routine_dims (data->clauses);
-  oacc_replace_fn_attrib (fndecl, dims);
+  /* Set the routine's level of parallelism.  */
+  tree dims = oacc_build_routine_dims (data->clauses);
+  oacc_replace_fn_attrib (fndecl, dims);
 
-  /* Add an "omp declare target" attribute.  */
-  DECL_ATTRIBUTES (fndecl)
-= tree_cons (get_identifier ("omp declare target"),
-		 data->clauses, DECL_ATTRIBUTES (fndecl));
+  /* Add an "omp declare target" attribute.  */
+  DECL_ATTRIBUTES (fndecl)
+	= tree_cons (get_identifier ("omp declare target"),
+		 data->clauses, DECL_ATTRIBUTES (fndecl));
+}
 
   /* Remember that we've used this "#pragma acc routine".  */
   data->fndecl_seen = true;
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index d56105ca177..0d314d63cfd 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -38260,36 +38260,42 @@ cp_finalize_oacc_routine (cp_parser *parser, tree fndecl, bool is_defn)
 	  return;
 	}
 
-  oacc_verify_routine_clauses (>oacc_routine->clauses,
-   parser->oacc_routine->loc);
-
-  if (oacc_get_fn_attr

[patch,openacc] Add support for OpenACC routine nohost clause

2018-10-02 Thread Cesar Philippidis
Attached is a patch that introduces support for the acc routine nohost
clause. Basically, if an acc routine function is marked as nohost, then
the compiler does not generate code for the host. It's kind of strange
to test for. Basically, we had to use acc_on_device at -O2 so that the
host references to the dead function get optimized away.

I believe that the nohost clause was added for acc routines to allow
offloaded acc code to call vendor libraries, such as cuBLAS, which are
only available for specific accelerators. I haven't seen it used much in
practice though.

Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux
with nvptx offloading.

Thanks
Cesar
[OpenACC] Add support for OpenACC routine nohost clause

(was OpenACC bind, nohost changes)

2018-XX-YY  Thomas Schwinge  
	Cesar Philippidis  

	gcc/
	* tree-core.h (omp_clause_code): Add OMP_CLAUSE_NOHOST.
	* tree.c (omp_clause_num_ops, omp_clause_code_name, walk_tree_1):
	Update for these.
	* tree-pretty-print.c (dump_omp_clause): Handle	OMP_CLAUSE_NOHOST.
	* gimplify.c (gimplify_scan_omp_clauses)
	(gimplify_adjust_omp_clauses): Handle OMP_CLAUSE_NOHOST.
	* tree-nested.c (convert_nonlocal_omp_clauses)
	(convert_local_omp_clauses): Likewise.
	* omp-low.c (scan_sharing_clauses): Likewise.
	* omp-offload.c (maybe_discard_oacc_function): New function.
	(execute_oacc_device_lower) [!ACCEL_COMPILER]: Handle OpenACC
	nohost clauses.

	gcc/c-family/
	* c-attribs.c (c_common_attribute_table): Set min_len to -1 for
	"omp declare target".
	* c-pragma.h (pragma_omp_clause): Add PRAGMA_OACC_CLAUSE_NOHST.

	gcc/c/
	* c-parser.c (c_parser_omp_clause_name): Handle "nohost".
	(c_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_NOHOST.
	(c_parser_oacc_routine, c_finish_oacc_routine): Update.
	* c-typeck.c (c_finish_omp_clauses): Handle OMP_CLAUSE_NOHOST.

	gcc/cp/
	* parser.c (cp_parser_omp_clause_name): Handle "nohost".
	(cp_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_NOHOST,
	(cp_parser_oacc_routine, cp_finalize_oacc_routine): Update.
	* pt.c (tsubst_omp_clauses): Handle OMP_CLAUSE_NOHOST.
	* semantics.c (finish_omp_clauses): Handle OMP_CLAUSE_NOHOST.

	gcc/fortran/
	* gfortran.h (gfc_omp_clauses): Add nohost members.
	* openmp.c (omp_mask2): Add OMP_CLAUSE_NOHOST.
	(gfc_match_omp_clauses): Handle OMP_CLAUSE_NOHOST.
	(gfc_match_oacc_routine): Set oacc_function_nohost when appropriate.
	* gfortran.h (symbol_attribute): Add oacc_function_nohost member.
	* trans-openmp.c (gfc_add_omp_offload_attributes): Use it to decide
	whether to generate an OMP_CLAUSE_NOHOST clause.
	(gfc_trans_omp_clauses_1): Unreachable code to generate an
	OMP_CLAUSE_NOHOST clause.

	gcc/testsuite/
	* c-c++-common/goacc/classify-routine.c: Adjust test.
	* c-c++-common/goacc/routine-1.c: Likewise.
	* c-c++-common/goacc/routine-2.c: Likewise.
	* c-c++-common/goacc/routine-nohost-1.c: New test.
	* g++.dg/goacc/routine-2.C: Adjust test.
	* gfortran.dg/goacc/pr72741.f90: New test.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/routine-3.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/routine-nohost-1.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/routine-bind-nohost-1.c:
	Update test.
	* testsuite/libgomp.oacc-fortran/routine-8.f90: Likewise.

(cherry picked from gomp-4_0-branch r223007, r226192, r226259, r228915,
r228916, and r231423)
(cherry picked from gomp-4_0-branch r231973 and r231979)
(cherry picked from gomp-4_0-branch r238847)
---
 gcc/c-family/c-attribs.c  |  2 +-
 gcc/c-family/c-pragma.h   |  1 +
 gcc/c/c-parser.c  | 12 +-
 gcc/c/c-typeck.c  |  1 +
 gcc/cp/parser.c   | 13 +--
 gcc/cp/pt.c   |  1 +
 gcc/cp/semantics.c|  1 +
 gcc/fortran/gfortran.h|  3 +-
 gcc/fortran/openmp.c  | 29 +++---
 gcc/fortran/trans-openmp.c| 15 +++-
 gcc/gimplify.c|  2 +
 gcc/lto/lto.c |  1 +
 gcc/omp-low.c |  2 +
 gcc/omp-offload.c | 38 ---
 .../c-c++-common/goacc/classify-routine.c |  4 +-
 gcc/testsuite/c-c++-common/goacc/routine-1.c  |  8 
 gcc/testsuite/c-c++-common/goacc/routine-2.c  |  8 ++--
 .../c-c++-common/goacc/routine-nohost-1.c | 28 ++
 gcc/testsuite/g++.dg/goacc/routine-2.C|  9 +
 gcc/testsuite/gfortran.dg/goacc/pr72741.f90   | 30 +++
 gcc/tree-core.h   |  3 ++
 gcc/tree-nested.c |  4 ++
 gcc/tree-pretty-print.c   |  3 ++
 gcc/tree.c|  3 ++
 .../libgomp.oacc-c-c++-common/routine-3.c | 33 
 .../routine-nohost-1.c| 18 +

[patch,openacc] Use oacc_verify_routine_clauses for C/C++

2018-10-02 Thread Cesar Philippidis
This patch introduces a new oacc_verify_routine_clauses function that
reports errors if the user abuses the gang, worker and vector clauses
for acc routine directives in C/C++. Fortran is a little different,
because the FE has it's own IR. So, while it would be possible to defer
checking for gang, worker, vector parallelism until a tree node is
created for a function, we'd still have problems of verifying the
parallelism for functions and subroutines defined and declared inside
modules. The C and C++ FE's are similar enough were they can share a
common function.

Is this OK for trunk? I bootstrapped and regression tested it for x86_64
Linux with nvptx offloading. This is only touches the OpenACC code path.

Cesar
[OpenACC] Use oacc_verify_routine_clauses for C/C++

2018-XX-YY  Thomas Schwinge  
	Cesar Philippidis  

	gcc/
	* omp-general.c (oacc_build_routine_dims): Move some of its
	processing into...
	(oacc_verify_routine_clauses): ... this new function.
	* omp-general.h (oacc_verify_routine_clauses): New prototype.

	gcc/c/
	* c-parser.c (c_parser_oacc_routine): Normalize order of clauses.
	(c_finish_oacc_routine): Call oacc_verify_routine_clauses.

	gcc/cp/
	* parser.c (cp_parser_oacc_routine)
	(cp_parser_late_parsing_oacc_routine): Normalize order of clauses.
	(cp_finalize_oacc_routine): Call oacc_verify_routine_clauses.

	gcc/testsuite/
	* c-c++-common/goacc/routine-level-of-parallelism-1.c: New test.

(cherry picked from gomp-4_0-branch r239520)
---
 gcc/c/c-parser.c  |   8 +
 gcc/cp/parser.c   |   9 +
 gcc/omp-general.c |  69 -
 gcc/omp-general.h |   1 +
 .../goacc/routine-level-of-parallelism-1.c| 265 ++
 5 files changed, 342 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/routine-level-of-parallelism-1.c

diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 3ca8fe71cc4..3517cb783d9 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -14999,6 +14999,9 @@ c_parser_oacc_routine (c_parser *parser, enum pragma_context context)
   data.clauses
 	= c_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
  "#pragma acc routine");
+  /* The clauses are in reverse order; fix that to make later diagnostic
+	 emission easier.  */
+  data.clauses = nreverse (data.clauses);
 
   if (TREE_CODE (decl) != FUNCTION_DECL)
 	{
@@ -15013,6 +15016,9 @@ c_parser_oacc_routine (c_parser *parser, enum pragma_context context)
   data.clauses
 	= c_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
  "#pragma acc routine");
+  /* The clauses are in reverse order; fix that to make later diagnostic
+	 emission easier.  */
+  data.clauses = nreverse (data.clauses);
 
   /* Emit a helpful diagnostic if there's another pragma following this
 	 one.  Also don't allow a static assertion declaration, as in the
@@ -15076,6 +15082,8 @@ c_finish_oacc_routine (struct oacc_routine_data *data, tree fndecl,
   return;
 }
 
+  oacc_verify_routine_clauses (>clauses, data->loc);
+
   if (oacc_get_fn_attrib (fndecl))
 {
   error_at (data->loc,
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 241226d8c21..fa7ee7798ae 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -38117,6 +38117,9 @@ cp_parser_oacc_routine (cp_parser *parser, cp_token *pragma_tok,
 	= cp_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
   "#pragma acc routine",
   cp_lexer_peek_token (parser->lexer));
+  /* The clauses are in reverse order; fix that to make later diagnostic
+	 emission easier.  */
+  data.clauses = nreverse (data.clauses);
 
   if (decl && is_overloaded_fn (decl)
 	  && (TREE_CODE (decl) != FUNCTION_DECL
@@ -38213,6 +38216,9 @@ cp_parser_late_parsing_oacc_routine (cp_parser *parser, tree attrs)
   parser->oacc_routine->clauses
 = cp_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
   "#pragma acc routine", pragma_tok);
+  /* The clauses are in reverse order; fix that to make later diagnostic
+ emission easier.  */
+  parser->oacc_routine->clauses = nreverse (parser->oacc_routine->clauses);
   cp_parser_pop_lexer (parser);
   /* Later, cp_finalize_oacc_routine will process the clauses, and then set
  fndecl_seen.  */
@@ -38247,6 +38253,9 @@ cp_finalize_oacc_routine (cp_parser *parser, tree fndecl, bool is_defn)
 	  return;
 	}
 
+  oacc_verify_routine_clauses (>oacc_routine->clauses,
+   parser->oacc_routine->loc);
+
   if (oacc_get_fn_attrib (fndecl))
 	{
 	  error_at (parser->oacc_routine->loc,
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index cac6de2..3ea2224957d 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -559,9 +559,64 @@ oacc_set_fn_attrib (tree fn, tree clauses, vec *args)
 }
 

Re: [patch,openacc] C, C++ OpenACC wait diagnostic change

2018-09-26 Thread Cesar Philippidis
On 09/26/2018 12:50 PM, Joseph Myers wrote:
> On Wed, 26 Sep 2018, Cesar Philippidis wrote:
> 
>> Attached is an old patch which updated the C and C++ FEs to use %<)%>
>> for the right ')' symbol. It's mostly a cosmetic change. All of the
>> changes are self-contained to the OpenACC code path.
> 
> Why is the "before ')'" included in the call to c_parser_error at all?  
> c_parser_error calls c_parse_error which adds its own " before " and token 
> description or expansion, so I'd expect the current error to result in a 
> message ending in something of the form "before X before Y".

On closer inspection

 #pragma acc parallel copyin (a[0:N]) copy (b[0:N]) wait (1 /* {
dg-error "expected '\\\)' before end of line" } */
-/* { dg-error "expected integer expression before '\\\)'" "" {
target c++ } .-1 } */
+/* { dg-error "expected integer expression list before" "" { target
c++ } .-1 } */

so this is only applicable to c++. But in C++ I see duplicate errors
like this

wait.c:29:29: error: expected ‘)’ before end of line
 #pragma acc parallel wait (1
   ~ ^
 )
wait.c:29:29: error: expected integer expression list before ‘)’ before
end of line

I suppose for C++ that's an improvement over

wait.c:29:29: error: expected integer expression before ')' before end
of line

Julian, I need to start working on deep copy in OpenACC. Can you take
over this patch? The error handling code in the C FE needs to be removed
because it's dead.

Thanks,
Cesar


Re: [patch,wip] warn on noncontiguous pointers

2018-09-26 Thread Cesar Philippidis
On 09/26/2018 01:49 PM, Thomas Koenig wrote:
> Hi Cesar,
> 
>> As of GCC 8, gfortran now errors when a pointer with a contiguous
>> attribute is set to point to a target without a contiguous attribute. I
>> think this is overly strict, and should probably be demoted to a
>> pedantic warning as I've done in the attached patch.
> 
> We had a lengthy discussion on that one. Still, we can dig into the
> standard for that one.
> 
> J3/10-007 says in 7.2.2.3  Data pointer assignment
> 
> # 7 If the pointer object has the CONTIGUOUS attribute, the pointer
> # target shall be contiguous.
> 
> # 9 If bounds-remapping-list is specified, the pointer target shall
> # be simply contiguous (6.5.4) or of rank one
> 
> program test
>    implicit none
>    real,pointer :: fptr1(:)
>    real,pointer,contiguous :: fptr3(:,:,:)
> 
>    allocate(fptr1(12))
>    call random_number(fptr1)
> 
>    !Test pointer reshape II
> 
>    fptr3(1:2,1:2,1:2) => fptr1(4:)
> 
> end program
> 
> So, by paragraph 9, this would be OK. Let's see what paragraph 7
> means when it says "contiguous". 5.3.7 says
> 
> An object is contiguous if it is
> 
> # (1) an object with the CONTIGUOUS attribute,
> # (2) a nonpointer whole array that is not assumed-shape,
> # (3) an assumed-shape array that is argument associated with an
>  array that is contiguous,
> # (4) an array allocated by an ALLOCATE statement,
> # (5) a pointer associated with a contiguous target, or
> # (6) a nonzero-sized array section (6.5.3) provided that
> #   (a) its base object is contiguous,
> #   (b) it does not have a vector subscript,
> #   (c) the elements of the section, in array element order, are a
> #   subset of the base object elements that are consecutive in
> #   array element order,
> #   (d) if the array is of type character and a substring-range appears,
> #   the substring-range specifies all of the characters of the
> #   parent string (6.4.1),
> #   (e) only its final part-ref has nonzero rank, and
> #   (f) it is not the real or imaginary part (6.4.4) of an array of type
> #   complex.
> 
> An object is not contiguous if it is an array subobject, and
> 
> [conditions not relevant elided]
> 
> # It is processor dependent whether any other object is contiguous.
> 
> If we go down the list, we see that fptr1(4:) is not contiguous; it
> is not an array (it is a pointer), so (4) also does not apply.
> 
> So, we are in the realm of processor dependent behavior, so we can
> chose what to do.
> 
> The last time we discussed this, we agreed on a hard error.  One
> important argument is that a mistakenly applied contiguous
> attribute will lead to wrong code, and that it is quite easy
> to check this, as we do now.
> 
> So, I think we should leave the behavior as it is now, and

Thank you for the explanation. That all seems reasonable.

>> Maybe the ScaTeLib code needs to be updated.
> 
> sounds like a good idea to me.

ACK.

Thanks,
Cesar



[patch,wip] warn on noncontiguous pointers

2018-09-26 Thread Cesar Philippidis
As of GCC 8, gfortran now errors when a pointer with a contiguous
attribute is set to point to a target without a contiguous attribute. I
think this is overly strict, and should probably be demoted to a
pedantic warning as I've done in the attached patch.

I ran into this issue while I was tuning GCC for lsdalton. Specifically,
CMake generates (not exactly because I reduced it) the following test
case for ScaTeLib to determine if that library can be enabled:

program test
   implicit none
   real,pointer :: fptr1(:)
   real,pointer,contiguous :: fptr3(:,:,:)

   allocate(fptr1(12))
   call random_number(fptr1)

   !Test pointer reshape II

   fptr3(1:2,1:2,1:2) => fptr1(4:)
end program

Note how fptr1 doesn't have a contiguous attribute. Does anyone have
thoughts on this? Maybe the ScaTeLib code needs to be updated.

Thanks,
Cesar
Disable "Assignment to contiguous pointer from non-contiguous target" error

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* expr.c (gfc_check_pointer_assign): Demote "Assignment to
	contiguous pointer from non-contiguous target" to a warning.
---

diff --git a/gcc/fortran/expr.c b/gcc/fortran/expr.c
index 3315bb840af..74caa4f2d59 100644
--- a/gcc/fortran/expr.c
+++ b/gcc/fortran/expr.c
@@ -3957,13 +3957,13 @@ gfc_check_pointer_assign (gfc_expr *lvalue, gfc_expr *rvalue)
 	  }
 }
 
-  /* Error for assignments of contiguous pointers to targets which is not
+  /* Warn for assignments of contiguous pointers to targets which is not
  contiguous.  Be lenient in the definition of what counts as
  contiguous.  */
 
   if (lhs_attr.contiguous && !gfc_is_simply_contiguous (rvalue, false, true))
-gfc_error ("Assignment to contiguous pointer from non-contiguous "
-	   "target at %L", >where);
+gfc_warning (OPT_Wpedantic, "Assignment to contiguous pointer from "
+		 "non-contiguous target at %L", >where);
 
   /* Warn if it is the LHS pointer may lives longer than the RHS target.  */
   if (warn_target_lifetime
-- 
2.17.1



[patch,openacc] Use correct location information for OpenACC shape and simple, clauses in C/C++

2018-09-26 Thread Cesar Philippidis
Thomas, this is your old gomp4 patch that updates the error locations
for gang, worker and vector clauses. Those functions are parsed in
{c,cp}_parser_oacc_shape_clause. I'm not sure how much of an impact this
patch has given that it does not require any test suite changes.
However, we do have a couple of tests in og8 that haven't been merged to
trunk, so perhaps this functionality will be exercised in a later patch
series. Unfortunately, there are a lot inter-dependencies between all of
the pending og8->trunk patches.

Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
Linux with nvptx offloading.

Cesar
[OpenACC] Use correct location information for OpenACC shape and simple
 clauses in C/C++

2018-XX-YY  Thomas Schwinge  
	    Cesar Philippidis  

	gcc/c/
	* c-parser.c (c_parser_oacc_shape_clause)
	(c_parser_oacc_simple_clause): Add loc formal parameter.  Adjust
	all users.
	gcc/cp/
	* parser.c (cp_parser_oacc_shape_clause): Add loc formal
	parameter.  Adjust all users.

(cherry picked from gomp-4_0-branch r239519)

update fallout for acc finalize / if_present
---
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index b8fc000b50d..d1e9876065a 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -12602,12 +12602,12 @@ c_parser_oacc_single_int_clause (c_parser *parser, omp_clause_code code,
 */
 
 static tree
-c_parser_oacc_shape_clause (c_parser *parser, omp_clause_code kind,
+c_parser_oacc_shape_clause (c_parser *parser, location_t loc,
+			omp_clause_code kind,
 			const char *str, tree list)
 {
   const char *id = "num";
   tree ops[2] = { NULL_TREE, NULL_TREE }, c;
-  location_t loc = c_parser_peek_token (parser)->location;
 
   if (kind == OMP_CLAUSE_VECTOR)
 id = "length";
@@ -12739,12 +12739,12 @@ c_parser_oacc_shape_clause (c_parser *parser, omp_clause_code kind,
seq */
 
 static tree
-c_parser_oacc_simple_clause (c_parser *parser, enum omp_clause_code code,
-			 tree list)
+c_parser_oacc_simple_clause (c_parser * /* parser */, location_t loc,
+			 enum omp_clause_code code, tree list)
 {
   check_no_duplicate_clause (list, code, omp_clause_code_name[code]);
 
-  tree c = build_omp_clause (c_parser_peek_token (parser)->location, code);
+  tree c = build_omp_clause (loc, code);
   OMP_CLAUSE_CHAIN (c) = list;
 
   return c;
@@ -14046,8 +14046,8 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  c_name = "async";
 	  break;
 	case PRAGMA_OACC_CLAUSE_AUTO:
-	  clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_AUTO,
-		clauses);
+	  clauses = c_parser_oacc_simple_clause (parser, here, OMP_CLAUSE_AUTO,
+		 clauses);
 	  c_name = "auto";
 	  break;
 	case PRAGMA_OACC_CLAUSE_COLLAPSE:
@@ -14091,8 +14091,8 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  c_name = "device_resident";
 	  break;
 	case PRAGMA_OACC_CLAUSE_FINALIZE:
-	  clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_FINALIZE,
-		 clauses);
+	  clauses = c_parser_oacc_simple_clause (parser, here,
+		 OMP_CLAUSE_FINALIZE, clauses);
 	  c_name = "finalize";
 	  break;
 	case PRAGMA_OACC_CLAUSE_FIRSTPRIVATE:
@@ -14101,7 +14101,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  break;
 	case PRAGMA_OACC_CLAUSE_GANG:
 	  c_name = "gang";
-	  clauses = c_parser_oacc_shape_clause (parser, OMP_CLAUSE_GANG,
+	  clauses = c_parser_oacc_shape_clause (parser, here, OMP_CLAUSE_GANG,
 		c_name, clauses);
 	  break;
 	case PRAGMA_OACC_CLAUSE_HOST:
@@ -14113,13 +14113,15 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  c_name = "if";
 	  break;
 	case PRAGMA_OACC_CLAUSE_IF_PRESENT:
-	  clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_IF_PRESENT,
+	  clauses = c_parser_oacc_simple_clause (parser, here,
+		 OMP_CLAUSE_IF_PRESENT,
 		 clauses);
 	  c_name = "if_present";
 	  break;
 	case PRAGMA_OACC_CLAUSE_INDEPENDENT:
-	  clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_INDEPENDENT,
-		clauses);
+	  clauses = c_parser_oacc_simple_clause (parser, here,
+		 OMP_CLAUSE_INDEPENDENT,
+		 clauses);
 	  c_name = "independent";
 	  break;
 	case PRAGMA_OACC_CLAUSE_LINK:
@@ -14151,7 +14153,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  c_name = "reduction";
 	  break;
 	case PRAGMA_OACC_CLAUSE_SEQ:
-	  clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_SEQ,
+	  clauses = c_parser_oacc_simple_clause (parser, here, OMP_CLAUSE_SEQ,
 		clauses);
 	  c_name = "seq";
 	  break;
@@ -14165,7 +14167,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  break;
 	case PRAGMA_OACC_CLAUSE_VECTOR:
 	  c_name = "vector";
-	  clauses = c_parser_oacc_shape_clause (parser, OMP_CLAUSE_VECTOR,
+	  clauses = c_parser_oacc_shape_clause (parser, here, OMP_CLAUSE_VECTOR,
 

[patch] nvptx libgcc atomic routines

2018-09-26 Thread Cesar Philippidis
This patch adds nvptx support for the atomic FETCH_AND_OP functions. I
recall that this used to be important for OpenACC reductions back in the
GCC 5.0 days before Nathan split reductions into four phases. Nowadays,
atomic reductions use a spin lock that's implemented directly by the
nvptx BE. Therefore, I'm not sure if the nvptx port still needs support
for atomic fetch_and_*.

Tom and Thomas, do either of you have any thoughts on this? Should I
commit it to trunk? I bootstrapped and regtested it for x86_64 Linux
with nvptx offloading.

Thanks,
Cesar
nvptx libgcc atomic routines

2018-XX-YY  Cesar Philippidis  

	libgcc/
	* config/nvptx/atomic.c: New file.
	* config/nvptx/t-nvptx (LIB2ADD): Include it.

(cherry picked from gomp-4_0-branch r223177)
---
 libgcc/config/nvptx/atomic.c | 279 +++
 libgcc/config/nvptx/t-nvptx  |   3 +-
 2 files changed, 281 insertions(+), 1 deletion(-)
 create mode 100644 libgcc/config/nvptx/atomic.c

diff --git a/libgcc/config/nvptx/atomic.c b/libgcc/config/nvptx/atomic.c
new file mode 100644
index 000..ab6cf23ef9d
--- /dev/null
+++ b/libgcc/config/nvptx/atomic.c
@@ -0,0 +1,279 @@
+/* Atomic operations for PTX.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
+   Contributed by Mentor Graphics.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Kernel helper for compare-and-exchange.  */
+static int
+nvidia_cas (int oldval, int newval, int *ptr)
+{
+  int ret;
+
+  asm volatile ("atom.cas.b32 %0, [%1], %2, %3;" : "=r"(ret) : "r"(ptr),
+		"r"(oldval), "r"(newval));
+
+  return ret;
+}
+
+#define __kernel_cmpxchg (nvidia_cas)
+
+/* Kernel helper for memory barrier.  */
+static void
+__threadfence_block (void)
+{
+  asm volatile ("membar.cta;");
+}
+
+#define __kernel_dmb (__threadfence_block)
+
+#define HIDDEN
+
+/* Warning: this assumes that all nvptx targets are little endian.  */
+
+#define INVERT_MASK_1 0
+#define INVERT_MASK_2 0
+
+#define MASK_1 0xffu
+#define MASK_2 0xu
+
+#define FETCH_AND_OP_WORD(OP, PFX_OP, INF_OP)\
+  int HIDDEN\
+  __sync_fetch_and_##OP##_4 (int *ptr, int val)\
+  {	\
+int failure, tmp;			\
+	\
+do {\
+  tmp = *ptr;			\
+  failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr);	\
+} while (failure != 0);		\
+	\
+return tmp;\
+  }
+
+FETCH_AND_OP_WORD (add,   , +)
+FETCH_AND_OP_WORD (sub,   , -)
+FETCH_AND_OP_WORD (or,, |)
+FETCH_AND_OP_WORD (and,   , &)
+FETCH_AND_OP_WORD (xor,   , ^)
+FETCH_AND_OP_WORD (nand, ~, &)
+
+#define NAME_oldval(OP, WIDTH) __sync_fetch_and_##OP##_##WIDTH
+#define NAME_newval(OP, WIDTH) __sync_##OP##_and_fetch_##WIDTH
+
+/* Implement both __sync__and_fetch and __sync_fetch_and_ for
+   subword-sized quantities.  */
+
+#define SUBWORD_SYNC_OP(OP, PFX_OP, INF_OP, TYPE, WIDTH, RETURN)	\
+  TYPE HIDDEN\
+  NAME##_##RETURN (OP, WIDTH) (TYPE *ptr, TYPE val)			\
+  {	\
+int *wordptr = (int *) ((unsigned long) ptr & ~3);			\
+unsigned int mask, shift, oldval, newval;\
+int failure;			\
+	\
+shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH;	\
+mask = MASK_##WIDTH << shift;	\
+	\
+do {\
+  oldval = *wordptr;		\
+  newval = ((PFX_OP (((oldval & mask) >> shift)			\
+			 INF_OP (unsigned int) val)) << shift) & mask;	\
+  newval |= oldval & ~mask;		\
+  failure = __kernel_cmpxchg (oldval, newval, wordptr);		\
+} while (failure != 0);		\
+	\
+return (RETURN & mask) >> shift;	\
+  }
+
+SUBWORD_SYNC_OP (add,   , +, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (sub,   , -, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (or,, |, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (and,   , &, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (xor,   , ^, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, oldval)
+
+SUBWORD_SYNC_OP (add,   , +, 

[patch,openacc] Don't gimplify in ssa mode if seen_error in oacc_xform_loop

2018-09-26 Thread Cesar Philippidis
Again, this is another old gomp4 patch without a corresponding test
case. I'm not familiar enough with the parloops kernels implementation
to know if this patch is important. However, I'm somewhat inclined to
drop patches from OG8 that don't impact correctness in the test suite.

What do you want to do with this patch Thomas? I bootstrapped and
regtested it for x86_64 Linux with nvptx offloading.

Cesar
[OpenACC] Don't gimplify in ssa mode if seen_error in oacc_xform_loop

2018-XX-YY  Tom de Vries  
	Cesar Philippidis  

	gcc/
	PR tree-optimization/68977
	* omp-offload.c (oacc_xform_loop): Handle seen_error () == true.

(cherry picked from gomp-4_0-branch r232343 and r232344)
---

diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 3582dda3d1a..dae284fe890 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -335,7 +335,12 @@ oacc_xform_loop (gcall *call)
-> chunks=ceil (range/(chunksize*threads*step))
  striding=false,chunking=false
-> chunk_size=ceil(range/(threads*step)),chunks=1  */
-  push_gimplify_context (true);
+
+  /* If seen_error (), we may introduce an uninitialized var due to
+ gimplification bailing out.  If we gimplify in ssa mode, that will cause an
+ ICE.  If we gimplify in non-ssa mode, then ssa updating will turn it into a
+ default definition, and we avoid the ICE.  */
+  push_gimplify_context (!seen_error ());
 
   switch (code)
 {
-- 
2.17.1



[patch,openacc] use existing local variable in cp_parser_oacc_enter_exit_data

2018-09-26 Thread Cesar Philippidis
This is an old gomp4 patch that updates the location of the clause for
acc enter/exit data. Apparently, it didn't impact any test cases. Is
this OK for trunk or should we drop it from OG8?

I bootstrapped and regtested it for x86_64 Linux with nvptx offloading.

Thanks,
Cesar
[OpenACC] Use existing local variable in cp_parser_oacc_enter_exit_data

2018-XX-YY  James Norris 
	Cesar Philippidis  

	gcc/cp/
	* parser.c (cp_parser_oacc_enter_exit_data): Use existing local
	variable.

(cherry picked from gomp-4_0-branch r223007)
---

diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 89f239e0f20..c6ebc494e59 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -37064,7 +37064,7 @@ cp_parser_oacc_enter_exit_data (cp_parser *parser, cp_token *pragma_tok,
   stmt = enter ? make_node (OACC_ENTER_DATA) : make_node (OACC_EXIT_DATA);
   TREE_TYPE (stmt) = void_type_node;
   OMP_STANDALONE_CLAUSES (stmt) = clauses;
-  SET_EXPR_LOCATION (stmt, pragma_tok->location);
+  SET_EXPR_LOCATION (stmt, loc);
   add_stmt (stmt);
   return stmt;
 }
-- 
2.17.1



[patch,openacc] C, C++ OpenACC wait diagnostic change

2018-09-26 Thread Cesar Philippidis
Attached is an old patch which updated the C and C++ FEs to use %<)%>
for the right ')' symbol. It's mostly a cosmetic change. All of the
changes are self-contained to the OpenACC code path.

Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux
with nvptx offloading.

Thanks,
Cesar
[OpenACC] C, C++ OpenACC wait diagnostic change

2018-XX-YY  James Norris  
	    Cesar Philippidis  

	gcc/c/
	* c-parser.c (c_parser_oacc_wait_list): Change error message.
	gcc/cp/
* parser.c (cp_parser_oacc_wait_list): Change error message.
	gcc/testsuite/
* c-c++-common/goacc/asyncwait-1: Update messages.

(cherry picked from gomp-4_0-branch r223007, e4ea0a3)

diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 1766a256633..b8fc000b50d 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -11597,7 +11597,8 @@ c_parser_oacc_wait_list (c_parser *parser, location_t clause_loc, tree list)
 
   if (args->length () == 0)
 {
-  c_parser_error (parser, "expected integer expression before ')'");
+  c_parser_error (parser,
+		  "expected integer expression list before %<)%>");
   release_tree_vector (args);
   return list;
 }
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index c6ebc494e59..e80c1fba670 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -32094,7 +32094,8 @@ cp_parser_oacc_wait_list (cp_parser *parser, location_t clause_loc, tree list)
 
   if (args == NULL || args->length () == 0)
 {
-  cp_parser_error (parser, "expected integer expression before ')'");
+  cp_parser_error (parser,
+		   "expected integer expression list before %<)%>");
   if (args != NULL)
 	release_tree_vector (args);
   return list;
diff --git a/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c b/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c
index e1840af5d70..2fc89486ee5 100644
--- a/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c
@@ -116,7 +116,7 @@ f (int N, float *a, float *b)
 }
 
 #pragma acc parallel copyin (a[0:N]) copy (b[0:N]) wait (1 /* { dg-error "expected '\\\)' before end of line" } */
-/* { dg-error "expected integer expression before '\\\)'" "" { target c++ } .-1 } */
+/* { dg-error "expected integer expression list before" "" { target c++ } .-1 } */
 {
 for (ii = 0; ii < N; ii++)
 b[ii] = a[ii];
@@ -171,7 +171,7 @@ f (int N, float *a, float *b)
 #pragma acc wait (1,2,,) /* { dg-error "expected (primary-|)expression before" } */
 
 #pragma acc wait (1 /* { dg-error "expected '\\\)' before end of line" } */
-/* { dg-error "expected integer expression before '\\\)'" "" { target c++ } .-1 } */
+/* { dg-error "expected integer expression list before" "" { target c++ } .-1 } */
 
 #pragma acc wait (1,*) /* { dg-error "expected (primary-|)expression before" } */
 
-- 
2.17.1



Re: [PATCH][OpenACC] Update deviceptr handling during gimplification

2018-09-26 Thread Cesar Philippidis
On 09/25/2018 05:55 PM, Julian Brown wrote:
> On Tue, 7 Aug 2018 15:09:38 -0700
> Cesar Philippidis  wrote:
> 
>> I had previously posted this patch as part of a monster deviceptr
>> patch here
>> <https://gcc.gnu.org/ml/gcc-patches/2018-06/msg01911.html>. This
>> patch breaks out the generic gimplifier changes. Essentially, with
>> this patch, the gimplifier will now transfer deviceptr data clauses
>> using GOMP_MAP_FORCE_DEVICEPTR.
>>
>> Is this patch OK for trunk? It bootstrapped / regression tested
>> cleanly for x86_64 with nvptx offloading.
> 
> This patch also appears to fix the attached test case, which had been
> associated with a different deviceptr-related patch on the og8 branch
> (the other parts of which are upstream already). Perhaps you'd like to
> incorporate this test into your patch? It was by James Norris
> originally, IIUC.

Ok, I'll do that. Thanks for updating those tests.

Cesar


[patch,openacc] update fortran nested parallelism error messages

2018-09-24 Thread Cesar Philippidis
Bernhard noticed a typo in one of the OpenACC parallelism error
messages. The error should have reported that gang loops cannot be
nested inside vector loops, not worker loops.

I'll commit the attached patch to trunk as obvious. I bootstrapped and
regtested it against x86_64 Linux with nvptx offloading.

Cesar
[OpenACC] update fortran nested parallelism error messages

2018-09-24  Bernhard Reuther-Fischer  
	Cesar Philippidis  

	gcc/fortran/
	* openmp.c (resolve_oacc_loop_blocks):

	gcc/testsuite/
	* gfortran.dg/goacc/nested-parallelism.f90: New test.

---
 gcc/fortran/openmp.c  |  2 +-
 .../gfortran.dg/goacc/nested-parallelism.f90  | 51 +++
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90

diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index ac1923ea06b..a046863445d 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -5913,7 +5913,7 @@ resolve_oacc_loop_blocks (gfc_code *code)
 			 >loc);
 		if (c->code->ext.omp_clauses->vector)
 		  gfc_error ("Loop parallelized across gangs is not allowed "
-			 "inside loop parallelized across workers at %L",
+			 "inside loop parallelized across vectors at %L",
 			 >loc);
 	  }
 	if (code->ext.omp_clauses->worker)
diff --git a/gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90 b/gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90
new file mode 100644
index 000..6ebef6a4547
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90
@@ -0,0 +1,51 @@
+! Verify the invalid gang, worker, vector parallelism error messages.
+
+program np
+  integer, parameter :: n = 100
+  integer :: i, j, k
+
+  !$acc parallel loop gang
+  do i = 1, n
+ !$acc loop gang ! { dg-error "gangs is not allowed inside another loop parallelized across gangs" }
+ do j = 1, n
+ end do
+
+ !$acc loop worker
+ do j = 1, n
+ end do
+
+ !$acc loop vector
+ do j = 1, n
+ end do 
+  end do
+  
+  !$acc parallel loop worker
+  do i = 1, n
+ !$acc loop gang ! { dg-error "gangs is not allowed inside loop parallelized across workers" }
+ do j = 1, n
+ end do
+
+ !$acc loop worker ! { dg-error "workers is not allowed inside another loop parallelized across workers" }
+ do j = 1, n
+ end do
+
+ !$acc loop vector
+ do j = 1, n
+ end do 
+  end do
+
+  !$acc parallel loop vector
+  do i = 1, n
+ !$acc loop gang ! { dg-error "gangs is not allowed inside loop parallelized across vectors" }
+ do j = 1, n
+ end do
+
+ !$acc loop worker ! { dg-error "workers is not allowed inside another loop parallelized across vectors" }
+ do j = 1, n
+ end do
+
+ !$acc loop vector ! { dg-error "vectors is not allowed inside another loop parallelized across vectors" }
+ do j = 1, n
+ end do 
+  end do
+end program np
-- 
2.17.1



Re: [openacc] Teach gfortran to lower OpenACC routine dims

2018-09-24 Thread Cesar Philippidis
On 09/20/2018 09:10 AM, Bernhard Reutner-Fischer wrote:
> On Thu, 20 Sep 2018 07:41:08 -0700
> Cesar Philippidis  wrote:
> 
>> On 09/19/2018 03:27 PM, Bernhard Reutner-Fischer wrote:
>>> On Wed, 5 Sep 2018 12:52:03 -0700
>>> Cesar Philippidis  wrote:
> 
>>>> diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c
>>>> index eea6b81ebfa..eed868f475b 100644
>>>> --- a/gcc/fortran/trans-decl.c
>>>> +++ b/gcc/fortran/trans-decl.c
>>>> @@ -46,6 +46,7 @@ along with GCC; see the file COPYING3.  If not
>>>> see #include "trans-stmt.h"
>>>>  #include "gomp-constants.h"
>>>>  #include "gimplify.h"
>>>> +#include "omp-general.h"  
>>>
>>> hmz. so the gomp-constants.h include would be redundant, but do we
>>> really need omp-general.h?  
>>
>> Good point. omp-general.h is required for oacc_build_routine_dims.
>>
>>> Doesn't this suggest to move this oacc dims lowering to
>>> trans-openmp.c instead, please?  
>>
>> So something like adding a new gfc_add_omp_offload_attributes to
>> trans-openmp.c and call it from add_attributes_to_decl?
> 
> yes.
> 
>> On a related note, I noticed that I forgot to incorporate this change
>> in gfortran.h:
>>
>> @@ -902,7 +912,7 @@ typedef struct
>>unsigned oacc_declare_link:1;
>>
>>/* This is an OpenACC acclerator function at level N - 1  */
>> -  unsigned oacc_function:3;
>> +  ENUM_BITFIELD (oacc_function) oacc_function:3;
>>
>> It's probably not huge, but I noticed that some other enum bitfields
>> are declared that way.
> 
> yea, some compilers had trouble with enum bitfields (where plain int
> bitfields like here worked fine, IIRC) but i'm not sure if it's
> considered legacy these days. Fine with me to be safe.

I updated the patch by incorporating all of those changes. Is it OK for
trunk?

Thanks,
Cesar
[openacc] Make GFC default to -1 for OpenACC routine dims

2018-09-24  Cesar Philippidis  

	* gfortran.h (enum oacc_function): New enum.
	(gfc_oacc_routine_name): Add locus loc field.
	(symbol_attribute): Update type of oacc_function field.	
	* openmp.c (gfc_oacc_routine_dims): Return oacc_function.
	(gfc_match_oacc_routine): Update routine clause syntax checking.
	Populate oacc_function attribute with dims.
	* trans-decl.c (add_attributes_to_decl): Use oacc_build_routine_dims
	to construct routine dims.
	* trans.h (gfc_add_omp_offload_attributes): Declare.
	* trans-decl.c (add_attributes_to_decl): Use it to set OMP and ACC
	offload function attributes.
	* trans-openmp.c (gfc_add_omp_offload_attributes): New function.

	gcc/testsuite/
	* gfortran.dg/goacc/classify-routine.f95: Adjust test.
	* gfortran.dg/goacc/pr71704.f90: Likewise.
	* gfortran.dg/goacc/routine-6.f90: Likewise.
	* gfortran.dg/goacc/routine-8.f90: Likewise.
	* gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise.

	libgomp/
	* testsuite/libgomp.oacc-fortran/routine-1.f90: Adjust test.
	* testsuite/libgomp.oacc-fortran/routine-2.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-3.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-4.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-5.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-9.f90: Likewise.
	* libgomp.oacc-fortran/host_data-2.f90: Likewise.
	* libgomp.oacc-fortran/host_data-3.f: Likewise.
	* libgomp.oacc-fortran/host_data-4.f90: Likewise.


diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 04b0024a992..3efd59c95f7 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -316,6 +316,16 @@ enum save_state
 { SAVE_NONE = 0, SAVE_EXPLICIT, SAVE_IMPLICIT
 };
 
+/* Flags to keep track of ACC routine states.  */
+enum oacc_function
+{ OACC_FUNCTION_NONE = 0,
+  OACC_FUNCTION_GANG,
+  OACC_FUNCTION_WORKER,
+  OACC_FUNCTION_VECTOR,
+  OACC_FUNCTION_SEQ,
+  OACC_FUNCTION_AUTO
+};
+
 /* Strings for all symbol attributes.  We use these for dumping the
parse tree, in error messages, and also when reading and writing
modules.  In symbol.c.  */
@@ -902,7 +912,7 @@ typedef struct
   unsigned oacc_declare_link:1;
 
   /* This is an OpenACC acclerator function at level N - 1  */
-  unsigned oacc_function:3;
+  ENUM_BITFIELD (oacc_function) oacc_function:3;
 
   /* Attributes set by compiler extensions (!GCC$ ATTRIBUTES).  */
   unsigned ext_attr:EXT_ATTR_NUM;
@@ -1726,6 +1736,7 @@ typedef struct gfc_oacc_routine_name
   struct gfc_symbol *sym;
   struct gfc_omp_clauses *clauses;
   struct gfc_oacc_routine_name *next;
+  locus loc;
 }
 gfc_oacc_routine_name;
 
diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 94a7f7eaa50..ac1923ea06b 100644
--- a/gcc/for

Re: [patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels

2018-09-20 Thread Cesar Philippidis
On 09/20/2018 10:14 AM, Cesar Philippidis wrote:
> As Chung-Lin noted here
> <https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01079.html>:
> 
>   This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a
>   "sequential" loop form (without the OMP runtime calls), used for loop
>   directives inside OpenACC kernels constructs. Tom mentions that this
>   allows the kernels parallelization to work when '#pragma acc loop'
>   makes the front-ends create OMP_FOR, which the loop analysis phases
>   don't understand.
> 
> I bootstrapped and regtested it on x86_64 Linux with nvptx offloading.
> Is this patch OK for trunk?

I forgot to mention how that patch depends on the
omp_target_base_pointers_restrict_p functionality from omp lowering that
I removed back in June when I added support for the OpenACC 2.5 data
clause semantics. It turned out that I was too aggressive when I was
removing unused code. That's because, at least initially, there was no
test cases that exercised that functionality in trunk until Chung-Lin's
kernels patch goes in.

Anyway, this patch is specifically required to get
kernels-acc-loop-reduction.c working.

Is this OK for trunk? I bootstrapped and regression tested it on x86_64
Linux with nvptx offloading.

Thanks,
Cesar
[OpenACC] Reintroduce omp_target_base_pointers_restrict_p

It turns out that existing acc kernels instructure based on parloops
will benefit if the variables used in OpenACC data clauses maintained
the restrict pointer qualifier. This code is present in GCC 8, but I
removed it back in June when I committed a patch to update the
behavior of the data clauses match the semantics in OpenACC 2.5.

Is this patch OK for trunk? A forthcoming acc kernels patch depends on
it.

2018-XX-YY  Cesar Philippidis  

	* omp-low.c (install_var_field): New base_pointer_restrict
	argument.
	(scan_sharing_clauses): Update call to install_var_field.
	(omp_target_base_pointers_restrict_p): New function.
	(scan_omp_target): Update call to install_var_field.
---
 gcc/omp-low.c | 89 +++
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 24685fd012c..a59c15ae5fd 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -642,7 +642,8 @@ build_sender_ref (tree var, omp_context *ctx)
BASE_POINTERS_RESTRICT, declare the field with restrict.  */
 
 static void
-install_var_field (tree var, bool by_ref, int mask, omp_context *ctx)
+install_var_field (tree var, bool by_ref, int mask, omp_context *ctx,
+		   bool base_pointers_restrict = false)
 {
   tree field, type, sfield = NULL_TREE;
   splay_tree_key key = (splay_tree_key) var;
@@ -673,7 +674,11 @@ install_var_field (tree var, bool by_ref, int mask, omp_context *ctx)
   type = build_pointer_type (build_pointer_type (type));
 }
   else if (by_ref)
-type = build_pointer_type (type);
+{
+  type = build_pointer_type (type);
+  if (base_pointers_restrict)
+	type = build_qualified_type (type, TYPE_QUAL_RESTRICT);
+}
   else if ((mask & 3) == 1 && omp_is_reference (var))
 type = TREE_TYPE (type);
 
@@ -987,10 +992,12 @@ fixup_child_record_type (omp_context *ctx)
 }
 
 /* Instantiate decls as necessary in CTX to satisfy the data sharing
-   specified by CLAUSES.  */
+   specified by CLAUSES.  If BASE_POINTERS_RESTRICT, install var field with
+   restrict.  */
 
 static void
-scan_sharing_clauses (tree clauses, omp_context *ctx)
+scan_sharing_clauses (tree clauses, omp_context *ctx,
+		  bool base_pointers_restrict = false)
 {
   tree c, decl;
   bool scan_array_reductions = false;
@@ -1252,7 +1259,8 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 		  && TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE)
 		install_var_field (decl, true, 7, ctx);
 		  else
-		install_var_field (decl, true, 3, ctx);
+		install_var_field (decl, true, 3, ctx,
+   base_pointers_restrict);
 		  if (is_gimple_omp_offloaded (ctx->stmt)
 		  && !OMP_CLAUSE_MAP_IN_REDUCTION (c))
 		install_var_local (decl, ctx);
@@ -2265,6 +2273,68 @@ scan_omp_single (gomp_single *stmt, omp_context *outer_ctx)
 layout_type (ctx->record_type);
 }
 
+/* Return true if the CLAUSES of an omp target guarantee that the base pointers
+   used in the corresponding offloaded function are restrict.  */
+
+static bool
+omp_target_base_pointers_restrict_p (tree clauses)
+{
+  /* The analysis relies on the GOMP_MAP_FORCE_* mapping kinds, which are only
+ used by OpenACC.  */
+  if (flag_openacc == 0)
+return false;
+
+  /* I.  Basic example:
+
+   void foo (void)
+   {
+	 unsigned int a[2], b[2];
+
+	 #pragma acc kernels \
+	   copyout (a) \
+	   copyout (b)
+	 {
+	   a[0] = 0;
+	   b[0] = 1;
+	 }
+   }
+
+ After gimplification, we have:
+
+   #pragma omp target oacc_kernels \
+	 map(force_from:a [len: 8]) \
+	 map(force_from:b

Re: [patch,openacc] handle missing OMP_LIST_ clauses in fortran's parse tree debugger

2018-09-20 Thread Cesar Philippidis
On 09/20/2018 11:22 AM, Paul Richard Thomas wrote:
> Hi Cesar,
> 
> It looks OK to me.
> 
> Thanks for the patch.
> 
> Paul

Thanks! Committed in r264446.

Cesar

> On 20 September 2018 at 18:21, Cesar Philippidis  
> wrote:
>> This patch updates Fortran's parse tree printer to print the names of
>> new OpenACC data clauses. I'm not if this functionality is widely used
>> or not, but from a standpoint of correctness, this patch would probably
>> be nice to have.
>>
>> It this patch OK for trunk? I bootstrapped and regtested it for x86_64
>> Linux with nvptx offloading.
>>
>> Thanks,
>> Cesar
> 
> 
> 



[patch,openacc] Update _OPENACC value and documentation for OpenACC 2.5

2018-09-20 Thread Cesar Philippidis
This patch formally introduces OpenACC 2.5 functionality in various GCC
documentation sources along with with updated the _OPENACC value in the
various offloading header files.

As of right now, GCC trunk already supports the updated OpenACC 2.5 data
clause semantics. Julian, Chung-Lin and I have been working on pushing
our remaining og8 patches to trunk (which we're down to under 30 now
from 170+). But a number of those changes involve performance tuning,
rather than new OpenACC functionality.

Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
Linux with nvptx offloading.

Thanks,
Cesar
[OpenACC] Update _OPENACC value and documentation for OpenACC 2.5

2018-XX-YY  Thomas Schwinge 
	Cesar Philippidis  

	gcc/c-family/
	* c-cppbuiltin.c (c_cpp_builtins): Update "_OPENACC" to "201510".
	gcc/fortran/
	* cpp.c (cpp_define_builtins): Update "_OPENACC" to "201510".
	* gfortran.texi: Update for OpenACC 2.5.
	* Intrinsic.texi: Likewise.
	* invoke.texi: Likewise.
	gcc/testsuite/
	* c-c++-common/cpp/openacc-define-3.c: Update.
	* gfortran.dg/openacc-define-3.f90: Likewise.
	gcc/
	* doc/invoke.texi: Update for OpenACC 2.5.
	libgomp/
	* libgomp.texi: Update for OpenACC 2.5.
	* openacc.f90 (openacc_version): Update to "201510".
	* openacc_lib.h (openacc_version): Likewise.
	* testsuite/libgomp.oacc-fortran/openacc_version-1.f: Update.
	* testsuite/libgomp.oacc-fortran/openacc_version-2.f90: Update.

(cherry picked from gomp-4_0-branch r248057, ccbbcb70569)
---
 gcc/c-family/c-cppbuiltin.c   |  2 +-
 gcc/doc/invoke.texi   |  4 +++-
 gcc/fortran/cpp.c |  2 +-
 gcc/fortran/gfortran.texi | 16 +-
 gcc/fortran/intrinsic.texi|  6 +++---
 gcc/fortran/invoke.texi   |  4 +---
 .../c-c++-common/cpp/openacc-define-3.c   |  2 +-
 .../gfortran.dg/openacc-define-3.f90  |  2 +-
 libgomp/libgomp.texi  | 21 ++-
 libgomp/openacc.f90   |  2 +-
 libgomp/openacc_lib.h |  2 +-
 .../libgomp.oacc-fortran/openacc_version-1.f  |  2 +-
 .../openacc_version-2.f90 |  2 +-
 13 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/gcc/c-family/c-cppbuiltin.c b/gcc/c-family/c-cppbuiltin.c
index 96a6b4dfd2b..f2a273b6ac7 100644
--- a/gcc/c-family/c-cppbuiltin.c
+++ b/gcc/c-family/c-cppbuiltin.c
@@ -1391,7 +1391,7 @@ c_cpp_builtins (cpp_reader *pfile)
 cpp_define (pfile, "__SSP__=1");
 
   if (flag_openacc)
-cpp_define (pfile, "_OPENACC=201306");
+cpp_define (pfile, "_OPENACC=201510");
 
   if (flag_openmp)
 cpp_define (pfile, "_OPENMP=201511");
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 94304c314cf..34d7ff71512 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -2161,10 +2161,12 @@ freestanding and hosted environments.
 Enable handling of OpenACC directives @code{#pragma acc} in C/C++ and
 @code{!$acc} in Fortran.  When @option{-fopenacc} is specified, the
 compiler generates accelerated code according to the OpenACC Application
-Programming Interface v2.0 @w{@uref{https://www.openacc.org}}.  This option
+Programming Interface v2.5 @w{@uref{https://www.openacc.org}}.  This option
 implies @option{-pthread}, and thus is only supported on targets that
 have support for @option{-pthread}.
 
+See @uref{https://gcc.gnu.org/wiki/OpenACC} for more information.
+
 @item -fopenacc-dim=@var{geom}
 @opindex fopenacc-dim
 @cindex OpenACC accelerator programming
diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c
index 0b3de42e832..14871129ff6 100644
--- a/gcc/fortran/cpp.c
+++ b/gcc/fortran/cpp.c
@@ -165,7 +165,7 @@ cpp_define_builtins (cpp_reader *pfile)
   cpp_define (pfile, "_LANGUAGE_FORTRAN=1");
 
   if (flag_openacc)
-cpp_define (pfile, "_OPENACC=201306");
+cpp_define (pfile, "_OPENACC=201510");
 
   if (flag_openmp)
 cpp_define (pfile, "_OPENMP=201511");
diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi
index 30934046a49..59a69457fe0 100644
--- a/gcc/fortran/gfortran.texi
+++ b/gcc/fortran/gfortran.texi
@@ -476,9 +476,7 @@ used on real-world programs.  In particular, the supported extensions
 include OpenMP, Cray-style pointers, some old vendor extensions, and several
 Fortran 2003 and Fortran 2008 features, including TR 15581.  However, it is
 still under development and has a few remaining rough edges.
-There also is initial support for OpenACC.
-Note that this is an experimental feature, incomplete, and subject to
-change in future versions of GCC.  See
+There also is support for OpenACC.  See
 @uref{https://gcc.gnu.org/wiki/OpenACC} for more information.
 
 At present, the GNU Fortran compiler passes the
@@ -538,10 +536,8 @@ status} and @ref{Fortran 2018 status

[patch,openacc] Set safelen to INT_MAX for oacc independent pragma

2018-09-20 Thread Cesar Philippidis
This is another old gomp4 OpenACC patch which impacts targets that use
simd vectorization, such as the host and AMD GCN, rather than nvptx.
Basically, as the subject states, it sets safelen to INT_MAX for
independent acc loops, which I believe is already being done for OpenMP
in certain situations.

The original discussion for this patch can be found here
<https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01872.html>.

Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
Linux with nvptx offloading.

Thanks,
Cesar
[OpenACC] Set safelen to INT_MAX for oacc independent pragma

2018-XX-YY  Tom de Vries  
	    Cesar Philippidis  

	gcc/
	* omp-expand.c (expand_omp_for): Set loop->safelen to INT_MAX if
	marked_independent.

(cherry picked from gomp-4_0-branch r226079)
---
 gcc/omp-expand.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 427f329d35f..ee147f10826 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -5718,6 +5718,7 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
 	{
 	  struct loop *loop = region->cont->loop_father;
 	  loop->marked_independent = true;
+	  loop->safelen = INT_MAX;
 	}
 }
   else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
-- 
2.17.1



[patch,openacc] Propagate independent clause for OpenACC kernels pass

2018-09-20 Thread Cesar Philippidis
This is another old patch teaches the omp expansion pass how to
propagate the acc loop independent clause to the later stages throughout
compilation. Unfortunately, it didn't include any test cases. I'm not
sure how effective this will be with the existing kernel parloops pass.
But as I noted in my Cauldron talk, we would like to convert acc kernels
regions to acc parallel regions, and this patch could help in that regard.

Chung-Lin, do you have anymore state on this patch?

Anyway, I bootstrapped and regtested it for x86_64 Linux with nvptx
offloading and it didn't introduce any regressions. We do have a couple
of other standalone kernels patches in og8, but those depend on other
patches.

Thanks,
Cesar
[OpenACC] Propagate independent clause for OpenACC kernels pass

2018-XX-YY  Chung-Lin Tang 
	Cesar Philippidis  

	gcc/
	* cfgloop.h (struct loop): Add 'bool marked_independent' field.
	* omp-expand.c (struct omp_region): Add 'int kind' and
	'bool independent' fields.
	(expand_omp_for): Set 'marked_independent' field for loop
	corresponding to region.
	(find_omp_for_region_data): New function.
	(build_omp_regions_1): Set kind field.  Call
	find_omp_for_region_data for GIMPLE_OMP_FOR statements.

(cherry picked from gomp-4_0-branch r225759)
---
 gcc/cfgloop.h|  4 
 gcc/omp-expand.c | 46 --
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 80a31c416ca..7928681b514 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -221,6 +221,10 @@ struct GTY ((chain_next ("%h.next"))) loop {
   /* True if the loop is part of an oacc kernels region.  */
   unsigned in_oacc_kernels_region : 1;
 
+  /* True if loop is tagged as having independent iterations by user,
+ e.g. the OpenACC independent clause.  */
+  bool marked_independent;
+
   /* The number of times to unroll the loop.  0 means no information given,
  just do what we always do.  A value of 1 means do not unroll the loop.
  A value of USHRT_MAX means unroll with no specific unrolling factor.
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 9b03f62e065..427f329d35f 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -107,6 +107,12 @@ struct omp_region
 
   /* True if this is nested inside an OpenACC kernels construct.  */
   bool inside_kernels_p;
+
+  /* Records a generic kind field.  */
+  int kind;
+
+  /* For an OpenACC loop directive, true if has the 'independent' clause.  */
+  bool independent;
 };
 
 static struct omp_region *root_omp_region;
@@ -5705,8 +5711,15 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
 loops_state_set (LOOPS_NEED_FIXUP);
 
   if (region->inside_kernels_p)
-expand_omp_for_generic (region, , BUILT_IN_NONE, BUILT_IN_NONE,
-			inner_stmt);
+{
+  expand_omp_for_generic (region, , BUILT_IN_NONE, BUILT_IN_NONE,
+			  inner_stmt);
+  if (region->independent && region->cont->loop_father)
+	{
+	  struct loop *loop = region->cont->loop_father;
+	  loop->marked_independent = true;
+	}
+}
   else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
 expand_omp_simd (region, );
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
@@ -7887,6 +7900,31 @@ expand_omp (struct omp_region *region)
 }
 }
 
+/* Fill in additional data for a region REGION associated with an
+   OMP_FOR STMT.  */
+
+static void
+find_omp_for_region_data (struct omp_region *region, gomp_for *stmt)
+{
+  region->kind = gimple_omp_for_kind (stmt);
+
+  if (region->kind == GF_OMP_FOR_KIND_OACC_LOOP)
+{
+  struct omp_region *target_region = region->outer;
+  while (target_region
+	 && target_region->type != GIMPLE_OMP_TARGET)
+	target_region = target_region->outer;
+  if (!target_region)
+	return;
+
+  tree clauses = gimple_omp_for_clauses (stmt);
+
+  if (target_region->kind == GF_OMP_TARGET_KIND_OACC_KERNELS
+	  && omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT))
+	region->independent = true;
+}
+}
+
 /* Helper for build_omp_regions.  Scan the dominator tree starting at
block BB.  PARENT is the region that contains BB.  If SINGLE_TREE is
true, the function ends once a single tree is built (otherwise, whole
@@ -7953,6 +7991,8 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
 		case GF_OMP_TARGET_KIND_OACC_KERNELS:
 		case GF_OMP_TARGET_KIND_OACC_DATA:
 		case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+		  if (is_gimple_omp_oacc (stmt))
+		region->kind = gimple_omp_target_kind (stmt);
 		  break;
 		case GF_OMP_TARGET_KIND_UPDATE:
 		case GF_OMP_TARGET_KIND_ENTER_DATA:
@@ -7974,6 +8014,8 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
 	/* #pragma omp ordered depend is also just a stand-alone
 	   directive.  */
 	region = NULL;
+	  else if (code == GIMPLE_OMP_FOR)
+	find_omp_for_region_data (re

[patch,openacc] Fix PR71959: lto dump of callee counts

2018-09-20 Thread Cesar Philippidis
This is another old gomp4 patch that demotes an ICE in PR71959 to a
linker warning. One problem here is that it is not clear if OpenACC
allows individual member functions in C++ classes to be marked as acc
routines. There's another issue accessing member data inside offloaded
regions. We'll add some support for member data OpenACC 2.6, but some of
the OpenACC C++ semantics are still unclear.

Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux
with nvptx offloading.

Thanks,
Cesar
[PR71959] lto dump of callee counts

2018-XX-YY  Nathan Sidwell  
	Cesar Philippidis  

	gcc/
	* ipa-inline-analysis.c (inline_write_summary): Only dump callee
	counts when dumping the function's body.

	libgomp/
	* testsuite/libgomp.oacc-c++/pr71959.C: New.
	* testsuite/libgomp.oacc-c++/pr71959-a.C: New.

(cherry picked from gomp-4_0-branch r239788)
---
 gcc/ipa-fnsummary.c   | 18 ---
 .../testsuite/libgomp.oacc-c++/pr71959-a.C| 31 +++
 libgomp/testsuite/libgomp.oacc-c++/pr71959.C  | 31 +++
 3 files changed, 75 insertions(+), 5 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C
 create mode 100644 libgomp/testsuite/libgomp.oacc-c++/pr71959.C

diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index 62095c6cf6f..e796b085e14 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -3409,8 +3409,10 @@ ipa_fn_summary_write (void)
 	  int i;
 	  size_time_entry *e;
 	  struct condition *c;
+	  int index = lto_symtab_encoder_encode (encoder, cnode);
+	  bool body = encoder->nodes[index].body;
 
-	  streamer_write_uhwi (ob, lto_symtab_encoder_encode (encoder, cnode));
+	  streamer_write_uhwi (ob, index);
 	  streamer_write_hwi (ob, info->estimated_self_stack_size);
 	  streamer_write_hwi (ob, info->self_size);
 	  info->time.stream_out (ob);
@@ -3453,10 +3455,16 @@ ipa_fn_summary_write (void)
 	info->array_index->stream_out (ob);
 	  else
 	streamer_write_uhwi (ob, 0);
-	  for (edge = cnode->callees; edge; edge = edge->next_callee)
-	write_ipa_call_summary (ob, edge);
-	  for (edge = cnode->indirect_calls; edge; edge = edge->next_callee)
-	write_ipa_call_summary (ob, edge);
+	  if (body)
+	{
+	  /* Only write callee counts when we're emitting the
+		 body, as the reader only knows about the callees when
+		 the body's emitted.  */
+	  for (edge = cnode->callees; edge; edge = edge->next_callee)
+		write_ipa_call_summary (ob, edge);
+	  for (edge = cnode->indirect_calls; edge; edge = edge->next_callee)
+		write_ipa_call_summary (ob, edge);
+	}
 	}
 }
   streamer_write_char_stream (ob->main_stream, 0);
diff --git a/libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C b/libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C
new file mode 100644
index 000..9486512d0e7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C
@@ -0,0 +1,31 @@
+// { dg-do compile }
+
+struct Iter 
+{
+  int *cursor;
+
+  void ctor (int *cursor_) asm("_ZN4IterC1EPi");
+  int *point () const asm("_ZNK4Iter5pointEv");
+};
+
+#pragma acc routine
+void  Iter::ctor (int *cursor_)
+{
+  cursor = cursor_;
+}
+
+#pragma acc routine
+int *Iter::point () const
+{
+  return cursor;
+}
+
+void apply (int (*fn)(), Iter out) asm ("_ZN5Apply5applyEPFivE4Iter");
+
+#pragma acc routine
+void apply (int (*fn)(), struct Iter out)
+{ *out.point() = fn (); }
+
+extern "C" void __gxx_personality_v0 ()
+{
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c++/pr71959.C b/libgomp/testsuite/libgomp.oacc-c++/pr71959.C
new file mode 100644
index 000..169bf4aad17
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/pr71959.C
@@ -0,0 +1,31 @@
+// { dg-additional-sources "pr71959-a.C" }
+
+// pr lto/71959 ICEd LTO due to mismatch between writing & reading behaviour
+
+struct Iter
+{
+  int *cursor;
+  
+  Iter(int *cursor_) : cursor(cursor_) {}
+
+  int *point() const { return cursor; }
+};
+
+#pragma acc routine seq
+int one () { return 1; }
+
+struct Apply
+{
+  static void apply (int (*fn)(), Iter out)
+  { *out.point() = fn (); }
+};
+
+int main ()
+{
+  int x;
+  
+#pragma acc parallel copyout(x)
+  Apply::apply (one, Iter ());
+
+  return x != 1;
+}
-- 
2.17.1



[patch,openacc] Fix hang when running oacc exec with CUDA 9.0 nvprof

2018-09-20 Thread Cesar Philippidis
While tuning the performance of nvptx OpenACC offloading earlier this
year, Tom fixed a bug in og7 that prevented Nvidia's nvprof profiling
tool from working with CUDA 9. Tom posted more details on the patch here
<https://gcc.gnu.org/ml/gcc-patches/2018-02/msg01269.html>, which is
still relevant here.

Note that this issue was triggered by the new OpenACC profiling API in
og7, which has not landed in trunk yet. However, it's probably a good
idea to get this patch committed independently from that huge profiling
patch series.

Is this OK for trunk? I bootstrapped and regtested this for x86_64 Linux
with nvptx offloading.

Thanks,
Cesar
[OpenACC] Fix hang when running oacc exec with CUDA 9.0 nvprof

2018-XX-YY  Tom de Vries  
	    Cesar Philippidis  

	libgomp/
	* oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
	New variable.
	(acc_init_1): Set acc_init_thread to pthread_self ().  Set
	acc_init_state to initializing at the start, and to initialized at the
	end.
	(self_initializing_p): New function.
	(acc_get_device_type): Return acc_device_none if called by thread that
	is currently executing acc_init_1.

(cherry picked from openacc-gcc-7-branch commit
81904b675f6298a9c26c71391909ce362990a11f, bfc999c)
---
 libgomp/oacc-init.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 8db24b17d29..8842e7218cb 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -40,6 +40,11 @@
 
 static gomp_mutex_t acc_device_lock;
 
+static gomp_mutex_t acc_init_state_lock;
+static enum { uninitialized, initializing, initialized } acc_init_state
+  = uninitialized;
+static pthread_t acc_init_thread;
+
 /* A cached version of the dispatcher for the global "current" accelerator type,
e.g. used as the default when creating new host threads.  This is the
device-type equivalent of goacc_device_num (which specifies which device to
@@ -215,6 +220,11 @@ acc_init_1 (acc_device_t d)
   struct gomp_device_descr *base_dev, *acc_dev;
   int ndevs;
 
+  gomp_mutex_lock (_init_state_lock);
+  acc_init_state = initializing;
+  acc_init_thread = pthread_self ();
+  gomp_mutex_unlock (_init_state_lock);
+
   base_dev = resolve_device (d, true);
 
   ndevs = base_dev->get_num_devices_func ();
@@ -234,6 +244,10 @@ acc_init_1 (acc_device_t d)
   gomp_init_device (acc_dev);
   gomp_mutex_unlock (_dev->lock);
 
+  gomp_mutex_lock (_init_state_lock);
+  acc_init_state = initialized;
+  gomp_mutex_unlock (_init_state_lock);
+
   return base_dev;
 }
 
@@ -528,6 +542,17 @@ acc_set_device_type (acc_device_t d)
 
 ialias (acc_set_device_type)
 
+static bool
+self_initializing_p (void)
+{
+  bool res;
+  gomp_mutex_lock (_init_state_lock);
+  res = (acc_init_state == initializing
+	 && pthread_equal (acc_init_thread, pthread_self ()));
+  gomp_mutex_unlock (_init_state_lock);
+  return res;
+}
+
 acc_device_t
 acc_get_device_type (void)
 {
@@ -537,6 +562,15 @@ acc_get_device_type (void)
 
   if (thr && thr->base_dev)
 res = acc_device_type (thr->base_dev->type);
+  else if (self_initializing_p ())
+/* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
+   acc_ev_device_init_start event callback, which is dispatched during
+   acc_init_1.  Trying to lock acc_device_lock during such a call (as we do
+   in the else clause below), will result in deadlock, since the lock has
+   already been taken by the acc_init_1 caller.  We work around this problem
+   by using the acc_get_device_type property "If the device type has not yet
+   been selected, the value acc_device_none may be returned".  */
+;
   else
 {
   gomp_init_targets_once ();
-- 
2.17.1



[patch,openacc] handle missing OMP_LIST_ clauses in fortran's parse tree debugger

2018-09-20 Thread Cesar Philippidis
This patch updates Fortran's parse tree printer to print the names of
new OpenACC data clauses. I'm not if this functionality is widely used
or not, but from a standpoint of correctness, this patch would probably
be nice to have.

It this patch OK for trunk? I bootstrapped and regtested it for x86_64
Linux with nvptx offloading.

Thanks,
Cesar
[OpenACC] handle missing OMP_LIST_ clauses in fortran's parse tree debugger

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* dump-parse-tree.c (show_omp_clauses): Add missing omp list_types
	and reorder the switch cases to match the enum in gfortran.h.

(cherry picked from gomp-4_0-branch r228355, 159518d)
---
 gcc/fortran/dump-parse-tree.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/gcc/fortran/dump-parse-tree.c b/gcc/fortran/dump-parse-tree.c
index 2a28fa30986..f1be5a67a26 100644
--- a/gcc/fortran/dump-parse-tree.c
+++ b/gcc/fortran/dump-parse-tree.c
@@ -1384,21 +1384,26 @@ show_omp_clauses (gfc_omp_clauses *omp_clauses)
 	const char *type = NULL;
 	switch (list_type)
 	  {
-	  case OMP_LIST_USE_DEVICE: type = "USE_DEVICE"; break;
-	  case OMP_LIST_DEVICE_RESIDENT: type = "USE_DEVICE"; break;
-	  case OMP_LIST_CACHE: type = ""; break;
 	  case OMP_LIST_PRIVATE: type = "PRIVATE"; break;
 	  case OMP_LIST_FIRSTPRIVATE: type = "FIRSTPRIVATE"; break;
 	  case OMP_LIST_LASTPRIVATE: type = "LASTPRIVATE"; break;
+	  case OMP_LIST_COPYPRIVATE: type = "COPYPRIVATE"; break;
 	  case OMP_LIST_SHARED: type = "SHARED"; break;
 	  case OMP_LIST_COPYIN: type = "COPYIN"; break;
 	  case OMP_LIST_UNIFORM: type = "UNIFORM"; break;
 	  case OMP_LIST_ALIGNED: type = "ALIGNED"; break;
 	  case OMP_LIST_LINEAR: type = "LINEAR"; break;
+	  case OMP_LIST_DEPEND: type = "DEPEND"; break;
+	  case OMP_LIST_MAP: type = "MAP"; break;
+	  case OMP_LIST_TO: type = "TO"; break;
+	  case OMP_LIST_FROM: type = "FROM"; break;
 	  case OMP_LIST_REDUCTION: type = "REDUCTION"; break;
+	  case OMP_LIST_DEVICE_RESIDENT: type = "DEVICE_RESIDENT"; break;
+	  case OMP_LIST_LINK: type = "LINK"; break;
+	  case OMP_LIST_USE_DEVICE: type = "USE_DEVICE"; break;
+	  case OMP_LIST_CACHE: type = "CACHE"; break;
 	  case OMP_LIST_IS_DEVICE_PTR: type = "IS_DEVICE_PTR"; break;
 	  case OMP_LIST_USE_DEVICE_PTR: type = "USE_DEVICE_PTR"; break;
-	  case OMP_LIST_DEPEND: type = "DEPEND"; break;
 	  default:
 	gcc_unreachable ();
 	  }
-- 
2.17.1



[patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels

2018-09-20 Thread Cesar Philippidis
As Chung-Lin noted here
<https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01079.html>:

  This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a
  "sequential" loop form (without the OMP runtime calls), used for loop
  directives inside OpenACC kernels constructs. Tom mentions that this
  allows the kernels parallelization to work when '#pragma acc loop'
  makes the front-ends create OMP_FOR, which the loop analysis phases
  don't understand.

I bootstrapped and regtested it on x86_64 Linux with nvptx offloading.
Is this patch OK for trunk?

Thanks,
Cesar
[OpenACC] Generate sequential loop for OpenACC loop directive inside kernels

2018-XX-YY  Chung-Lin Tang 
	Cesar Philippidis  

	gcc/
	* omp-expand.c (struct omp_region): Add inside_kernels_p field.
	(expand_omp_for_generic): Adjust to generate a 'sequential' loop
	when GOMP builtin arguments are BUILT_IN_NONE.
	(expand_omp_for): Use expand_omp_for_generic to generate a
	non-parallelized loop for OMP_FORs inside OpenACC kernels regions.
	(expand_omp): Mark inside_kernels_p field true for regions
	nested inside OpenACC kernels constructs.
	gcc/testsuite/
	* c-c++-common/goacc/kernels-loop-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-acc-loop-reduction.c: New test.
	* c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test.

(cherry picked from gomp-4_0-branch r224505, r224837, r228232, r228233,
r231461, and r247958)
---
 gcc/omp-expand.c  | 136 --
 .../goacc/kernels-acc-loop-reduction.c|  23 +++
 .../goacc/kernels-acc-loop-smaller-equal.c|  23 +++
 .../goacc/kernels-loop-2-acc-loop.c   |  18 +++
 .../goacc/kernels-loop-3-acc-loop.c   |  15 ++
 .../goacc/kernels-loop-acc-loop.c |  15 ++
 .../goacc/kernels-loop-n-acc-loop.c   |  15 ++
 7 files changed, 204 insertions(+), 41 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c

diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index d2a77c067c6..9b03f62e065 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -104,6 +104,9 @@ struct omp_region
   /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has
  a depend clause.  */
   gomp_ordered *ord_stmt;
+
+  /* True if this is nested inside an OpenACC kernels construct.  */
+  bool inside_kernels_p;
 };
 
 static struct omp_region *root_omp_region;
@@ -2509,6 +2512,7 @@ expand_omp_for_generic (struct omp_region *region,
   gassign *assign_stmt;
   bool in_combined_parallel = is_combined_parallel (region);
   bool broken_loop = region->cont == NULL;
+  bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE);
   edge e, ne;
   tree *counts = NULL;
   int i;
@@ -2606,8 +2610,12 @@ expand_omp_for_generic (struct omp_region *region,
   type = TREE_TYPE (fd->loop.v);
   istart0 = create_tmp_var (fd->iter_type, ".istart0");
   iend0 = create_tmp_var (fd->iter_type, ".iend0");
-  TREE_ADDRESSABLE (istart0) = 1;
-  TREE_ADDRESSABLE (iend0) = 1;
+
+  if (!seq_loop)
+{
+  TREE_ADDRESSABLE (istart0) = 1;
+  TREE_ADDRESSABLE (iend0) = 1;
+}
 
   /* See if we need to bias by LLONG_MIN.  */
   if (fd->iter_type == long_long_unsigned_type_node
@@ -2637,7 +2645,25 @@ expand_omp_for_generic (struct omp_region *region,
   gsi_prev ();
 
   tree arr = NULL_TREE;
-  if (in_combined_parallel)
+  if (seq_loop)
+{
+  tree n1 = fold_convert (fd->iter_type, fd->loop.n1);
+  tree n2 = fold_convert (fd->iter_type, fd->loop.n2);
+
+  n1 = force_gimple_operand_gsi_1 (, n1, is_gimple_reg, NULL_TREE, true,
+   GSI_SAME_STMT);
+  n2 = force_gimple_operand_gsi_1 (, n2, is_gimple_reg, NULL_TREE, true,
+   GSI_SAME_STMT);
+
+  assign_stmt = gimple_build_assign (istart0, n1);
+  gsi_insert_before (, assign_stmt, GSI_SAME_STMT);
+
+  assign_stmt = gimple_build_assign (iend0, n2);
+  gsi_insert_before (, assign_stmt, GSI_SAME_STMT);
+
+  t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0);
+}
+  else if (in_combined_parallel)
 {
   gcc_assert (fd->ordered == 0);
   /* In a combined parallel loop, emit a call to
@@ -3059,39 +3085,45 @@ expand_omp_for_generic (struct omp_region *region,
 	collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb);
 
   /* Emit code to get the next

[patch,openacc] Fix infinite recursion in OMP clause pretty-printing, default label

2018-09-20 Thread Cesar Philippidis
Apparently, Tom ran into an ICE when we were adding support for new
clauses back in the gomp-4_0-branch days.  This patch shouldn't be
necessary because all of the clauses are fully implemented now, but
it may prevent similar bugs from occurring in the future at least
during development.

Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
Linux with nvptx offloading.

Thanks,
Cesar
Fix infinite recursion in OMP clause pretty-printing, default label

Apparently, Tom ran into an ICE when we were adding support for new
clauses back in the gomp-4_0-branch days.  This patch shouldn't be
necessary because all of the clauses are fully implemented now, but
it may prevent similar bugs from occuring in the future at least
during development.

2018-XX-YY  Tom de Vries  
Cesar Philippidis  

	gcc/
	* tree-pretty-print.c (dump_omp_clause): Fix infinite recursion in
	default label.

(cherry picked from gomp-4_0-branch r228915, 2e4d930)
---
 gcc/tree-pretty-print.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c
index 2c089b11751..031afbb49e4 100644
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@@ -1063,8 +1063,7 @@ dump_omp_clause (pretty_printer *pp, tree clause, int spc, dump_flags_t flags)
   break;
 
 default:
-  /* Should never happen.  */
-  dump_generic_node (pp, clause, spc, flags, false);
+  pp_string (pp, "unknown");
   break;
 }
 }
-- 
2.17.1



[patch,openacc] Fix acc_shutdown issue

2018-09-20 Thread Cesar Philippidis
Attached is an old gomp4 patch that allegedly fixes an shutdown runtime
issue involving OpenACC accelerators. Unfortunately, the original patch
didn't include a test case, nor did it generate any regressions in the
libgomp testsuite when I reverted it in og8.

With that said, I like how this patch eliminates the redundant use of
gomp_mutex_lock to unmap variables (because gomp_unmap_vars already
acquires a lock). However, the trade-off is that it does increase
tgt->list_count to num_funcs + num_vars.

Does anyone have any strong opinion on this patch and is it OK for
trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx
offloading and I didn't encounter any regressions.

Thanks,
Cesar
[OpenACC] Fix acc_shutdown issue

2018-XX-YY  James Norris 
	    Cesar Philippidis  

	libgomp/
	* oacc-init.c (acc_shutdown_1): Replace use of gomp_free_memmap with
	gomp_unmap_vars.
	* target.c (gomp_load_image_to_device): Fix initialization.
	(gomp_free_memmap): Remove.

(cherry picked from gomp-4_0-branch r226045)
---
 libgomp/libgomp.h   |  1 -
 libgomp/oacc-init.c |  9 ++---
 libgomp/target.c| 27 +--
 3 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 3a8cc2bd7d6..5c11e97616d 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1003,7 +1003,6 @@ extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
 	  enum gomp_map_vars_kind);
 extern void gomp_unmap_vars (struct target_mem_desc *, bool);
 extern void gomp_init_device (struct gomp_device_descr *);
-extern void gomp_free_memmap (struct splay_tree_s *);
 extern void gomp_unload_device (struct gomp_device_descr *);
 extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
 
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 8842e7218cb..957bb9f31f9 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -303,9 +303,12 @@ acc_shutdown_1 (acc_device_t d)
 
   if (walk->dev)
 	{
-	  gomp_mutex_lock (>dev->lock);
-	  gomp_free_memmap (>dev->mem_map);
-	  gomp_mutex_unlock (>dev->lock);
+	  while (walk->dev->mem_map.root)
+	{
+	  struct target_mem_desc *tgt = walk->dev->mem_map.root->key.tgt;
+
+	  gomp_unmap_vars (tgt, false);
+	}
 
 	  walk->dev = NULL;
 	  walk->base_dev = NULL;
diff --git a/libgomp/target.c b/libgomp/target.c
index dda041cdbef..9ddc8d6c038 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1184,14 +1184,17 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
 }
 
   /* Insert host-target address mapping into splay tree.  */
-  struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt));
+  struct target_mem_desc *tgt =
+	  gomp_malloc (sizeof (*tgt)
+		   + sizeof (tgt->list[0])
+		   * (num_funcs + num_vars) * sizeof (*tgt->array));
   tgt->array = gomp_malloc ((num_funcs + num_vars) * sizeof (*tgt->array));
   tgt->refcount = REFCOUNT_INFINITY;
   tgt->tgt_start = 0;
   tgt->tgt_end = 0;
   tgt->to_free = NULL;
   tgt->prev = NULL;
-  tgt->list_count = 0;
+  tgt->list_count = num_funcs + num_vars;
   tgt->device_descr = devicep;
   splay_tree_node array = tgt->array;
 
@@ -1204,6 +1207,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
   k->tgt_offset = target_table[i].start;
   k->refcount = REFCOUNT_INFINITY;
   k->link_key = NULL;
+  tgt->list[i].key = k;
+  tgt->refcount++;
   array->left = NULL;
   array->right = NULL;
   splay_tree_insert (>mem_map, array);
@@ -1236,6 +1241,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
   k->tgt_offset = target_var->start;
   k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY;
   k->link_key = NULL;
+  tgt->list[i].key = k;
+  tgt->refcount++;
   array->left = NULL;
   array->right = NULL;
   splay_tree_insert (>mem_map, array);
@@ -1454,22 +1461,6 @@ gomp_unload_device (struct gomp_device_descr *devicep)
 }
 }
 
-/* Free address mapping tables.  MM must be locked on entry, and remains locked
-   on return.  */
-
-attribute_hidden void
-gomp_free_memmap (struct splay_tree_s *mem_map)
-{
-  while (mem_map->root)
-{
-  struct target_mem_desc *tgt = mem_map->root->key.tgt;
-
-  splay_tree_remove (mem_map, _map->root->key);
-  free (tgt->array);
-  free (tgt);
-}
-}
-
 /* Host fallback for GOMP_target{,_ext} routines.  */
 
 static void
-- 
2.17.1



[patch,opencc] Don't mark OpenACC auto loops as independent inside acc parallel regions

2018-09-20 Thread Cesar Philippidis
OpenACC as a concept of loop independence, in which independent loops
may be executed in parallel across gangs, workers and vectors. Inside
acc parallel regions, if a loop isn't explicitly marked seq or auto, it
is predetermined to be independent.

This patch corrects a bug where acc loops marked as auto were being
mistakenly promoted to independent. That's bad because it can generate
bogus results if a dependency exist.

Note that this patch depends on the following patches for
-fnote-info-omp-optimized which is used in a test case.

  * Add user-friendly OpenACC diagnostics regarding detected
parallelism.
https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01652.html

  * Correct the reported line number in fortran combined OpenACC
directives
https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01554.html

  * Correct the reported line number in c++ combined OpenACC directives
https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01552.html

Is this OK for trunk? I bootstrapped and regtested on x86_64 Linux with
nvptx offloading.

Thanks,
Cesar
[OpenACC] Don't mark OpenACC auto loops as independent inside acc parallel regions

2018-XX-YY  Cesar Philippidis  

	gcc/
	* omp-low.c (lower_oacc_head_mark): Don't mark OpenACC auto
	loops as independent inside acc parallel regions.

	gcc/testsuite/
	* c-c++-common/goacc/loop-auto-1.c: Adjust test case to conform to
	the new behavior of the auto clause in OpenACC 2.5.
	* c-c++-common/goacc/loop-auto-2.c: Likewise.
	* gcc.dg/goacc/loop-processing-1.c: Likewise.
	* c-c++-common/goacc/loop-auto-3.c: New test.
	* gfortran.dg/goacc/loop-auto-1.f90: New test.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust test case
	to conform to the new behavior of the auto clause in OpenACC 2.5.

(cherry picked from gomp-4_0-branch r247569, 6d30b542f29)

---
 gcc/omp-low.c |  5 +-
 .../c-c++-common/goacc/loop-auto-1.c  | 50 +--
 .../c-c++-common/goacc/loop-auto-2.c  |  4 +-
 .../c-c++-common/goacc/loop-auto-3.c  | 78 
 .../gcc.dg/goacc/loop-processing-1.c  |  2 +-
 .../gfortran.dg/goacc/loop-auto-1.f90 | 88 +++
 .../libgomp.oacc-c-c++-common/loop-auto-1.c   | 20 ++---
 7 files changed, 207 insertions(+), 40 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/loop-auto-3.c
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/loop-auto-1.f90

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index fdabf67249b..24685fd012c 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -5647,9 +5647,10 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
   tag |= OLF_GANG_STATIC;
 }
 
-  /* In a parallel region, loops are implicitly INDEPENDENT.  */
+  /* In a parallel region, loops without auto and seq clauses are
+ implicitly INDEPENDENT.  */
   omp_context *tgt = enclosing_target_ctx (ctx);
-  if (!tgt || is_oacc_parallel (tgt))
+  if ((!tgt || is_oacc_parallel (tgt)) && !(tag & (OLF_SEQ | OLF_AUTO)))
 tag |= OLF_INDEPENDENT;
 
   if (tag & OLF_TILE)
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
index 124befc4002..dcad07f11c8 100644
--- a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
@@ -10,7 +10,7 @@ void Foo ()
 #pragma acc loop seq
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++) {}
   }
 
@@ -20,7 +20,7 @@ void Foo ()
 #pragma acc loop auto
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++)
 	  {
 #pragma acc loop vector
@@ -51,7 +51,7 @@ void Foo ()
 #pragma acc loop vector
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int kx = 0; kx < 10; kx++) {}
 	  }
 
@@ -64,27 +64,27 @@ void Foo ()
 
   }
 
-#pragma acc loop auto
+#pragma acc loop auto independent
 for (int ix = 0; ix < 10; ix++)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int kx = 0; kx < 10; kx++) {}
 	  }
   }
 
-#pragma acc loop auto
+#pragma acc loop auto independent
 for (int ix = 0; ix < 10; ix++)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning&q

[patch,openacc] Better distinguish OpenACC and OpenMP sections in libgomp.texi

2018-09-20 Thread Cesar Philippidis
This patch updates the libgomp documentation to more clearly identify
OpenMP-specific sections. Specifically, the sections "Runtime Library
Routine" and "Environment Variables" are now prefixed by OpenMP, because
those sections are applicable to OpenACC.

Is this OK for trunk? I verified that libgomp.pdf looks ok.

Thanks,
Cesar
[OpenACC] Update _OPENACC value and documentation for OpenACC 2.5

2018-XX-YY  Thomas Schwinge 
	    Cesar Philippidis  

	gcc/c-family/
	* c-cppbuiltin.c (c_cpp_builtins): Update "_OPENACC" to "201510".
	gcc/fortran/
	* cpp.c (cpp_define_builtins): Update "_OPENACC" to "201510".
	* gfortran.texi: Update for OpenACC 2.5.
	* Intrinsic.texi: Likewise.
	* invoke.texi: Likewise.
	gcc/testsuite/
	* c-c++-common/cpp/openacc-define-3.c: Update.
	* gfortran.dg/openacc-define-3.f90: Likewise.
	gcc/
	* doc/invoke.texi: Update for OpenACC 2.5.
	libgomp/
	* libgomp.texi: Update for OpenACC 2.5.
	* openacc.f90 (openacc_version): Update to "201510".
	* openacc_lib.h (openacc_version): Likewise.
	* testsuite/libgomp.oacc-fortran/openacc_version-1.f: Update.
	* testsuite/libgomp.oacc-fortran/openacc_version-2.f90: Update.

(cherry picked from gomp-4_0-branch r248057, ccbbcb70569)
---
 gcc/c-family/c-cppbuiltin.c   |  2 +-
 gcc/doc/invoke.texi   |  4 +++-
 gcc/fortran/cpp.c |  2 +-
 gcc/fortran/gfortran.texi | 16 +-
 gcc/fortran/intrinsic.texi|  6 +++---
 gcc/fortran/invoke.texi   |  4 +---
 .../c-c++-common/cpp/openacc-define-3.c   |  2 +-
 .../gfortran.dg/openacc-define-3.f90  |  2 +-
 libgomp/libgomp.texi  | 21 ++-
 libgomp/openacc.f90   |  2 +-
 libgomp/openacc_lib.h |  2 +-
 .../libgomp.oacc-fortran/openacc_version-1.f  |  2 +-
 .../openacc_version-2.f90 |  2 +-
 13 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/gcc/c-family/c-cppbuiltin.c b/gcc/c-family/c-cppbuiltin.c
index 96a6b4dfd2b..f2a273b6ac7 100644
--- a/gcc/c-family/c-cppbuiltin.c
+++ b/gcc/c-family/c-cppbuiltin.c
@@ -1391,7 +1391,7 @@ c_cpp_builtins (cpp_reader *pfile)
 cpp_define (pfile, "__SSP__=1");
 
   if (flag_openacc)
-cpp_define (pfile, "_OPENACC=201306");
+cpp_define (pfile, "_OPENACC=201510");
 
   if (flag_openmp)
 cpp_define (pfile, "_OPENMP=201511");
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 94304c314cf..34d7ff71512 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -2161,10 +2161,12 @@ freestanding and hosted environments.
 Enable handling of OpenACC directives @code{#pragma acc} in C/C++ and
 @code{!$acc} in Fortran.  When @option{-fopenacc} is specified, the
 compiler generates accelerated code according to the OpenACC Application
-Programming Interface v2.0 @w{@uref{https://www.openacc.org}}.  This option
+Programming Interface v2.5 @w{@uref{https://www.openacc.org}}.  This option
 implies @option{-pthread}, and thus is only supported on targets that
 have support for @option{-pthread}.
 
+See @uref{https://gcc.gnu.org/wiki/OpenACC} for more information.
+
 @item -fopenacc-dim=@var{geom}
 @opindex fopenacc-dim
 @cindex OpenACC accelerator programming
diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c
index 0b3de42e832..14871129ff6 100644
--- a/gcc/fortran/cpp.c
+++ b/gcc/fortran/cpp.c
@@ -165,7 +165,7 @@ cpp_define_builtins (cpp_reader *pfile)
   cpp_define (pfile, "_LANGUAGE_FORTRAN=1");
 
   if (flag_openacc)
-cpp_define (pfile, "_OPENACC=201306");
+cpp_define (pfile, "_OPENACC=201510");
 
   if (flag_openmp)
 cpp_define (pfile, "_OPENMP=201511");
diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi
index 30934046a49..59a69457fe0 100644
--- a/gcc/fortran/gfortran.texi
+++ b/gcc/fortran/gfortran.texi
@@ -476,9 +476,7 @@ used on real-world programs.  In particular, the supported extensions
 include OpenMP, Cray-style pointers, some old vendor extensions, and several
 Fortran 2003 and Fortran 2008 features, including TR 15581.  However, it is
 still under development and has a few remaining rough edges.
-There also is initial support for OpenACC.
-Note that this is an experimental feature, incomplete, and subject to
-change in future versions of GCC.  See
+There also is support for OpenACC.  See
 @uref{https://gcc.gnu.org/wiki/OpenACC} for more information.
 
 At present, the GNU Fortran compiler passes the
@@ -538,10 +536,8 @@ status} and @ref{Fortran 2018 status} sections of the documentation.
 Additionally, the GNU Fortran compilers supports the OpenMP specification
 (version 4.0 and most of the features of the 4.5 version,
 @url{http://openmp.org/@/wp/@/openmp-specifications/}).
-There also is initial support for t

Re: [openacc] Teach gfortran to lower OpenACC routine dims

2018-09-20 Thread Cesar Philippidis
On 09/19/2018 03:27 PM, Bernhard Reutner-Fischer wrote:
> On Wed, 5 Sep 2018 12:52:03 -0700
> Cesar Philippidis  wrote:
> 
>> At present, gfortran does not encode the gang, worker or vector
>> parallelism clauses when it creates acc routines dim attribute for
>> subroutines and functions. While support for acc routine is lacking in
>> other areas in gfortran (including modules), this patch is important
>> because it encodes the parallelism attributes using the same function
>> as the C and C++ FEs. This will become important with the forthcoming
>> nvptx vector length extensions, because large vectors are not
>> supported in acc routines yet.
>>
>> Is this OK for trunk? I regtested and bootstrapped for x86_64 with
>> nvptx offloading.
> 
>> diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
>> index 94a7f7eaa50..d48c9351e25 100644
>> --- a/gcc/fortran/openmp.c
>> +++ b/gcc/fortran/openmp.c
>> @@ -2234,34 +2234,45 @@ gfc_match_oacc_cache (void)
>>return MATCH_YES;
>>  }
>>  
>> -/* Determine the loop level for a routine.   */
>> +/* Determine the loop level for a routine.  Returns
>> OACC_FUNCTION_NONE
>> +   if any error is detected.  */
>>  
>> -static int
>> +static oacc_function
>>  gfc_oacc_routine_dims (gfc_omp_clauses *clauses)
>>  {
>>int level = -1;
>> +  oacc_function ret = OACC_FUNCTION_AUTO;
>>  
>>if (clauses)
>>  {
>>unsigned mask = 0;
>>  
>>if (clauses->gang)
>> -level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level);
>> +{
>> +  level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level);
>> +  ret = OACC_FUNCTION_GANG;
>> +}
>>if (clauses->worker)
>> -level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level);
>> +{
>> +  level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level);
>> +  ret = OACC_FUNCTION_WORKER;
>> +}
>>if (clauses->vector)
>> -level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level);
>> +{
>> +  level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level);
>> +  ret = OACC_FUNCTION_VECTOR;
>> +}
>>if (clauses->seq)
>> -level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level);
>> +{
>> +  level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level);
>> +  ret = OACC_FUNCTION_SEQ;
>> +}
>>  
>>if (mask != (mask & -mask))
>> -gfc_error ("Multiple loop axes specified for routine");
>> +ret = OACC_FUNCTION_NONE;
>>  }
>>  
>> -  if (level < 0)
>> -level = GOMP_DIM_MAX;
>> -
>> -  return level;
>> +  return ret;
>>  }
>>  
>>  match
>> @@ -2272,6 +2283,8 @@ gfc_match_oacc_routine (void)
>>match m;
>>gfc_omp_clauses *c = NULL;
>>gfc_oacc_routine_name *n = NULL;
>> +  oacc_function dims = OACC_FUNCTION_NONE;
> 
> Unneeded initialisation of dims.

ACK.

>> +  bool seen_error = false;
>>  
>>old_loc = gfc_current_locus;
>>  
>> @@ -2318,17 +2331,15 @@ gfc_match_oacc_routine (void)
>>  }
>>else
>>  {
>> -  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C");
>> -  gfc_current_locus = old_loc;
>> -  return MATCH_ERROR;
>> +  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L",
>> _loc);
>> +  goto cleanup;
>>  }
>>  
>>if (gfc_match_char (')') != MATCH_YES)
>>  {
>> -  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C,
>> expecting"
>> - " ')' after NAME");
>> -  gfc_current_locus = old_loc;
>> -  return MATCH_ERROR;
>> +  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L,
>> expecting"
>> + " ')' after NAME", _loc);
>> +  goto cleanup;
>>  }
>>  }
>>  
>> @@ -2337,26 +2348,83 @@ gfc_match_oacc_routine (void)
>>!= MATCH_YES))
>>  return MATCH_ERROR;
>>  
>> +  /* Scan for invalid routine geometry.  */
>> +  dims = gfc_oacc_routine_dims (c);
>> +  if (dims == OACC_FUNCTION_NONE)
>> +{
>> +  gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at
>> %L",
>> + _loc);
>> +
>> +  /* Don't abort early, because it's important to let the user
>> + know of any potential duplicate routine directives.  */
>> +  seen_error =

[nvptx] vector length patch series

2018-09-18 Thread Cesar Philippidis
Hi Tom,

Here is a link to our nvptx vector length patches on github:

  https://github.com/cesarjp/gcc/tree/trunk-og8-vl-private

Specifically, the code lives in the trunk-og8-vl-private branch. There
are a couple of outstanding dependency patches:

  * Teach gfortran to lower OpenACC routine dims
https://gcc.gnu.org/ml/gcc-patches/2018-09/msg00368.html
b186c651f37 [openacc] Make GFC default to -1 for OpenACC routine dims

  * Add target hook TARGET_GOACC_ADJUST_PARALLELISM
https://gcc.gnu.org/ml/gcc-patches/2018-09/msg00369.html
49b2039013e [openacc] Add target hook TARGET_GOACC_ADJUST_PARALLELISM

  * Enable firstprivate OpenACC reductions
https://gcc.gnu.org/ml/gcc-patches/2018-09/msg00370.html
1f70cdb7cf0 (HEAD -> trunk-og8-vl-private,
github/trunk-og8-vl-private) [OpenACC] Enable firstprivate OpenACC
reductions

  * Adjust offsets for present data clauses
https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01213.html
8bcda2f1a2b [libgomp, OpenACC] Adjust offsets for present data clauses

Of the patches in trunk-og8-vl-private, the following are just general
refactors and cleanups which do not change any functionality:

7eb378e9b0c [nvptx] Generalize state propagation and synchronization
10aa1f74d5a [nvptx] Use MAX, MIN, ROUND_UP macros
9dfe611f3d8 [nvptx] Use TARGET_SET_CURRENT_FUNCTION
4fbe0e812bd [nvptx] Add axis_dim
fbe43dac79f [nvptx] Add thread count parm to bar.sync
57d3f8c88ff [nvptx] only use one bar.sync barriers in OpenACC offloaded code
f14d0e882eb [nvptx] Fix whitespace in nvptx_single and nvptx_neuter_pars
82d81fffb0f [nvptx] make nvptx state propagation function names more generic
95703737e09 [nvptx] consolidate offloaded function attributes into
struct offload_attrs
8c9e897c36d [nvptx] Rename worker_bcast variables oacc_bcast.
45147e7e3f3 [nvptx] update openacc dim macros
caa641ecfb4 [nvptx] Update insufficient launch message to accommodate
large vectors

The following patches actually implement the new vector length
functionality. Note that trunk doesn't support missing arguments between
colons in -fopenacc-dim like -fopenacc-dim=::64, so I had to remove a
couple or adjust a couple of your test cases from og8.

591973d3c3a [nvptx] use user-defined vectors when possible
fb9cefa5b17 [nvptx] Handle large vector reductions
5154d363d07 [nvptx] Force vl32 if calling vector-partitionable routines
f62e3afcf6a [nvptx, openacc] Don't emit barriers for empty loops
4cc408658fb [PR85246] [nvptx] Fix propagation of branch cond in
vw-neutered code
d97ed5fc580 [nvptx] Simplifly logic in nvptx_single
62f0c5df3dd [nvptx] Enable worker partitioning with warp-sized vector_length
f2cf96b0df3 [nvptx] Handle large vectors in libgomp
eba014c260c [nvptx] Enable large vectors
f31d8b98ca1 [nvptx] Add vector_length 128 testcases

Let me know if you encounter any problems with that github branch.

This branch has recently been recently rebased against trunk. Further, I
bootstrapped and regtested it on x86_64 Linux target with nvptx
offloading.

Thanks,
Cesar


Re: [PATCH,nvptx] Remove use of CUDA unified memory in libgomp

2018-09-18 Thread Cesar Philippidis
On 08/01/2018 04:12 AM, Tom de Vries wrote:
> On 07/31/2018 05:27 PM, Cesar Philippidis wrote:

>>/* Copy the (device) pointers to arguments to the device (dp and hp might 
>> in
>>   fact have the same value on a unified-memory system).  */
> 
> This comment needs to be updated, right?
> 
>> -  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
>> +  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
>>  mapnum * sizeof (void *));
>>GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
>>   " gangs=%u, workers=%u, vectors=%u\n",
>> -- 2.7.4
>>
> 
> Otherwise OK.

Thanks. I've committed the attach patch to trunk.

Cesar
[nvptx] Remove use of CUDA unified memory in libgomp

2018-09-18  Cesar Philippidis  

	libgomp/
	* plugin/plugin-nvptx.c (struct cuda_map): New.
	(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
	h_tail with (cuda_map *) map.
	(cuda_map_create): New function.
	(cuda_map_destroy): New function.
	(map_init): Update to use a linked list of cuda_map objects.
	(map_fini): Likewise.
	(map_pop): Likewise.
	(map_push): Likewise.  Return CUdeviceptr instead of void.
	(init_streams_for_device): Remove stales references to ptx_stream
	members.
	(select_stream_for_async): Likewise.
	(nvptx_exec): Update call to map_init.

(cherry picked from gomp-4_0-branch r242614)
---
 libgomp/plugin/plugin-nvptx.c | 170 ++
 1 file changed, 91 insertions(+), 79 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index bae1b05..6492e5f 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -192,20 +192,20 @@ cuda_error (CUresult r)
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
+struct cuda_map
+{
+  CUdeviceptr d;
+  size_t size;
+  bool active;
+  struct cuda_map *next;
+};
+
 struct ptx_stream
 {
   CUstream stream;
   pthread_t host_thread;
   bool multithreaded;
-
-  CUdeviceptr d;
-  void *h;
-  void *h_begin;
-  void *h_end;
-  void *h_next;
-  void *h_prev;
-  void *h_tail;
-
+  struct cuda_map *map;
   struct ptx_stream *next;
 };
 
@@ -217,101 +217,114 @@ struct nvptx_thread
   struct ptx_device *ptx_dev;
 };
 
+static struct cuda_map *
+cuda_map_create (size_t size)
+{
+  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+  assert (map);
+
+  map->next = NULL;
+  map->size = size;
+  map->active = false;
+
+  CUDA_CALL_ERET (NULL, cuMemAlloc, >d, size);
+  assert (map->d);
+
+  return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+  CUDA_CALL_ASSERT (cuMemFree, map->d);
+  free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+   contains the data mapping arguments for cuLaunchKernel.  Each
+   asynchronous PTX stream may have multiple pending kernel
+   invocations, which are launched in a FIFO order.  As such, the map
+   routines maintains a queue of cuLaunchKernel arguments.
+
+   Calls to map_push and map_pop must be guarded by ptx_event_lock.
+   Likewise, calls to map_init and map_fini are guarded by
+   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+   GOMP_OFFLOAD_fini_device, respectively.  */
+
 static bool
 map_init (struct ptx_stream *s)
 {
   int size = getpagesize ();
 
   assert (s);
-  assert (!s->d);
-  assert (!s->h);
-
-  CUDA_CALL (cuMemAllocHost, >h, size);
-  CUDA_CALL (cuMemHostGetDevicePointer, >d, s->h, 0);
 
-  assert (s->h);
+  s->map = cuda_map_create (size);
 
-  s->h_begin = s->h;
-  s->h_end = s->h_begin + size;
-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
-  assert (s->h_next);
-  assert (s->h_end);
   return true;
 }
 
 static bool
 map_fini (struct ptx_stream *s)
 {
-  CUDA_CALL (cuMemFreeHost, s->h);
+  assert (s->map->next == NULL);
+  assert (!s->map->active);
+
+  cuda_map_destroy (s->map);
+
   return true;
 }
 
 static void
 map_pop (struct ptx_stream *s)
 {
-  assert (s != NULL);
-  assert (s->h_next);
-  assert (s->h_prev);
-  assert (s->h_tail);
-
-  s->h_tail = s->h_next;
-
-  if (s->h_tail >= s->h_end)
-s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+  struct cuda_map *next;
 
-  if (s->h_next == s->h_tail)
-s->h_prev = s->h_next;
+  assert (s != NULL);
 
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
+  if (s->map->next == NULL)
+{
+  s->map->active = false;
+  return;
+}
 
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  next = s->map->next;
+  cuda_map_destroy (s->map);
+  s->map = nex

[patch,nvptx] Add atomic_fetch* support for SImode arguments.

2018-09-17 Thread Cesar Philippidis
I've committed this patch extends the nvptx atomic_fetch_
pattern to accept SImode arguments regardless of the -misa argument
supplied. Tom had pre-approved this patch awhile ago. As the test case
demonstrates, it only works 32-bit pointers.

While adding the new test case, I noticed that I named atomic-fetch-2.c
incorrectly; there should be an underscore between atomic and fetch.
This patch also fixes that.

I tested this patch using both a standalone nvptx compiler and x86_64
Linux with nvptx offloading.

Cesar
[nvptx] Add atomic_fetch* support for SImode arguments.

2018-09-17  Cesar Philippidis  
	Bernd Schmidt 

	gcc/
	* config/nvptx/nvptx.md (atomic_fetch_): Enable with
	SImode args.

	gcc/testsuite/
	* gcc.target/nvptx/atomic-fetch-2.c: Rename to ...
	* gcc.target/nvptx/atomic_fetch-2.c: ... this.
	* gcc.target/nvptx/atomic_fetch-3.c: New test.

---
 gcc/config/nvptx/nvptx.md |  2 +-
 .../{atomic-fetch-2.c => atomic_fetch-2.c}|  0
 .../gcc.target/nvptx/atomic_fetch-3.c | 24 +++
 3 files changed, 25 insertions(+), 1 deletion(-)
 rename gcc/testsuite/gcc.target/nvptx/{atomic-fetch-2.c => atomic_fetch-2.c} (100%)
 create mode 100644 gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index dd6032d021b..ca00b1d8073 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1449,7 +1449,7 @@
 	  UNSPECV_LOCK))
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
-  "TARGET_SM35"
+  "mode == SImode || TARGET_SM35"
   "%.\\tatom%A1.b%T0.\\t%0, %1, %2;"
   [(set_attr "atomic" "true")])
 
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-2.c
similarity index 100%
rename from gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c
rename to gcc/testsuite/gcc.target/nvptx/atomic_fetch-2.c
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c
new file mode 100644
index 000..36a83ebba9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c
@@ -0,0 +1,24 @@
+/* Test the nvptx atomic instructions for __atomic_fetch_OP for
+   SImode arguments.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -m32" } */
+
+int
+main()
+{
+  unsigned long a = ~0;
+  unsigned b = 0xa;
+
+  __atomic_fetch_add (, b, 0);
+  __atomic_fetch_and (, b, 0);
+  __atomic_fetch_or (, b, 0);
+  __atomic_fetch_xor (, b, 0);
+  
+  return a;
+}
+
+/* { dg-final { scan-assembler "atom.add.u32" } } */
+/* { dg-final { scan-assembler "atom.b32.and" } } */
+/* { dg-final { scan-assembler "atom.b32.or" } } */
+/* { dg-final { scan-assembler "atom.b32.xor" } } */
-- 
2.17.1



Re: [PATCH, OpenACC] C++ reference mapping (PR middle-end/86336)

2018-09-10 Thread Cesar Philippidis
On 09/10/2018 10:37 AM, Jason Merrill wrote:
> On Mon, Sep 10, 2018 at 4:05 AM, Julian Brown  wrote:
>> This patch (by Cesar) changes the way C++ references are mapped in
>> OpenACC regions, fixing an ICE in the non-scalar-data.C testcase.
>>
>> Post-patch, references are mapped like this (from the omplower dump):
>>
>> map(force_present:*x [len: 4]) map(firstprivate ref:x [pointer assign, bias: 
>> 0])
>>
>> Tested with offloading to NVPTX and bootstrapped. OK for trunk?
>>
>> Thanks,
>>
>> Julian
>>
>> ChangeLog
>>
>> 2018-09-09  Cesar Philippidis  
>> Julian Brown  
>>
>> PR middle-end/86336
>>
>> (gimplify_adjust_omp_clauses_1): Update handling of mapping of C++
>> references.
> 
> How is reference handling specified differently between OpenMP and
> OpenACC?  It seems strange for them to differ.

Both OpenACC and OpenMP privatize mapped array pointers on the
accelerator for subarrays in the same way. However, for pointers without
subarrays, OpenMP treats them as zero-length arrays, whereas OpenACC
treats them as ordinary scalars so that the pointer target will not get
remapped on the accelerator (which is odd because there's a deviceptr
clause for that). Scalars in C++ are special, because references must
treated like an array of length one, for lack of a better terminology.

> In any case, you shouldn't need to check lang_GNU_CXX since we're
> already calling the langhook.

Julian, can you look into this? I'm traveling tomorrow.

Cesar


Re: [PATCH, OpenACC 2.5, libgomp] Add *_async versions of runtime library API functions

2018-09-10 Thread Cesar Philippidis
On 09/10/2018 08:04 AM, Chung-Lin Tang wrote:

>  GOACC_2.0 {
> Index: libgomp/oacc-mem.c
> ===
> --- libgomp/oacc-mem.c(revision 264192)
> +++ libgomp/oacc-mem.c(working copy)
> @@ -153,8 +153,9 @@ acc_free (void *d)
>  gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
>  }
>  
> -void
> -acc_memcpy_to_device (void *d, void *h, size_t s)
> +static void
> +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
> +   const char *libfnname)

This showed up oddly in the diff, but memcpy_tofrom_device is a new
internal function that's not part of the public API. It's nice that you
were able to merge the to/from functions together. I think this is safe
in terms of backwards compatibility.

>  {
>/* No need to call lazy open here, as the device pointer must have
>   been obtained from a routine that did that.  */
> @@ -164,31 +165,49 @@ acc_free (void *d)
>  
>if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
>  {
> -  memmove (d, h, s);
> +  if (from)
> + memmove (h, d, s);
> +  else
> + memmove (d, h, s);
>return;
>  }
>  
> -  if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s))
> -gomp_fatal ("error in %s", __FUNCTION__);
> +  if (async > acc_async_sync)
> +thr->dev->openacc.async_set_async_func (async);
> +
> +  bool ret = (from
> +   ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
> +   : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
> +
> +  if (async > acc_async_sync)
> +thr->dev->openacc.async_set_async_func (acc_async_sync);
> +
> +  if (!ret)
> +gomp_fatal ("error in %s", libfnname);
>  }
>  
>  void
> -acc_memcpy_from_device (void *h, void *d, size_t s)
> +acc_memcpy_to_device (void *d, void *h, size_t s)
>  {
> -  /* No need to call lazy open here, as the device pointer must have
> - been obtained from a routine that did that.  */
> -  struct goacc_thread *thr = goacc_thread ();
> +  memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
> +}
>  
> -  assert (thr && thr->dev);
> +void
> +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
> +{
> +  memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
> +}
>  
> -  if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
> -{
> -  memmove (h, d, s);
> -  return;
> -}
> +void
> +acc_memcpy_from_device (void *h, void *d, size_t s)
> +{
> +  memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
> +}
>  
> -  if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s))
> -gomp_fatal ("error in %s", __FUNCTION__);
> +void
> +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
> +{
> +  memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
>  }
>  
>  /* Return the device pointer that corresponds to host data H.  Or NULL
> @@ -428,7 +447,7 @@ acc_unmap_data (void *h)
>  #define FLAG_COPY (1 << 2)
>  
>  static void *
> -present_create_copy (unsigned f, void *h, size_t s)
> +present_create_copy (unsigned f, void *h, size_t s, int async)

Likewise, this is another internal function, so it shouldn't break anything.

>  {
>void *d;
>splay_tree_key n;
> @@ -490,11 +509,17 @@ static void *
>  
>gomp_mutex_unlock (_dev->lock);
>  
> +  if (async > acc_async_sync)
> + acc_dev->openacc.async_set_async_func (async);
> +
>tgt = gomp_map_vars (acc_dev, mapnum, , NULL, , , 
> true,
>  GOMP_MAP_VARS_OPENACC);
>/* Initialize dynamic refcount.  */
>tgt->list[0].key->dynamic_refcount = 1;
>  
> +  if (async > acc_async_sync)
> + acc_dev->openacc.async_set_async_func (acc_async_sync);
> +
>gomp_mutex_lock (_dev->lock);
>  
>d = tgt->to_free;
> @@ -510,19 +535,32 @@ static void *
>  void *
>  acc_create (void *h, size_t s)
>  {
> -  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s);
> +  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, 
> acc_async_sync);
>  }
>  
> +void
> +acc_create_async (void *h, size_t s, int async)
> +{
> +  present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async);
> +}
> +
>  void *
>  acc_copyin (void *h, size_t s)
>  {
> -  return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s);
> +  return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s,
> +   acc_async_sync);
>  }
>  
> +void
> +acc_copyin_async (void *h, size_t s, int async)
> +{
> +  present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async);
> +}
> +
>  void *
>  acc_present_or_create (void *h, size_t s)
>  {
> -  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s);
> +  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, 
> acc_async_sync);
>  }
>  
>  /* acc_pcreate is acc_present_or_create by a different name.  */
> @@ -539,7 

Re: [patch,nvptx] Basic -misa support for nvptx

2018-09-05 Thread Cesar Philippidis
On 09/05/2018 07:30 AM, Tom de Vries wrote:
> On 09/05/2018 12:19 AM, Cesar Philippidis wrote:
>> On 09/02/2018 07:57 AM, Cesar Philippidis wrote:
>>> On 09/01/2018 12:04 PM, Tom de Vries wrote:
>>>> On 08/31/2018 04:14 PM, Cesar Philippidis wrote:
>>>
>>>>> Is this patch OK for trunk?
>>>>>
>>>>
>>>> Well, how did you test this (
>>>> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and
>>>> testing. State the host and target combinations you used to do proper
>>>> testing as described above, and the results of your testing.") ?
>>>
>>> I tested the standalone nvptx compiler. I'll retest with libgomp with
>>> -misa=sm_35. Bootstrapping won't help much here, unfortunately.
>>>>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c
>>>>> @@ -0,0 +1,24 @@
>>>>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35
>>>>> +   targets.  */
>>>>> +
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -misa=sm_35" } */
>>>>> +
>>>>> +int
>>>>> +main()
>>>>> +{
>>>>> +  unsigned long long a = ~0;
>>>>> +  unsigned b = 0xa;
>>>>> +
>>>>> +  __atomic_fetch_add (, b, 0);
>>>>> +  __atomic_fetch_and (, b, 0);
>>>>> +  __atomic_fetch_or (, b, 0);
>>>>> +  __atomic_fetch_xor (, b, 0);
>>>>> +  
>>>>> +  return a;
>>>>> +}
>>>>> +
>>>>> +/* { dg-final { scan-assembler "atom.add.u64" } } */
>>>>> +/* { dg-final { scan-assembler "atom.b64.and" } } */
>>>>> +/* { dg-final { scan-assembler "atom.b64.or" } } */
>>>>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */
>>>>> -- 2.17.1
>>>>>
>>>>
>>>> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not
>>>> testcase does not use this difference, so that needs to be fixed, or for
>>>> bonus points, changed into a scan-assembler testcase).
>>>>
>>>> The documentation uses "op.type", we should fix the compiler to emit
>>>> that consistently. Separate patch that fixes that pre-approved.
>>>
>>> ACK. I think there are a lot of other cases like that in the BE.
>>>
>>>> This is ok (with, as I mentioned above, the SI part split off into a
>>>> separate patch), on the condition that you test libgomp with
>>>> -foffload=-misa=sm_35.
>>
>> Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't
>> support the -misa flag.
> 
> That doesn't make sense to me. For me this works without any problems.
> Have you tried a clean build?

I was incorrectly setting ALWAYS_CFLAGS to use -foffload=-misa=sm_35.
That didn't work on the host. But lappend'ing tagopt did work.

>> When I forced the nvptx BE to set TARGET_SM35 to
>> always be true, I ran into problems with SM_30 code linking against
>> SM_35 code.
> 
> I also cannot reproduce this, works for me.

I found the problem. I wasn't using a clean build. Besides, with the
tagopt change in libgomp, I didn't need to force the -misa=sm_35 flag
everywhere.

>> Therefore, I don't think this patch is ready for trunk yet.
>>> By the way, is -misa really necessary for atomic_fetch_?
>> Looking at the PTX documentation I see
>> <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-3-1>:
>>
>> PTX ISA version 3.1 introduces the following new features:
>>
>> * Support for sm_35 target architecture.
>> * Extends atomic and reduction instructions to perform 64-bit {and, or,
>> xor} operations, and 64-bit integer {min, max} operations.
>>
>> Is there a table for which list which GPUs are compatible with which
>> instructions?
> 
> Yes, every instruction has a table in the ptx manual, and there's a "PTX
> ISA Notes" entry.
> 
> For the atom instruction in ptx isa 3.1 manual, we have "PTX ISA Notes":
> ...
> atom.global requires sm_11 or higher.
> atom.shared requires sm_12 or higher.
> 64-bit atom.global.{add,cas,exch} require sm_12 or higher.
> 64-bit atom.shared.{add,cas,exch} require sm_20 or higher.
> 64-bit atom.{and,or.xor,min,max} require sm_35 or higher.
> atom.add.f32 requires sm_20 or higher.
> Use of generic addressing requires sm_20 or higher.
> ...

Thanks!

I'll com

[OpenACC] Enable firstprivate OpenACC reductions

2018-09-05 Thread Cesar Philippidis
This patch teaches the gimplifier how to pass certain OpenACC reduction
variables as firstprivate, and not with an implicit copy directive. This
is matches the default behavior for the implicit data mappings of scalar
variables inside OpenACC parallel regions. It should be noted that the
gimplifier will still implicitly map reduction variables on loops
immediately enclosed inside a parallel regions, like so

  #pragma acc parallel
  #pragma acc loop reduction(+:sum)

as copy. This change only impacts reductions variables inside nested acc
loops like

  #pragma acc parallel
  #pragma acc loop
  for (...)
  {
#pragma acc loop reduction(+:s2)

Here s2 will be transferred into the accelerator as firstprivate instead
of copy.

Is this OK for trunk? I regtested and bootstrapped for x86_64 with nvptx
offloading.

Cesar
[OpenACC] Enable firstprivate OpenACC reductions

2018-XX-YY  Cesar Philippidis  
	Chung-Lin Tang  

	gcc/
	* gimplify.c (omp_add_variable): Enable firstprivate reduction
	variables.

	gcc/testsuite/
	* c-c++-common/goacc/reduction-8.c: New test.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c: New
	test.


diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index dbd0f0ebd0c..4d954e20788 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -6823,20 +6823,27 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
   else
 splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags);
 
-  /* For reductions clauses in OpenACC loop directives, by default create a
- copy clause on the enclosing parallel construct for carrying back the
- results.  */
+  /* For OpenACC loop directives, when a reduction clause is placed on
+ the outermost acc loop within an acc parallel or kernels
+ construct, it must have an implied copy data mapping. E.g.
+
+   #pragma acc parallel
+	 {
+	   #pragma acc loop reduction (+:sum)
+
+ a copy clause for sum should be added on the enclosing parallel
+ construct for carrying back the results.  */
   if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION))
 {
   struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
-  while (outer_ctx)
+  if (outer_ctx)
 	{
 	  n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl);
 	  if (n != NULL)
 	{
 	  /* Ignore local variables and explicitly declared clauses.  */
 	  if (n->value & (GOVD_LOCAL | GOVD_EXPLICIT))
-		break;
+		;
 	  else if (outer_ctx->region_type == ORT_ACC_KERNELS)
 		{
 		  /* According to the OpenACC spec, such a reduction variable
@@ -6856,9 +6863,7 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 	{
 	  splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl,
  GOVD_MAP | GOVD_SEEN);
-	  break;
 	}
-	  outer_ctx = outer_ctx->outer_context;
 	}
 }
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-8.c b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
new file mode 100644
index 000..8a0283f4ac3
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
@@ -0,0 +1,94 @@
+/* { dg-additional-options "-fdump-tree-gimple" } */
+
+#define n 1000
+
+int
+main(void)
+{
+  int i, j;
+  int result, array[n];
+
+#pragma acc parallel loop reduction (+:result)
+  for (i = 0; i < n; i++)
+result ++;
+
+#pragma acc parallel
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+result ++;
+
+#pragma acc parallel
+#pragma acc loop
+  for (i = 0; i < n; i++)
+{
+  result = i;
+
+#pragma acc loop reduction(+:result)
+  for (j = 0; j < n; j++)
+	result ++;
+
+  array[i] = result;
+}
+
+#pragma acc parallel
+#pragma acc loop
+  for (i = 0; i < n; i++)
+{
+  result = i;
+
+#pragma acc loop worker vector reduction(+:result)
+  for (j = 0; j < n; j++)
+	result ++;
+
+  array[i] = result;
+}
+
+#pragma acc parallel
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+  for (i = 0; i < n; i++)
+{
+  result = i;
+
+#pragma acc loop gang reduction(+:result)
+  for (j = 0; j < n; j++)
+	result ++;
+
+  array[i] = result;
+}
+
+#pragma acc parallel copy(result)
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+  for (i = 0; i < n; i++)
+{
+  result = i;
+
+#pragma acc loop gang reduction(+:result)
+  for (j = 0; j < n; j++)
+	result ++;
+
+  array[i] = result;
+}
+  
+#pragma acc kernels
+#pragma acc loop
+  for (i = 0; i < n; i++)
+{
+  result = i;
+
+#pragma acc loop reduction(+:result)
+  for (j = 0; j < n; j++)
+	result ++;
+
+  array[i] = result;
+}
+
+  return 0;
+}
+
+/* Check that default copy maps are generated for loop reductions.  */
+/* { dg-final { scan-tree-dump-times "reductio

[patch][OpenACC] Add target hook TARGET_GOACC_ADJUST_PARALLELISM

2018-09-05 Thread Cesar Philippidis
At present, GCC fixes the vector length on all targets. However, that is
an artificial restriction. This patch introduces a new
TARGET_GOACC_ADJUST_PARALLELISM hook that enables the runtime to correct
the default number of acc workers and vectors. Extra care need to be
done to ensure that large vectors fit inside workers. The target hook
itself doesn't do anything for the host, but the nvptx BE will make use
of it.

Is this patch OK for trunk? I regtested and bootstrapped for x86_64 with
nvptx offloading.

Thanks,
Cesar
[openacc] Add target hook TARGET_GOACC_ADJUST_PARALLELISM

	gcc/
	* doc/tm.texi.in: Add placeholder for TARGET_GOACC_ADJUST_PARALLELISM.
	* doc/tm.texi: Regenerate.
	* omp-offload.c (oacc_loop_fixed_partitions): Use the adjust_parallelism
	hook to modify this_mask.
	(oacc_loop_auto_partitions): Use the adjust_parallelism hook to modify
	this_mask and loop->mask.
	(default_goacc_adjust_parallelism): New function.
	* target.def (adjust_parallelism): New hook.
	* targhooks.h (default_goacc_adjust_parallelism): Declare.


diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index a40f45ade07..365a7bbec90 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6029,6 +6029,12 @@ This hook should return the maximum size of a particular dimension,
 or zero if unbounded.
 @end deftypefn
 
+@deftypefn {Target Hook} unsigned TARGET_GOACC_ADJUST_PARALLELISM (unsigned @var{this_mask}, unsigned @var{outer_mask})
+This hook allows the accelerator compiler to remove any unused
+parallelism exposed in the current loop @var{THIS_MASK}, and the
+enclosing loop @var{OUTER_MASK}.  It returns an adjusted mask.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_GOACC_FORK_JOIN (gcall *@var{call}, const int *@var{dims}, bool @var{is_fork})
 This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN
 function calls to target-specific gimple, or indicate whether they
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 39a214e9b2c..9edd2e7ecaf 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4145,6 +4145,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_GOACC_DIM_LIMIT
 
+@hook TARGET_GOACC_ADJUST_PARALLELISM
+
 @hook TARGET_GOACC_FORK_JOIN
 
 @hook TARGET_GOACC_REDUCTION
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 0abf0283c9e..1659febd2b1 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -1218,6 +1218,13 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 	}
 }
 
+  /* Ideally, we should be coalescing parallelism here if the
+ hardware supports it.  E.g. Instead of partitioning a loop
+ across worker and vector axes, sometimes the hardware can
+ execute those loops together without resorting to placing
+ extra thread barriers.  */
+  this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask);
+
   mask_all |= this_mask;
 
   if (loop->flags & OLF_TILE)
@@ -1302,6 +1309,7 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
 	  this_mask ^= loop->e_mask;
 	}
 
+  this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask);
   loop->mask |= this_mask;
 }
 
@@ -1350,6 +1358,8 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
 	}
 
   loop->mask |= this_mask;
+  loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask);
+
   if (!loop->mask && noisy)
 	warning_at (loop->loc, 0,
 		tiling
@@ -1684,6 +1694,15 @@ default_goacc_dim_limit (int ARG_UNUSED (axis))
 #endif
 }
 
+/* Default adjustment of loop parallelism is not required.  */
+
+unsigned
+default_goacc_adjust_parallelism (unsigned this_mask,
+  unsigned ARG_UNUSED (outer_mask))
+{
+  return this_mask;
+}
+
 namespace {
 
 const pass_data pass_data_oacc_device_lower =
diff --git a/gcc/target.def b/gcc/target.def
index c570f3825a5..401d681fc42 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1678,6 +1678,14 @@ or zero if unbounded.",
 int, (int axis),
 default_goacc_dim_limit)
 
+DEFHOOK
+(adjust_parallelism,
+"This hook allows the accelerator compiler to remove any unused\n\
+parallelism exposed in the current loop @var{THIS_MASK}, and the\n\
+enclosing loop @var{OUTER_MASK}.  It returns an adjusted mask.",
+unsigned, (unsigned this_mask, unsigned outer_mask),
+default_goacc_adjust_parallelism)
+
 DEFHOOK
 (fork_join,
 "This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN\n\
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index f92ca5ca997..38e024b13de 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -125,6 +125,7 @@ extern bool default_goacc_validate_dims (tree, int [], int);
 extern int default_goacc_dim_limit (int);
 extern bool default_goacc_fork_join (gcall *, const int [], bool);
 extern void default_goacc_reduction (gcall *);
+extern unsigned default_goacc_adjust_parallelism (unsigned, unsigned);
 
 /* These are here, and not in hooks.[ch], because not all users of
hooks.h include tm.h, and thus we don't 

[openacc] Teach gfortran to lower OpenACC routine dims

2018-09-05 Thread Cesar Philippidis
At present, gfortran does not encode the gang, worker or vector
parallelism clauses when it creates acc routines dim attribute for
subroutines and functions. While support for acc routine is lacking in
other areas in gfortran (including modules), this patch is important
because it encodes the parallelism attributes using the same function as
the C and C++ FEs. This will become important with the forthcoming nvptx
vector length extensions, because large vectors are not supported in acc
routines yet.

Is this OK for trunk? I regtested and bootstrapped for x86_64 with nvptx
offloading.

Thanks,
Cesar
[openacc] Teach gfortran to lower OpenACC routine dims

	gcc/fortran/
	* gfortran.h (oacc_function): New enum.
	(gfc_oacc_routine_name): Add locus loc field.
	* openmp.c (gfc_oacc_routine_dims): Return oacc_function.
	(gfc_match_oacc_routine): Update routine clause syntax checking.
	Populate oacc_function attribute with dims.
	* trans-decl.c (add_attributes_to_decl): Use oacc_build_routine_dims
	to construct routine dims.

	gcc/testsuite/
	* gfortran.dg/goacc/classify-routine.f95: Adjust test.
	* gfortran.dg/goacc/pr71704.f90: Likewise.
	* gfortran.dg/goacc/routine-6.f90: Likewise.
	* gfortran.dg/goacc/routine-8.f90: Likewise.
	* gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise.

	libgomp/
	* testsuite/libgomp.oacc-fortran/routine-1.f90: Adjust test.
	* testsuite/libgomp.oacc-fortran/routine-2.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-3.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-4.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-5.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/routine-9.f90: Likewise.
	* libgomp.oacc-fortran/host_data-2.f90: Likewise.
	* libgomp.oacc-fortran/host_data-3.f: Likewise.
	* libgomp.oacc-fortran/host_data-4.f90: Likewise.


diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 04b0024a992..3675f2e8d52 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -316,6 +316,16 @@ enum save_state
 { SAVE_NONE = 0, SAVE_EXPLICIT, SAVE_IMPLICIT
 };
 
+/* Flags to keep track of ACC routine states.  */
+enum oacc_function
+{ OACC_FUNCTION_NONE = 0,
+  OACC_FUNCTION_GANG,
+  OACC_FUNCTION_WORKER,
+  OACC_FUNCTION_VECTOR,
+  OACC_FUNCTION_SEQ,
+  OACC_FUNCTION_AUTO
+};
+
 /* Strings for all symbol attributes.  We use these for dumping the
parse tree, in error messages, and also when reading and writing
modules.  In symbol.c.  */
@@ -1726,6 +1736,7 @@ typedef struct gfc_oacc_routine_name
   struct gfc_symbol *sym;
   struct gfc_omp_clauses *clauses;
   struct gfc_oacc_routine_name *next;
+  locus loc;
 }
 gfc_oacc_routine_name;
 
diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 94a7f7eaa50..d48c9351e25 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -2234,34 +2234,45 @@ gfc_match_oacc_cache (void)
   return MATCH_YES;
 }
 
-/* Determine the loop level for a routine.   */
+/* Determine the loop level for a routine.  Returns OACC_FUNCTION_NONE
+   if any error is detected.  */
 
-static int
+static oacc_function
 gfc_oacc_routine_dims (gfc_omp_clauses *clauses)
 {
   int level = -1;
+  oacc_function ret = OACC_FUNCTION_AUTO;
 
   if (clauses)
 {
   unsigned mask = 0;
 
   if (clauses->gang)
-	level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level);
+	{
+	  level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level);
+	  ret = OACC_FUNCTION_GANG;
+	}
   if (clauses->worker)
-	level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level);
+	{
+	  level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level);
+	  ret = OACC_FUNCTION_WORKER;
+	}
   if (clauses->vector)
-	level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level);
+	{
+	  level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level);
+	  ret = OACC_FUNCTION_VECTOR;
+	}
   if (clauses->seq)
-	level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level);
+	{
+	  level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level);
+	  ret = OACC_FUNCTION_SEQ;
+	}
 
   if (mask != (mask & -mask))
-	gfc_error ("Multiple loop axes specified for routine");
+	ret = OACC_FUNCTION_NONE;
 }
 
-  if (level < 0)
-level = GOMP_DIM_MAX;
-
-  return level;
+  return ret;
 }
 
 match
@@ -2272,6 +2283,8 @@ gfc_match_oacc_routine (void)
   match m;
   gfc_omp_clauses *c = NULL;
   gfc_oacc_routine_name *n = NULL;
+  oacc_function dims = OACC_FUNCTION_NONE;
+  bool seen_error = false;
 
   old_loc = gfc_current_locus;
 
@@ -2318,17 +2331,15 @@ gfc_match_oacc_routine (void)
 	}
   else
 {
-	  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C");
-	  gfc_current_locus = old_loc;
-	  return MATCH_ERROR;
+	  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L", _loc);
+	  goto cleanup;
 	}
 
   if (gfc_match_char (')') != MATCH_YES)
 	{
-	  gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C, expecting"
-		 " ')' after NAME");
-	  gfc_current_locus = old_loc;
-	  return 

Re: [patch,nvptx] Basic -misa support for nvptx

2018-09-04 Thread Cesar Philippidis
On 09/02/2018 07:57 AM, Cesar Philippidis wrote:
> On 09/01/2018 12:04 PM, Tom de Vries wrote:
>> On 08/31/2018 04:14 PM, Cesar Philippidis wrote:
> 
>>> Is this patch OK for trunk?
>>>
>>
>> Well, how did you test this (
>> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and
>> testing. State the host and target combinations you used to do proper
>> testing as described above, and the results of your testing.") ?
> 
> I tested the standalone nvptx compiler. I'll retest with libgomp with
> -misa=sm_35. Bootstrapping won't help much here, unfortunately.
>>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c
>>> @@ -0,0 +1,24 @@
>>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35
>>> +   targets.  */
>>> +
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -misa=sm_35" } */
>>> +
>>> +int
>>> +main()
>>> +{
>>> +  unsigned long long a = ~0;
>>> +  unsigned b = 0xa;
>>> +
>>> +  __atomic_fetch_add (, b, 0);
>>> +  __atomic_fetch_and (, b, 0);
>>> +  __atomic_fetch_or (, b, 0);
>>> +  __atomic_fetch_xor (, b, 0);
>>> +  
>>> +  return a;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "atom.add.u64" } } */
>>> +/* { dg-final { scan-assembler "atom.b64.and" } } */
>>> +/* { dg-final { scan-assembler "atom.b64.or" } } */
>>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */
>>> -- 2.17.1
>>>
>>
>> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not
>> testcase does not use this difference, so that needs to be fixed, or for
>> bonus points, changed into a scan-assembler testcase).
>>
>> The documentation uses "op.type", we should fix the compiler to emit
>> that consistently. Separate patch that fixes that pre-approved.
> 
> ACK. I think there are a lot of other cases like that in the BE.
> 
>> This is ok (with, as I mentioned above, the SI part split off into a
>> separate patch), on the condition that you test libgomp with
>> -foffload=-misa=sm_35.

Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't
support the -misa flag. When I forced the nvptx BE to set TARGET_SM35 to
always be true, I ran into problems with SM_30 code linking against
SM_35 code. Therefore, I don't think this patch is ready for trunk yet.

By the way, is -misa really necessary for atomic_fetch_?
Looking at the PTX documentation I see
<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-3-1>:

PTX ISA version 3.1 introduces the following new features:

* Support for sm_35 target architecture.
* Extends atomic and reduction instructions to perform 64-bit {and, or,
xor} operations, and 64-bit integer {min, max} operations.

Is there a table for which list which GPUs are compatible with which
instructions?

Thanks,
Cesar


Re: [patch,nvptx] Basic -misa support for nvptx

2018-09-02 Thread Cesar Philippidis
On 09/01/2018 12:04 PM, Tom de Vries wrote:
> On 08/31/2018 04:14 PM, Cesar Philippidis wrote:

>> Is this patch OK for trunk?
>>
> 
> Well, how did you test this (
> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and
> testing. State the host and target combinations you used to do proper
> testing as described above, and the results of your testing.") ?

I tested the standalone nvptx compiler. I'll retest with libgomp with
-misa=sm_35. Bootstrapping won't help much here, unfortunately.
>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c
>> @@ -0,0 +1,24 @@
>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35
>> +   targets.  */
>> +
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -misa=sm_35" } */
>> +
>> +int
>> +main()
>> +{
>> +  unsigned long long a = ~0;
>> +  unsigned b = 0xa;
>> +
>> +  __atomic_fetch_add (, b, 0);
>> +  __atomic_fetch_and (, b, 0);
>> +  __atomic_fetch_or (, b, 0);
>> +  __atomic_fetch_xor (, b, 0);
>> +  
>> +  return a;
>> +}
>> +
>> +/* { dg-final { scan-assembler "atom.add.u64" } } */
>> +/* { dg-final { scan-assembler "atom.b64.and" } } */
>> +/* { dg-final { scan-assembler "atom.b64.or" } } */
>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */
>> -- 2.17.1
>>
> 
> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not
> testcase does not use this difference, so that needs to be fixed, or for
> bonus points, changed into a scan-assembler testcase).
> 
> The documentation uses "op.type", we should fix the compiler to emit
> that consistently. Separate patch that fixes that pre-approved.

ACK. I think there are a lot of other cases like that in the BE.

> This is ok (with, as I mentioned above, the SI part split off into a
> separate patch), on the condition that you test libgomp with
> -foffload=-misa=sm_35.

Thanks,
Cesar


[patch,nvptx] Basic -misa support for nvptx

2018-08-31 Thread Cesar Philippidis
Attached is an nvptx patch that adds support for a new, albeit rarely
used, compiler option -misa. At present, there are only two valid ISA
arguments, SM_30 and SM_35. Without that flag, GCC will default to
SM_30. The major advantage of using the SM_35 ISA is to enable the use
PTX atom instructions for __atomic_fetch_{add,and,or,xor} for DI
integers. Without -misa, GCC would use an atomic CAS loop for them. As
an aside, this patch also enables PTX atom instructions for those
aforementioned functions for SI integers.

Is this patch OK for trunk?

Thanks,
Cesar
Basic -misa support for nvptx

2018-XX-YY  Cesar Philippidis  
	Bernd Schmidt  

	gcc/
	* config/nvptx/nvptx-opts.h: New file.
	* config/nvptx/nvptx.c (nvptx_file_start): Print the correct .target.
	* config/nvptx/nvptx.h: Include "nvptx-opts.h".
	(ASM_SPEC): Define.
	(TARGET_SM35): New macro.
	* config/nvptx/nvptx.md (atomic_fetch_): Enable with the
	correct predicate.
	* config/nvptx/nvptx.opt (ptx_isa, sm_30, sm_35): New enum and its
	values.
	(misa=): New option.
	* doc/invoke.texi (Nvidia PTX Options): Document -misa.

	gcc/testsuite/
	* gcc.target/nvptx/atomic_fetch-1.c: New test.
	* gcc.target/nvptx/atomic_fetch-1.c: New test.


diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
new file mode 100644
index 000..55d9599917e
--- /dev/null
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -0,0 +1,30 @@
+/* Definitions for the NVPTX port needed for option handling.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NVPTX_OPTS_H
+#define NVPTX_OPTS_H
+
+enum ptx_isa
+{
+  PTX_ISA_SM30,
+  PTX_ISA_SM35
+};
+
+#endif
+
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index c0b0a2ec3ab..9903a273863 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4931,7 +4931,10 @@ nvptx_file_start (void)
 {
   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
   fputs ("\t.version\t3.1\n", asm_out_file);
-  fputs ("\t.target\tsm_30\n", asm_out_file);
+  if (TARGET_SM35)
+fputs ("\t.target\tsm_35\n", asm_out_file);
+  else
+fputs ("\t.target\tsm_30\n", asm_out_file);
   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
   fputs ("// END PREAMBLE\n", asm_out_file);
 }
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index dfa1e9aa859..a2fe8b68b22 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -21,10 +21,16 @@
 #ifndef GCC_NVPTX_H
 #define GCC_NVPTX_H
 
+#ifndef NVPTX_OPTS_H
+#include "config/nvptx/nvptx-opts.h"
+#endif
+
 /* Run-time Target.  */
 
 #define STARTFILE_SPEC "%{mmainkernel:crt0.o}"
 
+#define ASM_SPEC "%{misa=*:-m %*}"
+
 #define TARGET_CPU_CPP_BUILTINS()		\
   do		\
 {		\
@@ -87,6 +93,8 @@
 #define Pmode (TARGET_ABI64 ? DImode : SImode)
 #define STACK_SIZE_MODE Pmode
 
+#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35)
+
 /* Registers.  Since ptx is a virtual target, we just define a few
hard registers for special purposes and leave pseudos unallocated.
We have to have some available hard registers, to keep gcc setup
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 2988f5dfa91..ca00b1d8073 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1440,7 +1440,6 @@
 (define_code_iterator any_logic [and ior xor])
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
-;; Currently disabled until we add better subtarget support - requires sm_32.
 (define_insn "atomic_fetch_"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
@@ -1450,7 +1449,7 @@
 	  UNSPECV_LOCK))
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
-  "0"
+  "mode == SImode || TARGET_SM35"
   "%.\\tatom%A1.b%T0.\\t%0, %1, %2;"
   [(set_attr "atomic" "true")])
 
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 04277d1d98e..8194c0324d6 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -48,3 +48,17 @@ Generate code that can keep local state uniform across all l

Re: [PATCH, OpenACC] (2/2) Fix implicit mapping for array slices on lexically-enclosing data constructs (PR70828)

2018-08-28 Thread Cesar Philippidis
On 08/28/2018 02:32 PM, Julian Brown wrote:
> On Tue, 28 Aug 2018 12:23:22 -0700
> Cesar Philippidis  wrote:

>> This is specific to OpenACC, and needs to be guarded as such.
> 
> Are you sure that condition can be true for OpenMP? I'd assumed not...

My bad, you're correct. OMP doesn't use those GOMP_MAP_FORCE map types
anymore.

Cesar



Re: [PATCH, OpenACC] (2/2) Fix implicit mapping for array slices on lexically-enclosing data constructs (PR70828)

2018-08-28 Thread Cesar Philippidis
On 08/28/2018 12:19 PM, Julian Brown wrote:

> diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
> index f038f4c..86be407 100644
> --- a/gcc/fortran/trans-openmp.c
> +++ b/gcc/fortran/trans-openmp.c
> @@ -1045,9 +1045,13 @@ gfc_omp_finish_clause (tree c, gimple_seq *pre_p)
>  
>tree decl = OMP_CLAUSE_DECL (c);
>  
> -  /* Assumed-size arrays can't be mapped implicitly, they have to be
> - mapped explicitly using array sections.  */
> -  if (TREE_CODE (decl) == PARM_DECL
> +  /* Assumed-size arrays can't be mapped implicitly, they have to be mapped
> + explicitly using array sections.  An exception is if the array is
> + mapped explicitly in an enclosing data construct for OpenACC, in which
> + case we see GOMP_MAP_FORCE_PRESENT here and do not need to raise an
> + error.  */
> +  if (OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_FORCE_PRESENT
> +  && TREE_CODE (decl) == PARM_DECL
>&& GFC_ARRAY_TYPE_P (TREE_TYPE (decl))
>&& GFC_TYPE_ARRAY_AKIND (TREE_TYPE (decl)) == GFC_ARRAY_UNKNOWN
>&& GFC_TYPE_ARRAY_UBOUND (TREE_TYPE (decl),

This is specific to OpenACC, and needs to be guarded as such.

Cesar


Re: [PATCH, OpenACC] Add support for gang local storage allocation in shared memory

2018-08-13 Thread Cesar Philippidis
On 08/13/2018 11:42 AM, Cesar Philippidis wrote:
> On 08/13/2018 09:21 AM, Julian Brown wrote:
> 
>> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c 
>> b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
>> new file mode 100644
>> index 000..2fa708a
>> --- /dev/null
>> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
>> @@ -0,0 +1,106 @@
>> +/* { dg-xfail-run-if "gangprivate failure" { openacc_nvidia_accel_selected 
>> } { "-O0" } { "" } } */
> 
> As a quick comment, I like the approach that you've taken with this
> patch, but the og8 patch only applies the gangprivate attribute in the
> c/c++ FE. I'd have to review the notes, but I seem to recall that
> excluding that clause in fortran was deliberate. Chung-Lin, do you
> recall the rationale behind that?

I found this in an old email:

  The older version of fortran that OpenACC supports doesn't have a
  concept of lexically scoped blocks like c/c++, so this isn't relevant
  except for explicit gang private variables.

So in other words, this is safe for fortran. It probably could use a
fortran test, because that functionality wasn't explicitly exercised in
og7/og8.

Cesar


Re: [PATCH, OpenACC] Add support for gang local storage allocation in shared memory

2018-08-13 Thread Cesar Philippidis
On 08/13/2018 09:21 AM, Julian Brown wrote:

> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c 
> b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
> new file mode 100644
> index 000..2fa708a
> --- /dev/null
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
> @@ -0,0 +1,106 @@
> +/* { dg-xfail-run-if "gangprivate failure" { openacc_nvidia_accel_selected } 
> { "-O0" } { "" } } */

As a quick comment, I like the approach that you've taken with this
patch, but the og8 patch only applies the gangprivate attribute in the
c/c++ FE. I'd have to review the notes, but I seem to recall that
excluding that clause in fortran was deliberate. Chung-Lin, do you
recall the rationale behind that?

With that aside, is the above xfail still necessary? It seems to xpass
for me on nvptx. However, I see this regression on the host:

FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/loop-gwv-2.c
-DACC_DEVICE_TYPE_host=1 -DACC_MEM_SHARED=1  -O2  execution test

There could be other regressions, but I only tested the new tests
introduced by the patch so far.

Cesar


Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-13 Thread Cesar Philippidis
On 08/13/2018 08:08 AM, Tom de Vries wrote:
> On 08/13/2018 04:54 PM, Cesar Philippidis wrote:
>> Going
>> forward, how would you like to proceed with the nvptx BE vector length
>> changes.
> 
> Do you have a branch available on github containing the patch series
> you've submitted?

Yes, https://github.com/cesarjp/gcc/tree/trunk-og8-vl-private

Beware that I'm constantly rebasing that branch to keep my patches up to
date. All of the commit subject lines prefixed with [nvptx] touch the
nvptx BE. The [OpenACC] patches are either involve platform-independent
code or libgomp.

Cesar


Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-13 Thread Cesar Philippidis
On 08/13/2018 05:04 AM, Tom de Vries wrote:
> On 08/10/2018 08:39 PM, Cesar Philippidis wrote:
>> is that I modified the default value for vectors as follows
>>
>> +int vectors = default_dim_p[GOMP_DIM_VECTOR]
>> +  ? 0 : dims[GOMP_DIM_VECTOR];
>>
>> Technically, trunk only supports warp-sized vectors, but the fallback
>> code is already checking for the presence of vectors as so
>>
>> +if (default_dim_p[GOMP_DIM_VECTOR])
>> +  dims[GOMP_DIM_VECTOR]
>> += MIN (dims[GOMP_DIM_VECTOR],
>> +   (targ_fn->max_threads_per_block / warp_size
>> +* warp_size));
>>
> 
> That code handles the case that the default vector size is bigger than
> the function being launched allows, independent from whether that
> default is calculated by the runtime, or set by GOMP_OPENACC_DIM.
> 
> The GOMP_OPENACC_DIM part is forward compatible, given that currently
> the compiler doesn't allow the runtime to choose the vector length, and
> AFAICT that will remain the same after application of the submitted set
> of vector_length patches.
> 
>> therefore, I had the cuOccupancyMaxPotentialBlockSize code path behave
>> the same.
> 
> They don't behave the same. What you add here is ignoring
> GOMP_OPENACC_DIM[GOMP_DIM_VECTOR], not handling it. That requires a comment.

I meant, same in the sense that it inspects for a pre-defined value of
vector length; not the application of vector length. I should have been
more clear.

> Furthermore, by assigning dims[GOMP_DIM_VECTOR] at the start you break
> the pattern of the code, which:
> - first applies GOMP_OPENACC_DIM
> - then further fills in defaults as required
> - then applies defaults
> I've rewritten this bit to fit the pattern. This result is not pretty,
> but it'll do for now.  Changing the pattern may make things better
> structured, but this is something we can do in a follow up patch, and
> want to do for all dimensions at once, not just for vector, otherwise
> the code will become too convoluted.
> 
> Btw, I've also noticed that we don't handle a too high
> GOMP_OPENACC_DIM[GOMP_DIM_WORKER], I've added a TODO comment for this.

That's why I set vectors to dims[GOMP_DIM_VECTOR] when set. However, I
do agree that this is a task for a follow up patch.

> Committed as attached.

Thank you Tom!

Looking at my patch queue, there's only one more non-vector length
related patch in there - Remove use of CUDA unified memory in libgomp
<https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01970.html>. Going
forward, how would you like to proceed with the nvptx BE vector length
changes.

Cesar


Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-10 Thread Cesar Philippidis
On 08/08/2018 08:19 AM, Tom de Vries wrote:
> On Wed, Aug 08, 2018 at 07:09:16AM -0700, Cesar Philippidis wrote:
>> On 08/07/2018 06:52 AM, Cesar Philippidis wrote:

Thanks for review. This version should address all of the following
remarks. However, one thing to note ...

>> [nvptx] Use CUDA driver API to select default runtime launch geometry
>>
>> 2018-08-YY  Cesar Philippidis  
>>
>>  libgomp/
>>  plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
>>  (cuDriverGetVersion): Declare.
>>  (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
>>  plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
>>  cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
>>  (ptx_device): Add driver_version member.
>>  (nvptx_open_device): Initialize it.
>>  (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
>>  default num_gangs and num_workers when the driver supports it.
>> ---
>>  libgomp/plugin/cuda-lib.def   |  2 ++
>>  libgomp/plugin/cuda/cuda.h|  4 
>>  libgomp/plugin/plugin-nvptx.c | 40 +++-
>>  3 files changed, 45 insertions(+), 1 deletion(-)
>>
>> diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
>> index be8e3b3..f2433e1 100644
>> --- a/libgomp/plugin/cuda-lib.def
>> +++ b/libgomp/plugin/cuda-lib.def
>> @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
>>  CUDA_ONE_CALL (cuCtxDestroy)
>>  CUDA_ONE_CALL (cuCtxGetCurrent)
>>  CUDA_ONE_CALL (cuCtxGetDevice)
>> +CUDA_ONE_CALL (cuDriverGetVersion)
> 
> Don't use cuDriverGetVersion.
> 
>>  CUDA_ONE_CALL (cuCtxPopCurrent)
>>  CUDA_ONE_CALL (cuCtxPushCurrent)
>>  CUDA_ONE_CALL (cuCtxSynchronize)
>> @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
>>  CUDA_ONE_CALL (cuModuleLoad)
>>  CUDA_ONE_CALL (cuModuleLoadData)
>>  CUDA_ONE_CALL (cuModuleUnload)
>> +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
> 
> Use CUDA_ONE_CALL_MAYBE_NULL.
> 
>>  CUDA_ONE_CALL (cuStreamCreate)
>>  CUDA_ONE_CALL (cuStreamDestroy)
>>  CUDA_ONE_CALL (cuStreamQuery)
>> diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
>> index 4799825..3a790e6 100644
>> --- a/libgomp/plugin/cuda/cuda.h
>> +++ b/libgomp/plugin/cuda/cuda.h
>> @@ -44,6 +44,7 @@ typedef void *CUevent;
>>  typedef void *CUfunction;
>>  typedef void *CUlinkState;
>>  typedef void *CUmodule;
>> +typedef size_t (*CUoccupancyB2DSize)(int);
>>  typedef void *CUstream;
>>  
>>  typedef enum {
>> @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
>>  CUresult cuDeviceGet (CUdevice *, int);
>>  CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
>>  CUresult cuDeviceGetCount (int *);
>> +CUresult cuDriverGetVersion(int *);
>>  CUresult cuEventCreate (CUevent *, unsigned);
>>  #define cuEventDestroy cuEventDestroy_v2
>>  CUresult cuEventDestroy (CUevent);
>> @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, 
>> CUmodule, const char *);
>>  CUresult cuModuleLoad (CUmodule *, const char *);
>>  CUresult cuModuleLoadData (CUmodule *, const void *);
>>  CUresult cuModuleUnload (CUmodule);
>> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
>> +  CUoccupancyB2DSize, size_t, int);
>>  CUresult cuStreamCreate (CUstream *, unsigned);
>>  #define cuStreamDestroy cuStreamDestroy_v2
>>  CUresult cuStreamDestroy (CUstream);
>> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
>> index 825470a..b0ccf0b 100644
>> --- a/libgomp/plugin/plugin-nvptx.c
>> +++ b/libgomp/plugin/plugin-nvptx.c
>> @@ -376,6 +376,7 @@ struct ptx_device
>>int max_threads_per_block;
>>int max_threads_per_multiprocessor;
>>int default_dims[GOMP_DIM_MAX];
>> +  int driver_version;
>>  
>>struct ptx_image_data *images;  /* Images loaded on device.  */
>>pthread_mutex_t image_lock; /* Lock for above list.  */
>> @@ -687,6 +688,7 @@ nvptx_open_device (int n)
>>ptx_dev->ord = n;
>>ptx_dev->dev = dev;
>>ptx_dev->ctx_shared = false;
>> +  ptx_dev->driver_version = 0;
>>  
>>r = CUDA_CALL_NOCHECK (cuCtxGetDevice, _dev);
>>if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
>> @@ -780,6 +782,9 @@ nvptx_open_device (int n)
>>for (int i = 0; i != GOMP_DIM_MAX; i++)
>>  ptx_dev->default_dims[i] = 0;
>>  
>> +  CUDA_CALL_ERET (NULL, cuDriverGetVersion,

Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-08 Thread Cesar Philippidis
On 08/07/2018 06:52 AM, Cesar Philippidis wrote:

> I attached an updated version of the CUDA driver patch, although I
> haven't rebased it against your changes yet. It still needs to be tested
> against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give
> you an update.
> 
> Does this patch look OK, at least after testing competes? I removed the
> tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't
> supported in the older drivers.

I've finally finished testing this patch. Besides for a couple of
regressions with CUDA 5.5 in libgomp.oacc-c-c++-common/lib-75.c,
lib-76.c and lib-79.c, the results came back clean.

This patch has been tested the following ways using a K40 GPU:

  * Using GCC's cuda.h with CUDA 9.2 drivers.
  * Using cuda.h from CUDA 5.5 and Nvidia drivers 331.133 (supports CUDA
6.0) and the driver from CUDA 8.0.
  * Using cuda.h from CUDA 8.0.

As mentioned before, because GCC's cuda.h defines CUDA_VERSION as 8000,
there was a conflict with using it against CUDA 5.5, because of the
missing cuLinkAddData_v2 symbol.

Note how the usage of cuOccupancyMaxPotentialBlockSize is guarded by
checking for the version of CUDA_VERSION. I don't really like this, but
it's a necessary evil of maintaining backwards compatibility.

Is this patch OK for trunk?

Thanks,
Cesar
[nvptx] Use CUDA driver API to select default runtime launch geometry

2018-08-YY  Cesar Philippidis  

	libgomp/
	plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuDriverGetVersion): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
	plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
	cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
	(ptx_device): Add driver_version member.
	(nvptx_open_device): Initialize it.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.
---
 libgomp/plugin/cuda-lib.def   |  2 ++
 libgomp/plugin/cuda/cuda.h|  4 
 libgomp/plugin/plugin-nvptx.c | 40 +++-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index be8e3b3..f2433e1 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
 CUDA_ONE_CALL (cuCtxDestroy)
 CUDA_ONE_CALL (cuCtxGetCurrent)
 CUDA_ONE_CALL (cuCtxGetDevice)
+CUDA_ONE_CALL (cuDriverGetVersion)
 CUDA_ONE_CALL (cuCtxPopCurrent)
 CUDA_ONE_CALL (cuCtxPushCurrent)
 CUDA_ONE_CALL (cuCtxSynchronize)
@@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
 CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825..3a790e6 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
 CUresult cuDeviceGet (CUdevice *, int);
 CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
 CUresult cuDeviceGetCount (int *);
+CUresult cuDriverGetVersion(int *);
 CUresult cuEventCreate (CUevent *, unsigned);
 #define cuEventDestroy cuEventDestroy_v2
 CUresult cuEventDestroy (CUevent);
@@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+	  CUoccupancyB2DSize, size_t, int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 825470a..b0ccf0b 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -376,6 +376,7 @@ struct ptx_device
   int max_threads_per_block;
   int max_threads_per_multiprocessor;
   int default_dims[GOMP_DIM_MAX];
+  int driver_version;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock; /* Lock for above list.  */
@@ -687,6 +688,7 @@ nvptx_open_device (int n)
   ptx_dev->ord = n;
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
+  ptx_dev->driver_version = 0;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, _dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
@@ -780,6 +782,9 @@ nvptx_open_device (int n)
   for (int i = 0; i != GOMP_DIM_MAX; i++)
 ptx_dev->default_dims[i] = 0

[PATCH][OpenACC] Update deviceptr handling during gimplification

2018-08-07 Thread Cesar Philippidis
I had previously posted this patch as part of a monster deviceptr patch
here <https://gcc.gnu.org/ml/gcc-patches/2018-06/msg01911.html>. This
patch breaks out the generic gimplifier changes. Essentially, with this
patch, the gimplifier will now transfer deviceptr data clauses using
GOMP_MAP_FORCE_DEVICEPTR.

Is this patch OK for trunk? It bootstrapped / regression tested cleanly
for x86_64 with nvptx offloading.

Thanks,
Cesar
>From b5cf37b795ce78c78f3f434ac6999f7094bd86aa Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Mon, 7 May 2018 08:23:48 -0700
Subject: [PATCH] [OpenACC] Update deviceptr handling

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* trans-openmp.c (gfc_omp_finish_clause): Don't create pointer data
	mappings for deviceptr clauses.
	(gfc_trans_omp_clauses): Likewise.
	gcc/
	* gimplify.c (enum gimplify_omp_var_data): Add GOVD_DEVICETPR.
	(omp_notice_variable): Add GOVD_DEVICEPTR attribute when appropriate.
	(gimplify_scan_omp_clauses): Likewise.
	(gimplify_adjust_omp_clauses_1): Set GOMP_MAP_FORCE_DEVICEPTR for
	implicit deviceptr mappings.
	gcc/testsuite/
	* c-c++-common/goacc/deviceptr-4.c: Update expected data mapping.

(cherry picked from openacc-gcc-7-branch commit
d3de16b461545aac1925f0d7c2851c8c49a07d06 and commit
f0514fe1899666bb5b8ee52601f5d4263d4c4646)
---
 gcc/fortran/trans-openmp.c |  9 +
 gcc/gimplify.c | 12 +++-
 gcc/testsuite/c-c++-common/goacc/deviceptr-4.c |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index f038f4c..ca31c88 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -1060,6 +1060,8 @@ gfc_omp_finish_clause (tree c, gimple_seq *pre_p)
 }
 
   tree c2 = NULL_TREE, c3 = NULL_TREE, c4 = NULL_TREE;
+  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FORCE_DEVICEPTR)
+return;
   if (POINTER_TYPE_P (TREE_TYPE (decl)))
 {
   if (!gfc_omp_privatize_by_reference (decl)
@@ -2111,6 +2113,12 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses,
 	  if (n->expr == NULL || n->expr->ref->u.ar.type == AR_FULL)
 		{
 		  if (POINTER_TYPE_P (TREE_TYPE (decl))
+		  && n->u.map_op == OMP_MAP_FORCE_DEVICEPTR)
+		{
+		  OMP_CLAUSE_DECL (node) = decl;
+		  goto finalize_map_clause;
+		}
+		  else if (POINTER_TYPE_P (TREE_TYPE (decl))
 		  && (gfc_omp_privatize_by_reference (decl)
 			  || GFC_DECL_GET_SCALAR_POINTER (decl)
 			  || GFC_DECL_GET_SCALAR_ALLOCATABLE (decl)
@@ -2282,6 +2290,7 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses,
 		  ptr2 = fold_convert (sizetype, ptr2);
 		  OMP_CLAUSE_SIZE (node3)
 		= fold_build2 (MINUS_EXPR, sizetype, ptr, ptr2);
+		finalize_map_clause:;
 		}
 	  switch (n->u.map_op)
 		{
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 4a109ae..bcf862f 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -105,6 +105,9 @@ enum gimplify_omp_var_data
   /* Flag for GOVD_MAP: must be present already.  */
   GOVD_MAP_FORCE_PRESENT = 524288,
 
+  /* Flag for OpenACC deviceptrs.  */
+  GOVD_DEVICEPTR = (1<<21),
+
   GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE
 			   | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR
 			   | GOVD_LOCAL)
@@ -7232,6 +7235,7 @@ omp_notice_variable (struct gimplify_omp_ctx *ctx, tree decl, bool in_code)
 		error ("variable %qE declared in enclosing "
 			   "% region", DECL_NAME (decl));
 		  nflags |= GOVD_MAP;
+		  nflags |= (n2->value & GOVD_DEVICEPTR);
 		  if (octx->region_type == ORT_ACC_DATA
 			  && (n2->value & GOVD_MAP_0LEN_ARRAY))
 			nflags |= GOVD_MAP_0LEN_ARRAY;
@@ -8213,6 +8217,8 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
 	  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_TO
 	  || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_TOFROM)
 	flags |= GOVD_MAP_ALWAYS_TO;
+	  else if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FORCE_DEVICEPTR)
+	flags |= GOVD_DEVICEPTR;
 	  goto do_add;
 
 	case OMP_CLAUSE_DEPEND:
@@ -8828,7 +8834,8 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data)
   /* Not all combinations of these GOVD_MAP flags are actually valid.  */
   switch (flags & (GOVD_MAP_TO_ONLY
 		   | GOVD_MAP_FORCE
-		   | GOVD_MAP_FORCE_PRESENT))
+		   | GOVD_MAP_FORCE_PRESENT
+		   | GOVD_DEVICEPTR))
 	{
 	case 0:
 	  kind = GOMP_MAP_TOFROM;
@@ -8845,6 +8852,9 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data)
 	case GOVD_MAP_FORCE_PRESENT:
 	  kind = GOMP_MAP_FORCE_PRESENT;
 	  break;
+	case GOVD_DEVICEPTR:
+	  kind = GOMP_MAP_FORCE_DEVICEPTR;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
diff --git a/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c b/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c
index db1b916..79a5162 100644
--- a/g

[PATCH][OpenACC] Don't error on implicitly private induction variables in gfortran

2018-08-07 Thread Cesar Philippidis
At present, the fortran FE reports an error if the user adds an explicit
private clause to an induction variable used by an acc loop. This patch
teaches the fortran acc block resolver how to cope with "duplicate"
private clauses, so that it doesn't error anymore.

Is this patch OK for trunk? I bootstrapped and regression tested it for
x86_64 with nvptx offloading.

Thanks,
Cesar
>From 576b2a7d5574400f067ec309929b38b324d8c6f6 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Fri, 27 Jan 2017 14:58:16 +
Subject: [PATCH] [OpenACC] Don't error on implicitly private induction
 variables in gfortran

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* openmp.c (gfc_resolve_oacc_blocks): Populate list of private
	variables.

	gcc/testsuite/
	* gfortran.dg/goacc/implicitly-private.f90: New test.

---
 gcc/fortran/openmp.c   |  5 +
 gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90 | 12 
 2 files changed, 17 insertions(+)
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90

diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index b346b51..798c5fa 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -5951,6 +5951,7 @@ void
 gfc_resolve_oacc_blocks (gfc_code *code, gfc_namespace *ns)
 {
   fortran_omp_context ctx;
+  gfc_omp_namelist *n;
 
   resolve_oacc_loop_blocks (code);
 
@@ -5961,6 +5962,10 @@ gfc_resolve_oacc_blocks (gfc_code *code, gfc_namespace *ns)
   ctx.is_openmp = false;
   omp_current_ctx = 
 
+  if (code->ext.omp_clauses)
+for (n = code->ext.omp_clauses->lists[OMP_LIST_PRIVATE]; n; n = n->next)
+  ctx.private_iterators->add (n->sym);
+
   gfc_resolve_blocks (code->block, ns);
 
   omp_current_ctx = ctx.previous;
diff --git a/gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90 b/gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90
new file mode 100644
index 000..a687d8a
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90
@@ -0,0 +1,12 @@
+! Ensure that implicitly private variables do not clash with those
+! that are explicitly private.
+
+program main
+  implicit none
+
+  integer i
+
+  !$acc parallel loop private(i)
+  do i = 1, 100
+  end do
+end program main
-- 
2.7.4



[PATCH][OpenACC] Add support for firstprivate Fortran allocatable scalars

2018-08-07 Thread Cesar Philippidis
This patch updates the way that lower_omp_target uses firstprivate
pointers in OpenACC offloaded regions. On host side, when preparing
firstprivate data mapping for pointer type objects, not to be confused
with GOMP_MAP_FIRSTPRIVATE_POINTER, the compiler passes passes the
address of the value being pointed to and not the address of the pointer
itself to the runtime. Correspondingly, on the device side, the compiler
generates to code to dereference the remapped pointer once to copy the
data to a local buffer.

While this behavior looks like it would break things, it will not affect
C or C++ data mappings, because those languages transfer pointers via
GOMP_MAP_FIRSTPRIVATE_POINTER. In addition, this will not cause
problems with array types, because the default remapping rules for
OpenACC is to transfer them in via copy. Besides it really doesn't
make sense to allow arrays to be transferred in via firstprivate
because that would use up a lot of memory on the accelerator.

Is this OK for trunk? I bootstrapped and regtested it for x86_64 with
nvptx offloading.

Thanks,
Cesar
>From b8fb83b36d0f96b12af9a1f5596f31b3c6b72ef0 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Mon, 6 Aug 2018 09:19:28 -0700
Subject: [PATCH] [OpenACC] Add support for firstprivate Fortran allocatable
 scalars

This patch updates the way that lower_omp_target uses firstprivate
pointers in OpenACC offloaded regions. On host side, when preparing
pointer type firstprivate data mapping, not to be confused with
GOMP_MAP_FIRSTPRIVATE_POINTER, the compiler passes passes the address
of the value being pointed to, not the address of the pointer
itself. Correspondingly, on the device side, the compiler generates to
deference the remapped pointer once and copy the data to a local
buffer.

While this behavior like it would break things, it will not affect C
or C++ data mappings, because those languages transfer pointers via
GOMP_MAP_FIRSTPRIVATE_POINTER. In addition, this will not cause
problems with array types, because the default remapping rules for
OpenACC is to transfer them in via copy. Besides it really doesn't
make sense to allow arrays to be transferred in via firstprivate
because that would use up a lot of memory on the accelerator.

2018-XX-YY  Cesar Philippidis  

	gcc/
	omp-low.c (lower_omp_target): Update OpenACC handling of
	pointer variables with GOMP_MAP_FIRSTPRIVATE mappings.

	libgomp/
	testsuite/libgomp.oacc-fortran/allocatable-scalar.f90: New
	test.
---
 gcc/omp-low.c  | 18 
 .../libgomp.oacc-fortran/allocatable-scalar.f90| 33 ++
 2 files changed, 46 insertions(+), 5 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 843c66f..47603c4 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -7643,15 +7643,21 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE)
 	  {
 		gcc_assert (is_gimple_omp_oacc (ctx->stmt));
-		if (omp_is_reference (new_var)
-		&& TREE_CODE (TREE_TYPE (new_var)) != POINTER_TYPE)
+		if (omp_is_reference (new_var))
 		  {
 		/* Create a local object to hold the instance
 		   value.  */
-		tree type = TREE_TYPE (TREE_TYPE (new_var));
+		tree type = TREE_TYPE (new_var);
+		/* Pointer types are mapped onto the device via a
+		   single level of indirection.  */
+		if (TREE_CODE (type) != POINTER_TYPE)
+		  type = TREE_TYPE (type);
 		const char *id = IDENTIFIER_POINTER (DECL_NAME (new_var));
 		tree inst = create_tmp_var (type, id);
-		gimplify_assign (inst, fold_indirect_ref (x), );
+		if (TREE_CODE (TREE_TYPE (new_var)) == POINTER_TYPE)
+		  gimplify_assign (inst, fold_indirect_ref (x), );
+		else
+		  gimplify_assign (inst, fold_indirect_ref (x), );
 		x = build_fold_addr_expr (inst);
 		  }
 		gimplify_assign (new_var, x, );
@@ -7879,7 +7885,9 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 		else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE)
 		  {
 		gcc_assert (is_gimple_omp_oacc (ctx->stmt));
-		if (!omp_is_reference (var))
+		/* Handle Fortran allocatable scalars.  */
+		if (!omp_is_reference (var)
+			&& TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE)
 		  {
 			if (is_gimple_reg (var)
 			&& OMP_CLAUSE_FIRSTPRIVATE_IMPLICIT (c))
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90 b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90
new file mode 100644
index 000..be86d14
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90
@@ -0,0 +1,33 @@
+! Test non-declared allocatable scalars in OpenACC data clauses.
+
+! { dg-do run }
+
+program main
+  implicit none
+  integer, parameter :: n = 100
+  integer, allocatable :: a, c
+  integer :: i, b(n)
+
+  al

[PATCH][OpenACC] update gfortran's tile clause error handling

2018-08-07 Thread Cesar Philippidis
This patch updates how the OpenACC tile clause is handled in the Fortran
FE to match it's behavior in C/C++. Specifically, the tile clause now
errors on negative integer arguments, instead of emitting a warning.

Is this OK for trunk?

Thanks,
Cesar
>From af39a6d65cfb46397fa62c88521189002fb3d705 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Mon, 3 Oct 2016 13:58:59 +
Subject: [PATCH] [OpenACC] update gfortran's tile clause error handling

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* openmp.c (resolve_positive_int_expr): Promote the warning to an
	error.

	gcc/testsuite/
	* gfortran.dg/goacc/loop-2.f95: Change expected tile clause
	warnings to errors.
	* gfortran.dg/goacc/loop-5.f95: Likewise.
	* gfortran.dg/goacc/sie.f95: Likewise.
	* gfortran.dg/goacc/tile-1.f90: New test.
	* gfortran.dg/goacc/tile-2.f90: New test.

---
 gcc/fortran/openmp.c   |  4 ++--
 gcc/testsuite/gfortran.dg/goacc/loop-2.f95 |  8 +++
 gcc/testsuite/gfortran.dg/goacc/loop-5.f95 | 12 --
 gcc/testsuite/gfortran.dg/goacc/sie.f95| 36 +++---
 gcc/testsuite/gfortran.dg/goacc/tile-1.f90 | 16 ++---
 gcc/testsuite/gfortran.dg/gomp/pr77516.f90 |  2 +-
 6 files changed, 33 insertions(+), 45 deletions(-)

diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 5c0ae45..b346b51 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -3719,8 +3719,8 @@ resolve_positive_int_expr (gfc_expr *expr, const char *clause)
   if (expr->expr_type == EXPR_CONSTANT
   && expr->ts.type == BT_INTEGER
   && mpz_sgn (expr->value.integer) <= 0)
-gfc_warning (0, "INTEGER expression of %s clause at %L must be positive",
-		 clause, >where);
+gfc_error ("INTEGER expression of %s clause at %L must be positive",
+	   clause, >where);
 }
 
 static void
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-2.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-2.f95
index 0c902b2..d4c6273 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-2.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-2.f95
@@ -143,7 +143,7 @@ program test
   DO j = 1,10
   ENDDO
 ENDDO
-!$acc loop tile(-1) ! { dg-warning "must be positive" }
+!$acc loop tile(-1) ! { dg-error "must be positive" }
 do i = 1,10
 enddo
 !$acc loop tile(i) ! { dg-error "constant expression" }
@@ -307,7 +307,7 @@ program test
   DO j = 1,10
   ENDDO
 ENDDO
-!$acc loop tile(-1) ! { dg-warning "must be positive" }
+!$acc loop tile(-1) ! { dg-error "must be positive" }
 do i = 1,10
 enddo
 !$acc loop tile(i) ! { dg-error "constant expression" }
@@ -460,7 +460,7 @@ program test
 DO j = 1,10
 ENDDO
   ENDDO
-  !$acc kernels loop tile(-1) ! { dg-warning "must be positive" }
+  !$acc kernels loop tile(-1) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
   !$acc kernels loop tile(i) ! { dg-error "constant expression" }
@@ -612,7 +612,7 @@ program test
 DO j = 1,10
 ENDDO
   ENDDO
-  !$acc parallel loop tile(-1) ! { dg-warning "must be positive" }
+  !$acc parallel loop tile(-1) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
   !$acc parallel loop tile(i) ! { dg-error "constant expression" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-5.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-5.f95
index d059cf7..fe137d5 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-5.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-5.f95
@@ -93,9 +93,6 @@ program test
   DO j = 1,10
   ENDDO
 ENDDO
-!$acc loop tile(-1) ! { dg-warning "must be positive" }
-do i = 1,10
-enddo
 !$acc loop vector tile(*)
 DO i = 1,10
 ENDDO
@@ -129,9 +126,6 @@ program test
   DO j = 1,10
   ENDDO
 ENDDO
-!$acc loop tile(-1) ! { dg-warning "must be positive" }
-do i = 1,10
-enddo
 !$acc loop vector tile(*)
 DO i = 1,10
 ENDDO
@@ -242,9 +236,6 @@ program test
 DO j = 1,10
 ENDDO
   ENDDO
-  !$acc kernels loop tile(-1) ! { dg-warning "must be positive" }
-  do i = 1,10
-  enddo
   !$acc kernels loop vector tile(*)
   DO i = 1,10
   ENDDO
@@ -333,9 +324,6 @@ program test
 DO j = 1,10
 ENDDO
   ENDDO
-  !$acc parallel loop tile(-1) ! { dg-warning "must be positive" }
-  do i = 1,10
-  enddo
   !$acc parallel loop vector tile(*)
   DO i = 1,10
   ENDDO
diff --git a/gcc/testsuite/gfortran.dg/goacc/sie.f95 b/gcc/testsuite/gfortran.dg/goacc/sie.f95
index abfe28b..3abf2c8 100644
--- a/gcc/testsuite/gfortran.dg/goacc/sie.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/sie.f95
@@ -78,10 +78,10 @@ program test
   !$acc parallel num_gangs(i+1)
   !$acc end parallel
 
-  !$acc parallel num_gangs(-1) ! { dg-warning "must be positive" }
+  !$acc parallel num_gangs(-1) ! { dg-error &

[PATCH][OpenACC] cleanup trans-stmt.h

2018-08-07 Thread Cesar Philippidis
This patch removes a stale reference to trans-openacc.c in
gcc/fortran/trans-statement.h. I'll apply it to trunk as obvious shortly.

Cesar
>From a08fe168c3f3ca4d446915ad26027786cda58394 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Tue, 14 Mar 2017 22:33:00 +
Subject: [PATCH] [OpenACC] cleanup trans-stmt.h

2018-08-07  Cesar Philippidis  

	gcc/fortran/
	* trans-stmt.h: Remove stale reference to trans-openacc.c.

---
 gcc/fortran/trans-stmt.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gcc/fortran/trans-stmt.h b/gcc/fortran/trans-stmt.h
index c798c80..848c7d9 100644
--- a/gcc/fortran/trans-stmt.h
+++ b/gcc/fortran/trans-stmt.h
@@ -70,8 +70,6 @@ tree gfc_trans_deallocate_array (tree);
 /* trans-openmp.c */
 tree gfc_trans_omp_directive (gfc_code *);
 void gfc_trans_omp_declare_simd (gfc_namespace *);
-
-/* trans-openacc.c */
 tree gfc_trans_oacc_directive (gfc_code *);
 tree gfc_trans_oacc_declare (gfc_namespace *);
 
-- 
2.7.4



Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-07 Thread Cesar Philippidis
On 08/06/2018 11:08 PM, Tom de Vries wrote:
> On 08/01/2018 12:18 PM, Tom de Vries wrote:
> 
>> I think we need to add and handle:
>> ...
>>   CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
>> ...
>>
> 
> I realized that the patch I posted introducing CUDA_ONE_CALL_MAYBE_NULL
> was incomplete, and needed to use the weak attribute in case of linking
> against a concrete libcuda.so.
> 
> So, I've now committed a patch implementing just CUDA_ONE_CALL_MAYBE_NULL:
> "[libgomp, nvptx] Handle CUDA_ONE_CALL_MAYBE_NULL" @
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00447.html . You can use
> "CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)" to test for
> existence of the function in the cuda driver API.

Sorry for taking so long getting this patch updated. It's a slow build
and test cycle getting older versions of cuda to play nicely. So far,
I've managed to get CUDA 5.5 partially working with Nvidia driver
331.113 (which supports CUDA 6.0) in the sense that I spotted an error
with the patch; I realized that the cuda.h that ships with libgomp
emulates version CUDA 8.0. That lead to problems using cuLinkAddData,
because that function gets remapped to cuLinkAddData_v2 in CUDA 6.5 and
newer.

That leads me to a question, do we really want to support older versions
of CUDA without using the system's CUDA header files?

>> The patch doesn't build in a setup with
>> --enable-offload-targets=nvptx-none and without cuda, that enables usage
>> of plugin/cuda/cuda.h:
>> ...
>> /data/offload-nvptx/src/libgomp/plugin/plugin-nvptx.c:98:16: error:
>> ‘cuOccupancyMaxPotentialBlockSize’ undeclared here (not in a function);
>> did you mean ‘cuOccupancyMaxPotentialBlockSizeWithFlags’?
>>  CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \
>> ...
>>
> 
> I've committed a patch "[libgomp, nvptx, --without-cuda-driver] Don't
> use system cuda driver" @
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00348.html .
> 
> Using --without-cuda-driver should make it easy to build using the
> dlopen interface without having to de-install the system libcuda.so.

I attached an updated version of the CUDA driver patch, although I
haven't rebased it against your changes yet. It still needs to be tested
against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give
you an update.

Does this patch look OK, at least after testing competes? I removed the
tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't
supported in the older drivers.

Cesar

>From 7fc093da173543b43e1d83dd5fb9e00e2b92eb09 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Thu, 26 Jul 2018 11:47:35 -0700
Subject: [PATCH] [nvptx] Use CUDA driver API to select default runtime launch
 geometry

	libgomp/
	plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuDriverGetVersion): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
	plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
	cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
	(ptx_device): Add driver_version member.
	(nvptx_open_device): Initialize it.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.
---
 libgomp/plugin/cuda-lib.def   |  2 ++
 libgomp/plugin/cuda/cuda.h|  4 
 libgomp/plugin/plugin-nvptx.c | 41 +--
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index be8e3b3ec4d..f2433e1f0a9 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
 CUDA_ONE_CALL (cuCtxDestroy)
 CUDA_ONE_CALL (cuCtxGetCurrent)
 CUDA_ONE_CALL (cuCtxGetDevice)
+CUDA_ONE_CALL (cuDriverGetVersion)
 CUDA_ONE_CALL (cuCtxPopCurrent)
 CUDA_ONE_CALL (cuCtxPushCurrent)
 CUDA_ONE_CALL (cuCtxSynchronize)
@@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
 CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..3a790e688e0 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
 CUresult cuDeviceGet (CUdevice *, int);
 CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
 CUresult cuDeviceGetCount (int *);
+CUresult cuDriverGetVersion(int *);
 CUresult cuEventCreate (CUevent *, unsigned);
 #def

Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-03 Thread Cesar Philippidis
On 08/03/2018 08:22 AM, Tom de Vries wrote:
> On 08/01/2018 09:11 PM, Cesar Philippidis wrote:
>> On 08/01/2018 07:12 AM, Tom de Vries wrote:
>>
>>>>>> +  gangs = grids * (blocks / warp_size);
>>>>>
>>>>> So, we launch with gangs == grids * workers ? Is that intentional?
>>>>
>>>> Yes. At least that's what I've been using in og8. Setting num_gangs =
>>>> grids alone caused significant slow downs.
>>>>
>>>
>>> Well, what you're saying here is: increasing num_gangs increases
>>> performance.
>>>
>>> You don't explain why you multiply with workers specifically.
>>
>> I set it that way because I think the occupancy calculator is
>> determining the occupancy of a single multiprocessor unit, rather than
>> the entire GPU. Looking at the og8 code again, I had
>>
>>num_gangs = 2 * threads_per_sm / warp_size * dev_size
>>
>> which corresponds to
>>
>>2 * grids * blocks / warp_size
>>
> 
> I've done an experiment using the sample simpleOccupancy. The kernel is
> small, so the blocks returned is the maximum: max_threads_per_block (1024).
> 
> The grids returned is 10, which I tentatively interpret as num_dev *
> (max_threads_per_multi_processor / blocks). [ Where num_dev == 5, and
> max_threads_per_multi_processor == 2048. ]
> 
> Substituting that into the og8 code, and equating
> max_threads_per_multi_processor with threads_per_sm, I indeed get
> 
> num_gangs = 2 * grids * blocks / warp_size.
> 
> So with this extra information I see how you got there.
> 
> But I still see no rationale why blocks is used here, and I wonder
> whether something like num_gangs = grids * 64 would give similar results.

My original intent was to keep the load proportional to the block size.
So, in the case were a block size is limited by shared-memory or the
register file capacity, the runtime wouldn't excessively over assign
gangs to the multiprocessor units if their state is going to be swapped
out even more than necessary.

With that said, I could be wrong here. It would be nice if Nvidia
provided us with more insights into their hardware.

> Anyway, given that this is what is used on og8, I'm ok with using that,
> so let's go with:
> ...
> gangs = 2 * grids * (blocks / warp_size);
> ...
> [ so, including the factor two you explicitly left out from the original
> patch. Unless you see a pressing reason not to include it. ]
> 
> Can you repost after retesting? [ note: the updated patch I posted
> earlier doesn't apply on trunk anymore due to the cuda-lib.def change. ]

Thanks for looking into this. I got bogged down tracking a problem with
allocatable scalars in fortran. I'll repost post this patch after I
tested it with an older version of CUDA (probably CUDA 5.5 using the
Nvidia driver 331.113 on a K40).

Cesar


Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-01 Thread Cesar Philippidis
On 08/01/2018 07:12 AM, Tom de Vries wrote:

 +gangs = grids * (blocks / warp_size);
>>>
>>> So, we launch with gangs == grids * workers ? Is that intentional?
>>
>> Yes. At least that's what I've been using in og8. Setting num_gangs =
>> grids alone caused significant slow downs.
>>
> 
> Well, what you're saying here is: increasing num_gangs increases
> performance.
> 
> You don't explain why you multiply with workers specifically.

I set it that way because I think the occupancy calculator is
determining the occupancy of a single multiprocessor unit, rather than
the entire GPU. Looking at the og8 code again, I had

   num_gangs = 2 * threads_per_sm / warp_size * dev_size

which corresponds to

   2 * grids * blocks / warp_size

Because blocks is generally smaller than threads_per_block, the driver
occupancy calculator ends up launching fewer gangs.

I don't have a firm position with this default behavior. Perhaps we
should just set

  gang = grids

That's probably an improvement over what's there now.

Cesar


Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-08-01 Thread Cesar Philippidis
On 08/01/2018 03:18 AM, Tom de Vries wrote:
> On 07/31/2018 04:58 PM, Cesar Philippidis wrote:
>> The attached patch teaches libgomp how to use the CUDA thread occupancy
>> calculator built into the CUDA driver. Despite both being based off the
>> CUDA thread occupancy spreadsheet distributed with CUDA, the built in
>> occupancy calculator differs from the occupancy calculator in og8 in two
>> key ways. First, og8 launches twice the number of gangs as the driver
>> thread occupancy calculator. This was my attempt at preventing threads
>> from idling, and it operating on a similar principle of running 'make
>> -jN', where N is twice the number of CPU threads.
> 
> You're saying the two methods are different, and that the difference
> between the two methods is a factor two, which is a heuristic you added
> yourself on top of one of the methods, which implies that in fact the
> two methods are identical. Is my understanding correct here?

With the exception being that og8 multiples num_gangs by a factor of
two, those two algorithms are identical, at least with respect to gangs.

>> Second, whereas og8
>> always attempts to maximize the CUDA block size, the driver may select a
>> smaller block, which effectively decreases num_workers.
>>
> 
> So, do I understand it correctly that using the function
> cuOccupancyMaxPotentialBlockSize gives us "minimum block size that can
> achieve the maximum occupancy" or some such and og8 gives us "maximum
> block size"?

Correct.

>> In terms of performance, there really isn't that much of a difference
>> between the CUDA driver's occupancy calculator and og8's. However, on
>> the tests that are impacted, they are generally within a factor of two
>> from one another, with some tests running faster with the driver
>> occupancy calculator and others with og8's.
>>
> 
> Ack. Well, until we understand that in more detail, going with the
> driver's occupancy calculator seems the right thing to do.
> 
>> Unfortunately, support for the CUDA driver API isn't universal; it's
>> only available in CUDA version 6.5 (or 6050) and newer. In this patch,
>> I'm exploiting the fact that init_cuda_lib only checks for errors on the
>> last library function initialized.
> 
> That sounds incorrect to me. In init_cuda_lib I see:
> ...
> # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
> # define CUDA_ONE_CALL_1(call) \
>   cuda_lib.call = dlsym (h, #call); \
>   if (cuda_lib.call == NULL)\
> return false;
>   CUDA_CALLS
> ...
> so in fact every library function is checked. Have you tested this with
> pre 6-5 cuda?

I misread that. You're correct. So far, I've only tested this out with
CUDA 9.

> I think we need to add and handle:
> ...
>   CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
> ...
> 
>> Therefore it guards the usage of
>>
>>   cuOccupancyMaxPotentialBlockSizeWithFlags
>>
>> by checking driver_version.
> 
> If we allow the cuOccupancyMaxPotentialBlockSize field to be NULL, we
> can test for NULL, which seems a simpler solution than testing the version.
> 
>> If the driver occupancy calculator isn't
>> available, it falls back to the existing defaults. Maybe the og8 thread
>> occupancy would make a better default for older versions of CUDA, but
>> that's a patch for another day.
>>
> 
> Agreed.
> 
>> Is this patch OK for trunk?
> 
> The patch doesn't build in a setup with
> --enable-offload-targets=nvptx-none and without cuda, that enables usage
> of plugin/cuda/cuda.h:
> ...
> /data/offload-nvptx/src/libgomp/plugin/plugin-nvptx.c:98:16: error:
> ‘cuOccupancyMaxPotentialBlockSize’ undeclared here (not in a function);
> did you mean ‘cuOccupancyMaxPotentialBlockSizeWithFlags’?
>  CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \
> ...
> 
>> @@ -1220,11 +1227,39 @@ nvptx_exec (void (*fn), size_t mapnum, void 
>> **hostaddrs, void **devaddrs,
>>  
>>{
>>  bool default_dim_p[GOMP_DIM_MAX];
>> +int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
>> +int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
>> +int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
>> +
>> +/* The CUDA driver occupancy calculator is only available on
>> +   CUDA version 6.5 (6050) and newer.  */
>> +if (nvthd->ptx_dev->driver_version > 6050)
>> +  {
>> +int grids, blocks;
>> +CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, ,
>> +  , function, NULL, 0,
>> +  

Re: [PATCH,nvptx] Remove use of 'struct map' from plugin (nvptx)

2018-08-01 Thread Cesar Philippidis
On 08/01/2018 04:01 AM, Tom de Vries wrote:
> On 07/31/2018 05:12 PM, Cesar Philippidis wrote:
>> This is an old patch which removes the struct map from the nvptx plugin.
>> I believe at one point this was supposed to be used to manage async data
>> mappings, but in practice that never worked out.
> 
> I don't quite understand what rationale you're trying to present here.
> 
> Is this dead code?

It's dead code.

Cesar


[og8] More goacc_parlevel enhancements

2018-07-31 Thread Cesar Philippidis
I've committed this patch which contains all of the remaining
goacc_parlevel bug fixes present in trunk to og8.

The goal of the goacc parlevel changes is replace the use of inline ptx
code with builtin functions so that the certain OpenACC execution tests
that exercise the execution model can be target independent. For the
most part, these patches applied cleanly to og8, however, as I noted in
PR86757, there were a couple of og8-specific regressions involving tests
that started to fail when built -O0. I believe that problem is caused by
the ganglocal memory changes.

Chung-Lin, we'll need to fix PR86757 before we push the gangprivate
changes upstream.

Julian, I'm not sure if the GCN port supports gangprivate memory. If it
does, you might be hit by this failure at -O0. But those tests have
already been xfailed, so you should be OK.

Cesar
[og8] More goacc_parlevel enhancements

2018-07-31  Cesar Philippidis  

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust test.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.

	Backport from mainline:
	2018-05-02  Tom de Vries  

	PR libgomp/85411
	libgomp/
	* plugin/plugin-nvptx.c (nvptx_exec): Move parsing of
	GOMP_OPENACC_DIM ...
	* env.c (parse_gomp_openacc_dim): ... here.  New function.
	(initialize_env): Call parse_gomp_openacc_dim.
	(goacc_default_dims): Define.
	* libgomp.h (goacc_default_dims): Declare.
	* oacc-plugin.c (GOMP_PLUGIN_acc_default_dim): New function.
	* oacc-plugin.h (GOMP_PLUGIN_acc_default_dim): Declare.
	* libgomp.map: New version "GOMP_PLUGIN_1.2". Add
	GOMP_PLUGIN_acc_default_dim.
	* testsuite/libgomp.oacc-c-c++-common/loop-default-runtime.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/loop-default.h: New test.

	2018-05-04  Tom de Vries  
	PR libgomp/85639
	gcc/
	* builtins.c (expand_builtin_goacc_parlevel_id_size): Handle null target
	if ignore == 0.

	2018-05-07  Tom de Vries  
	PR testsuite/85677
	libgomp/
	* testsuite/lib/libgomp.exp (libgomp_init): Move inclusion of top-level
	include directory in ALWAYS_CFLAGS out of $blddir != "" condition.

[openacc] Move GOMP_OPENACC_DIM parsing out of nvptx plugin

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259852
138bc75d-0d04-0410-961f-82ee72b054a4

[expand] Handle null target in expand_builtin_goacc_parlevel_id_size

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259927
138bc75d-0d04-0410-961f-82ee72b054a4

[openacc, testsuite] Allow installed testing of libgomp to find gomp-constants.h

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259992
138bc75d-0d04-0410-961f-82ee72b054a4

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 300e13c..0097d5b 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -6682,6 +6682,9 @@ expand_builtin_goacc_parlevel_id_size (tree exp, rtx target, int ignore)
   if (ignore)
 return target;
 
+  if (target == NULL_RTX)
+target = gen_reg_rtx (TYPE_MODE (TREE_TYPE (exp)));
+
   if (!targetm.have_oacc_dim_size ())
 {
   emit_move_insn (target, fallback_retval);
diff --git a/libgomp/env.c b/libgomp/env.c
index c99ba85..fab35b7 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -90,6 +90,7 @@ int gomp_debug_var;
 unsigned int gomp_num_teams_var;
 char *goacc_device_type;
 int goacc_device_num;
+int goacc_default_dims[GOMP_DIM_MAX];
 
 #ifndef LIBGOMP_OFFLOADED_ONLY
 
@@ -1066,6 +1067,36 @@ parse_acc_device_type (void)
 }
 
 static void
+parse_gomp_openacc_dim (void)
+{
+  /* The syntax is the same as for the -fopenacc-dim compilation option.  */
+  const char *var_name = "GOMP_OPENACC_DIM";
+  const char *env_var = getenv (var_name);
+  if (!env_var)
+return;
+
+  const char *pos = env_var;
+  int i;
+  for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+{
+  if (i && *pos++ != ':')
+	break;
+
+  if (*pos == ':')
+	continue;
+
+  const char *eptr;
+  errno = 0;
+  long val = strtol (pos, (char **), 10);
+  if (errno || val < 0 || (unsigned)val != val)
+	break;
+
+  goacc_default_dims[i] = (int)val;
+  pos = eptr;
+}
+}
+
+static void
 handle_omp_display_env (unsigned long stacksize, int wait_policy)
 {
   const char *env;
@@ -1336,6 +1367,7 @@ initialize_env (void)
 goacc_device_num = 0;
 
   parse_acc_device_type ();
+  parse_gomp_openacc_dim ();
 
   goacc_runtime_initialize ();
 
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a9aca74..607f4c2 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -44,6 +44,7 @@
 #include "config.h"
 #include "gst

[og8] Add __builtin_goacc_parlevel_{id,size}

2018-07-31 Thread Cesar Philippidis
I've committed this patch to og8 which backports the first of Tom's
goacc_parlevel patches from mainline. I'll post of a followup patch
which contains various bug fixes. I believe that this patch was
originally introduced in PR82428, or at least it resolves that PR.

Cesar
[og8] Add __builtin_goacc_parlevel_{id,size}

2018-07-31  Cesar Philippidis  

	Backport from mainline:
	2018-05-02  Tom de Vries  

	PR libgomp/82428
	gcc/
	* builtins.def (DEF_GOACC_BUILTIN_ONLY): Define.
	* omp-builtins.def (BUILT_IN_GOACC_PARLEVEL_ID)
	(BUILT_IN_GOACC_PARLEVEL_SIZE): New builtin.
	* builtins.c (expand_builtin_goacc_parlevel_id_size): New function.
	(expand_builtin): Call expand_builtin_goacc_parlevel_id_size.
	* doc/extend.texi (Other Builtins): Add __builtin_goacc_parlevel_id and
	__builtin_goacc_parlevel_size.

	gcc/fortran/
	* f95-lang.c (DEF_GOACC_BUILTIN_ONLY): Define.

	gcc/testsuite/
	* c-c++-common/goacc/builtin-goacc-parlevel-id-size-2.c: New test.
	* c-c++-common/goacc/builtin-goacc-parlevel-id-size.c: New test.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/gang-static-2.c: Use
	__builtin_goacc_parlevel_{id,size}.
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/tile-1.c: Same.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259850
138bc75d-0d04-0410-961f-82ee72b054a4

diff --git a/gcc/builtins.c b/gcc/builtins.c
index a71555e..300e13c 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -71,6 +71,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-fold.h"
 #include "intl.h"
 #include "file-prefix-map.h" /* remap_macro_filename()  */
+#include "gomp-constants.h"
+#include "omp-general.h"
 
 struct target_builtins default_target_builtins;
 #if SWITCHABLE_TARGET
@@ -6628,6 +6630,71 @@ expand_stack_save (void)
   return ret;
 }
 
+/* Emit code to get the openacc gang, worker or vector id or size.  */
+
+static rtx
+expand_builtin_goacc_parlevel_id_size (tree exp, rtx target, int ignore)
+{
+  const char *name;
+  rtx fallback_retval;
+  rtx_insn *(*gen_fn) (rtx, rtx);
+  switch (DECL_FUNCTION_CODE (get_callee_fndecl (exp)))
+{
+case BUILT_IN_GOACC_PARLEVEL_ID:
+  name = "__builtin_goacc_parlevel_id";
+  fallback_retval = const0_rtx;
+  gen_fn = targetm.gen_oacc_dim_pos;
+  break;
+case BUILT_IN_GOACC_PARLEVEL_SIZE:
+  name = "__builtin_goacc_parlevel_size";
+  fallback_retval = const1_rtx;
+  gen_fn = targetm.gen_oacc_dim_size;
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  if (oacc_get_fn_attrib (current_function_decl) == NULL_TREE)
+{
+  error ("%qs only supported in OpenACC code", name);
+  return const0_rtx;
+}
+
+  tree arg = CALL_EXPR_ARG (exp, 0);
+  if (TREE_CODE (arg) != INTEGER_CST)
+{
+  error ("non-constant argument 0 to %qs", name);
+  return const0_rtx;
+}
+
+  int dim = TREE_INT_CST_LOW (arg);
+  switch (dim)
+{
+case GOMP_DIM_GANG:
+case GOMP_DIM_WORKER:
+case GOMP_DIM_VECTOR:
+  break;
+default:
+  error ("illegal argument 0 to %qs", name);
+  return const0_rtx;
+}
+
+  if (ignore)
+return target;
+
+  if (!targetm.have_oacc_dim_size ())
+{
+  emit_move_insn (target, fallback_retval);
+  return target;
+}
+
+  rtx reg = MEM_P (target) ? gen_reg_rtx (GET_MODE (target)) : target;
+  emit_insn (gen_fn (reg, GEN_INT (dim)));
+  if (reg != target)
+emit_move_insn (target, reg);
+
+  return target;
+}
 
 /* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
@@ -7

[PATCH,nvptx] Truncate config/nvptx/oacc-parallel.c

2018-07-31 Thread Cesar Philippidis
Way back in the GCC 5 days when support for OpenACC was in its infancy,
we used to rely on having various GOACC_ thread functions in the runtime
to implement the execution model, or there lack of (that version of GCC
only supported vector level parallelism). However, beginning with GCC 6,
those external functions were replaced with internal functions that get
expanded by the nvptx BE directly.

This patch removes those stale libgomp functions from the nvptx libgomp
target. Is this OK for trunk, or does libgomp still need to maintain
backwards compatibility with GCC 5?

This patch has been bootstrapped and regtested for x86_64 with nvptx
offloading.

Thanks,
Cesar
[PATCH] [libgomp] Truncate config/nvptx/oacc-parallel.c

2018-XX-YY  Cesar Philippidis  
	Thomas Schwinge 

	libgomp/
	* config/nvptx/oacc-parallel.c: Truncate.

(cherry picked from gomp-4_0-branch r228836)
---
 libgomp/config/nvptx/oacc-parallel.c | 358 ---
 1 file changed, 358 deletions(-)

diff --git a/libgomp/config/nvptx/oacc-parallel.c b/libgomp/config/nvptx/oacc-parallel.c
index 5dc53da..e69de29 100644
--- a/libgomp/config/nvptx/oacc-parallel.c
+++ b/libgomp/config/nvptx/oacc-parallel.c
@@ -1,358 +0,0 @@
-/* OpenACC constructs
-
-   Copyright (C) 2014-2018 Free Software Foundation, Inc.
-
-   Contributed by Mentor Embedded.
-
-   This file is part of the GNU Offloading and Multi Processing Library
-   (libgomp).
-
-   Libgomp is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
-   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-   more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include "libgomp_g.h"
-
-__asm__ (".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n"
-	 "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_num_threads\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n"
-	 "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_thread_num\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n"
-	 "// BEGIN GLOBAL FUNCTION DECL: abort\n"
-	 ".extern .func abort;\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n"
-	 "{\n"
-	 ".reg .u32 %ar1;\n"
-	 ".reg .u32 %retval;\n"
-	 ".reg .u64 %hr10;\n"
-	 ".reg .u32 %r22;\n"
-	 ".reg .u32 %r23;\n"
-	 ".reg .u32 %r24;\n"
-	 ".reg .u32 %r25;\n"
-	 ".reg .u32 %r26;\n"
-	 ".reg .u32 %r27;\n"
-	 ".reg .u32 %r28;\n"
-	 ".reg .u32 %r29;\n"
-	 ".reg .pred %r30;\n"
-	 ".reg .u32 %r31;\n"
-	 ".reg .pred %r32;\n"
-	 ".reg .u32 %r33;\n"
-	 ".reg .pred %r34;\n"
-	 ".local .align 8 .b8 %frame[4];\n"
-	 "ld.param.u32 %ar1,[%in_ar1];\n"
-	 "mov.u32 %r27,%ar1;\n"
-	 "st.local.u32 [%frame],%r27;\n"
-	 "ld.local.u32 %r28,[%frame];\n"
-	 "mov.u32 %r29,1;\n"
-	 "setp.eq.u32 %r30,%r28,%r29;\n"
-	 "@%r30 bra $L4;\n"
-	 "mov.u32 %r31,2;\n"
-	 "setp.eq.u32 %r32,%r28,%r31;\n"
-	 "@%r32 bra $L5;\n"
-	 "mov.u32 %r33,0;\n"
-	 "setp.eq.u32 %r34,%r28,%r33;\n"
-	 "@!%r34 bra $L8;\n"
-	 "mov.u32 %r23,%tid.x;\n"
-	 "mov.u32 %r22,%r23;\n"
-	 "bra $L7;\n"
-	 "$L4:\n"
-	 "mov.u32 %r24,%tid.y;\n"
-	 "mov.u32 %r22,%r24;\n"
-	 "bra $L7;\n"
-	 "$L5:\n"
-	 "mov.u32 %r25,%tid.z;\n"
-	 "mov.u32 %r22,%r25;\n"
-	 "bra $L7;\n"
-	 "$L8:\n"
-	 "{\n"
-	 "{\n"
-	 "call abort;\n"
-	 "}\n"
-	 "}\n"
-	 "$L7:\n"
-	 "mov.u32 %r26,%r22;\n"
-	 "mov.u32 %retval,%r26;\n"
-	 "st.param.u32 [%ou

[PATCH,nvptx] Remove use of CUDA unified memory in libgomp

2018-07-31 Thread Cesar Philippidis
At present, libgomp is using CUDA unified memory only as a buffer pass
to the struct containing the pointers to the data mappings to the
offloaded functions. I'm not sure why unified memory is needed here if
it is still being managed explicitly by the driver.

This patch removes the use of CUDA unified memory from the driver. I
don't recall observing any reduction in performance. Besides,
eventually, we'd like to eliminate the struct containing all pointers to
the offloaded data mappings and pass those pointers as individual
function arguments to cuLaunchKernel directly.

Is this patch OK for trunk? I bootstrapped and regression tested it for
x86_64 with nvptx offloading.

Thanks,
Cesar
[PATCH] [nvptx] Remove use of CUDA unified memory in libgomp

2018-XX-YY  Cesar Philippidis  

	libgomp/
	* plugin/plugin-nvptx.c (struct cuda_map): New.
	(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
	h_tail with (cuda_map *) map.
	(cuda_map_create): New function.
	(cuda_map_destroy): New function.
	(map_init): Update to use a linked list of cuda_map objects.
	(map_fini): Likewise.
	(map_pop): Likewise.
	(map_push): Likewise.  Return CUdeviceptr instead of void.
	(init_streams_for_device): Remove stales references to ptx_stream
	members.
	(select_stream_for_async): Likewise.
	(nvptx_exec): Update call to map_init.

(cherry picked from gomp-4_0-branch r242614)
---
 libgomp/plugin/plugin-nvptx.c | 167 +++---
 1 file changed, 90 insertions(+), 77 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 1237ea10..d79ddf1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -200,20 +200,20 @@ cuda_error (CUresult r)
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
+struct cuda_map
+{
+  CUdeviceptr d;
+  size_t size;
+  bool active;
+  struct cuda_map *next;
+};
+
 struct ptx_stream
 {
   CUstream stream;
   pthread_t host_thread;
   bool multithreaded;
-
-  CUdeviceptr d;
-  void *h;
-  void *h_begin;
-  void *h_end;
-  void *h_next;
-  void *h_prev;
-  void *h_tail;
-
+  struct cuda_map *map;
   struct ptx_stream *next;
 };
 
@@ -225,101 +225,114 @@ struct nvptx_thread
   struct ptx_device *ptx_dev;
 };
 
+static struct cuda_map *
+cuda_map_create (size_t size)
+{
+  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+  assert (map);
+
+  map->next = NULL;
+  map->size = size;
+  map->active = false;
+
+  CUDA_CALL_ERET (NULL, cuMemAlloc, >d, size);
+  assert (map->d);
+
+  return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+  CUDA_CALL_ASSERT (cuMemFree, map->d);
+  free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+   contains the data mapping arguments for cuLaunchKernel.  Each
+   asynchronous PTX stream may have multiple pending kernel
+   invocations, which are launched in a FIFO order.  As such, the map
+   routines maintains a queue of cuLaunchKernel arguments.
+
+   Calls to map_push and map_pop must be guarded by ptx_event_lock.
+   Likewise, calls to map_init and map_fini are guarded by
+   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+   GOMP_OFFLOAD_fini_device, respectively.  */
+
 static bool
 map_init (struct ptx_stream *s)
 {
   int size = getpagesize ();
 
   assert (s);
-  assert (!s->d);
-  assert (!s->h);
-
-  CUDA_CALL (cuMemAllocHost, >h, size);
-  CUDA_CALL (cuMemHostGetDevicePointer, >d, s->h, 0);
 
-  assert (s->h);
+  s->map = cuda_map_create (size);
 
-  s->h_begin = s->h;
-  s->h_end = s->h_begin + size;
-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
-  assert (s->h_next);
-  assert (s->h_end);
   return true;
 }
 
 static bool
 map_fini (struct ptx_stream *s)
 {
-  CUDA_CALL (cuMemFreeHost, s->h);
+  assert (s->map->next == NULL);
+  assert (!s->map->active);
+
+  cuda_map_destroy (s->map);
+
   return true;
 }
 
 static void
 map_pop (struct ptx_stream *s)
 {
-  assert (s != NULL);
-  assert (s->h_next);
-  assert (s->h_prev);
-  assert (s->h_tail);
-
-  s->h_tail = s->h_next;
-
-  if (s->h_tail >= s->h_end)
-s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+  struct cuda_map *next;
 
-  if (s->h_next == s->h_tail)
-s->h_prev = s->h_next;
+  assert (s != NULL);
 
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
+  if (s->map->next == NULL)
+{
+  s->map->active = false;
+  return;
+}
 
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  next = s->map->next;
+  cuda_map_destroy (s->map);
+  s->map = next;
 }
 
-static void
-map_push (struct ptx_stream *s, size_t size,

[PATCH,nvptx] Remove use of 'struct map' from plugin (nvptx)

2018-07-31 Thread Cesar Philippidis
This is an old patch which removes the struct map from the nvptx plugin.
I believe at one point this was supposed to be used to manage async data
mappings, but in practice that never worked out.

Is this OK for trunk? I bootstrapped and regtested on x86_64 with nvptx
offloading.

Thanks,
Cesar
[PATCH] Remove use of 'struct map' from plugin (nvptx)

2018-XX-YY  Cesar Philippidis  
	James Norris 	

	libgomp/
	* plugin/plugin-nvptx.c (struct map): Removed.
	(map_init, map_pop): Remove use of struct map. (map_push):
	Likewise and change argument list.
	* testsuite/libgomp.oacc-c-c++-common/mapping-1.c: New

(cherry picked from gomp-4_0-branch r231616)
---
 libgomp/plugin/plugin-nvptx.c  | 33 +++-
 .../libgomp.oacc-c-c++-common/mapping-1.c  | 63 ++
 2 files changed, 69 insertions(+), 27 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a92f054..1237ea10 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -225,13 +225,6 @@ struct nvptx_thread
   struct ptx_device *ptx_dev;
 };
 
-struct map
-{
-  int async;
-  size_t  size;
-  charmappings[0];
-};
-
 static bool
 map_init (struct ptx_stream *s)
 {
@@ -265,16 +258,12 @@ map_fini (struct ptx_stream *s)
 static void
 map_pop (struct ptx_stream *s)
 {
-  struct map *m;
-
   assert (s != NULL);
   assert (s->h_next);
   assert (s->h_prev);
   assert (s->h_tail);
 
-  m = s->h_tail;
-
-  s->h_tail += m->size;
+  s->h_tail = s->h_next;
 
   if (s->h_tail >= s->h_end)
 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
@@ -292,37 +281,27 @@ map_pop (struct ptx_stream *s)
 }
 
 static void
-map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
+map_push (struct ptx_stream *s, size_t size, void **h, void **d)
 {
   int left;
   int offset;
-  struct map *m;
 
   assert (s != NULL);
 
   left = s->h_end - s->h_next;
-  size += sizeof (struct map);
 
   assert (s->h_prev);
   assert (s->h_next);
 
   if (size >= left)
 {
-  m = s->h_prev;
-  m->size += left;
-  s->h_next = s->h_begin;
-
-  if (s->h_next + size > s->h_end)
-	GOMP_PLUGIN_fatal ("unable to push map");
+  assert (s->h_next == s->h_prev);
+  s->h_next = s->h_prev = s->h_tail = s->h_begin;
 }
 
   assert (s->h_next);
 
-  m = s->h_next;
-  m->async = async;
-  m->size = size;
-
-  offset = (void *)>mappings[0] - s->h;
+  offset = s->h_next - s->h;
 
   *d = (void *)(s->d + offset);
   *h = (void *)(s->h + offset);
@@ -1291,7 +1270,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
  the host and the device. HP is a host pointer to the new chunk, and DP is
  the corresponding device pointer.  */
-  map_push (dev_str, async, mapnum * sizeof (void *), , );
+  map_push (dev_str, mapnum * sizeof (void *), , );
 
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c
new file mode 100644
index 000..593e7d4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+
+#include 
+#include 
+#include 
+
+/* Exercise the kernel launch argument mapping.  */
+
+int
+main (int argc, char **argv)
+{
+  int a[256], b[256], c[256], d[256], e[256], f[256];
+  int i;
+  int n;
+
+  /* 48 is the size of the mappings for the first parallel construct.  */
+  n = sysconf (_SC_PAGESIZE) / 48 - 1;
+
+  i = 0;
+
+  for (i = 0; i < n; i++)
+{
+  #pragma acc parallel copy (a, b, c, d)
+	{
+	  int j;
+
+	  for (j = 0; j < 256; j++)
+	{
+	  a[j] = j;
+	  b[j] = j;
+	  c[j] = j;
+	  d[j] = j;
+	}
+	}
+}
+
+#pragma acc parallel copy (a, b, c, d, e, f)
+  {
+int j;
+
+for (j = 0; j < 256; j++)
+  {
+	a[j] = j;
+	b[j] = j;
+	c[j] = j;
+	d[j] = j;
+	e[j] = j;
+	f[j] = j;
+  }
+  }
+
+  for (i = 0; i < 256; i++)
+   {
+ if (a[i] != i) abort();
+ if (b[i] != i) abort();
+ if (c[i] != i) abort();
+ if (d[i] != i) abort();
+ if (e[i] != i) abort();
+ if (f[i] != i) abort();
+   }
+
+  exit (0);
+}
-- 
2.7.4



[PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

2018-07-31 Thread Cesar Philippidis
The attached patch teaches libgomp how to use the CUDA thread occupancy
calculator built into the CUDA driver. Despite both being based off the
CUDA thread occupancy spreadsheet distributed with CUDA, the built in
occupancy calculator differs from the occupancy calculator in og8 in two
key ways. First, og8 launches twice the number of gangs as the driver
thread occupancy calculator. This was my attempt at preventing threads
from idling, and it operating on a similar principle of running 'make
-jN', where N is twice the number of CPU threads. Second, whereas og8
always attempts to maximize the CUDA block size, the driver may select a
smaller block, which effectively decreases num_workers.

In terms of performance, there really isn't that much of a difference
between the CUDA driver's occupancy calculator and og8's. However, on
the tests that are impacted, they are generally within a factor of two
from one another, with some tests running faster with the driver
occupancy calculator and others with og8's.

Unfortunately, support for the CUDA driver API isn't universal; it's
only available in CUDA version 6.5 (or 6050) and newer. In this patch,
I'm exploiting the fact that init_cuda_lib only checks for errors on the
last library function initialized. Therefore it guards the usage of

  cuOccupancyMaxPotentialBlockSizeWithFlags

by checking driver_version. If the driver occupancy calculator isn't
available, it falls back to the existing defaults. Maybe the og8 thread
occupancy would make a better default for older versions of CUDA, but
that's a patch for another day.

Is this patch OK for trunk? I bootstrapped and regression tested it
using x86_64 with nvptx offloading.

Thanks,
Cesar
[nvptx] Use CUDA driver API to select default runtime launch geometry

2018-XX-YY  Cesar Philippidis  
	libgomp/
	plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuDriverGetVersion): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
	plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
	cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
	(ptx_device): Add driver_version member.
	(nvptx_open_device): Initialize it.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.
---
 libgomp/plugin/cuda/cuda.h|  5 +
 libgomp/plugin/plugin-nvptx.c | 37 -
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825..1fc694d 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
 CUresult cuDeviceGet (CUdevice *, int);
 CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
 CUresult cuDeviceGetCount (int *);
+CUresult cuDriverGetVersion (int *);
 CUresult cuEventCreate (CUevent *, unsigned);
 #define cuEventDestroy cuEventDestroy_v2
 CUresult cuEventDestroy (CUevent);
@@ -170,6 +172,9 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSizeWithFlags (int *, int *, CUfunction,
+		CUoccupancyB2DSize, size_t,
+		int, unsigned int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index b6ec5f8..2647af6 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -63,6 +63,7 @@ CUDA_ONE_CALL (cuCtxSynchronize)	\
 CUDA_ONE_CALL (cuDeviceGet)		\
 CUDA_ONE_CALL (cuDeviceGetAttribute)	\
 CUDA_ONE_CALL (cuDeviceGetCount)	\
+CUDA_ONE_CALL (cuDriverGetVersion)	\
 CUDA_ONE_CALL (cuEventCreate)		\
 CUDA_ONE_CALL (cuEventDestroy)		\
 CUDA_ONE_CALL (cuEventElapsedTime)	\
@@ -94,6 +95,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)	\
 CUDA_ONE_CALL (cuModuleLoad)		\
 CUDA_ONE_CALL (cuModuleLoadData)	\
 CUDA_ONE_CALL (cuModuleUnload)		\
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \
 CUDA_ONE_CALL (cuStreamCreate)		\
 CUDA_ONE_CALL (cuStreamDestroy)		\
 CUDA_ONE_CALL (cuStreamQuery)		\
@@ -423,6 +425,7 @@ struct ptx_device
   int max_threads_per_block;
   int max_threads_per_multiprocessor;
   int default_dims[GOMP_DIM_MAX];
+  int driver_version;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock; /* Lock for above list.  */
@@ -734,6 +737,7 @@ nvptx_open_device (int n)
   ptx_dev->ord = n;
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
+  ptx_dev->driver_version = 0;
 
   r = CUDA_

Re: [libgomp, nvptx, committed] Calculate default dims per device

2018-07-30 Thread Cesar Philippidis
On 07/30/2018 03:19 AM, Tom de Vries wrote:
> 
> [libgomp, nvptx] Calculate default dims per device
> 
> The default dimensions are calculated using per-device properties, but
> initialized once and used on all devices.
> 
> This patch fixes this problem by introducing per-device default dimensions.

Neat, thanks!

I wonder if it's worthwhile to optimize the case where a system has more
than one identical GPU.

Cesar


Re: [PATCH 0/8] Reduce/remove dependencies on _GLIBCXX_USE_C99_STDINT_TR1

2018-07-26 Thread Cesar Philippidis
On 07/26/2018 07:01 AM, jwak...@redhat.com wrote:
> From: Jonathan Wakely 

It looks like you're using git send-email for this patch series. And it
seems like you made the same mistake that I did when you configured git
sendmail.from. According to the git sent-email manpage, from should be
your email address, however, it really wants it to be in of the form

  Full Name 

This is not a huge deal because the email went through, but it was
something that wasn't immediately obvious to me.

Cesar


Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions

2018-07-26 Thread Cesar Philippidis
Hi Tom,

I see that you're reviewing the libgomp changes. Please disregard the
following hunk:

On 07/11/2018 12:13 PM, Cesar Philippidis wrote:
> @@ -1199,12 +1202,59 @@ nvptx_exec (void (*fn), size_t mapnum, void 
> **hostaddrs, void **devaddrs,
>default_dims[GOMP_DIM_VECTOR]);
>   }
>pthread_mutex_unlock (_dev_lock);
> +  int vectors = default_dims[GOMP_DIM_VECTOR];
> +  int workers = default_dims[GOMP_DIM_WORKER];
> +  int gangs = default_dims[GOMP_DIM_GANG];
> +
> +  if (nvptx_thread()->ptx_dev->driver_version > 6050)
> + {
> +   int grids, blocks;
> +   CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, ,
> + , function, NULL, 0,
> + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
> +   GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
> +  "grid = %d, block = %d\n", grids, blocks);
> +
> +   gangs = grids * dev_size;
> +   workers = blocks / vectors;
> + }

I revisited this change yesterday and I noticed it was setting gangs
incorrectly. Basically, gangs should be set as follows

  gangs = grids * (blocks / warp_size);

or to be more closer to og8 as

  gangs = 2 * grids * (blocks / warp_size);

The use of that magic constant 2 is to prevent thread starvation. That's
a similar concept behind make -j<2*#threads>.

Anyway, I'm still experimenting with that change. There are still some
discrepancies between the way that I select num_workers and how the
driver does. The driver appears to be a little bit more conservative,
but according to the thread occupancy calculator, that should yield
greater performance on GPUs.

I just wanted to give you a heads up because you seem to be working on this.

Thanks for all of your reviews!

By the way, are you now maintainer of the libgomp nvptx plugin?

Cesar


Re: [PATCH 3/3] Add user-friendly OpenACC diagnostics regarding detected parallelism.

2018-07-26 Thread Cesar Philippidis
On 07/26/2018 01:33 AM, Richard Biener wrote:
> On Wed, Jul 25, 2018 at 5:30 PM Cesar Philippidis
>  wrote:
>>
>> This patch teaches GCC to inform the user how it assigned parallelism
>> to each OpenACC loop at compile time using the -fopt-info-note-omp
>> flag. For instance, given the acc parallel loop nest:
>>
>>   #pragma acc parallel loop
>>   for (...)
>> #pragma acc loop vector
>> for (...)
>>
>> GCC will report somthing like
>>
>>   foo.c:4:0: note: Detected parallelism 
>>   foo.c:6:0: note: Detected parallelism 
>>
>> Note how only the inner loop specifies vector parallelism. In this
>> example, GCC automatically assigned gang and worker parallelism to the
>> outermost loop. Perhaps, going forward, it would be useful to
>> distinguish which parallelism was specified by the user and which was
>> assigned by the compiler. But that can be added in a follow up patch.
>>
>> Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
>> with nvptx offloading.
> 
> Shouldn't this use MSG_OPTIMIZED_LOCATIONS instead?  Are there
> any other optinfo notes emitted?  Like when despite pragmas loops
> are not handled or so?

Early on I was just using the diagnostics in omp-grid.c as a model, but
yes, it does make sense to use MSG_OPTIMIZED_LOCATIONS instead of
MSG_NOTE. And no, these are the only optinfo notes that we're emitting
at the moment. All of the other diagnostics are just errors and
warnings, although we probably should revisit that for some of the
forthcoming acc routine diagnostics. Going forward, now that there's in
interest in automatic parallelism inside acc kernels, we do plan on
expanding the diagnostics.

The attached revised patch now uses MSG_OPTIMIZED_LOCATIONS for the
diagnostics. If this gets approved for trunk, I'll go ahead and backport
it to og8 and update the OpenACC wiki to change the usage of
-fopt-info-note-omp to -fopt-info-optimized-omp.

Is this OK for trunk?

Thanks,
Cesar
2018-XX-YY  Cesar Philippidis  

	gcc/
	* omp-offload.c (inform_oacc_loop): New function.
	(execute_oacc_device_lower): Use it to display loop parallelism.

	gcc/testsuite/
	* c-c++-common/goacc/note-parallelism.c: New test.
	* gfortran.dg/goacc/note-parallelism.f90: New test.

(cherry picked from gomp-4_0-branch r245683, and gcc/testsuite/ parts of
r245770)

use MSG_OPTIMIZED_LOCATIONS instead of MSG_NOTE
---
 gcc/omp-offload.c | 27 
 .../c-c++-common/goacc/note-parallelism.c | 61 ++
 .../gfortran.dg/goacc/note-parallelism.f90| 62 +++
 3 files changed, 150 insertions(+)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/note-parallelism.c
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/note-parallelism.f90

diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 0abf0283c9e..3582dda3d1a 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -866,6 +866,31 @@ debug_oacc_loop (oacc_loop *loop)
   dump_oacc_loop (stderr, loop, 0);
 }
 
+/* Provide diagnostics on OpenACC loops LOOP, its siblings and its
+   children.  */
+
+static void
+inform_oacc_loop (oacc_loop *loop)
+{
+  const char *seq = loop->mask == 0 ? " seq" : "";
+  const char *gang = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG)
+? " gang" : "";
+  const char *worker = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)
+? " worker" : "";
+  const char *vector = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+? " vector" : "";
+  dump_location_t loc = dump_location_t::from_location_t (loop->loc);
+
+  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
+		   "Detected parallelism \n", seq, gang,
+		   worker, vector);
+
+  if (loop->child)
+inform_oacc_loop (loop->child);
+  if (loop->sibling)
+inform_oacc_loop (loop->sibling);
+}
+
 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
structures as we go.  By construction these loops are properly
nested.  */
@@ -1533,6 +1558,8 @@ execute_oacc_device_lower ()
   dump_oacc_loop (dump_file, loops, 0);
   fprintf (dump_file, "\n");
 }
+  if (dump_enabled_p () && loops->child)
+inform_oacc_loop (loops->child);
 
   /* Offloaded targets may introduce new basic blocks, which require
  dominance information to update SSA.  */
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c
new file mode 100644
index 000..2e50d86cd23
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c
@@ -0,0 +1,61 @@
+/* Test the output of -fopt-info-note-omp.  */
+
+/* { dg-additional-options "-fopt-info-note-optimized" } */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc parallel

Re: [PATCH 00/11] [nvptx] Initial vector length changes

2018-07-25 Thread Cesar Philippidis
On 07/24/2018 01:47 PM, ce...@codesourcery.com wrote:
> From: Cesar Philippidis 
> 
> This patch series contains various cleanups and structural
> reorganizations to the NVPTX BE in preparation for the forthcoming
> variable length vector length enhancements. Tom, in order to make
> these changes easier for you to review, I broke these patches into
> logical components. If approved for trunk, would you like to see these
> patches committed individually, or all together in a single huge
> commit?
> 
> One notable change in this patch set is the partial inclusion of the
> PTX_DEFAULT_RUNTIME_DIM change that I previously placed with the
> libgomp default geometry update patch that I posted a couple of weeks
> ago. I don't want to block this patch series so I included the nvptx
> changes in patch 01.
> 
> It this OK for trunk? I regtested both standalone and offloading
> compiliers. I'm seeing some inconsistencies in the standalone compiler
> results, so I might rerun those just to be safe. But the results using
> nvptx as an offloading compiler came back clean.

On further inspection, the inconsistencies turned out to be isolated in
the c++ tests. The c tests results are clean.

Cesar


Re: [PATCH 1/3] Correct the reported line number in fortran combined OpenACC directives

2018-07-25 Thread Cesar Philippidis
On 07/25/2018 08:32 AM, Marek Polacek wrote:
> On Wed, Jul 25, 2018 at 08:29:17AM -0700, Cesar Philippidis wrote:
>> The fortran FE incorrectly records the line locations of combined acc
>> loop directives when it lowers the construct to gimple. Usually this
>> isn't a problem because the fortran FE is able to report problems with
>> acc loops itself. However, there will be inaccuracies if the ME tries
>> to use those locations.
>>
>> Note that test cases are inconspicuously absent in this patch.
>> However, without this bug fix, -fopt-info-note-omp will report bogus
>> line numbers. This code patch will be tested in a later patch in
>> this series.
>>
>> Is this OK for trunk? I bootstrapped and regtested it on x86_64 with
>> nvptx offloading.
>>
>> Thanks,
>> Cesar
>>
>> 2018-XX-YY  Cesar Philippidis  
>>
>>  gcc/fortran/
>>  * trans-openmp.c (gfc_trans_oacc_combined_directive): Set the
>>  location of combined acc loops.
>>
>> (cherry picked from gomp-4_0-branch r245653)
>>
>> diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
>> index f038f4c..e7707d0 100644
>> --- a/gcc/fortran/trans-openmp.c
>> +++ b/gcc/fortran/trans-openmp.c
>> @@ -3869,6 +3869,7 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
>>gfc_omp_clauses construct_clauses, loop_clauses;
>>tree stmt, oacc_clauses = NULL_TREE;
>>enum tree_code construct_code;
>> +  location_t loc = input_location;
>>  
>>switch (code->op)
>>  {
>> @@ -3930,12 +3931,16 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
>>else
>>  pushlevel ();
>>stmt = gfc_trans_omp_do (code, EXEC_OACC_LOOP, pblock, _clauses, 
>> NULL);
>> +
>> +  if (CAN_HAVE_LOCATION_P (stmt))
>> +SET_EXPR_LOCATION (stmt, loc);
> 
> This is protected_set_expr_location.

Neat, thanks! This patch includes that correction. Is it ok for trunk
after bootstrapping and regression testing?

Thanks,
Cesar

2018-XX-YY  Cesar Philippidis  

	gcc/fortran/
	* trans-openmp.c (gfc_trans_oacc_combined_directive): Set the
	location of combined acc loops.

(cherry picked from gomp-4_0-branch r245653)
---
 gcc/fortran/trans-openmp.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index f038f4c5bf8..b549c682533 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -3869,6 +3869,7 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
   gfc_omp_clauses construct_clauses, loop_clauses;
   tree stmt, oacc_clauses = NULL_TREE;
   enum tree_code construct_code;
+  location_t loc = input_location;
 
   switch (code->op)
 {
@@ -3929,13 +3930,16 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
 pblock = 
   else
 pushlevel ();
+
   stmt = gfc_trans_omp_do (code, EXEC_OACC_LOOP, pblock, _clauses, NULL);
+  protected_set_expr_location (stmt, loc);
+
   if (TREE_CODE (stmt) != BIND_EXPR)
 stmt = build3_v (BIND_EXPR, NULL, stmt, poplevel (1, 0));
   else
 poplevel (0, 0);
-  stmt = build2_loc (input_location, construct_code, void_type_node, stmt,
-		 oacc_clauses);
+
+  stmt = build2_loc (loc, construct_code, void_type_node, stmt, oacc_clauses);
   gfc_add_expr_to_block (, stmt);
   return gfc_finish_block ();
 }
-- 
2.17.1



[PATCH 3/3] Add user-friendly OpenACC diagnostics regarding detected parallelism.

2018-07-25 Thread Cesar Philippidis
This patch teaches GCC to inform the user how it assigned parallelism
to each OpenACC loop at compile time using the -fopt-info-note-omp
flag. For instance, given the acc parallel loop nest:

  #pragma acc parallel loop
  for (...)
#pragma acc loop vector
for (...)

GCC will report somthing like

  foo.c:4:0: note: Detected parallelism 
  foo.c:6:0: note: Detected parallelism 

Note how only the inner loop specifies vector parallelism. In this
example, GCC automatically assigned gang and worker parallelism to the
outermost loop. Perhaps, going forward, it would be useful to
distinguish which parallelism was specified by the user and which was
assigned by the compiler. But that can be added in a follow up patch.

Is this patch OK for trunk? I bootstrapped and regtested it for x86_64
with nvptx offloading.

Thanks,
Cesar

2018-XX-YY  Cesar Philippidis  

gcc/
* omp-offload.c (inform_oacc_loop): New function.
(execute_oacc_device_lower): Use it to display loop parallelism.

gcc/testsuite/
* c-c++-common/goacc/note-parallelism.c: New test.
* gfortran.dg/goacc/note-parallelism.f90: New test.

(cherry picked from gomp-4_0-branch r245683, and gcc/testsuite/ parts of
r245770)

diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 0abf028..66b99bb 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -866,6 +866,31 @@ debug_oacc_loop (oacc_loop *loop)
   dump_oacc_loop (stderr, loop, 0);
 }
 
+/* Provide diagnostics on OpenACC loops LOOP, its siblings and its
+   children.  */
+
+static void
+inform_oacc_loop (oacc_loop *loop)
+{
+  const char *seq = loop->mask == 0 ? " seq" : "";
+  const char *gang = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG)
+? " gang" : "";
+  const char *worker = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)
+? " worker" : "";
+  const char *vector = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+? " vector" : "";
+  dump_location_t loc = dump_location_t::from_location_t (loop->loc);
+
+  dump_printf_loc (MSG_NOTE, loc,
+  "Detected parallelism \n", seq, gang,
+  worker, vector);
+
+  if (loop->child)
+inform_oacc_loop (loop->child);
+  if (loop->sibling)
+inform_oacc_loop (loop->sibling);
+}
+
 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
structures as we go.  By construction these loops are properly
nested.  */
@@ -1533,6 +1558,8 @@ execute_oacc_device_lower ()
   dump_oacc_loop (dump_file, loops, 0);
   fprintf (dump_file, "\n");
 }
+  if (dump_enabled_p () && loops->child)
+inform_oacc_loop (loops->child);
 
   /* Offloaded targets may introduce new basic blocks, which require
  dominance information to update SSA.  */
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism.c 
b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c
new file mode 100644
index 000..3ec794c
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c
@@ -0,0 +1,61 @@
+/* Test the output of -fopt-info-note-omp.  */
+
+/* { dg-additional-options "-fopt-info-note-omp" } */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc parallel loop seq /* { dg-message "note: Detected parallelism " } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop gang /* { dg-message "note: Detected parallelism 
" } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop worker /* { dg-message "note: Detected parallelism 
" } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop vector /* { dg-message "note: Detected parallelism 
" } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop gang vector /* { dg-message "note: Detected 
parallelism " } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop gang worker /* { dg-message "note: Detected 
parallelism " } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop worker vector /* { dg-message "note: Detected 
parallelism " } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop gang worker vector /* { dg-message "note: Detected 
parallelism " } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop /* { dg-message "note: Detected parallelism " } */
+  for (x = 0; x < 10; x++)
+;
+
+#pragma acc parallel loop /* { dg-message "note: Detected parallelism " } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop /* { dg-message "note: Detected parallelism " } */
+for (y = 0; y < 10; y++)
+  ;
+
+#pragma acc parallel loop gang /* { dg-message "note: Detected parallelism 
" } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop worker /* { dg-message "note: Detected parallelism " } */
+ 

[PATCH 2/3] Correct the reported line number in c++ combined OpenACC directives

2018-07-25 Thread Cesar Philippidis
Like the fortran FE, the C++ FE doesn't set the expr_location of the
split acc loop in combined acc parallel/kernels loop directives. This
only happens for with combined directives, otherwise
cp_parser_omp_construct would be responsible for setting the
location. After fixing this bug, I was able to resolve a couple of
long standing diagnostics discrepancies between the c/c++ FEs in the
test suite.

Is this patch OK for trunk? I bootstrapped and regtested using x86_64
with nvptx offloading.

Thanks,
Cesar

2018-XX-YY  Cesar Philippidis  

gcc/cp/
* parser.c (cp_parser_oacc_kernels_parallel): Adjust EXPR_LOCATION
on the combined acc loop.

gcc/testsuite/
* c-c++-common/goacc/combined-directives-3.c: New test.
* c-c++-common/goacc/loop-2-kernels.c (void K): Adjust test.
* c-c++-common/goacc/loop-2-parallel.c (void P): Adjust test.
* c-c++-common/goacc/loop-3.c (void p2): Adjust test.

(cherry picked from gomp-4_0-branch r245673)

diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 90d5d00..52e61fc 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -37183,8 +37183,9 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, 
cp_token *pragma_tok,
  cp_lexer_consume_token (parser->lexer);
  tree block = begin_omp_parallel ();
  tree clauses;
- cp_parser_oacc_loop (parser, pragma_tok, p_name, mask, ,
-  if_p);
+ tree stmt = cp_parser_oacc_loop (parser, pragma_tok, p_name, mask,
+  , if_p);
+ protected_set_expr_location (stmt, pragma_tok->location);
  return finish_omp_construct (code, block, clauses);
}
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/combined-directives-3.c 
b/gcc/testsuite/c-c++-common/goacc/combined-directives-3.c
new file mode 100644
index 000..77d4182
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/combined-directives-3.c
@@ -0,0 +1,24 @@
+/* Verify the accuracy of the line number associated with combined
+   constructs.  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc parallel loop seq auto /* { dg-error "'seq' overrides other 
OpenACC loop specifiers" } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop
+for (y = 0; y < 10; y++)
+  ;
+
+#pragma acc parallel loop gang auto /* { dg-error "'auto' conflicts with other 
OpenACC loop specifiers" } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop worker auto /* { dg-error "'auto' conflicts with other 
OpenACC loop specifiers" } */
+for (y = 0; y < 10; y++)
+#pragma acc loop vector
+  for (z = 0; z < 10; z++)
+   ;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c 
b/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c
index 01ad32d..3a11ef5f 100644
--- a/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c
+++ b/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c
@@ -145,8 +145,8 @@ void K(void)
 #pragma acc kernels loop worker(num:5)
   for (i = 0; i < 10; i++)
 { }
-#pragma acc kernels loop seq worker // { dg-error "'seq' overrides" "" { 
target c } }
-  for (i = 0; i < 10; i++) // { dg-error "'seq' overrides" "" { target c++ } }
+#pragma acc kernels loop seq worker // { dg-error "'seq' overrides" }
+  for (i = 0; i < 10; i++)
 { }
 #pragma acc kernels loop gang worker
   for (i = 0; i < 10; i++)
@@ -161,8 +161,8 @@ void K(void)
 #pragma acc kernels loop vector(length:5)
   for (i = 0; i < 10; i++)
 { }
-#pragma acc kernels loop seq vector // { dg-error "'seq' overrides" "" { 
target c } }
-  for (i = 0; i < 10; i++) // { dg-error "'seq' overrides" "" { target c++ } }
+#pragma acc kernels loop seq vector // { dg-error "'seq' overrides" }
+  for (i = 0; i < 10; i++)
 { }
 #pragma acc kernels loop gang vector
   for (i = 0; i < 10; i++)
@@ -174,16 +174,16 @@ void K(void)
 #pragma acc kernels loop auto
   for (i = 0; i < 10; i++)
 { }
-#pragma acc kernels loop seq auto // { dg-error "'seq' overrides" "" { target 
c } }
-  for (i = 0; i < 10; i++) // { dg-error "'seq' overrides" "" { target c++ } }
+#pragma acc kernels loop seq auto // { dg-error "'seq' overrides" }
+  for (i = 0; i < 10; i++)
 { }
-#pragma acc kernels loop gang auto // { dg-error "'auto' conflicts" "" { 
target c } }
-  for (i = 0; i < 10; i++) // { dg-error "'auto' conflicts" "" { target c++ } }
+#pragma acc kernels loop gang auto // { dg-error "'auto' conflicts" }
+  for (i = 0; i < 10; i++)
 { }
-#pragma acc kernels loop worker auto // { dg-error "'auto' conflicts" "" { 
target c } }
-  for (i = 0; i < 10; i++) // { dg-error "'auto' conflicts" "" { target c++ } }
+#prag

[PATCH 1/3] Correct the reported line number in fortran combined OpenACC directives

2018-07-25 Thread Cesar Philippidis
The fortran FE incorrectly records the line locations of combined acc
loop directives when it lowers the construct to gimple. Usually this
isn't a problem because the fortran FE is able to report problems with
acc loops itself. However, there will be inaccuracies if the ME tries
to use those locations.

Note that test cases are inconspicuously absent in this patch.
However, without this bug fix, -fopt-info-note-omp will report bogus
line numbers. This code patch will be tested in a later patch in
this series.

Is this OK for trunk? I bootstrapped and regtested it on x86_64 with
nvptx offloading.

Thanks,
Cesar

2018-XX-YY  Cesar Philippidis  

gcc/fortran/
* trans-openmp.c (gfc_trans_oacc_combined_directive): Set the
location of combined acc loops.

(cherry picked from gomp-4_0-branch r245653)

diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index f038f4c..e7707d0 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -3869,6 +3869,7 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
   gfc_omp_clauses construct_clauses, loop_clauses;
   tree stmt, oacc_clauses = NULL_TREE;
   enum tree_code construct_code;
+  location_t loc = input_location;
 
   switch (code->op)
 {
@@ -3930,12 +3931,16 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
   else
 pushlevel ();
   stmt = gfc_trans_omp_do (code, EXEC_OACC_LOOP, pblock, _clauses, NULL);
+
+  if (CAN_HAVE_LOCATION_P (stmt))
+SET_EXPR_LOCATION (stmt, loc);
+
   if (TREE_CODE (stmt) != BIND_EXPR)
 stmt = build3_v (BIND_EXPR, NULL, stmt, poplevel (1, 0));
   else
 poplevel (0, 0);
-  stmt = build2_loc (input_location, construct_code, void_type_node, stmt,
-oacc_clauses);
+
+  stmt = build2_loc (loc, construct_code, void_type_node, stmt, oacc_clauses);
   gfc_add_expr_to_block (, stmt);
   return gfc_finish_block ();
 }
-- 
2.7.4



[PATCH 0/3] Add OpenACC diagnostics to -fopt-info-note-omp

2018-07-25 Thread Cesar Philippidis
This patch series extends -fopt-info-note-omp to include OpenACC loop
diagnostics when it is used in conjunction with -fopenacc. At present,
the diagnostics are limited to reporting how OpenACC loops are
partitioned, e.g., seq, gang, worker or vector. The major advantage of
this diagnostics is that it informs the user how GCC automatically
partitions independent loops, i.e., acc loops without any parallelism
clauses inside acc parallel regions. This information provides the
user with insights on how to select num_gangs, num_workers and
vector_length for their application.

All three patches in this series are independent from one
another. Patches 1 and 2 fix diagnostics bugs involving incorrect line
numbers. Patch 3 is responsible for generating the actual diagnostics.

Cesar


[PATCH] Adjust offsets for present data clauses

2018-07-20 Thread Cesar Philippidis
This is another old gomp4 patch that corrects a bug where the runtime
was passing the wrong offset for subarray data to the accelerator. The
original description of this patch can be found here
<https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01676.html>

I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk?

Thanks,
Cesar
>From fb743d8a45193c177cb0082400d140949e8c1e6d Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Wed, 24 Aug 2016 00:02:50 +
Subject: [PATCH 5/5] [libgomp, OpenACC] Adjust offsets for present data
 clauses

2018-XX-YY  Cesar Philippidis  

	libgomp/
	* oacc-parallel.c (GOACC_parallel_keyed): Add offset to devaddrs.
	* testsuite/libgomp.oacc-c-c++-common/data_offset.c: New test.
	* testsuite/libgomp.oacc-fortran/data_offset.f90: New test.

(cherry picked from gomp-4_0-branch r239723, 00c2585)
---
 libgomp/oacc-parallel.c   | 10 -
 .../libgomp.oacc-c-c++-common/data_offset.c   | 41 ++
 .../libgomp.oacc-fortran/data_offset.f90  | 43 +++
 3 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90

diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index b80ace58590..20e9ab2e251 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -231,8 +231,14 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
-devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
-			+ tgt->list[i].key->tgt_offset);
+{
+  if (tgt->list[i].key != NULL)
+	devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
++ tgt->list[i].key->tgt_offset
++ tgt->list[i].offset);
+  else
+	devaddrs[i] = NULL;
+}
 
   acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
 			  async, dims, tgt);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c
new file mode 100644
index 000..ccbbfcab87b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c
@@ -0,0 +1,41 @@
+/* Test present data clauses in acc offloaded regions when the
+   subarray inside the present clause does not have the same base
+   offset value as the subarray in the enclosing acc data or acc enter
+   data variable.  */
+
+#include 
+
+void
+offset (int *data, int n)
+{
+  int i;
+
+#pragma acc parallel loop present (data[0:n])
+  for (i = 0; i < n; i++)
+data[i] = n;
+}
+
+int
+main ()
+{
+  const int n = 30;
+  int data[n], i;
+
+  for (i = 0; i < n; i++)
+data[i] = -1;
+
+#pragma acc data copy(data[0:n])
+  {
+offset (data+10, 10);
+  }
+
+  for (i = 0; i < n; i++)
+{
+  if (i < 10 || i >= 20)
+	assert (data[i] == -1);
+  else
+	assert (data[i] == 10);
+}
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90 b/libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90
new file mode 100644
index 000..ff8ee39f964
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90
@@ -0,0 +1,43 @@
+! Test present data clauses in acc offloaded regions when the subarray
+! inside the present clause does not have the same base offset value
+! as the subarray in the enclosing acc data or acc enter data variable.
+
+program test
+  implicit none
+
+  integer, parameter :: n = 30, m = 10
+  integer :: i
+  integer, allocatable :: data(:)
+  logical bounded
+
+  allocate (data(n))
+
+  data(:) = -1
+
+  !$acc data copy (data(5:20))
+  call test_data (data, n, m)
+  !$acc end data
+
+  do i = 1, n
+ bounded = i < m .or. i >= m+m
+ if (bounded .and. (data(i) /= -1)) then
+call abort
+ else if (.not. bounded .and. data(i) /= 10) then
+call abort
+ end if
+  end do
+
+  deallocate (data)
+end program test
+
+subroutine test_data (data, n, m)
+  implicit none
+
+  integer :: n, m, data(n), i
+
+  !$acc parallel loop present (data(m:m))
+  do i = m, m+m-1
+ data(i) = m
+  end do
+  !$acc end parallel loop
+end subroutine test_data
-- 
2.17.1



[PATCH] Enable firstprivate OpenACC reductions

2018-07-20 Thread Cesar Philippidis
At present, all reduction variables are transferred via an implicit
'copy' clause. As shown the the recent patches I've been posting, that
causes a lot of problems when the reduction variables are used by
multiple workers or vectors. This patch teaches the gimplifier to
transfer reduction variable as firstprivate in OpenACC parallel regions,
if the are in an inner loop. This matches the behavior of reductions in
OpenACC 2.6.

Is this patch OK for trunk? I bootstrapped and regtested on x86_64/nvptx.

Thanks,
Cesar
>From 035be51a795ad8bed5342ba181220bf3102bcd6d Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Wed, 31 Jan 2018 07:21:53 -0800
Subject: [PATCH 4/5] Enable firstprivate OpenACC reductions

2018-XX-YY  Cesar Philippidis  

	gcc/
	* gimplify.c (omp_add_variable): Allow certain OpenACC reduction
	variables to remain firstprivate.

	gcc/testsuite/
	* c-c++-common/goacc/reduction-8.c: New test.

(cherry picked from openacc-gcc-7-branch commit
441621739e2a067c97409f8b0e3e30362a7905be, cec00212ad8)
---
 gcc/gimplify.c| 30 --
 .../c-c++-common/goacc/reduction-8.c  | 94 +++
 2 files changed, 117 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/reduction-8.c

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 737a280cfe9..bcfb029275c 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -6858,9 +6858,16 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
   else
 splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags);
 
-  /* For reductions clauses in OpenACC loop directives, by default create a
- copy clause on the enclosing parallel construct for carrying back the
- results.  */
+  /* For OpenACC loop directives, when a reduction is immediately
+ enclosed within an acc parallel or kernels construct, it must
+ have an implied copy data mapping. E.g.
+
+   #pragma acc parallel
+	 {
+	   #pragma acc loop reduction (+:sum)
+
+ a copy clause for sum should be added on the enclosing parallel
+ construct for carrying back the results.  */
   if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION))
 {
   struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
@@ -6876,8 +6883,11 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 	vector = true;
 	}
 
-  /* Set new copy map as 'private' if sure we're not gang-partitioning.  */
-  bool map_private;
+  /* Reduction data maps need to be marked as private for worker
+	 and vector loops, in order to ensure that value of the
+	 reduction carried back to the host.  Set new copy map as
+	 'private' if sure we're not gang-partitioning.  */
+  bool map_private, update_data_map = false;
 
   if (gang)
 	map_private = false;
@@ -6886,6 +6896,10 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
   else
 	map_private = oacc_privatize_reduction (ctx->outer_context);
 
+  if (ctx->outer_context
+	  && ctx->outer_context->region_type == ORT_ACC_PARALLEL)
+	update_data_map = true;
+
   while (outer_ctx)
 	{
 	  n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl);
@@ -6902,7 +6916,8 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 		  gcc_assert (!(n->value & GOVD_FIRSTPRIVATE)
 			  && (n->value & GOVD_MAP));
 		}
-	  else if (outer_ctx->region_type == ORT_ACC_PARALLEL)
+	  else if (update_data_map
+		   && outer_ctx->region_type == ORT_ACC_PARALLEL)
 		{
 		  /* Remove firstprivate and make it a copy map.  */
 		  n->value &= ~GOVD_FIRSTPRIVATE;
@@ -6914,7 +6929,8 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 		n->value |= GOVD_MAP_PRIVATE;
 		}
 	}
-	  else if (outer_ctx->region_type == ORT_ACC_PARALLEL)
+	  else if (update_data_map
+		   && outer_ctx->region_type == ORT_ACC_PARALLEL)
 	{
 	  unsigned f = GOVD_MAP | GOVD_SEEN;
 
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-8.c b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
new file mode 100644
index 000..8a0283f4ac3
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
@@ -0,0 +1,94 @@
+/* { dg-additional-options "-fdump-tree-gimple" } */
+
+#define n 1000
+
+int
+main(void)
+{
+  int i, j;
+  int result, array[n];
+
+#pragma acc parallel loop reduction (+:result)
+  for (i = 0; i < n; i++)
+result ++;
+
+#pragma acc parallel
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+result ++;
+
+#pragma acc parallel
+#pragma acc loop
+  for (i = 0; i < n; i++)
+{
+  result = i;
+
+#pragma acc loop reduction(+:result)
+  for (j = 0; j < n; j++)
+	result ++;
+
+  array[i] = result;
+}
+
+#pragma acc parallel
+#pragma acc loop
+  fo

[PATCH] Privatize independent OpenACC reductions

2018-07-20 Thread Cesar Philippidis
This is another OpenACC reduction patch to privatize reduction variables
used inside inner acc loops. For some reason, I can't find the original
email announcement on the gcc-patches mailing list. But according to the
ChangeLog, I committed that change to og7 back on Jan 26, 2018.

I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk?

Thanks,
Cesar
>From a4753e2b40cf3d707aabd7c9d5bad7d8f9be8b6f Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Fri, 26 Jan 2018 08:30:13 -0800
Subject: [PATCH 3/5] Privatize independent OpenACC reductions

2018-XX-YY  Cesar Philippidis  

	gcc/
	* gimplify.c (oacc_privatize_reduction): New function.
	(omp_add_variable): Use it to determine if a reduction variable
	needs to be privatized.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/inner-reduction.c: New test.

(cherry picked from openacc-gcc-7-branch commit
330ba2316fabd0e5525c99fdacedb0bfae270244, 133f3a8fb5c)
---
 gcc/gimplify.c| 35 ++-
 .../inner-reduction.c | 23 
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 7dadf69b758..737a280cfe9 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -6722,6 +6722,32 @@ omp_firstprivatize_type_sizes (struct gimplify_omp_ctx *ctx, tree type)
   lang_hooks.types.omp_firstprivatize_type_sizes (ctx, type);
 }
 
+/* Determine if CTX might contain any gang partitioned loops.  During
+   oacc_dev_low, independent loops are assign gangs at the outermost
+   level, and vectors in the innermost.  */
+
+static bool
+oacc_privatize_reduction (struct gimplify_omp_ctx *ctx)
+{
+  if (ctx == NULL)
+return false;
+
+  if (ctx->region_type != ORT_ACC)
+return false;
+
+  for (tree c = ctx->clauses; c; c = OMP_CLAUSE_CHAIN (c))
+switch (OMP_CLAUSE_CODE (c))
+  {
+  case OMP_CLAUSE_SEQ:
+	return oacc_privatize_reduction (ctx->outer_context);
+  case OMP_CLAUSE_GANG:
+	return true;
+  default:;
+  }
+
+  return true;
+}
+
 /* Add an entry for DECL in the OMP context CTX with FLAGS.  */
 
 static void
@@ -6851,7 +6877,14 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 	}
 
   /* Set new copy map as 'private' if sure we're not gang-partitioning.  */
-  bool map_private = !gang && (worker || vector);
+  bool map_private;
+
+  if (gang)
+	map_private = false;
+  else if (worker || vector)
+	map_private = true;
+  else
+	map_private = oacc_privatize_reduction (ctx->outer_context);
 
   while (outer_ctx)
 	{
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c
new file mode 100644
index 000..0c317dcf8a6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c
@@ -0,0 +1,23 @@
+#include 
+
+int
+main ()
+{
+  const int n = 1000;
+  int i, j, temp, a[n];
+
+#pragma acc parallel loop
+  for (i = 0; i < n; i++)
+{
+  temp = i;
+#pragma acc loop reduction (+:temp)
+  for (j = 0; j < n; j++)
+	temp ++;
+  a[i] = temp;
+}
+
+  for (i = 0; i < n; i++)
+assert (a[i] == i+n);
+
+  return 0;
+}
-- 
2.17.1



[PATCH] Add support for making maps 'private' inside OpenACC offloaded regions

2018-07-20 Thread Cesar Philippidis
Due to the different levels of parallelism available in OpenACC, it is
useful to mark certain variables as GOMP_MAP_PRIVATE so that they can be
used in reductions. This patch was introduced in openacc-gcc-7-branch
here <https://gcc.gnu.org/ml/gcc-patches/2017-09/msg00274.html>.


I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk?

Thanks,
Cesar

>From b0e7fb09bf3a3f853e77c2712b6f85ad21472e72 Mon Sep 17 00:00:00 2001
From: Chung-Lin Tang 
Date: Tue, 5 Sep 2017 22:09:34 +0800
Subject: [PATCH 2/5] [OpenACC] Add support for making maps 'private' inside
 offloaded regions

2018-XX-YY Chung-Lin Tang  
	   Cesar Philippidis  

	gcc/
	* tree.h (OMP_CLAUSE_MAP_PRIVATE): Define macro.
	* gimplify.c (enum gimplify_omp_var_data): Add GOVD_MAP_PRIVATE enum value.
	(omp_add_variable): Add GOVD_MAP_PRIVATE to reduction clause flags if
	not a gang-partitioned loop directive.
	(gimplify_adjust_omp_clauses_1): Set OMP_CLAUSE_MAP_PRIVATE of new map
	clause to 1 if GOVD_MAP_PRIVATE flag is present.
	* omp-low.c (lower_oacc_reductions): Handle map clauses with
	OMP_CLAUSE_MAP_PRIVATE set in same matter as firstprivate/private.
	(lower_omp_target): Likewise. Add copy back code for map clauses with
	OMP_CLAUSE_MAP_PRIVATE set.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/reduction-9.c: New test.

(cherry picked from openacc-gcc-7-branch commit
2dc21f336368889c1ebf031801a7613f65899ef1, e17bb2068f9)
---
 gcc/gimplify.c| 34 ++-
 gcc/omp-low.c | 28 +++--
 gcc/tree.h|  3 ++
 .../libgomp.oacc-c-c++-common/reduction-9.c   | 41 +++
 4 files changed, 101 insertions(+), 5 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-9.c

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index cf8977c8508..7dadf69b758 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -105,6 +105,9 @@ enum gimplify_omp_var_data
   /* Flag for GOVD_MAP: must be present already.  */
   GOVD_MAP_FORCE_PRESENT = 524288,
 
+  /* Flag for GOVD_MAP, copy to/from private storage inside offloaded region.  */
+  GOVD_MAP_PRIVATE = 1048576,
+
   GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE
 			   | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR
 			   | GOVD_LOCAL)
@@ -6835,6 +6838,21 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
   if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION))
 {
   struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
+
+  bool gang = false, worker = false, vector = false;
+  for (tree c = ctx->clauses; c; c = OMP_CLAUSE_CHAIN (c))
+	{
+	  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_GANG)
+	gang = true;
+	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WORKER)
+	worker = true;
+	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR)
+	vector = true;
+	}
+
+  /* Set new copy map as 'private' if sure we're not gang-partitioning.  */
+  bool map_private = !gang && (worker || vector);
+
   while (outer_ctx)
 	{
 	  n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl);
@@ -6856,12 +6874,21 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 		  /* Remove firstprivate and make it a copy map.  */
 		  n->value &= ~GOVD_FIRSTPRIVATE;
 		  n->value |= GOVD_MAP;
+
+		  /* If not gang-partitioned, add MAP_PRIVATE on the map
+		 clause.  */
+		  if (map_private)
+		n->value |= GOVD_MAP_PRIVATE;
 		}
 	}
 	  else if (outer_ctx->region_type == ORT_ACC_PARALLEL)
 	{
-	  splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl,
- GOVD_MAP | GOVD_SEEN);
+	  unsigned f = GOVD_MAP | GOVD_SEEN;
+
+	  /* If not gang-partitioned, add MAP_PRIVATE on the map clause.  */
+	  if (map_private)
+		f |= GOVD_MAP_PRIVATE;
+	  splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl, f);
 	  break;
 	}
 	  outer_ctx = outer_ctx->outer_context;
@@ -8904,6 +8931,9 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data)
 	  gcc_unreachable ();
 	}
   OMP_CLAUSE_SET_MAP_KIND (clause, kind);
+  if ((flags & GOVD_MAP_PRIVATE)
+	  && TREE_CODE (OMP_CLAUSE_DECL (clause)) == VAR_DECL)
+	OMP_CLAUSE_MAP_PRIVATE (clause) = 1;
   tree c2 = gomp_needs_data_present (decl);
   /* Handle OpenACC pointers that were declared inside acc data
 	 regions.  */
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 714490d6921..ef3c7651c74 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -4907,7 +4907,9 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
 		  goto has_outer_reduction;
 		}
 		  else if ((OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_FIRSTPRIVATE
-			|| OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_PRIVATE)
+			|| OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_PRIVATE
+			|

[PATCH] Fix PR70828 - broken array-type subarrays inside acc data, in OpenACC

2018-07-20 Thread Cesar Philippidis
Attached is an old gomp-4_0-branch that fixes PR70828. Besides for
fixing the PR, it also introduces some changes which will enable the
forthcoming nvptx vector length enhancements. More details on the patch
can be found here <https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01293.html>

I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk?

Thanks,
Cesar
>From 3a58144cfaca8f6e3a889346e736e68a9ed17e6a Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Thu, 18 Aug 2016 01:12:15 +
Subject: [PATCH 1/5] Fix PR70828s "broken array-type subarrays inside acc data
 in openacc"

2018-XX-YY  Cesar Philippidis  

	gcc/
	* gimplify.c (struct gimplify_omp_ctx): Add tree clauses member.
	(new_omp_context): Initialize clauses to NULL_TREE.
	(gimplify_scan_omp_clauses): Set clauses in the gimplify_omp_ctx.
	(omp_clause_matching_array_ref): New function.
	(gomp_needs_data_present): New function.
	(gimplify_adjust_omp_clauses_1): Use preset or pointer omp clause map
	kinds when creating implicit data clauses for OpenACC offloaded
	variables defined used an acc data region as necessary.  Link ACC
	new clauses with the old ones.

	gcc/testsuite/
	* c-c++-common/goacc/acc-data-chain.c: New test.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/pr70828.c: New test.
	* testsuite/libgomp.oacc-fortran/pr70828.f90: New test.
	* testsuite/libgomp.oacc-fortran/lib-13.f90: Remove XFAIL.
---
 gcc/gimplify.c| 101 +-
 .../c-c++-common/goacc/acc-data-chain.c   |  24 +
 .../libgomp.oacc-c-c++-common/pr70828.c   |  25 +
 .../testsuite/libgomp.oacc-fortran/lib-13.f90 |   1 -
 .../libgomp.oacc-fortran/pr70828.f90  |  24 +
 5 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/acc-data-chain.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 4a109aee27a..cf8977c8508 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -191,6 +191,7 @@ struct gimplify_omp_ctx
   bool target_map_scalars_firstprivate;
   bool target_map_pointers_as_0len_arrays;
   bool target_firstprivatize_array_bases;
+  tree clauses;
 };
 
 static struct gimplify_ctx *gimplify_ctxp;
@@ -409,6 +410,7 @@ new_omp_context (enum omp_region_type region_type)
   c->privatized_types = new hash_set;
   c->location = input_location;
   c->region_type = region_type;
+  c->clauses = NULL_TREE;
   if ((region_type & ORT_TASK) == 0)
 c->default_kind = OMP_CLAUSE_DEFAULT_SHARED;
   else
@@ -7501,6 +7503,7 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
   tree *prev_list_p = NULL;
 
   ctx = new_omp_context (region_type);
+  ctx->clauses = *list_p;
   outer_ctx = ctx->outer_context;
   if (code == OMP_TARGET)
 {
@@ -8696,6 +8699,58 @@ struct gimplify_adjust_omp_clauses_data
   gimple_seq *pre_p;
 };
 
+/* Return true if clause contains an array_ref of DECL.  */
+
+static bool
+omp_clause_matching_array_ref (tree clause, tree decl)
+{
+  tree cdecl = OMP_CLAUSE_DECL (clause);
+
+  if (TREE_CODE (cdecl) != ARRAY_REF)
+return false;
+
+  return TREE_OPERAND (cdecl, 0) == decl;
+}
+
+/* Inside OpenACC parallel and kernels regions, the implicit data
+   clauses for arrays must respect the explicit data clauses set by a
+   containing acc data region.  Specifically, care must be taken
+   pointers or if an subarray of a local array is specified in an acc
+   data region, so that the referenced array inside the offloaded
+   region has a present data clasue for that array with an
+   approporiate subarray argument.  This function returns the tree
+   node of the acc data clause that utilizes DECL as an argument.  */
+
+static tree
+gomp_needs_data_present (tree decl)
+{
+  gimplify_omp_ctx *ctx = NULL;
+  bool found_match = false;
+  tree c = NULL_TREE;
+
+  if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
+return NULL_TREE;
+
+  if (gimplify_omp_ctxp->region_type != ORT_ACC_PARALLEL
+  && gimplify_omp_ctxp->region_type != ORT_ACC_KERNELS)
+return NULL_TREE;
+
+  for (ctx = gimplify_omp_ctxp->outer_context; !found_match && ctx;
+   ctx = ctx->outer_context)
+{
+  if (ctx->region_type != ORT_ACC_DATA)
+	break;
+
+  for (c = ctx->clauses; c; c = OMP_CLAUSE_CHAIN (c))
+	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+	&& (omp_clause_matching_array_ref (c, decl)
+		|| OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER))
+	  return c;
+}
+
+  return NULL_TREE;
+}
+
 /* For all variables that were not actually used within the context,
remove PRIVATE, SHARED, and FIRSTPRIVATE clauses.  */
 
@@ -8849,7 +8904,51 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data)
 	  gcc_unreachable ();
 	}
   OMP_CLAUSE_SET_MAP_KIND (clause, kind);
-  if (DECL_SIZE (de

Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions

2018-07-11 Thread Cesar Philippidis
On 07/02/2018 07:14 AM, Tom de Vries wrote:
> On 06/21/2018 03:58 PM, Cesar Philippidis wrote:
>> On 06/20/2018 03:15 PM, Tom de Vries wrote:
>>> On 06/20/2018 11:59 PM, Cesar Philippidis wrote:
>>>> Now it follows the formula contained in
>>>> the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA.
>>>
>>> Any reason we're not using the cuda runtime functions to get the
>>> occupancy (see PR85590 - [nvptx, libgomp, openacc] Use cuda runtime fns
>>> to determine launch configuration in nvptx ) ?
>>
>> There are two reasons:
>>
>>   1) cuda_occupancy.h depends on the CUDA runtime to extract the device
>>  properties instead of the CUDA driver API. However, we can always
>>  teach libgomp how to populate the cudaDeviceProp struct using the
>>  driver API.
>>
>>   2) CUDA is not always present on the build host, and that's why
>>  libgomp maintains its own cuda.h. So at the very least, this
>>  functionality would be good to have in libgomp as a fallback
>>  implementation;
> 
> Libgomp maintains its own cuda.h to "allow building GCC with PTX
> offloading even without CUDA being installed" (
> https://gcc.gnu.org/ml/gcc-patches/2017-01/msg00980.html ).
> 
> The libgomp nvptx plugin however uses the cuda driver API to launch
> kernels etc, so we can assume that's always available at launch time.
> And according to the "CUDA Pro Tip: Occupancy API Simplifies Launch
> Configuration", the occupancy API is also available in the driver API.
> 
> What we cannot assume to be available is the occupancy API pre cuda-6.5.
> So it's fine to have a fallback for that (properly isolated in utility
> functions), but for cuda 6.5 and up we want to use the occupancy API.

Here's revision 2 to the patch. I replaced all of my thread occupancy
heuristics with calls to the CUDA driver as you suggested. The
performance is worse than my heuristics, but that's to be expected
because the CUDA driver only guarantees the minimal launch geometry to
to fully utilize the hardware, and not the optimal value. I'll
reintroduce my heuristics later as a follow up patch. The major
advantage of the CUDA thread occupancy calculator is that it allows the
runtime to select sensible default num_workers to avoid those annoying
runtime failures due to insufficient GPU hardware resources.

One thing that may stick out in this patch is how it probes for the
driver version instead of the API version. It turns out that the API
version corresponds to the SM version declared in the PTX sources,
whereas the driver version corresponds to the latest version of CUDA
supported by the driver. At least that's the case with driver version
396.24.

>>  its not good to have program fail due to
>>  insufficient hardware resources errors when it is avoidable.
>>
> 
> Right, in fact there are two separate things you're trying to address
> here: launch failure and occupancy heuristic, so split the patch.

That hunk was small, so I included it with this patch. Although if you
insist, I can remove it.

Is this patch OK for trunk? I tested it x86_64 with nvptx offloading.

Cesar
2018-07-XX  Cesar Philippidis  
	Tom de Vries  

	gcc/
	* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Rename to ...
	(PTX_DEFAULT_RUNTIME_DIM): ... this.
	(nvptx_goacc_validate_dims): Set default worker and gang dims to
	PTX_DEFAULT_RUNTIME_DIM.
	(nvptx_dim_limit): Ignore GOMP_DIM_WORKER;

	libgomp/
	* plugin/cuda/cuda.h (CUoccupancyB2DSize): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Likewise.
	* plugin/plugin-nvptx.c (struct ptx_device): Add driver_version member.
	(nvptx_open_device): Set it.
	(nvptx_exec): Use the CUDA driver to both determine default num_gangs
	and num_workers, and error if the hardware doesn't have sufficient
	resources to launch a kernel.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5608bee8a8d..c1946e75f42 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -5165,7 +5165,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 /* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
 #define PTX_WORKER_LENGTH 32
-#define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
+#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
 
 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
 
@@ -5214,9 +5214,9 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
 {
   dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
   if (dims[GOMP_DIM_WORKER] < 0)
-	dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
   if (dims[GOMP_DIM_GANG] < 0)
-	dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
+	dims[GOMP_DIM_GANG] = P

Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions

2018-07-02 Thread Cesar Philippidis
On 07/02/2018 07:14 AM, Tom de Vries wrote:
> On 06/21/2018 03:58 PM, Cesar Philippidis wrote:
>> On 06/20/2018 03:15 PM, Tom de Vries wrote:
>>> On 06/20/2018 11:59 PM, Cesar Philippidis wrote:
>>>> Now it follows the formula contained in
>>>> the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA.
>>>
>>> Any reason we're not using the cuda runtime functions to get the
>>> occupancy (see PR85590 - [nvptx, libgomp, openacc] Use cuda runtime fns
>>> to determine launch configuration in nvptx ) ?
>>
>> There are two reasons:
>>
>>   1) cuda_occupancy.h depends on the CUDA runtime to extract the device
>>  properties instead of the CUDA driver API. However, we can always
>>  teach libgomp how to populate the cudaDeviceProp struct using the
>>  driver API.
>>
>>   2) CUDA is not always present on the build host, and that's why
>>  libgomp maintains its own cuda.h. So at the very least, this
>>  functionality would be good to have in libgomp as a fallback
>>  implementation;
> 
> Libgomp maintains its own cuda.h to "allow building GCC with PTX
> offloading even without CUDA being installed" (
> https://gcc.gnu.org/ml/gcc-patches/2017-01/msg00980.html ).
> 
> The libgomp nvptx plugin however uses the cuda driver API to launch
> kernels etc, so we can assume that's always available at launch time.
> And according to the "CUDA Pro Tip: Occupancy API Simplifies Launch
> Configuration", the occupancy API is also available in the driver API.

Thanks for the info. I was not aware that the CUDA driver API had a
thread occupancy calculator (it' described in section 4.18).

> What we cannot assume to be available is the occupancy API pre cuda-6.5.
> So it's fine to have a fallback for that (properly isolated in utility
> functions), but for cuda 6.5 and up we want to use the occupancy API.

That seems reasonable. I'll run some experiments with that. In the
meantime, would it be OK to make this fallback the default, then add
support for the driver occupancy calculator as a follow up?

>>  its not good to have program fail due to
>>  insufficient hardware resources errors when it is avoidable.
>>
> 
> Right, in fact there are two separate things you're trying to address
> here: launch failure and occupancy heuristic, so split the patch.

ACK. I'll split those changes into separate patches.

By the way, do you have any preferences on how to break up the nvptx
vector length changes for trunk submission? I was planning on breaking
it down into four components - generic ME changes, tests, nvptx
reductions and the rest. Those two nvptx compoinents are large, so I'll
probably break them down to smaller patches, but I'm not sure if it's
worthwhile to make them independent from one another with the use of a
lot of stub functions.

Cesar


Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions

2018-06-29 Thread Cesar Philippidis
On 06/29/2018 10:12 AM, Cesar Philippidis wrote:
> Ping.

While porting the vector length patches to trunk, I realized that I
mistakenly removed support for the environment variable GOMP_OPENACC_DIM
in this patch (thanks for adding those test case Tom!). I'll post an
updated version of this patch once I got the vector length patches
working with it.

Cesar

> On 06/20/2018 02:59 PM, Cesar Philippidis wrote:
>> At present, the nvptx libgomp plugin does not take into account the
>> amount of shared resources on GPUs (mostly shared-memory are register
>> usage) when selecting the default num_gangs and num_workers. In certain
>> situations, an OpenACC offloaded function can fail to launch if the GPU
>> does not have sufficient shared resources to accommodate all of the
>> threads in a CUDA block. This typically manifests when a PTX function
>> uses a lot of registers and num_workers is set too large, although it
>> can also happen if the shared-memory has been exhausted by the threads
>> in a vector.
>>
>> This patch resolves that issue by adjusting num_workers based the amount
>> of shared resources used by each threads. If worker parallelism has been
>> requested, libgomp will spawn as many workers as possible up to 32.
>> Without this patch, libgomp would always default to launching 32 workers
>> when worker parallelism is used.
>>
>> Besides for the worker parallelism, this patch also includes some
>> heuristics on selecting num_gangs. Before, the plugin would launch two
>> gangs per GPU multiprocessor. Now it follows the formula contained in
>> the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA.
>>
>> Is this patch OK for trunk?
>>
>> Thanks,
>> Cesar
>>
> 



[patch] Add OpenACC Fortran support for deviceptr and variable in common blocks

2018-06-29 Thread Cesar Philippidis
The attached patch adds support Fortran support for OpenACC deviceptr
and the use of common block variables in data clauses (both implicit and
explicit). This patch also relaxes the Fortran parser to not error
certain types of integral expressions and assumed-sized arrays.

With respect to those errors, I removed them because a lot of working
applications do not explicitly use type attributes (like contiguous).
Perhaps it would be better to reduce them to a warning. Any thoughts on
that? My argument for their removal is that, while the standard states
that, say, arrays must be contiguous or bad things will happen, it does
not necessary mandate that the compiler enforces it. I.e., the intent is
to set the user's expectation that things will go bad if garbage input
is fed to the accelerator. If necessary, I can push back on the OpenACC
standards committee on these issue, but don't expect a quick resolution.

In hindsight, I probably should have kept the error relaxation patches
separate. This patch includes the following patches from og8:

  * (dd8b75a) [OpenACC] Update deviceptr handling
  * (634727d) [OpenACC] Handle Fortran deviceptr clause
  * (d50862a) [Fortran] Remove pointer check in check_array_not_assumed
  * (0793cef) [OpenACC] add support for fortran common blocks
  * (bdc1acc) [Fortran] update gfortran's tile clause error handling
  * (5dc4968) Fix PR72715 "ICE in gfc_trans_omp_do, at
  fortran/trans-openmp.c:3164"

Is this patch OK for trunk? It bootstrapped / regression tested cleanly
for x86_64 with nvptx offloading.

Thanks,
Cesar
2018-06-29  Cesar Philippidis  
	James Norris  

	gcc/fortran/
	* openmp.c (gfc_match_omp_map_clause): Re-write handling of the
	deviceptr clause.  Add new common_blocks argument.  Propagate it to
	gfc_match_omp_variable_list.
	(gfc_match_omp_clauses): Update calls to gfc_match_omp_map_clauses.
	(resolve_positive_int_expr): Promote the warning to an error.
	(check_array_not_assumed): Remove pointer check.
	(resolve_oacc_nested_loops): Error on do concurrent loops.
	* trans-openmp.c (gfc_omp_finish_clause): Don't create pointer data
	mappings for deviceptr clauses.
	(gfc_trans_omp_clauses): Likewise.

	gcc/
	* gimplify.c (enum gimplify_omp_var_data): Add GOVD_DEVICETPR.
	(oacc_default_clause): Privatize fortran common blocks.
	(omp_notice_variable): Add GOVD_DEVICEPTR attribute when appropriate.
	Defer the expansion of DECL_VALUE_EXPR for common block decls.
	(gimplify_scan_omp_clauses): Add GOVD_DEVICEPTR attribute when
	appropriate.
	(gimplify_adjust_omp_clauses_1): Set GOMP_MAP_FORCE_DEVICEPTR for
	implicit deviceptr mappings.

	gcc/testsuite/
	* c-c++-common/goacc/deviceptr-4.c: Update.
	* gfortran.dg/goacc/common-block-1.f90: New test.
	* gfortran.dg/goacc/common-block-2.f90: New test.
	* gfortran.dg/goacc/loop-2.f95: Update.
	* gfortran.dg/goacc/loop-3-2.f95: Update.
	* gfortran.dg/goacc/loop-3.f95: Update.
	* gfortran.dg/goacc/loop-5.f95: Update.
	* gfortran.dg/goacc/pr72715.f90: New test.
	* gfortran.dg/goacc/sie.f95: Update.
	* gfortran.dg/goacc/tile-1.f90: Update.
	* gfortran.dg/gomp/pr77516.f90: Update.

	libgomp/
	* oacc-parallel.c (GOACC_parallel_keyed): Handle Fortran deviceptr
	clause.
	(GOACC_data_start): Likewise.
	* testsuite/libgomp.oacc-fortran/common-block-1.f90: New test.
	* testsuite/libgomp.oacc-fortran/common-block-2.f90: New test.
	* testsuite/libgomp.oacc-fortran/common-block-3.f90: New test.
	* testsuite/libgomp.oacc-fortran/deviceptr-1.f90: New test.


>From 09c1aa87d9a7db2e08384bb47c80b4a61d218a99 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Mon, 25 Jun 2018 13:10:13 -0700
Subject: [PATCH] fortran deviceptr

dd8b75 [OpenACC] Update deviceptr handling
634727 [OpenACC] Handle Fortran deviceptr clause
0793ce [OpenACC] add support for fortran common blocks
bdc1ac [Fortran] update gfortran's tile clause error handling
d50862 [Fortran] Remove pointer check in check_array_not_assumed
5dc496 Fix PR72715 "ICE in gfc_trans_omp_do, at fortran/trans-openmp.c:3164"

---
 gcc/fortran/openmp.c  |  57 ++---
 gcc/fortran/trans-openmp.c|   9 +
 gcc/gimplify.c|  35 +++-
 .../c-c++-common/goacc/deviceptr-4.c  |   2 +-
 .../gfortran.dg/goacc/common-block-1.f90  |  69 ++
 .../gfortran.dg/goacc/common-block-2.f90  |  49 +
 gcc/testsuite/gfortran.dg/goacc/loop-2.f95|   8 +-
 gcc/testsuite/gfortran.dg/goacc/loop-3-2.f95  |   4 +-
 gcc/testsuite/gfortran.dg/goacc/loop-3.f95|   4 +-
 gcc/testsuite/gfortran.dg/goacc/loop-5.f95|  12 --
 gcc/testsuite/gfortran.dg/goacc/pr72715.f90   |   6 +
 gcc/testsuite/gfortran.dg/goacc/sie.f95   |  36 ++--
 gcc/testsuite/gfortran.dg/goacc/tile-1.f90|  16 +-
 gcc/testsuite/gfortran.dg/gomp/pr77516.f90|   2 +-
 libgomp/oacc-parallel.c   |  11 +-
 .../libgomp.oacc-fortran/common-block-1.f90   | 105 ++
 .../libgomp.oacc-fortran/com

Re: [patch] various OpenACC reduction enhancements - test cases

2018-06-29 Thread Cesar Philippidis
Attached are the updated reductions tests cases. Again, these have been
bootstrapped and regression tested cleanly for x86_64 with nvptx
offloading. Is it OK for trunk?

Thanks,
Cesar
2018-06-29  Cesar Philippidis  
	Nathan Sidwell  

	gcc/testsuite/
	* c-c++-common/goacc/orphan-reductions-1.c: New test.
	* c-c++-common/goacc/reduction-7.c: New test.
	* c-c++-common/goacc/routine-4.c: Update.
	* g++.dg/goacc/reductions-1.C: New test.
	* gcc.dg/goacc/loop-processing-1.c: Update.
	* gfortran.dg/goacc/orphan-reductions-1.f90: New test.

	libgomp/
	* libgomp.oacc-c-c++-common/par-reduction-3.c: New test.
	* libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c: New test.
	* libgomp.oacc-fortran/reduction-9.f90: New test.


From b128e80be7cd2c81171fbd9c8b23e786bb832633 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Thu, 21 Jun 2018 11:37:56 -0700
Subject: [PATCH] Trunk reductions patches

OG8 Reduction patches

4469fc4 [Fortran] Permit reductions in gfc_omp_clause_copy_ctor
704f1a2 [nxptx, OpenACC] vector reductions
8a35c89 [OpenACC] Fix a reduction bug involving GOMP_MAP_FIRSTPRIVATE_POINTER variables
16ead33 [OpenACC] Update error messages for c and c++ reductions
65dd9cf Make OpenACC orphan gang reductions errors
5d60102 [PR80547] Handle parallel reductions explicitly initialized by the user
---
 gcc/c/c-parser.c  |  46 +-
 gcc/c/c-typeck.c  |   8 +
 gcc/config/nvptx/nvptx.c  | 233 +++-
 gcc/config/nvptx/nvptx.md |   7 +
 gcc/cp/parser.c   |  27 +-
 gcc/cp/semantics.c|   8 +
 gcc/fortran/openmp.c  |  12 +
 gcc/fortran/trans-openmp.c|   3 +-
 gcc/omp-general.h |   5 +-
 gcc/omp-low.c |  33 +-
 gcc/omp-offload.c |  18 +
 .../c-c++-common/goacc/orphan-reductions-1.c  |  56 ++
 .../c-c++-common/goacc/reduction-7.c  | 111 
 gcc/testsuite/c-c++-common/goacc/routine-4.c  |   8 +-
 gcc/testsuite/g++.dg/goacc/reductions-1.C | 548 ++
 .../gcc.dg/goacc/loop-processing-1.c  |   3 +-
 .../gfortran.dg/goacc/orphan-reductions-1.f90 | 204 +++
 .../par-reduction-3.c |  29 +
 .../reduction-cplx-flt-2.c|  32 +
 .../libgomp.oacc-fortran/reduction-9.f90  |  54 ++
 20 files changed, 1396 insertions(+), 49 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/reduction-7.c
 create mode 100644 gcc/testsuite/g++.dg/goacc/reductions-1.C
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/orphan-reductions-1.f90
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90

diff --git a/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c b/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c
new file mode 100644
index 000..b0bd4a7de05
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c
@@ -0,0 +1,56 @@
+/* Test orphan reductions.  */
+
+#include 
+
+#pragma acc routine seq
+int
+seq_reduction (int n)
+{
+  int i, sum = 0;
+#pragma acc loop seq reduction(+:sum)
+  for (i = 0; i < n; i++)
+sum = sum + 1;
+
+  return sum;
+}
+
+#pragma acc routine gang
+int
+gang_reduction (int n)
+{
+  int i, s1 = 0, s2 = 0;
+#pragma acc loop gang reduction(+:s1) /* { dg-error "gang reduction on an orphan loop" } */
+  for (i = 0; i < n; i++)
+s1 = s1 + 2;
+
+#pragma acc loop gang reduction(+:s2) /* { dg-error "gang reduction on an orphan loop" } */
+  for (i = 0; i < n; i++)
+s2 = s2 + 2;
+
+
+  return s1 + s2;
+}
+
+#pragma acc routine worker
+int
+worker_reduction (int n)
+{
+  int i, sum = 0;
+#pragma acc loop worker reduction(+:sum)
+  for (i = 0; i < n; i++)
+sum = sum + 3;
+
+  return sum;
+}
+
+#pragma acc routine vector
+int
+vector_reduction (int n)
+{
+  int i, sum = 0;
+#pragma acc loop vector reduction(+:sum)
+  for (i = 0; i < n; i++)
+sum = sum + 4;
+
+  return sum;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-7.c b/gcc/testsuite/c-c++-common/goacc/reduction-7.c
new file mode 100644
index 000..245c848d509
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-7.c
@@ -0,0 +1,111 @@
+/* Exercise invalid reductions on array and struct members.  */
+
+void
+test_parallel ()
+{
+  struct {
+int a;
+float b[5];
+  } s1, s2[10];
+
+  int i;
+  double z[100];
+
+#pragma acc parallel reduction(+:s1.a) /* { dg-error "invalid reduction variable" } */
+  for (i = 0; i < 10; i++)
+s1.a += 1;
+
+#pragma acc parallel reduction(+:s1.b[3]) /* { dg-error "inva

Re: [patch] various OpenACC reduction enhancements - FE changes

2018-06-29 Thread Cesar Philippidis
Attaches are the FE changes for the OpenACC reduction enhancements. It
depends on the ME patch.

Is this patch OK for trunk? It bootstrapped / regression tested cleanly
for x86_64 with nvptx offloading.

Thanks,
Cesar
2018-06-29  Cesar Philippidis  
	Nathan Sidwell  

	gcc/c/
	* c-parser.c (c_parser_omp_variable_list): New c_omp_region_type
	argument.  Use it to specialize handling of OMP_CLAUSE_REDUCTION for
	OpenACC.
	(c_parser_omp_clause_reduction): Update call to
	c_parser_omp_variable_list.  Propage OpenACC errors as necessary.
	(c_parser_oacc_all_clauses): Update call to
	p_parser_omp_clause_reduction.
	(c_parser_omp_all_clauses): Likewise.
	* c-typeck.c (c_finish_omp_clauses): Emit an error on orphan OpenACC
	gang reductions.

	gcc/cp/
	* parser.c (cp_parser_omp_var_list_no_open):  New c_omp_region_type
	argument.  Use it to specialize handling of OMP_CLAUSE_REDUCTION for
	OpenACC.
	(cp_parser_omp_clause_reduction): Update call to
	cp_parser_omp_variable_list.  Propage OpenACC errors as necessary.
	(cp_parser_oacc_all_clauses): Update call to
	cp_parser_omp_clause_reduction.
	(cp_parser_omp_all_clauses): Likewise.
	* semantics.c (finish_omp_clauses): Emit an error on orphan OpenACC
	gang reductions.

	gcc/fortran/
	* openmp.c (resolve_oacc_loop_blocks): Emit an error on orphan OpenACC
	gang reductions.
	* trans-openmp.c (gfc_omp_clause_copy_ctor): Permit reductions.

---
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 7a926285f3a..a6f453dae54 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -965,12 +965,13 @@ class token_pair
 
   /* Like token_pair::require_close, except that tokens will be skipped
  until the desired token is found.  An error message is still produced
- if the next token is not as expected.  */
+ if the next token is not as expected, unless QUIET is set.  */
 
-  void skip_until_found_close (c_parser *parser) const
+  void skip_until_found_close (c_parser *parser, bool quiet = false) const
   {
 c_parser_skip_until_found (parser, traits_t::close_token_type,
-			   traits_t::close_gmsgid, m_open_loc);
+			   quiet ? NULL : traits_t::close_gmsgid,
+			   m_open_loc);
   }
 
  private:
@@ -11498,7 +11499,8 @@ c_parser_oacc_wait_list (c_parser *parser, location_t clause_loc, tree list)
 static tree
 c_parser_omp_variable_list (c_parser *parser,
 			location_t clause_loc,
-			enum omp_clause_code kind, tree list)
+			enum omp_clause_code kind, tree list,
+			enum c_omp_region_type ort = C_ORT_OMP)
 {
   if (c_parser_next_token_is_not (parser, CPP_NAME)
   || c_parser_peek_token (parser)->id_kind != C_ID_ID)
@@ -11557,6 +11559,22 @@ c_parser_omp_variable_list (c_parser *parser,
 	  /* FALLTHROUGH  */
 	case OMP_CLAUSE_DEPEND:
 	case OMP_CLAUSE_REDUCTION:
+	  if (kind == OMP_CLAUSE_REDUCTION && ort == C_ORT_ACC)
+		{
+		  switch (c_parser_peek_token (parser)->type)
+		{
+		case CPP_OPEN_PAREN:
+		case CPP_OPEN_SQUARE:
+		case CPP_DOT:
+		case CPP_DEREF:
+		  error ("invalid reduction variable");
+		  t = error_mark_node;
+		default:;
+		  break;
+		}
+		  if (t == error_mark_node)
+		break;
+		}
 	  while (c_parser_next_token_is (parser, CPP_OPEN_SQUARE))
 		{
 		  tree low_bound = NULL_TREE, length = NULL_TREE;
@@ -12789,9 +12807,12 @@ c_parser_omp_clause_private (c_parser *parser, tree list)
  identifier  */
 
 static tree
-c_parser_omp_clause_reduction (c_parser *parser, tree list)
+c_parser_omp_clause_reduction (c_parser *parser, tree list,
+			   enum c_omp_region_type ort)
 {
   location_t clause_loc = c_parser_peek_token (parser)->location;
+  bool seen_error = false;
+
   matching_parens parens;
   if (parens.require_open (parser))
 {
@@ -12855,7 +12876,13 @@ c_parser_omp_clause_reduction (c_parser *parser, tree list)
 	  tree nl, c;
 
 	  nl = c_parser_omp_variable_list (parser, clause_loc,
-	   OMP_CLAUSE_REDUCTION, list);
+	   OMP_CLAUSE_REDUCTION, list, ort);
+	  if (c_parser_peek_token (parser)->type != CPP_CLOSE_PAREN)
+	{
+	  seen_error = true;
+	  goto cleanup;
+	}
+
 	  for (c = nl; c != list; c = OMP_CLAUSE_CHAIN (c))
 	{
 	  tree d = OMP_CLAUSE_DECL (c), type;
@@ -12891,7 +12918,8 @@ c_parser_omp_clause_reduction (c_parser *parser, tree list)
 
 	  list = nl;
 	}
-  parens.skip_until_found_close (parser);
+cleanup:
+  parens.skip_until_found_close (parser, seen_error);
 }
   return list;
 }
@@ -13998,7 +14026,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  c_name = "private";
 	  break;
 	case PRAGMA_OACC_CLAUSE_REDUCTION:
-	  clauses = c_parser_omp_clause_reduction (parser, clauses);
+	  clauses = c_parser_omp_clause_reduction (parser, clauses, C_ORT_ACC);
 	  c_name = "reduction";
 	  break;
 	case PRAGMA_OACC_CLAUSE_SEQ:
@@ -14157,7 +14185,7 @@ c_parser_omp_all_clauses (c_parser *parser, omp_clause_ma

Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-06-29 Thread Cesar Philippidis
The attached patch includes the nvptx and GCC ME reductions enhancements.

Is this patch OK for trunk? It bootstrapped / regression tested cleanly
for x86_64 with nvptx offloading.

Thanks,
Cesar
2018-06-29  Cesar Philippidis  
	Nathan Sidwell  

	gcc/
	* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
	(nvptx_split_blocks): Call it for cond_uni insn.
	(nvptx_expand_cond_uni): New.
	(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
	(nvptx_init_builtins): Initialize it.
	(nvptx_expand_builtin):
	(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
	tree BITS operand.
	(nvptx_vector_reduction): New.
	(nvptx_adjust_reduction_type): New.
	(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
	(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
	Use it to adjust the type of ref_to_res.
	(nvptx_goacc_reduction_teardown):
	* config/nvptx/nvptx.md (cond_uni): New pattern.
	* omp-general.h (enum oacc_loop_flags): Add OLF_REDUCTION enum.
	* omp-low.c (lower_oacc_reductions): Handle reduction decls mapped
	with GOMP_MAP_FIRSTPRIVATE_POINTER.
	(lower_oacc_head_mark): Use OLF_REDUCTION to mark OpenACC reductions.
	* omp-offload.c (oacc_loop_auto_partitions): Don't assign gang
	level parallelism to orphan reductions.
	(default_goacc_reduction): Retype ref_to_res as necessary.

---
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5608bee8a8d..33ec3db1153 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
 }
 }
 
+/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
+   mark that as unified.  We expect to be in a single block.  */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+  rtx_insn *probe = unified;
+  rtx cond_reg = SET_DEST (PATTERN (unified));
+  rtx pat = NULL_RTX;
+
+  /* Find the comparison.  (We could skip this and simply scan to he
+ blocks' terminating branch, if we didn't care for self
+ checking.)  */
+  for (;;)
+{
+  probe = next_real_insn (probe);
+  if (!probe)
+	break;
+  pat = PATTERN (probe);
+
+  if (GET_CODE (pat) == SET
+	  && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+	  && XEXP (SET_SRC (pat), 0) == cond_reg)
+	break;
+  gcc_assert (NONJUMP_INSN_P (probe));
+}
+  gcc_assert (pat);
+  rtx pred_reg = SET_DEST (pat);
+
+  /* Find the branch.  */
+  do
+probe = NEXT_INSN (probe);
+  while (!JUMP_P (probe));
+
+  pat = PATTERN (probe);
+  rtx itec = XEXP (SET_SRC (pat), 0);
+  gcc_assert (XEXP (itec, 0) == pred_reg);
+
+  /* Mark the branch's condition as unified.  */
+  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+			   UNSPEC_BR_UNIFIED);
+  bool ok = validate_change (probe,  (itec, 0), unspec, false);
+
+  gcc_assert (ok);
+}
+
 /* Loop structure of the function.  The entire function is described as
a NULL loop.  */
 
@@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
 	continue;
 	  switch (recog_memoized (insn))
 	{
+	case CODE_FOR_cond_uni:
+	  nvptx_propagate_unified (insn);
+	  /* FALLTHROUGH */
 	default:
 	  seen_insn = true;
 	  continue;
@@ -5080,6 +5129,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
   return target;
 }
 
+/* Expander for the compare unified builtin.  */
+
+static rtx
+nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
+{
+  if (ignore)
+return target;
+  
+  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+			 NULL_RTX, mode, EXPAND_NORMAL);
+
+  emit_insn (gen_cond_uni (target, src));
+
+  return target;
+}
 
 /* Codes for all the NVPTX builtins.  */
 enum nvptx_builtins
@@ -5089,6 +5153,7 @@ enum nvptx_builtins
   NVPTX_BUILTIN_WORKER_ADDR,
   NVPTX_BUILTIN_CMP_SWAP,
   NVPTX_BUILTIN_CMP_SWAPLL,
+  NVPTX_BUILTIN_COND_UNI,
   NVPTX_BUILTIN_MAX
 };
 
@@ -5126,6 +5191,7 @@ nvptx_init_builtins (void)
(PTRVOID, ST, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
+  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE));
 
 #undef DEF
 #undef ST
@@ -5158,6 +5224,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 case NVPTX_BUILTIN_CMP_SWAPLL:
   return nvptx_expand_cmp_swap (exp, target, mode, ignore);
 
+case NVPTX_BUILTIN_COND_UNI:
+  return nvptx_expand_cond_uni (exp, target, mode, ignore);
+
 default: gcc_unreachable ();
 }
 }
@@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset)
 
 static void
 nvptx_generate_vector_shuffle (location_t loc,
-			   tree dest_var, tree var, unsigned shift,
+			   tree dest_var, tree var, tree bits,
 			   gimple_seq *seq)
 {
   unsigned 

  1   2   3   4   5   6   7   >