Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes
On 10/5/18 07:07, Tom de Vries wrote: > On 6/29/18 8:19 PM, Cesar Philippidis wrote: >> The attached patch includes the nvptx and GCC ME reductions enhancements. >> >> Is this patch OK for trunk? It bootstrapped / regression tested cleanly >> for x86_64 with nvptx offloading. >> > > These need fixing: > ... > === ERROR type #5: trailing whitespace (4 error(s)) === > gcc/config/nvptx/nvptx.c:5139:0:██ > gcc/config/nvptx/nvptx.c:5660:8: do█ > gcc/config/nvptx/nvptx.c:5702:0:██ > gcc/config/nvptx/nvptx.c:5726:0:██ > ... Sorry. The attached patch fixes that. > Otherwise, nvptx part LGTM. Tomorrow's my last day at Mentor, so either Thomas or Julian will need to commit it once the other patches get approved. Thanks, Cesar gcc/ * config/nvptx/nvptx.c (nvptx_propagate_unified): New. (nvptx_split_blocks): Call it for cond_uni insn. (nvptx_expand_cond_uni): New. (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI. (nvptx_init_builtins): Initialize it. (nvptx_expand_builtin): (nvptx_generate_vector_shuffle): Change integral SHIFT operand to tree BITS operand. (nvptx_vector_reduction): New. (nvptx_adjust_reduction_type): New. (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res. (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist. (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector. Use it to adjust the type of ref_to_res. (nvptx_goacc_reduction_teardown): * config/nvptx/nvptx.md (cond_uni): New pattern. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 9903a273863..acb490a9a90 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt () } } +/* UNIFIED is a cond_uni insn. Find the branch insn it affects, and + mark that as unified. We expect to be in a single block. */ + +static void +nvptx_propagate_unified (rtx_insn *unified) +{ + rtx_insn *probe = unified; + rtx cond_reg = SET_DEST (PATTERN (unified)); + rtx pat = NULL_RTX; + + /* Find the comparison. (We could skip this and simply scan to he + blocks' terminating branch, if we didn't care for self + checking.) */ + for (;;) +{ + probe = next_real_insn (probe); + if (!probe) + break; + pat = PATTERN (probe); + + if (GET_CODE (pat) == SET + && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE + && XEXP (SET_SRC (pat), 0) == cond_reg) + break; + gcc_assert (NONJUMP_INSN_P (probe)); +} + gcc_assert (pat); + rtx pred_reg = SET_DEST (pat); + + /* Find the branch. */ + do +probe = NEXT_INSN (probe); + while (!JUMP_P (probe)); + + pat = PATTERN (probe); + rtx itec = XEXP (SET_SRC (pat), 0); + gcc_assert (XEXP (itec, 0) == pred_reg); + + /* Mark the branch's condition as unified. */ + rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg), + UNSPEC_BR_UNIFIED); + bool ok = validate_change (probe, (itec, 0), unspec, false); + + gcc_assert (ok); +} + /* Loop structure of the function. The entire function is described as a NULL loop. */ @@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map) continue; switch (recog_memoized (insn)) { + case CODE_FOR_cond_uni: + nvptx_propagate_unified (insn); + /* FALLTHROUGH */ default: seen_insn = true; continue; @@ -5083,6 +5132,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target, return target; } +/* Expander for the compare unified builtin. */ + +static rtx +nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore) +{ + if (ignore) +return target; + + rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), + NULL_RTX, mode, EXPAND_NORMAL); + + emit_insn (gen_cond_uni (target, src)); + + return target; +} /* Codes for all the NVPTX builtins. */ enum nvptx_builtins @@ -5092,6 +5156,7 @@ enum nvptx_builtins NVPTX_BUILTIN_WORKER_ADDR, NVPTX_BUILTIN_CMP_SWAP, NVPTX_BUILTIN_CMP_SWAPLL, + NVPTX_BUILTIN_COND_UNI, NVPTX_BUILTIN_MAX }; @@ -5129,6 +5194,7 @@ nvptx_init_builtins (void) (PTRVOID, ST, UINT, UINT, NULL_TREE)); DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); + DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE)); #undef DEF #undef ST @@ -5161,6 +5227,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), case NVPTX_BUILTIN_CMP_SWAPLL: return nvptx_expand_cmp_swap (exp, target, mode, ignore); +case NVPTX_BUILTIN_COND_UNI: + return nvptx_expand_cond_uni (exp, target, mode, ignore); + default: gcc_unreachable (); } } @@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset) static void nvptx_generate_vector_shuffle (location_t loc, - tree
[PATCH 4/4] [og8] Attach / Detach compiler tests
This patch introduces a couple of compiler tests for the OpenACC attach and detach clauses. I've committed it to openacc-gcc-8-branch. Cesar 2018-10-30 Cesar Philippidis gcc/testsuite/ * c-c++-common/goacc/mdc-1.c: New test. * c-c++-common/goacc/mdc-2.c: New test. * g++.dg/goacc/mdc.C: New test. --- gcc/testsuite/c-c++-common/goacc/mdc-1.c | 54 +++ gcc/testsuite/c-c++-common/goacc/mdc-2.c | 62 + gcc/testsuite/g++.dg/goacc/mdc.C | 68 3 files changed, 184 insertions(+) create mode 100644 gcc/testsuite/c-c++-common/goacc/mdc-1.c create mode 100644 gcc/testsuite/c-c++-common/goacc/mdc-2.c create mode 100644 gcc/testsuite/g++.dg/goacc/mdc.C diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-1.c b/gcc/testsuite/c-c++-common/goacc/mdc-1.c new file mode 100644 index 000..c20b94ddbdc --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/mdc-1.c @@ -0,0 +1,54 @@ +/* Test OpenACC's support for manual deep copy, including the attach + and detach clauses. */ + +/* { dg-additional-options "-fdump-tree-omplower" } */ + +void +t1 () +{ + struct foo { +int *a, *b, c, d, *e; + } s; + + int *a, *z; + +#pragma acc enter data copyin(s) + { +#pragma acc data copy(s.a[0:10]) copy(z[0:10]) +{ + s.e = z; +#pragma acc parallel loop attach(s.e) + for (int i = 0; i < 10; i++) +s.a[i] = s.e[i]; + + + a = s.e; +#pragma acc enter data attach(a) +#pragma acc exit data detach(a) +} + +#pragma acc enter data copyin(a) +#pragma acc acc enter data attach(s.e) +#pragma acc exit data detach(s.e) + +#pragma acc data attach(s.e) +{ +} +#pragma acc exit data delete(a) + +#pragma acc exit data detach(a) finalize +#pragma acc exit data detach(s.a) finalize + } +} + +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:s .len: 32.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.tofrom:.z .len: 40.. map.struct:s .len: 1.. map.alloc:s.a .len: 8.. map.tofrom:._1 .len: 40.. map.always_pointer:s.a .pointer assign, bias: 0.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_parallel map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.attach:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:s.e .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.release:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:s.a .len: 8.." 1 "omplower" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-2.c b/gcc/testsuite/c-c++-common/goacc/mdc-2.c new file mode 100644 index 000..ebfb99d4caf --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/mdc-2.c @@ -0,0 +1,62 @@ +/* Test OpenACC's support for manual deep copy, including the attach + and detach clauses. */ + +void +t1 () +{ + struct foo { +int *a, *b, c, d, *e; + } s; + + int *a, *z, scalar, **y; + +#pragma acc enter data copyin(s) detach(z) /* { dg-error ".detach. is not valid for" } */ + { +#pragma acc data copy(s.a[0:10]) copy(z[0:10]) +{ + s.e = z; +#pragma acc parallel loop attach(s.e) detach(s.b) /* { dg-error ".detach. is not valid for" } */ + for (int i = 0; i < 10; i++) +s.a[i] = s.e[i]; + + a = s.e; +#pragma acc enter data attach(a) detach(s.c) /* { dg-error ".detach. is not valid for" } */ +#pragma acc exit data detach(a) +} + +#pragma acc enter data attach(z[:5]) /* { dg-error "array section in .attach. clause" } */ +/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */ +#pragma acc exit data detach(z[:5]) /* { dg-error "array section in .detach. clause" } */ +/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */ +#pragma acc enter data attach(z[1:]) /* { dg-error "array section in .attach. clause" } */ +/* { dg-error "
[PATCH 3/4] [og8] Attach / Detach C++ FE changes
As noted here <https://gcc.gnu.org/ml/gcc-patches/2018-10/msg01643.html> this patch adds support for attach and detach in the C++ front end. Unlike trunk, OG8 has some preliminary support for the this pointer. Consequently, finish_omp_clauses had to take care of a couple more cases in order to get libgomp.oacc-c++/this.C to work. I've committed this patch to openacc-gcc-8-branch. Cesar 2018-10-30 Cesar Philippidis gcc/cp/ * parser.c (cp_parser_omp_clause_name): Scan for attach and detach. (cp_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH, DETACH}. (cp_parser_oacc_all_clauses): Likewise. (OACC_DATA_CLAUSE_MASK): Add support for attach and detach. (OACC_ENTER_DATA_CLAUSE_MASK): Likewise. (cp_parser_oacc_declare): Likewise. (OACC_KERNELS_CLAUSE_MASK): Likewise. (OACC_PARALLEL_CLAUSE_MASK): Likewise. * semantics.c (handle_omp_array_sections_1): Reject subarrays for attach and detach. (cp_oacc_check_attachments): New function. (finish_omp_clauses): Use it. Also, allow structure fields and class members to appear in OpenACC data clauses. --- gcc/cp/parser.c| 28 +- gcc/cp/semantics.c | 71 +- 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 9a8ec70bb17..8161d6301df 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -31266,6 +31266,8 @@ cp_parser_omp_clause_name (cp_parser *parser, bool consume_token = true) result = PRAGMA_OMP_CLAUSE_ALIGNED; else if (!strcmp ("async", p)) result = PRAGMA_OACC_CLAUSE_ASYNC; + else if (!strcmp ("attach", p)) + result = PRAGMA_OACC_CLAUSE_ATTACH; break; case 'b': if (!strcmp ("bind", p)) @@ -31290,6 +31292,8 @@ cp_parser_omp_clause_name (cp_parser *parser, bool consume_token = true) result = PRAGMA_OMP_CLAUSE_DEFAULTMAP; else if (!strcmp ("depend", p)) result = PRAGMA_OMP_CLAUSE_DEPEND; + else if (!strcmp ("detach", p)) + result = PRAGMA_OACC_CLAUSE_DETACH; else if (!strcmp ("device", p)) result = PRAGMA_OMP_CLAUSE_DEVICE; else if (!strcmp ("deviceptr", p)) @@ -31679,11 +31683,13 @@ cp_parser_omp_var_list (cp_parser *parser, enum omp_clause_code kind, tree list) } /* OpenACC 2.5: + attach ( variable-list ) copy ( variable-list ) copyin ( variable-list ) copyout ( variable-list ) create ( variable-list ) delete ( variable-list ) + detach ( variable-list ) present ( variable-list ) */ static tree @@ -31693,6 +31699,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind, enum gomp_map_kind kind; switch (c_kind) { +case PRAGMA_OACC_CLAUSE_ATTACH: + kind = GOMP_MAP_ATTACH; + break; case PRAGMA_OACC_CLAUSE_COPY: kind = GOMP_MAP_TOFROM; break; @@ -31708,6 +31717,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind, case PRAGMA_OACC_CLAUSE_DELETE: kind = GOMP_MAP_RELEASE; break; +case PRAGMA_OACC_CLAUSE_DETACH: + kind = GOMP_MAP_DETACH; + break; case PRAGMA_OACC_CLAUSE_DEVICE: kind = GOMP_MAP_FORCE_TO; break; @@ -33851,6 +33863,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask, clauses, here); c_name = "auto"; break; + case PRAGMA_OACC_CLAUSE_ATTACH: + clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "attach"; + break; case PRAGMA_OACC_CLAUSE_BIND: clauses = cp_parser_oacc_clause_bind (parser, clauses); c_name = "bind"; @@ -33883,6 +33899,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask, clauses = cp_parser_omp_clause_default (parser, clauses, here, true); c_name = "default"; break; + case PRAGMA_OACC_CLAUSE_DETACH: + clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "detach"; + break; case PRAGMA_OACC_CLAUSE_DEVICE: clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "device"; @@ -36904,10 +36924,12 @@ cp_parser_oacc_cache (cp_parser *parser, cp_token *pragma_tok) structured-block */ #define OACC_DATA_CLAUSE_MASK \ - ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ + ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DETACH) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT) ) @@ -37107,6 +37129,7 @@ cp_parser_oacc_dec
[PATCH 2/4] [og8] Attach / Detach C FE changes
As noted here <https://gcc.gnu.org/ml/gcc-patches/2018-10/msg01642.html>, this patch adds support for attach and detach in the C front end. The only major difference between this and the trunk patch is that OG8 supports the acc routine bind clause, do the trunk patch didn't apply cleanly. Other than that, these patches are identical. I've committed this patch to openacc-gcc-8-branch. Cesar 2018-10-30 Cesar Philippidis gcc/c/ * c-parser.c (c_parser_omp_clause_name): Scan for attach and detach. (c_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH, DETACH}. (c_parser_oacc_all_clauses): Likewise. (OACC_DATA_CLAUSE_MASK): Add support for attach and detach. (OACC_ENTER_DATA_CLAUSE_MASK): Likewise. (OACC_KERNELS_CLAUSE_MASK): Likewise. (OACC_PARALLEL_CLAUSE_MASK): Likewise. * c-typeck.c (handle_omp_array_sections_1): Reject subarrays for attach and detach. (c_oacc_check_attachments): New function. (c_finish_omp_clauses): Use it. --- gcc/c/c-parser.c | 27 +++- gcc/c/c-typeck.c | 55 +--- 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 578c0660c54..ffc5fe9b0d3 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -11226,6 +11226,8 @@ c_parser_omp_clause_name (c_parser *parser, bool consume_token = true) result = PRAGMA_OMP_CLAUSE_ALIGNED; else if (!strcmp ("async", p)) result = PRAGMA_OACC_CLAUSE_ASYNC; + else if (!strcmp ("attach", p)) + result = PRAGMA_OACC_CLAUSE_ATTACH; break; case 'b': if (!strcmp ("bind", p)) @@ -11252,6 +11254,8 @@ c_parser_omp_clause_name (c_parser *parser, bool consume_token = true) result = PRAGMA_OACC_CLAUSE_DELETE; else if (!strcmp ("depend", p)) result = PRAGMA_OMP_CLAUSE_DEPEND; + else if (!strcmp ("detach", p)) + result = PRAGMA_OACC_CLAUSE_DETACH; else if (!strcmp ("device", p)) result = PRAGMA_OMP_CLAUSE_DEVICE; else if (!strcmp ("deviceptr", p)) @@ -11675,11 +11679,13 @@ c_parser_omp_var_list_parens (c_parser *parser, enum omp_clause_code kind, } /* OpenACC 2.5: + attach (variable-list ) copy ( variable-list ) copyin ( variable-list ) copyout ( variable-list ) create ( variable-list ) delete ( variable-list ) + detach ( variable-list ) present ( variable-list ) */ static tree @@ -11689,6 +11695,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind, enum gomp_map_kind kind; switch (c_kind) { +case PRAGMA_OACC_CLAUSE_ATTACH: + kind = GOMP_MAP_ATTACH; + break; case PRAGMA_OACC_CLAUSE_COPY: kind = GOMP_MAP_TOFROM; break; @@ -11704,6 +11713,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind, case PRAGMA_OACC_CLAUSE_DELETE: kind = GOMP_MAP_RELEASE; break; +case PRAGMA_OACC_CLAUSE_DETACH: + kind = GOMP_MAP_DETACH; + break; case PRAGMA_OACC_CLAUSE_DEVICE: kind = GOMP_MAP_FORCE_TO; break; @@ -14083,6 +14095,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, clauses); c_name = "auto"; break; + case PRAGMA_OACC_CLAUSE_ATTACH: + clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "attach"; + break; case PRAGMA_OACC_CLAUSE_BIND: clauses = c_parser_oacc_clause_bind (parser, clauses); c_name = "bind"; @@ -14115,6 +14131,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, clauses = c_parser_omp_clause_default (parser, clauses, true); c_name = "default"; break; + case PRAGMA_OACC_CLAUSE_DETACH: + clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "detach"; + break; case PRAGMA_OACC_CLAUSE_DEVICE: clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "device"; @@ -14589,7 +14609,8 @@ c_parser_oacc_cache (location_t loc, c_parser *parser) */ #define OACC_DATA_CLAUSE_MASK \ - ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ + ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE) \ @@ -14773,6 +14794,7 @@ c_parser_oacc_declare (c_parser *parser) #define OACC_ENTER_DATA_CLAUSE_MASK \ ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE
[PATCH 1/4] [og8] Attach / Detach generic infrastructure
As mentioned here <https://gcc.gnu.org/ml/gcc-patches/2018-10/msg01641.html>, this patch series adds support for the new attach / detach clauses introduced in OpenACC 2.6 to the C and C++ front ends. There is one notable difference between this patch and the one I posted for trunk. This patch tweaks GOMP_MAP_DEEP_COPY because OG8 has a lot of other map types for acc declare and dynamic arrays. I suspect that change would be required for trunk too, eventually. I've committed this patch to openacc-gcc-8-branch. Cesar 2018-10-30 Cesar Philippidis gcc/ * gimplify.c (gimplify_adjust_omp_clauses): Filter out GOMP_MAP_STRUCT for acc exit data. (gimplify_omp_target_update): Promote GOMP_MAP_DETACH to GOMP_MAP_FORCE_DETACH when the finalize clause is present. * omp-low.c (lower_omp_target): Add support for GOMP_MAP_{ATTACH, DETACH, FORCE_DETACH}. * tree-pretty-print.c (dump_omp_clause): Likewise. gcc/c-family/ * c-pragma.h (enum pragma_omp_clause): Define PRAGMA_OACC_CLAUSE_{ATTACH,DETACH}. include/ * gomp-constants.h (GOMP_MAP_DEEP_COPY): Define. (enum gomp_map_kind): Add GOMP_MAP_{ATTACH, DETACH, FORCE_DETACH}. --- gcc/c-family/c-pragma.h | 2 ++ gcc/gimplify.c | 12 +--- gcc/omp-low.c| 3 +++ gcc/tree-pretty-print.c | 9 + include/gomp-constants.h | 9 + 5 files changed, 32 insertions(+), 3 deletions(-) diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h index 8b392486615..bce915187c1 100644 --- a/gcc/c-family/c-pragma.h +++ b/gcc/c-family/c-pragma.h @@ -131,12 +131,14 @@ enum pragma_omp_clause { /* Clauses for OpenACC. */ PRAGMA_OACC_CLAUSE_ASYNC, + PRAGMA_OACC_CLAUSE_ATTACH, PRAGMA_OACC_CLAUSE_AUTO, PRAGMA_OACC_CLAUSE_BIND, PRAGMA_OACC_CLAUSE_COPY, PRAGMA_OACC_CLAUSE_COPYOUT, PRAGMA_OACC_CLAUSE_CREATE, PRAGMA_OACC_CLAUSE_DELETE, + PRAGMA_OACC_CLAUSE_DETACH, PRAGMA_OACC_CLAUSE_DEVICEPTR, PRAGMA_OACC_CLAUSE_DEVICE_RESIDENT, PRAGMA_OACC_CLAUSE_DEVICE_TYPE, diff --git a/gcc/gimplify.c b/gcc/gimplify.c index fda0d69caf7..9be0b70fc7f 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -9468,7 +9468,8 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p, } } else if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_STRUCT - && code == OMP_TARGET_EXIT_DATA) + && (code == OMP_TARGET_EXIT_DATA + || code == OACC_EXIT_DATA)) remove = true; else if (DECL_SIZE (decl) && TREE_CODE (DECL_SIZE (decl)) != INTEGER_CST @@ -11156,8 +11157,9 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p) && omp_find_clause (OMP_STANDALONE_CLAUSES (expr), OMP_CLAUSE_FINALIZE)) { - /* Use GOMP_MAP_DELETE/GOMP_MAP_FORCE_FROM to denote that "finalize" - semantics apply to all mappings of this OpenACC directive. */ + /* Use GOMP_MAP_DELETE, GOMP_MAP_FORCE_DETACH, and + GOMP_MAP_FORCE_FROM to denote that "finalize" semantics apply + to all mappings of this OpenACC directive. */ bool finalize_marked = false; for (tree c = OMP_STANDALONE_CLAUSES (expr); c; c = OMP_CLAUSE_CHAIN (c)) if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP) @@ -11171,6 +11173,10 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p) OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_DELETE); finalize_marked = true; break; + case GOMP_MAP_DETACH: + OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_DETACH); + finalize_marked = true; + break; default: /* Check consistency: libgomp relies on the very first data mapping clause being marked, so make sure we did that before diff --git a/gcc/omp-low.c b/gcc/omp-low.c index a219b825488..e559211f413 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -8185,6 +8185,9 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) case GOMP_MAP_DYNAMIC_ARRAY_FORCE_ALLOC: case GOMP_MAP_DYNAMIC_ARRAY_FORCE_PRESENT: case GOMP_MAP_LINK: + case GOMP_MAP_ATTACH: + case GOMP_MAP_DETACH: + case GOMP_MAP_FORCE_DETACH: gcc_assert (is_gimple_omp_oacc (stmt)); break; default: diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c index 05a163d8956..ecbb51646b0 100644 --- a/gcc/tree-pretty-print.c +++ b/gcc/tree-pretty-print.c @@ -778,6 +778,15 @@ dump_omp_clause (pretty_printer *pp, tree clause, int spc, dump_flags_t flags) case GOMP_MAP_DECLARE_DEALLOCATE: pp_string (pp, "declare_deallocate"); break; + case GOMP_MAP_ATTACH: + pp_string (pp, "attach"); + break; + case GOMP_MAP_DETACH: + pp_string (pp, "detach"); + break; + case GOMP_MAP_FORCE_DETACH: + pp_string (pp, "force_detach"); + break; default: gcc_unreachable (); } diff --git a/include/gomp-constants.h b/include/gomp-constants.h index 9ef51c04994..c6cd48805e0 100644 --- a/include/gomp-constants.h +++ b/include/gomp-const
Re: [nvptx] vector length patch series
On 10/5/18 23:22, Tom de Vries wrote: > On 9/18/18 10:04 PM, Cesar Philippidis wrote: >> 591973d3c3a [nvptx] use user-defined vectors when possible > > If I drop this patch, I get the same test results. Can you find a > testcase for which this patch has an effect? I just revisited the vector length patch series, and that patch in specific is bogus and can be safely dropped. >From what I can remember, the intent behind that patch is to allow the user to override the default vector length using GOMP_OPENACC_DIM. E.g., #pragma acc parallel loop vector for (...) Here, the nvptx BE defaults to vector length = 32. But I had originally wanted to allow something like GOMP_OPENACC_DIM="1:1:128" ./a.out to use a vector length of 128 in that parallel region. After looking the rest of the patch series, that's not possible because the nvptx BE hard-codes the vector length to 128 at compile time. This was done because large, multi-warp vector reductions are slow (O(n) vs O(ln n)). Is this patch series OK without that patch? And if so, because that patch series depends on other patches, can the following patches be committed independently? 91e5c13b462 [nvptx] Generalize state propagation and synchronization cb4b27a93e0 [nvptx] Use MAX, MIN, ROUND_UP macros 0af782ae93c [nvptx] Use TARGET_SET_CURRENT_FUNCTION 87cfb384dbe [nvptx] Add axis_dim d1783939d98 [nvptx] Add thread count parm to bar.sync 47e80fa77a5 [nvptx] only use one bar.sync barriers in OpenACC offloaded code dafc9957ee7 [nvptx] Fix whitespace in nvptx_single and nvptx_neuter_pars a4857b94879 [nvptx] make nvptx state propagation function names more generic b4b85f6e0b5 [nvptx] consolidate offloaded function attributes into struct offload_attrs bcdb1e8afac [nvptx] Rename worker_bcast variables oacc_bcast. 34958a0904d [nvptx] update openacc dim macros These patches just refactor code in the nvptx BE. Thanks, Cesar nvptx-vl.tar.gz Description: application/gzip
[PATCH 4/4] [OpenACC] Attach / Detach compiler tests
This patch introduces a couple of compiler tests for the OpenACC attach and detach clauses. Is this OK for trunk after the other patches get approved? Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/testsuite/ * c-c++-common/goacc/mdc-1.c: New test. * c-c++-common/goacc/mdc-2.c: New test. * g++.dg/goacc/mdc.C: New test. diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-1.c b/gcc/testsuite/c-c++-common/goacc/mdc-1.c new file mode 100644 index 000..c20b94ddbdc --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/mdc-1.c @@ -0,0 +1,54 @@ +/* Test OpenACC's support for manual deep copy, including the attach + and detach clauses. */ + +/* { dg-additional-options "-fdump-tree-omplower" } */ + +void +t1 () +{ + struct foo { +int *a, *b, c, d, *e; + } s; + + int *a, *z; + +#pragma acc enter data copyin(s) + { +#pragma acc data copy(s.a[0:10]) copy(z[0:10]) +{ + s.e = z; +#pragma acc parallel loop attach(s.e) + for (int i = 0; i < 10; i++) +s.a[i] = s.e[i]; + + + a = s.e; +#pragma acc enter data attach(a) +#pragma acc exit data detach(a) +} + +#pragma acc enter data copyin(a) +#pragma acc acc enter data attach(s.e) +#pragma acc exit data detach(s.e) + +#pragma acc data attach(s.e) +{ +} +#pragma acc exit data delete(a) + +#pragma acc exit data detach(a) finalize +#pragma acc exit data detach(s.a) finalize + } +} + +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:s .len: 32.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.tofrom:.z .len: 40.. map.struct:s .len: 1.. map.alloc:s.a .len: 8.. map.tofrom:._1 .len: 40.. map.always_pointer:s.a .pointer assign, bias: 0.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_parallel map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.attach:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:s.e .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.struct:s .len: 1.. map.attach:s.e .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.release:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:a .len: 8.." 1 "omplower" } } */ +/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:s.a .len: 8.." 1 "omplower" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-2.c b/gcc/testsuite/c-c++-common/goacc/mdc-2.c new file mode 100644 index 000..ebfb99d4caf --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/mdc-2.c @@ -0,0 +1,62 @@ +/* Test OpenACC's support for manual deep copy, including the attach + and detach clauses. */ + +void +t1 () +{ + struct foo { +int *a, *b, c, d, *e; + } s; + + int *a, *z, scalar, **y; + +#pragma acc enter data copyin(s) detach(z) /* { dg-error ".detach. is not valid for" } */ + { +#pragma acc data copy(s.a[0:10]) copy(z[0:10]) +{ + s.e = z; +#pragma acc parallel loop attach(s.e) detach(s.b) /* { dg-error ".detach. is not valid for" } */ + for (int i = 0; i < 10; i++) +s.a[i] = s.e[i]; + + a = s.e; +#pragma acc enter data attach(a) detach(s.c) /* { dg-error ".detach. is not valid for" } */ +#pragma acc exit data detach(a) +} + +#pragma acc enter data attach(z[:5]) /* { dg-error "array section in .attach. clause" } */ +/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */ +#pragma acc exit data detach(z[:5]) /* { dg-error "array section in .detach. clause" } */ +/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */ +#pragma acc enter data attach(z[1:]) /* { dg-error "array section in .attach. clause" } */ +/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */ +#pragma acc exit data detach(z[1:]) /* { dg-error "array section in .detach. clause" } */ +/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */ +#pragma acc enter data attach(z[:]) /* { dg-error "array section in .attach. clause" } */ +/* { dg-error "h
[PATCH 3/4] [OpenACC] Attach / Detach C++ FE changes
This patch adds support for attach and detach in the C front end. All of the comments for the C FE patch apply here. Arguably, there's not a significant difference between cp_oacc_check_attachments and its C counterpart. However, I decided to keep them separate in case the standard gets updated in the future to support more complicated C++ functionality. Is this patch OK for trunk? I bootstrapped and regression tested it for x86_64 Linux with nvptx offloading. Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/cp/ * parser.c (cp_parser_omp_clause_name): Scan for attach and detach. (cp_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH, DETACH}. (cp_parser_oacc_all_clauses): Likewise. (OACC_DATA_CLAUSE_MASK): Add support for attach and detach. (OACC_ENTER_DATA_CLAUSE_MASK): Likewise. (cp_parser_oacc_declare): Likewise. (OACC_KERNELS_CLAUSE_MASK): Likewise. (OACC_PARALLEL_CLAUSE_MASK): Likewise. * semantics.c (handle_omp_array_sections_1): Reject subarrays for attach and detach. (cp_oacc_check_attachments): New function. (finish_omp_clauses): Use it. Also, allow structure fields and class members to appear in OpenACC data clauses. diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 2533871fb28..a1b6244483b 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -31381,6 +31381,8 @@ cp_parser_omp_clause_name (cp_parser *parser) result = PRAGMA_OMP_CLAUSE_ALIGNED; else if (!strcmp ("async", p)) result = PRAGMA_OACC_CLAUSE_ASYNC; + else if (!strcmp ("attach", p)) + result = PRAGMA_OACC_CLAUSE_ATTACH; break; case 'c': if (!strcmp ("collapse", p)) @@ -31401,6 +31403,8 @@ cp_parser_omp_clause_name (cp_parser *parser) result = PRAGMA_OMP_CLAUSE_DEFAULTMAP; else if (!strcmp ("depend", p)) result = PRAGMA_OMP_CLAUSE_DEPEND; + else if (!strcmp ("detach", p)) + result = PRAGMA_OACC_CLAUSE_DETACH; else if (!strcmp ("device", p)) result = PRAGMA_OMP_CLAUSE_DEVICE; else if (!strcmp ("deviceptr", p)) @@ -31767,11 +31771,13 @@ cp_parser_omp_var_list (cp_parser *parser, enum omp_clause_code kind, tree list) } /* OpenACC 2.0: + attach ( variable-list ) copy ( variable-list ) copyin ( variable-list ) copyout ( variable-list ) create ( variable-list ) delete ( variable-list ) + detach ( variable-list ) present ( variable-list ) */ static tree @@ -31781,6 +31787,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind, enum gomp_map_kind kind; switch (c_kind) { +case PRAGMA_OACC_CLAUSE_ATTACH: + kind = GOMP_MAP_ATTACH; + break; case PRAGMA_OACC_CLAUSE_COPY: kind = GOMP_MAP_TOFROM; break; @@ -31796,6 +31805,9 @@ cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind, case PRAGMA_OACC_CLAUSE_DELETE: kind = GOMP_MAP_RELEASE; break; +case PRAGMA_OACC_CLAUSE_DETACH: + kind = GOMP_MAP_DETACH; + break; case PRAGMA_OACC_CLAUSE_DEVICE: kind = GOMP_MAP_FORCE_TO; break; @@ -33776,6 +33788,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask, clauses, here); c_name = "auto"; break; + case PRAGMA_OACC_CLAUSE_ATTACH: + clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "attach"; + break; case PRAGMA_OACC_CLAUSE_COLLAPSE: clauses = cp_parser_omp_clause_collapse (parser, clauses, here); c_name = "collapse"; @@ -33804,6 +33820,10 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask, clauses = cp_parser_omp_clause_default (parser, clauses, here, true); c_name = "default"; break; + case PRAGMA_OACC_CLAUSE_DETACH: + clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "detach"; + break; case PRAGMA_OACC_CLAUSE_DEVICE: clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "device"; @@ -36809,10 +36829,12 @@ cp_parser_oacc_cache (cp_parser *parser, cp_token *pragma_tok) structured-block */ #define OACC_DATA_CLAUSE_MASK \ - ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ + ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DETACH) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT) ) @@ -37012,6 +37034,7 @@ cp_parser_oacc_declare (cp_parser *parser, cp_token *pragma_tok) #define OACC_ENTER_DATA_CLAUSE_MASK \ ( (OMP_CLAUSE_MASK_1 &l
[PATCH 2/4] [OpenACC] Attach / Detach C FE changes
This patch adds support for attach and detach in the C front end. Both attach and detach are a little different from the other data clauses because they require variables that are pointers. Consequently, this patch teaches handle_omp_array_sections_1 to bail out of it detects a subarray argument for attach or detach. Likewise, c_finish_omp_clauses calls c_oacc_check_attachments to ensure that the variable is a pointer. Is this patch OK for trunk? I bootstrapped and regression tested it for x86_64 Linux with nvptx offloading. Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/c/ * c-parser.c (c_parser_omp_clause_name): Scan for attach and detach. (c_parser_oacc_data_clause): Handle PRAGMA_OACC_CLAUSE_{ATTACH, DETACH}. (c_parser_oacc_all_clauses): Likewise. (OACC_DATA_CLAUSE_MASK): Add support for attach and detach. (OACC_ENTER_DATA_CLAUSE_MASK): Likewise. (OACC_KERNELS_CLAUSE_MASK): Likewise. (OACC_PARALLEL_CLAUSE_MASK): Likewise. * c-typeck.c (handle_omp_array_sections_1): Reject subarrays for attach and detach. (c_oacc_check_attachments): New function. (c_finish_omp_clauses): Use it. diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index ee66ce89b5d..749a7f946ce 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -11396,6 +11396,8 @@ c_parser_omp_clause_name (c_parser *parser) result = PRAGMA_OMP_CLAUSE_ALIGNED; else if (!strcmp ("async", p)) result = PRAGMA_OACC_CLAUSE_ASYNC; + else if (!strcmp ("attach", p)) + result = PRAGMA_OACC_CLAUSE_ATTACH; break; case 'c': if (!strcmp ("collapse", p)) @@ -11418,6 +11420,8 @@ c_parser_omp_clause_name (c_parser *parser) result = PRAGMA_OACC_CLAUSE_DELETE; else if (!strcmp ("depend", p)) result = PRAGMA_OMP_CLAUSE_DEPEND; + else if (!strcmp ("detach", p)) + result = PRAGMA_OACC_CLAUSE_DETACH; else if (!strcmp ("device", p)) result = PRAGMA_OMP_CLAUSE_DEVICE; else if (!strcmp ("deviceptr", p)) @@ -11818,11 +11822,13 @@ c_parser_omp_var_list_parens (c_parser *parser, enum omp_clause_code kind, } /* OpenACC 2.0: + attach (variable-list ) copy ( variable-list ) copyin ( variable-list ) copyout ( variable-list ) create ( variable-list ) delete ( variable-list ) + detach ( variable-list ) present ( variable-list ) */ static tree @@ -11832,6 +11838,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind, enum gomp_map_kind kind; switch (c_kind) { +case PRAGMA_OACC_CLAUSE_ATTACH: + kind = GOMP_MAP_ATTACH; + break; case PRAGMA_OACC_CLAUSE_COPY: kind = GOMP_MAP_TOFROM; break; @@ -11847,6 +11856,9 @@ c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind, case PRAGMA_OACC_CLAUSE_DELETE: kind = GOMP_MAP_RELEASE; break; +case PRAGMA_OACC_CLAUSE_DETACH: + kind = GOMP_MAP_DETACH; + break; case PRAGMA_OACC_CLAUSE_DEVICE: kind = GOMP_MAP_FORCE_TO; break; @@ -14072,6 +14084,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, clauses); c_name = "auto"; break; + case PRAGMA_OACC_CLAUSE_ATTACH: + clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "attach"; + break; case PRAGMA_OACC_CLAUSE_COLLAPSE: clauses = c_parser_omp_clause_collapse (parser, clauses); c_name = "collapse"; @@ -14100,6 +14116,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, clauses = c_parser_omp_clause_default (parser, clauses, true); c_name = "default"; break; + case PRAGMA_OACC_CLAUSE_DETACH: + clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); + c_name = "detach"; + break; case PRAGMA_OACC_CLAUSE_DEVICE: clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "device"; @@ -14558,7 +14578,8 @@ c_parser_oacc_cache (location_t loc, c_parser *parser) */ #define OACC_DATA_CLAUSE_MASK \ - ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ + ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE) \ @@ -14741,6 +14762,7 @@ c_parser_oacc_declare (c_parser *parser) #define OACC_ENTER_DATA_CLAUSE_MASK \ ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) ) @@ -14750,6 +14772,7 @@ c_parser_oacc_declare (c_parse
[PATCH 1/4] [OpenACC] Attach / Detach generic infrastructure
This patch series adds support for the new attach / detach clauses introduced in OpenACC 2.6 to the C and C++ front ends. Julian is working patches for the Fortran front end along with the runtime. As their names somewhat imply, attach and detach are new data clauses that are used to support manual deep copy in OpenACC. Specifically, OpenACC 2.6 allows users to specify individual structure fields inside data clauses, whereas before that would only work inside the update directive. The attach and detach clauses allow users to update the pointers in structure fields with their on-device counterparts. As an example, consider the the following code: struct { int *a, b } s; int *z = ... #pragma acc enter data copyin(a[:N], s) ... s.a = z; #pragma acc enter data attach(s.a) ... #pragma acc exit data detach(s.a) #pragma acc exit data copyout(s) Because the attach clause updates field s.a with the device address, "acc exit data detach" must be used to restore the host pointer contents before that value is copied back to the host. This patch in particular adds the generic infrastructure for the attach and detach clauses. All of the front ends lower the attach clause as GOMP_MAP_DETACH data mapping. However, if a detachment is finalized, e.g. #pragma acc exit data finalize detach(ptr) the gimplifier will promote it to GOMP_MAP_FORCE_FINALIZE. Also, this patch teaches the gimplifier how to ignore GOMP_MAP_STRUCT for the target update constructs. Is this patch OK for trunk? I bootstrapped and regression tested it for x86_64 Linux with nvptx offloading. Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/ * gimplify.c (gimplify_adjust_omp_clauses): Filter out GOMP_MAP_STRUCT for acc exit data. (gimplify_omp_target_update): Promote GOMP_MAP_DETACH to GOMP_MAP_FORCE_DETACH when the finalize clause is present. * omp-low.c (lower_omp_target): Add support for GOMP_MAP_{ATTACH, DETACH, FORCE_DETACH}. * tree-pretty-print.c (dump_omp_clause): Likewise. gcc/c-family/ * c-pragma.h (enum pragma_omp_clause): Define PRAGMA_OACC_CLAUSE_{ATTACH,DETACH}. include/ * gomp-constants.h (GOMP_MAP_DEEP_COPY): Define. (enum gomp_map_kind): Add GOMP_MAP_{ATTACH, DETACH, FORCE_DETACH}. diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h index b322547b11a..ab4c03b21f1 100644 --- a/gcc/c-family/c-pragma.h +++ b/gcc/c-family/c-pragma.h @@ -131,11 +131,13 @@ enum pragma_omp_clause { /* Clauses for OpenACC. */ PRAGMA_OACC_CLAUSE_ASYNC, + PRAGMA_OACC_CLAUSE_ATTACH, PRAGMA_OACC_CLAUSE_AUTO, PRAGMA_OACC_CLAUSE_COPY, PRAGMA_OACC_CLAUSE_COPYOUT, PRAGMA_OACC_CLAUSE_CREATE, PRAGMA_OACC_CLAUSE_DELETE, + PRAGMA_OACC_CLAUSE_DETACH, PRAGMA_OACC_CLAUSE_DEVICEPTR, PRAGMA_OACC_CLAUSE_DEVICE_RESIDENT, PRAGMA_OACC_CLAUSE_FINALIZE, diff --git a/gcc/gimplify.c b/gcc/gimplify.c index 509fc2f3f5b..ead412e3f6f 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -9145,7 +9145,8 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p, } } else if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_STRUCT - && code == OMP_TARGET_EXIT_DATA) + && (code == OMP_TARGET_EXIT_DATA + || code == OACC_EXIT_DATA)) remove = true; else if (DECL_SIZE (decl) && TREE_CODE (DECL_SIZE (decl)) != INTEGER_CST @@ -11001,8 +11002,9 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p) && omp_find_clause (OMP_STANDALONE_CLAUSES (expr), OMP_CLAUSE_FINALIZE)) { - /* Use GOMP_MAP_DELETE/GOMP_MAP_FORCE_FROM to denote that "finalize" - semantics apply to all mappings of this OpenACC directive. */ + /* Use GOMP_MAP_DELETE, GOMP_MAP_FORCE_DETACH, and + GOMP_MAP_FORCE_FROM to denote that "finalize" semantics apply + to all mappings of this OpenACC directive. */ bool finalize_marked = false; for (tree c = OMP_STANDALONE_CLAUSES (expr); c; c = OMP_CLAUSE_CHAIN (c)) if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP) @@ -11016,6 +11018,10 @@ gimplify_omp_target_update (tree *expr_p, gimple_seq *pre_p) OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_DELETE); finalize_marked = true; break; + case GOMP_MAP_DETACH: + OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_DETACH); + finalize_marked = true; + break; default: /* Check consistency: libgomp relies on the very first data mapping clause being marked, so make sure we did that before diff --git a/gcc/omp-low.c b/gcc/omp-low.c index bbcbc121bae..f5ee117887f 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -7581,6 +7581,9 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) case GOMP_MAP_FORCE_DEVICEPTR: case GOMP_MAP_DEVICE_RESIDENT: case GOMP_MAP_LINK: + case GOMP_MAP_ATTACH: + case GOMP_MAP_DETACH: + case GOMP_MAP_FORCE_DETACH: gcc_assert (is_gimple_omp_oacc (stmt)); break; default: diff --git a/gcc/t
[OpenACC] initial manual deep copy in c
I've push the attach patch to my github trunk-acc-mdc branch which enables OpenMP 4.5 deep copy semantics in OpenACC data clauses in C. Now GCC accepts data clauses of the form #pragma acc data copy(v.a[:n], v.b) I think there are a couple of limitations in OpenMP that's going to force me to introduce a new GOMP_MAP_ACC_STRUCT map kind. Basically, GOMP_MAP_STRUCT reserves the minimum amount of device storage to the member actually used in a struct. OpenACC allows the users to dynamically attach and detach struct members, so GOMP_MAP_ACC_STRUCT would need reserve enough memory for the entire struct. This is also necessary for cases like this struct { int *a, b, *c; } v; #pragma acc data copy(v.b) { #pragma acc parallel copy(v.a[:n], v.c[:n]) } If the acc data directive is replaced with omp target data, and the acc parallel replaced with omp target something, then the runtime would crash because struct v has been partially mapped already. Going forward, OpenACC 2.6 requires the runtime to maintain an attachment counter to keep track if struct fields have been mapped. So that's another justification for the GOMP_MAP_ACC_STRUCT type. This is all an early work in progress. I'm still experimenting with some other functionality. If you checkout that branch, beware it may be rebased. Cesar [OpenACC] Initial Manual Deep Copy 2018-10-02 Cesar Philippidis gcc/c/ * c-typeck.c (handle_omp_array_sections_1): Enable structs in acc data clauses. (c_finish_omp_clauses): Likewise. libgomp/ * libgomp.h: Declare gomp_map_val. * oacc-parallel.c (GOACC_parallel_keyed): Use it to set devaddrs. * target.c (gomp_map_val): Remove static inline. * testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c: New test. diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index 9d09b8d65fd..0428f48952a 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -12605,7 +12605,6 @@ handle_omp_array_sections_1 (tree c, tree t, vec , return error_mark_node; } if (TREE_CODE (t) == COMPONENT_REF - && ort == C_ORT_OMP && (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_TO || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FROM)) @@ -13799,7 +13798,6 @@ c_finish_omp_clauses (tree clauses, enum c_omp_region_type ort) break; } if (TREE_CODE (t) == COMPONENT_REF - && (ort & C_ORT_OMP) && OMP_CLAUSE_CODE (c) != OMP_CLAUSE__CACHE_) { if (DECL_BIT_FIELD (TREE_OPERAND (t, 1))) diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 3a8cc2bd7d6..553d1bb81ba 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -996,6 +996,7 @@ extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *); extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int); extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *, unsigned short *); +extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t); extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, size_t, void **, void **, diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index b80ace58590..fd5bbfbdf7d 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -231,8 +231,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *), devaddrs = gomp_alloca (sizeof (void *) * mapnum); for (i = 0; i < mapnum; i++) -devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start - + tgt->list[i].key->tgt_offset); +devaddrs[i] = (void *) gomp_map_val (tgt, hostaddrs, i); acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, async, dims, tgt); diff --git a/libgomp/target.c b/libgomp/target.c index dda041cdbef..a87ba7cad0e 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -457,7 +457,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n, (void *) cur_node.host_end); } -static inline uintptr_t +uintptr_t gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i) { if (tgt->list[i].key != NULL) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c new file mode 100644 index 000..d489cc645cd --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c @@ -0,0 +1,25 @@ +#include +#include + +struct dc +{ + int a; + int *b; +}; + +int +main () +{ + int n = 100, i; + struct dc v = { .a = 3, .b = (int *) malloc (sizeof (int) * n) }; + +#pragma omp target teams distribute parallel for map(tofrom:v.a, v.b[:n]) +#pragma acc parallel loop copy(v.a, v.b[:n]) + for (i = 0; i < n; i++) +v.b[i] = v.a; + + for (i = 0; i < 10; i++) +printf ("%d: %d\n", i, v.b[i]); + + return 0; +}
[patch,openacc] Add warning for unused acc routine parallelism
This patch teaches omp-general to be a little more verbose when it comes time to reporting the missing usage of gang, worker, and vector clauses on acc routines. As before, the Fortran FE does this directly so that it can handle modules. Therefore, this primarily handle the C and C++ cases (although certain Fortran routines fall though to this). Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. This patch only touches the OpenACC code path. Thanks, Cesar [OpenACC] Add warning for unused acc routine parallelism (was [OpenACC] Don't error on implicitly private induction variables in gfortran) 2018-XX-YY Cesar Philippidis gcc/ * omp-general.c (oacc_verify_routine_clauses): New warning. gcc/testsuite/ * c-c++-common/goacc-gomp/nesting-fail-1.c: Update test. * c-c++-common/goacc/Wparentheses-1.c: Likewise. * c-c++-common/goacc/builtin-goacc-parlevel-id-size-2.c: Likewise. * c-c++-common/goacc/builtin-goacc-parlevel-id-size.c: Likewise. * c-c++-common/goacc/nesting-fail-1.c: Likewise. * c-c++-common/goacc/routine-1.c: Likewise. * c-c++-common/goacc/routine-level-of-parallelism-1.c: Likewise. * c-c++-common/goacc/routine-level-of-parallelism-2.c: Likewise. * c-c++-common/goacc/routine-nohost-1.c: Likewise. * c-c++-common/goacc/routine-nohost-2.c: Likewise. * g++.dg/goacc/routine-1.C: Likewise. * g++.dg/goacc/routine-2.C: Likewise. * gfortran.dg/goacc/pr72741-2.f: Likewise. * gfortran.dg/goacc/routine-9.f90: Likewise. * gfortran.dg/goacc/routine-without-clauses.f90: New test. libgomp/ * testsuite/libgomp.oacc-c-c++-common/declare-2.c: Update test. * testsuite/libgomp.oacc-c-c++-common/declare-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/declare-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-3.c: Likewise. (cherry picked from gomp-4_0-branch r244980) --- gcc/omp-general.c | 6 ++- .../c-c++-common/goacc-gomp/nesting-fail-1.c | 4 +- .../c-c++-common/goacc/Wparentheses-1.c | 4 +- .../goacc/builtin-goacc-parlevel-id-size-2.c | 2 + .../goacc/builtin-goacc-parlevel-id-size.c| 2 + .../c-c++-common/goacc/nesting-fail-1.c | 2 +- gcc/testsuite/c-c++-common/goacc/routine-1.c | 4 ++ .../goacc/routine-level-of-parallelism-1.c| 8 ++-- .../goacc/routine-level-of-parallelism-2.c| 34 .../c-c++-common/goacc/routine-nohost-1.c | 20 +- .../c-c++-common/goacc/routine-nohost-2.c | 40 +-- gcc/testsuite/g++.dg/goacc/routine-1.C| 6 +-- gcc/testsuite/g++.dg/goacc/routine-2.C| 10 ++--- gcc/testsuite/gfortran.dg/goacc/pr72741-2.f | 4 +- gcc/testsuite/gfortran.dg/goacc/routine-9.f90 | 22 +- .../goacc/routine-without-clauses.f90 | 34 .../libgomp.oacc-c-c++-common/declare-2.c | 4 +- .../libgomp.oacc-c-c++-common/declare-3.c | 2 +- .../libgomp.oacc-c-c++-common/declare-4.c | 2 +- .../libgomp.oacc-c-c++-common/host_data-1.c | 2 +- .../loop-dim-default.c| 2 +- .../mode-transitions.c| 2 +- .../parallel-loop-2.h | 2 +- .../libgomp.oacc-c-c++-common/routine-1.c | 2 +- .../libgomp.oacc-c-c++-common/routine-3.c | 2 +- 25 files changed, 132 insertions(+), 90 deletions(-) create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-without-clauses.f90 diff --git a/gcc/omp-general.c b/gcc/omp-general.c index 5c91ce73a50..d290766329f 100644 --- a/gcc/omp-general.c +++ b/gcc/omp-general.c @@ -613,8 +613,10 @@ oacc_verify_routine_clauses (tree fndecl, tree *clauses, location_t loc, } if (c_level == NULL_TREE) { - /* OpenACC 2.5 makes this an error; for the current OpenACC 2.0a - implementation add an implicit "seq" clause. */ + /* OpenACC 2.5 expects the user to supply one parallelism clause. */ + warning_at (loc, 0, "expecting one of %, %, % " + "or % clauses"); + inform (loc, "assigning % parallelism to this routine"); c_level = build_omp_clause (loc, OMP_CLAUSE_SEQ); OMP_CLAUSE_CHAIN (c_level) = *clauses; *clauses = c_level; diff --git a/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c b/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c index 1a3324200e2..57eaa0296d6 100644 --- a/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c +++ b/gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c @@ -362,7 +362,7 @@ f_acc_data (void) } } -#pragma acc routine +#pragma acc routine seq void f_acc_loop (void)
[patch,openacc] Check for sufficient parallelism when calling acc routines in Fortran
This patch updates the Fortran FE OpenACC routine parser to enforce the new OpenACC 2.5 routine directive semantics. In addition to emitting a warning when the user doesn't specify a gang, worker or vector clause, it also clarifies some error messages and introduces a new error when the user tries to use an acc routine with insufficient parallelism, e.g., calling a gang routine from a vector loop. Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] Check for sufficient parallelism when calling acc routines in fortran 2018-XX-YY Cesar Philippidis gcc/fortran/ * gfortran.h (gfc_resolve_oacc_routine_call): Declare. (gfc_resolve_oacc_routines): Declare. * openmp.c (gfc_match_oacc_routine): Make error reporting more precise. Defer rejection of non-function and subroutine symbols until gfc_resolve_oacc_routines. (struct fortran_omp_context): Add a dims member. (gfc_resolve_oacc_blocks): Update ctx->dims. (gfc_resolve_oacc_routine_call): New function. (gfc_resolve_oacc_routines): New function. * resolve.c (resolve_function): Call gfc_resolve_oacc_routine_call. (resolve_call): Likewise. (resolve_codes): Call gfc_resolve_oacc_routines. gcc/testsuite/ * gfortran.dg/goacc/routine-10.f90: New test. * gfortran.dg/goacc/routine-9.f90: New test. * gfortran.dg/goacc/routine-nested-parallelism.f: New test. * gfortran.dg/goacc/routine-nested-parallelism.f90: New test. (cherry picked from gomp-4_0-branch r239784) (cherry picked from gomp-4_0-branch r247353) --- gcc/fortran/gfortran.h| 2 + gcc/fortran/openmp.c | 108 +- gcc/fortran/resolve.c | 11 + .../gfortran.dg/goacc/routine-10.f90 | 6 + gcc/testsuite/gfortran.dg/goacc/routine-9.f90 | 96 + .../goacc/routine-nested-parallelism.f| 340 ++ .../goacc/routine-nested-parallelism.f90 | 340 ++ 7 files changed, 887 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-10.f90 create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-9.f90 create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-nested-parallelism.f create mode 100644 gcc/testsuite/gfortran.dg/goacc/routine-nested-parallelism.f90 diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h index 781dc2a7d17..87f98bbd110 100644 --- a/gcc/fortran/gfortran.h +++ b/gcc/fortran/gfortran.h @@ -3166,6 +3166,8 @@ void gfc_resolve_oacc_directive (gfc_code *, gfc_namespace *); void gfc_resolve_oacc_declare (gfc_namespace *); void gfc_resolve_oacc_parallel_loop_blocks (gfc_code *, gfc_namespace *); void gfc_resolve_oacc_blocks (gfc_code *, gfc_namespace *); +void gfc_resolve_oacc_routine_call (gfc_symbol *, locus *); +void gfc_resolve_oacc_routines (gfc_namespace *); /* expr.c */ void gfc_free_actual_arglist (gfc_actual_arglist *); diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index 58cbe0ae90c..5850538c1f0 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -2319,7 +2319,13 @@ gfc_match_oacc_routine (void) { if ((isym = gfc_find_function (buffer)) == NULL && (isym = gfc_find_subroutine (buffer)) == NULL) - st = gfc_find_symtree (gfc_current_ns->sym_root, buffer); + { + st = gfc_find_symtree (gfc_current_ns->sym_root, buffer); + if (st == NULL && gfc_current_ns->proc_name->attr.contained + && gfc_current_ns->parent) + st = gfc_find_symtree (gfc_current_ns->parent->sym_root, + buffer); + } if (st) { sym = st->n.sym; @@ -2327,18 +2333,12 @@ gfc_match_oacc_routine (void) && strcmp (sym->name, gfc_current_ns->proc_name->name) == 0) sym = NULL; } - - if ((isym == NULL && st == NULL) - || (sym - && !sym->attr.external - && !sym->attr.function - && !sym->attr.subroutine)) + else if (isym == NULL) { - gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C, " - "invalid function name %s", - (sym) ? sym->name : buffer); - gfc_current_locus = old_loc; - return MATCH_ERROR; + gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L, " + "invalid function name %qs", _loc, buffer);\ + goto cleanup; + } /* Set sym to NULL if it matches the current procedure's @@ -2371,20 +2371,27 @@ gfc_match_oacc_routine (void) dims = gfc_oacc_routine_dims (c); if (dims == OACC_FUNCTION_NONE) { - gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at %C"); + gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at %L", + _loc); /* Don't abort early, because it's important to let the user know of any potential duplicate routine directives
[patch,openacc] Check clauses with intrinsic function specified in !$ACC ROUTINE ( NAME )
This patch allows Fortran intrinsic functions to be declared as acc routines. For instance, abort can now be called from acc within offloaded regions. Given that intrinsic functions like sin and cos are important for offloaded functions, I wonder if there is a better way to accomplish to enabling this. Maybe certain intrinsic functions should default to having an implied acc routine directive. But I suppose that's something for another patch. Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [PR fortran/72741] Check clauses with intrinsic function specified in !$ACC ROUTINE ( NAME ) 2018-XX-YY Cesar Philippidis gcc/fortran/ * openmp.c (gfc_match_oacc_routine): Check clauses of intrinsic functions. gcc/testsuite/ * gfortran.dg/goacc/fixed-1.f: Update test. * gfortran.dg/goacc/pr72741-2.f: New test. * gfortran.dg/goacc/pr72741-intrinsic-1.f: New test. * gfortran.dg/goacc/pr72741-intrinsic-2.f: New test. * gfortran.dg/goacc/pr72741.f90: Update test. libgomp/ * testsuite/libgomp.oacc-fortran/abort-1.f90: Update test. * testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f: Update test. (cherry picked from gomp-4_0-branch r239422) (cherry picked from gomp-4_0-branch r239515, and r247954) --- gcc/fortran/openmp.c | 41 +++ gcc/testsuite/gfortran.dg/goacc/fixed-1.f | 2 + gcc/testsuite/gfortran.dg/goacc/pr72741-2.f | 39 ++ .../gfortran.dg/goacc/pr72741-intrinsic-1.f | 16 .../gfortran.dg/goacc/pr72741-intrinsic-2.f | 22 ++ gcc/testsuite/gfortran.dg/goacc/pr72741.f90 | 20 +++-- .../libgomp.oacc-fortran/abort-1.f90 | 1 + .../libgomp.oacc-fortran/acc_on_device-1-2.f | 1 + 8 files changed, 130 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gfortran.dg/goacc/pr72741-2.f create mode 100644 gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-1.f create mode 100644 gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-2.f diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index 60ecaf54523..58cbe0ae90c 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -2288,8 +2288,9 @@ match gfc_match_oacc_routine (void) { locus old_loc; - gfc_symbol *sym = NULL; match m; + gfc_intrinsic_sym *isym = NULL; + gfc_symbol *sym = NULL; gfc_omp_clauses *c = NULL; gfc_oacc_routine_name *n = NULL; oacc_function dims; @@ -2311,12 +2312,14 @@ gfc_match_oacc_routine (void) if (m == MATCH_YES) { char buffer[GFC_MAX_SYMBOL_LEN + 1]; - gfc_symtree *st; + gfc_symtree *st = NULL; m = gfc_match_name (buffer); if (m == MATCH_YES) { - st = gfc_find_symtree (gfc_current_ns->sym_root, buffer); + if ((isym = gfc_find_function (buffer)) == NULL + && (isym = gfc_find_subroutine (buffer)) == NULL) + st = gfc_find_symtree (gfc_current_ns->sym_root, buffer); if (st) { sym = st->n.sym; @@ -2325,7 +2328,7 @@ gfc_match_oacc_routine (void) sym = NULL; } - if (st == NULL + if ((isym == NULL && st == NULL) || (sym && !sym->attr.external && !sym->attr.function @@ -2337,6 +2340,13 @@ gfc_match_oacc_routine (void) gfc_current_locus = old_loc; return MATCH_ERROR; } + + /* Set sym to NULL if it matches the current procedure's + name. This will simplify the check for duplicate ACC + ROUTINE attributes. */ + if (gfc_current_ns->proc_name + && !strcmp (buffer, gfc_current_ns->proc_name->name)) + sym = NULL; } else { @@ -2357,15 +2367,30 @@ gfc_match_oacc_routine (void) != MATCH_YES)) return MATCH_ERROR; + /* Scan for invalid routine geometry. */ dims = gfc_oacc_routine_dims (c); if (dims == OACC_FUNCTION_NONE) { - gfc_error ("Multiple loop axes specified for routine %C"); - gfc_current_locus = old_loc; - return MATCH_ERROR; + gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at %C"); + + /* Don't abort early, because it's important to let the user + know of any potential duplicate routine directives. */ + seen_error = true; } - if (sym != NULL) + if (isym != NULL) +{ + if (c && (c->gang || c->worker || c->vector)) + { + gfc_error ("Intrinsic symbol specified in !$ACC ROUTINE ( NAME ) " + "at %C, with incompatible clauses specifying the level " + "of parallelism"); + goto cleanup; + } + /* The intrinsic symbol has been marked with a SEQ, or with no clause at + all, which is OK. */ +} + else if (sym != NULL) { bool needs_entry = true; diff --git a/gcc/testsuite/gfortran.dg/goacc/fixed-1.f b/gcc/testsuite/gfortran.dg/goacc/fixed-1.f index 974f2702260..3a900c5b4e6 100644 --- a/gcc/testsuite/gfortran.dg/goacc/fixed
[patch,openacc] Repeated use of the OpenACC routine directive
This is another patch that teaches the C and C++ to emit more errors involving acc routine clauses. In retrospect, I should have merged it together with the patch I posted here <https://gcc.gnu.org/ml/gcc-patches/2018-10/msg00089.html>, however at the time I thought it would make the patch too large. Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. This patch is also self-contained to the OpenACC code path. Thanks, Cesar [OpenACC] Repeated use of the OpenACC routine directive 2018-XX-YY Thomas Schwinge Cesar Philippidis gcc/ * omp-general.h (oacc_verify_routine_clauses): Declare. * omp-general.c (oacc_verify_routine_clauses): Change formal parameters. Add checking if already marked as an accelerator routine. Adjust all users. gcc/c/ * c-parser.c (c_finish_oacc_routine): Rework checking if already marked as an accelerator routine. gcc/cp/ * parser.c (cp_finalize_oacc_routine): Rework checking if already marked as an accelerator routine. gcc/testsuite/ * c-c++-common/goacc/routine-1.c: Update tests. * c-c++-common/goacc/routine-5.c: Likewise. * c-c++-common/goacc/routine-level-of-parallelism-1.c: Likewise. * c-c++-common/goacc/routine-level-of-parallelism-2.c: New test. * c-c++-common/goacc/routine-nohost-1.c: Update tests. * c-c++-common/goacc/routine-nohost-2.c: New test. (cherry picked from gomp-4_0-branch r239521) remove bind clause support --- gcc/c/c-parser.c | 46 ++-- gcc/cp/parser.c | 50 ++-- gcc/omp-general.c | 105 +++- gcc/omp-general.h | 3 +- gcc/testsuite/c-c++-common/goacc/routine-1.c | 10 +- gcc/testsuite/c-c++-common/goacc/routine-5.c | 4 +- .../goacc/routine-level-of-parallelism-1.c| 233 -- .../goacc/routine-level-of-parallelism-2.c| 73 ++ .../c-c++-common/goacc/routine-nohost-1.c | 20 ++ .../c-c++-common/goacc/routine-nohost-2.c | 97 10 files changed, 566 insertions(+), 75 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/routine-level-of-parallelism-2.c create mode 100644 gcc/testsuite/c-c++-common/goacc/routine-nohost-2.c diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 187a2dec999..3d5cbe76acf 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -15090,35 +15090,39 @@ c_finish_oacc_routine (struct oacc_routine_data *data, tree fndecl, return; } - oacc_verify_routine_clauses (>clauses, data->loc); - - if (oacc_get_fn_attrib (fndecl)) + int compatible += oacc_verify_routine_clauses (fndecl, >clauses, data->loc, + "#pragma acc routine"); + if (compatible < 0) { - error_at (data->loc, - "%<#pragma acc routine%> already applied to %qD", fndecl); data->error_seen = true; return; } - - if (TREE_USED (fndecl) || (!is_defn && DECL_SAVED_TREE (fndecl))) + if (compatible > 0) { - error_at (data->loc, - TREE_USED (fndecl) - ? G_("%<#pragma acc routine%> must be applied before use") - : G_("%<#pragma acc routine%> must be applied before " - "definition")); - data->error_seen = true; - return; } + else +{ + if (TREE_USED (fndecl) || (!is_defn && DECL_SAVED_TREE (fndecl))) + { + error_at (data->loc, + TREE_USED (fndecl) + ? G_("%<#pragma acc routine%> must be applied before use") + : G_("%<#pragma acc routine%> must be applied before" + " definition")); + data->error_seen = true; + return; + } - /* Process the routine's dimension clauses. */ - tree dims = oacc_build_routine_dims (data->clauses); - oacc_replace_fn_attrib (fndecl, dims); + /* Set the routine's level of parallelism. */ + tree dims = oacc_build_routine_dims (data->clauses); + oacc_replace_fn_attrib (fndecl, dims); - /* Add an "omp declare target" attribute. */ - DECL_ATTRIBUTES (fndecl) -= tree_cons (get_identifier ("omp declare target"), - data->clauses, DECL_ATTRIBUTES (fndecl)); + /* Add an "omp declare target" attribute. */ + DECL_ATTRIBUTES (fndecl) + = tree_cons (get_identifier ("omp declare target"), + data->clauses, DECL_ATTRIBUTES (fndecl)); +} /* Remember that we've used this "#pragma acc routine". */ data->fndecl_seen = true; diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index d56105ca177..0d314d63cfd 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -38260,36 +38260,42 @@ cp_finalize_oacc_routine (cp_parser *parser, tree fndecl, bool is_defn) return; } - oacc_verify_routine_clauses (>oacc_routine->clauses, - parser->oacc_routine->loc); - - if (oacc_get_fn_attr
[patch,openacc] Add support for OpenACC routine nohost clause
Attached is a patch that introduces support for the acc routine nohost clause. Basically, if an acc routine function is marked as nohost, then the compiler does not generate code for the host. It's kind of strange to test for. Basically, we had to use acc_on_device at -O2 so that the host references to the dead function get optimized away. I believe that the nohost clause was added for acc routines to allow offloaded acc code to call vendor libraries, such as cuBLAS, which are only available for specific accelerators. I haven't seen it used much in practice though. Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks Cesar [OpenACC] Add support for OpenACC routine nohost clause (was OpenACC bind, nohost changes) 2018-XX-YY Thomas Schwinge Cesar Philippidis gcc/ * tree-core.h (omp_clause_code): Add OMP_CLAUSE_NOHOST. * tree.c (omp_clause_num_ops, omp_clause_code_name, walk_tree_1): Update for these. * tree-pretty-print.c (dump_omp_clause): Handle OMP_CLAUSE_NOHOST. * gimplify.c (gimplify_scan_omp_clauses) (gimplify_adjust_omp_clauses): Handle OMP_CLAUSE_NOHOST. * tree-nested.c (convert_nonlocal_omp_clauses) (convert_local_omp_clauses): Likewise. * omp-low.c (scan_sharing_clauses): Likewise. * omp-offload.c (maybe_discard_oacc_function): New function. (execute_oacc_device_lower) [!ACCEL_COMPILER]: Handle OpenACC nohost clauses. gcc/c-family/ * c-attribs.c (c_common_attribute_table): Set min_len to -1 for "omp declare target". * c-pragma.h (pragma_omp_clause): Add PRAGMA_OACC_CLAUSE_NOHST. gcc/c/ * c-parser.c (c_parser_omp_clause_name): Handle "nohost". (c_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_NOHOST. (c_parser_oacc_routine, c_finish_oacc_routine): Update. * c-typeck.c (c_finish_omp_clauses): Handle OMP_CLAUSE_NOHOST. gcc/cp/ * parser.c (cp_parser_omp_clause_name): Handle "nohost". (cp_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_NOHOST, (cp_parser_oacc_routine, cp_finalize_oacc_routine): Update. * pt.c (tsubst_omp_clauses): Handle OMP_CLAUSE_NOHOST. * semantics.c (finish_omp_clauses): Handle OMP_CLAUSE_NOHOST. gcc/fortran/ * gfortran.h (gfc_omp_clauses): Add nohost members. * openmp.c (omp_mask2): Add OMP_CLAUSE_NOHOST. (gfc_match_omp_clauses): Handle OMP_CLAUSE_NOHOST. (gfc_match_oacc_routine): Set oacc_function_nohost when appropriate. * gfortran.h (symbol_attribute): Add oacc_function_nohost member. * trans-openmp.c (gfc_add_omp_offload_attributes): Use it to decide whether to generate an OMP_CLAUSE_NOHOST clause. (gfc_trans_omp_clauses_1): Unreachable code to generate an OMP_CLAUSE_NOHOST clause. gcc/testsuite/ * c-c++-common/goacc/classify-routine.c: Adjust test. * c-c++-common/goacc/routine-1.c: Likewise. * c-c++-common/goacc/routine-2.c: Likewise. * c-c++-common/goacc/routine-nohost-1.c: New test. * g++.dg/goacc/routine-2.C: Adjust test. * gfortran.dg/goacc/pr72741.f90: New test. libgomp/ * testsuite/libgomp.oacc-c-c++-common/routine-3.c: New test. * testsuite/libgomp.oacc-c-c++-common/routine-nohost-1.c: New test. * testsuite/libgomp.oacc-c-c++-common/routine-bind-nohost-1.c: Update test. * testsuite/libgomp.oacc-fortran/routine-8.f90: Likewise. (cherry picked from gomp-4_0-branch r223007, r226192, r226259, r228915, r228916, and r231423) (cherry picked from gomp-4_0-branch r231973 and r231979) (cherry picked from gomp-4_0-branch r238847) --- gcc/c-family/c-attribs.c | 2 +- gcc/c-family/c-pragma.h | 1 + gcc/c/c-parser.c | 12 +- gcc/c/c-typeck.c | 1 + gcc/cp/parser.c | 13 +-- gcc/cp/pt.c | 1 + gcc/cp/semantics.c| 1 + gcc/fortran/gfortran.h| 3 +- gcc/fortran/openmp.c | 29 +++--- gcc/fortran/trans-openmp.c| 15 +++- gcc/gimplify.c| 2 + gcc/lto/lto.c | 1 + gcc/omp-low.c | 2 + gcc/omp-offload.c | 38 --- .../c-c++-common/goacc/classify-routine.c | 4 +- gcc/testsuite/c-c++-common/goacc/routine-1.c | 8 gcc/testsuite/c-c++-common/goacc/routine-2.c | 8 ++-- .../c-c++-common/goacc/routine-nohost-1.c | 28 ++ gcc/testsuite/g++.dg/goacc/routine-2.C| 9 + gcc/testsuite/gfortran.dg/goacc/pr72741.f90 | 30 +++ gcc/tree-core.h | 3 ++ gcc/tree-nested.c | 4 ++ gcc/tree-pretty-print.c | 3 ++ gcc/tree.c| 3 ++ .../libgomp.oacc-c-c++-common/routine-3.c | 33 .../routine-nohost-1.c| 18 +
[patch,openacc] Use oacc_verify_routine_clauses for C/C++
This patch introduces a new oacc_verify_routine_clauses function that reports errors if the user abuses the gang, worker and vector clauses for acc routine directives in C/C++. Fortran is a little different, because the FE has it's own IR. So, while it would be possible to defer checking for gang, worker, vector parallelism until a tree node is created for a function, we'd still have problems of verifying the parallelism for functions and subroutines defined and declared inside modules. The C and C++ FE's are similar enough were they can share a common function. Is this OK for trunk? I bootstrapped and regression tested it for x86_64 Linux with nvptx offloading. This is only touches the OpenACC code path. Cesar [OpenACC] Use oacc_verify_routine_clauses for C/C++ 2018-XX-YY Thomas Schwinge Cesar Philippidis gcc/ * omp-general.c (oacc_build_routine_dims): Move some of its processing into... (oacc_verify_routine_clauses): ... this new function. * omp-general.h (oacc_verify_routine_clauses): New prototype. gcc/c/ * c-parser.c (c_parser_oacc_routine): Normalize order of clauses. (c_finish_oacc_routine): Call oacc_verify_routine_clauses. gcc/cp/ * parser.c (cp_parser_oacc_routine) (cp_parser_late_parsing_oacc_routine): Normalize order of clauses. (cp_finalize_oacc_routine): Call oacc_verify_routine_clauses. gcc/testsuite/ * c-c++-common/goacc/routine-level-of-parallelism-1.c: New test. (cherry picked from gomp-4_0-branch r239520) --- gcc/c/c-parser.c | 8 + gcc/cp/parser.c | 9 + gcc/omp-general.c | 69 - gcc/omp-general.h | 1 + .../goacc/routine-level-of-parallelism-1.c| 265 ++ 5 files changed, 342 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/routine-level-of-parallelism-1.c diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 3ca8fe71cc4..3517cb783d9 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -14999,6 +14999,9 @@ c_parser_oacc_routine (c_parser *parser, enum pragma_context context) data.clauses = c_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK, "#pragma acc routine"); + /* The clauses are in reverse order; fix that to make later diagnostic + emission easier. */ + data.clauses = nreverse (data.clauses); if (TREE_CODE (decl) != FUNCTION_DECL) { @@ -15013,6 +15016,9 @@ c_parser_oacc_routine (c_parser *parser, enum pragma_context context) data.clauses = c_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK, "#pragma acc routine"); + /* The clauses are in reverse order; fix that to make later diagnostic + emission easier. */ + data.clauses = nreverse (data.clauses); /* Emit a helpful diagnostic if there's another pragma following this one. Also don't allow a static assertion declaration, as in the @@ -15076,6 +15082,8 @@ c_finish_oacc_routine (struct oacc_routine_data *data, tree fndecl, return; } + oacc_verify_routine_clauses (>clauses, data->loc); + if (oacc_get_fn_attrib (fndecl)) { error_at (data->loc, diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 241226d8c21..fa7ee7798ae 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -38117,6 +38117,9 @@ cp_parser_oacc_routine (cp_parser *parser, cp_token *pragma_tok, = cp_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK, "#pragma acc routine", cp_lexer_peek_token (parser->lexer)); + /* The clauses are in reverse order; fix that to make later diagnostic + emission easier. */ + data.clauses = nreverse (data.clauses); if (decl && is_overloaded_fn (decl) && (TREE_CODE (decl) != FUNCTION_DECL @@ -38213,6 +38216,9 @@ cp_parser_late_parsing_oacc_routine (cp_parser *parser, tree attrs) parser->oacc_routine->clauses = cp_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK, "#pragma acc routine", pragma_tok); + /* The clauses are in reverse order; fix that to make later diagnostic + emission easier. */ + parser->oacc_routine->clauses = nreverse (parser->oacc_routine->clauses); cp_parser_pop_lexer (parser); /* Later, cp_finalize_oacc_routine will process the clauses, and then set fndecl_seen. */ @@ -38247,6 +38253,9 @@ cp_finalize_oacc_routine (cp_parser *parser, tree fndecl, bool is_defn) return; } + oacc_verify_routine_clauses (>oacc_routine->clauses, + parser->oacc_routine->loc); + if (oacc_get_fn_attrib (fndecl)) { error_at (parser->oacc_routine->loc, diff --git a/gcc/omp-general.c b/gcc/omp-general.c index cac6de2..3ea2224957d 100644 --- a/gcc/omp-general.c +++ b/gcc/omp-general.c @@ -559,9 +559,64 @@ oacc_set_fn_attrib (tree fn, tree clauses, vec *args) }
Re: [patch,openacc] C, C++ OpenACC wait diagnostic change
On 09/26/2018 12:50 PM, Joseph Myers wrote: > On Wed, 26 Sep 2018, Cesar Philippidis wrote: > >> Attached is an old patch which updated the C and C++ FEs to use %<)%> >> for the right ')' symbol. It's mostly a cosmetic change. All of the >> changes are self-contained to the OpenACC code path. > > Why is the "before ')'" included in the call to c_parser_error at all? > c_parser_error calls c_parse_error which adds its own " before " and token > description or expansion, so I'd expect the current error to result in a > message ending in something of the form "before X before Y". On closer inspection #pragma acc parallel copyin (a[0:N]) copy (b[0:N]) wait (1 /* { dg-error "expected '\\\)' before end of line" } */ -/* { dg-error "expected integer expression before '\\\)'" "" { target c++ } .-1 } */ +/* { dg-error "expected integer expression list before" "" { target c++ } .-1 } */ so this is only applicable to c++. But in C++ I see duplicate errors like this wait.c:29:29: error: expected ‘)’ before end of line #pragma acc parallel wait (1 ~ ^ ) wait.c:29:29: error: expected integer expression list before ‘)’ before end of line I suppose for C++ that's an improvement over wait.c:29:29: error: expected integer expression before ')' before end of line Julian, I need to start working on deep copy in OpenACC. Can you take over this patch? The error handling code in the C FE needs to be removed because it's dead. Thanks, Cesar
Re: [patch,wip] warn on noncontiguous pointers
On 09/26/2018 01:49 PM, Thomas Koenig wrote: > Hi Cesar, > >> As of GCC 8, gfortran now errors when a pointer with a contiguous >> attribute is set to point to a target without a contiguous attribute. I >> think this is overly strict, and should probably be demoted to a >> pedantic warning as I've done in the attached patch. > > We had a lengthy discussion on that one. Still, we can dig into the > standard for that one. > > J3/10-007 says in 7.2.2.3 Data pointer assignment > > # 7 If the pointer object has the CONTIGUOUS attribute, the pointer > # target shall be contiguous. > > # 9 If bounds-remapping-list is specified, the pointer target shall > # be simply contiguous (6.5.4) or of rank one > > program test > implicit none > real,pointer :: fptr1(:) > real,pointer,contiguous :: fptr3(:,:,:) > > allocate(fptr1(12)) > call random_number(fptr1) > > !Test pointer reshape II > > fptr3(1:2,1:2,1:2) => fptr1(4:) > > end program > > So, by paragraph 9, this would be OK. Let's see what paragraph 7 > means when it says "contiguous". 5.3.7 says > > An object is contiguous if it is > > # (1) an object with the CONTIGUOUS attribute, > # (2) a nonpointer whole array that is not assumed-shape, > # (3) an assumed-shape array that is argument associated with an > array that is contiguous, > # (4) an array allocated by an ALLOCATE statement, > # (5) a pointer associated with a contiguous target, or > # (6) a nonzero-sized array section (6.5.3) provided that > # (a) its base object is contiguous, > # (b) it does not have a vector subscript, > # (c) the elements of the section, in array element order, are a > # subset of the base object elements that are consecutive in > # array element order, > # (d) if the array is of type character and a substring-range appears, > # the substring-range specifies all of the characters of the > # parent string (6.4.1), > # (e) only its final part-ref has nonzero rank, and > # (f) it is not the real or imaginary part (6.4.4) of an array of type > # complex. > > An object is not contiguous if it is an array subobject, and > > [conditions not relevant elided] > > # It is processor dependent whether any other object is contiguous. > > If we go down the list, we see that fptr1(4:) is not contiguous; it > is not an array (it is a pointer), so (4) also does not apply. > > So, we are in the realm of processor dependent behavior, so we can > chose what to do. > > The last time we discussed this, we agreed on a hard error. One > important argument is that a mistakenly applied contiguous > attribute will lead to wrong code, and that it is quite easy > to check this, as we do now. > > So, I think we should leave the behavior as it is now, and Thank you for the explanation. That all seems reasonable. >> Maybe the ScaTeLib code needs to be updated. > > sounds like a good idea to me. ACK. Thanks, Cesar
[patch,wip] warn on noncontiguous pointers
As of GCC 8, gfortran now errors when a pointer with a contiguous attribute is set to point to a target without a contiguous attribute. I think this is overly strict, and should probably be demoted to a pedantic warning as I've done in the attached patch. I ran into this issue while I was tuning GCC for lsdalton. Specifically, CMake generates (not exactly because I reduced it) the following test case for ScaTeLib to determine if that library can be enabled: program test implicit none real,pointer :: fptr1(:) real,pointer,contiguous :: fptr3(:,:,:) allocate(fptr1(12)) call random_number(fptr1) !Test pointer reshape II fptr3(1:2,1:2,1:2) => fptr1(4:) end program Note how fptr1 doesn't have a contiguous attribute. Does anyone have thoughts on this? Maybe the ScaTeLib code needs to be updated. Thanks, Cesar Disable "Assignment to contiguous pointer from non-contiguous target" error 2018-XX-YY Cesar Philippidis gcc/fortran/ * expr.c (gfc_check_pointer_assign): Demote "Assignment to contiguous pointer from non-contiguous target" to a warning. --- diff --git a/gcc/fortran/expr.c b/gcc/fortran/expr.c index 3315bb840af..74caa4f2d59 100644 --- a/gcc/fortran/expr.c +++ b/gcc/fortran/expr.c @@ -3957,13 +3957,13 @@ gfc_check_pointer_assign (gfc_expr *lvalue, gfc_expr *rvalue) } } - /* Error for assignments of contiguous pointers to targets which is not + /* Warn for assignments of contiguous pointers to targets which is not contiguous. Be lenient in the definition of what counts as contiguous. */ if (lhs_attr.contiguous && !gfc_is_simply_contiguous (rvalue, false, true)) -gfc_error ("Assignment to contiguous pointer from non-contiguous " - "target at %L", >where); +gfc_warning (OPT_Wpedantic, "Assignment to contiguous pointer from " + "non-contiguous target at %L", >where); /* Warn if it is the LHS pointer may lives longer than the RHS target. */ if (warn_target_lifetime -- 2.17.1
[patch,openacc] Use correct location information for OpenACC shape and simple, clauses in C/C++
Thomas, this is your old gomp4 patch that updates the error locations for gang, worker and vector clauses. Those functions are parsed in {c,cp}_parser_oacc_shape_clause. I'm not sure how much of an impact this patch has given that it does not require any test suite changes. However, we do have a couple of tests in og8 that haven't been merged to trunk, so perhaps this functionality will be exercised in a later patch series. Unfortunately, there are a lot inter-dependencies between all of the pending og8->trunk patches. Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Cesar [OpenACC] Use correct location information for OpenACC shape and simple clauses in C/C++ 2018-XX-YY Thomas Schwinge Cesar Philippidis gcc/c/ * c-parser.c (c_parser_oacc_shape_clause) (c_parser_oacc_simple_clause): Add loc formal parameter. Adjust all users. gcc/cp/ * parser.c (cp_parser_oacc_shape_clause): Add loc formal parameter. Adjust all users. (cherry picked from gomp-4_0-branch r239519) update fallout for acc finalize / if_present --- diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index b8fc000b50d..d1e9876065a 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -12602,12 +12602,12 @@ c_parser_oacc_single_int_clause (c_parser *parser, omp_clause_code code, */ static tree -c_parser_oacc_shape_clause (c_parser *parser, omp_clause_code kind, +c_parser_oacc_shape_clause (c_parser *parser, location_t loc, + omp_clause_code kind, const char *str, tree list) { const char *id = "num"; tree ops[2] = { NULL_TREE, NULL_TREE }, c; - location_t loc = c_parser_peek_token (parser)->location; if (kind == OMP_CLAUSE_VECTOR) id = "length"; @@ -12739,12 +12739,12 @@ c_parser_oacc_shape_clause (c_parser *parser, omp_clause_code kind, seq */ static tree -c_parser_oacc_simple_clause (c_parser *parser, enum omp_clause_code code, - tree list) +c_parser_oacc_simple_clause (c_parser * /* parser */, location_t loc, + enum omp_clause_code code, tree list) { check_no_duplicate_clause (list, code, omp_clause_code_name[code]); - tree c = build_omp_clause (c_parser_peek_token (parser)->location, code); + tree c = build_omp_clause (loc, code); OMP_CLAUSE_CHAIN (c) = list; return c; @@ -14046,8 +14046,8 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, c_name = "async"; break; case PRAGMA_OACC_CLAUSE_AUTO: - clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_AUTO, - clauses); + clauses = c_parser_oacc_simple_clause (parser, here, OMP_CLAUSE_AUTO, + clauses); c_name = "auto"; break; case PRAGMA_OACC_CLAUSE_COLLAPSE: @@ -14091,8 +14091,8 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, c_name = "device_resident"; break; case PRAGMA_OACC_CLAUSE_FINALIZE: - clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_FINALIZE, - clauses); + clauses = c_parser_oacc_simple_clause (parser, here, + OMP_CLAUSE_FINALIZE, clauses); c_name = "finalize"; break; case PRAGMA_OACC_CLAUSE_FIRSTPRIVATE: @@ -14101,7 +14101,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, break; case PRAGMA_OACC_CLAUSE_GANG: c_name = "gang"; - clauses = c_parser_oacc_shape_clause (parser, OMP_CLAUSE_GANG, + clauses = c_parser_oacc_shape_clause (parser, here, OMP_CLAUSE_GANG, c_name, clauses); break; case PRAGMA_OACC_CLAUSE_HOST: @@ -14113,13 +14113,15 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, c_name = "if"; break; case PRAGMA_OACC_CLAUSE_IF_PRESENT: - clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_IF_PRESENT, + clauses = c_parser_oacc_simple_clause (parser, here, + OMP_CLAUSE_IF_PRESENT, clauses); c_name = "if_present"; break; case PRAGMA_OACC_CLAUSE_INDEPENDENT: - clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_INDEPENDENT, - clauses); + clauses = c_parser_oacc_simple_clause (parser, here, + OMP_CLAUSE_INDEPENDENT, + clauses); c_name = "independent"; break; case PRAGMA_OACC_CLAUSE_LINK: @@ -14151,7 +14153,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, c_name = "reduction"; break; case PRAGMA_OACC_CLAUSE_SEQ: - clauses = c_parser_oacc_simple_clause (parser, OMP_CLAUSE_SEQ, + clauses = c_parser_oacc_simple_clause (parser, here, OMP_CLAUSE_SEQ, clauses); c_name = "seq"; break; @@ -14165,7 +14167,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, break; case PRAGMA_OACC_CLAUSE_VECTOR: c_name = "vector"; - clauses = c_parser_oacc_shape_clause (parser, OMP_CLAUSE_VECTOR, + clauses = c_parser_oacc_shape_clause (parser, here, OMP_CLAUSE_VECTOR,
[patch] nvptx libgcc atomic routines
This patch adds nvptx support for the atomic FETCH_AND_OP functions. I recall that this used to be important for OpenACC reductions back in the GCC 5.0 days before Nathan split reductions into four phases. Nowadays, atomic reductions use a spin lock that's implemented directly by the nvptx BE. Therefore, I'm not sure if the nvptx port still needs support for atomic fetch_and_*. Tom and Thomas, do either of you have any thoughts on this? Should I commit it to trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar nvptx libgcc atomic routines 2018-XX-YY Cesar Philippidis libgcc/ * config/nvptx/atomic.c: New file. * config/nvptx/t-nvptx (LIB2ADD): Include it. (cherry picked from gomp-4_0-branch r223177) --- libgcc/config/nvptx/atomic.c | 279 +++ libgcc/config/nvptx/t-nvptx | 3 +- 2 files changed, 281 insertions(+), 1 deletion(-) create mode 100644 libgcc/config/nvptx/atomic.c diff --git a/libgcc/config/nvptx/atomic.c b/libgcc/config/nvptx/atomic.c new file mode 100644 index 000..ab6cf23ef9d --- /dev/null +++ b/libgcc/config/nvptx/atomic.c @@ -0,0 +1,279 @@ +/* Atomic operations for PTX. + Copyright (C) 2015-2018 Free Software Foundation, Inc. + Contributed by Mentor Graphics. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/* Kernel helper for compare-and-exchange. */ +static int +nvidia_cas (int oldval, int newval, int *ptr) +{ + int ret; + + asm volatile ("atom.cas.b32 %0, [%1], %2, %3;" : "=r"(ret) : "r"(ptr), + "r"(oldval), "r"(newval)); + + return ret; +} + +#define __kernel_cmpxchg (nvidia_cas) + +/* Kernel helper for memory barrier. */ +static void +__threadfence_block (void) +{ + asm volatile ("membar.cta;"); +} + +#define __kernel_dmb (__threadfence_block) + +#define HIDDEN + +/* Warning: this assumes that all nvptx targets are little endian. */ + +#define INVERT_MASK_1 0 +#define INVERT_MASK_2 0 + +#define MASK_1 0xffu +#define MASK_2 0xu + +#define FETCH_AND_OP_WORD(OP, PFX_OP, INF_OP)\ + int HIDDEN\ + __sync_fetch_and_##OP##_4 (int *ptr, int val)\ + { \ +int failure, tmp; \ + \ +do {\ + tmp = *ptr; \ + failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr); \ +} while (failure != 0); \ + \ +return tmp;\ + } + +FETCH_AND_OP_WORD (add, , +) +FETCH_AND_OP_WORD (sub, , -) +FETCH_AND_OP_WORD (or,, |) +FETCH_AND_OP_WORD (and, , &) +FETCH_AND_OP_WORD (xor, , ^) +FETCH_AND_OP_WORD (nand, ~, &) + +#define NAME_oldval(OP, WIDTH) __sync_fetch_and_##OP##_##WIDTH +#define NAME_newval(OP, WIDTH) __sync_##OP##_and_fetch_##WIDTH + +/* Implement both __sync__and_fetch and __sync_fetch_and_ for + subword-sized quantities. */ + +#define SUBWORD_SYNC_OP(OP, PFX_OP, INF_OP, TYPE, WIDTH, RETURN) \ + TYPE HIDDEN\ + NAME##_##RETURN (OP, WIDTH) (TYPE *ptr, TYPE val) \ + { \ +int *wordptr = (int *) ((unsigned long) ptr & ~3); \ +unsigned int mask, shift, oldval, newval;\ +int failure; \ + \ +shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH; \ +mask = MASK_##WIDTH << shift; \ + \ +do {\ + oldval = *wordptr; \ + newval = ((PFX_OP (((oldval & mask) >> shift) \ + INF_OP (unsigned int) val)) << shift) & mask; \ + newval |= oldval & ~mask; \ + failure = __kernel_cmpxchg (oldval, newval, wordptr); \ +} while (failure != 0); \ + \ +return (RETURN & mask) >> shift; \ + } + +SUBWORD_SYNC_OP (add, , +, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (sub, , -, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (or,, |, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (and, , &, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (xor, , ^, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, oldval) + +SUBWORD_SYNC_OP (add, , +,
[patch,openacc] Don't gimplify in ssa mode if seen_error in oacc_xform_loop
Again, this is another old gomp4 patch without a corresponding test case. I'm not familiar enough with the parloops kernels implementation to know if this patch is important. However, I'm somewhat inclined to drop patches from OG8 that don't impact correctness in the test suite. What do you want to do with this patch Thomas? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Cesar [OpenACC] Don't gimplify in ssa mode if seen_error in oacc_xform_loop 2018-XX-YY Tom de Vries Cesar Philippidis gcc/ PR tree-optimization/68977 * omp-offload.c (oacc_xform_loop): Handle seen_error () == true. (cherry picked from gomp-4_0-branch r232343 and r232344) --- diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index 3582dda3d1a..dae284fe890 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -335,7 +335,12 @@ oacc_xform_loop (gcall *call) -> chunks=ceil (range/(chunksize*threads*step)) striding=false,chunking=false -> chunk_size=ceil(range/(threads*step)),chunks=1 */ - push_gimplify_context (true); + + /* If seen_error (), we may introduce an uninitialized var due to + gimplification bailing out. If we gimplify in ssa mode, that will cause an + ICE. If we gimplify in non-ssa mode, then ssa updating will turn it into a + default definition, and we avoid the ICE. */ + push_gimplify_context (!seen_error ()); switch (code) { -- 2.17.1
[patch,openacc] use existing local variable in cp_parser_oacc_enter_exit_data
This is an old gomp4 patch that updates the location of the clause for acc enter/exit data. Apparently, it didn't impact any test cases. Is this OK for trunk or should we drop it from OG8? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] Use existing local variable in cp_parser_oacc_enter_exit_data 2018-XX-YY James Norris Cesar Philippidis gcc/cp/ * parser.c (cp_parser_oacc_enter_exit_data): Use existing local variable. (cherry picked from gomp-4_0-branch r223007) --- diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 89f239e0f20..c6ebc494e59 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -37064,7 +37064,7 @@ cp_parser_oacc_enter_exit_data (cp_parser *parser, cp_token *pragma_tok, stmt = enter ? make_node (OACC_ENTER_DATA) : make_node (OACC_EXIT_DATA); TREE_TYPE (stmt) = void_type_node; OMP_STANDALONE_CLAUSES (stmt) = clauses; - SET_EXPR_LOCATION (stmt, pragma_tok->location); + SET_EXPR_LOCATION (stmt, loc); add_stmt (stmt); return stmt; } -- 2.17.1
[patch,openacc] C, C++ OpenACC wait diagnostic change
Attached is an old patch which updated the C and C++ FEs to use %<)%> for the right ')' symbol. It's mostly a cosmetic change. All of the changes are self-contained to the OpenACC code path. Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] C, C++ OpenACC wait diagnostic change 2018-XX-YY James Norris Cesar Philippidis gcc/c/ * c-parser.c (c_parser_oacc_wait_list): Change error message. gcc/cp/ * parser.c (cp_parser_oacc_wait_list): Change error message. gcc/testsuite/ * c-c++-common/goacc/asyncwait-1: Update messages. (cherry picked from gomp-4_0-branch r223007, e4ea0a3) diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 1766a256633..b8fc000b50d 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -11597,7 +11597,8 @@ c_parser_oacc_wait_list (c_parser *parser, location_t clause_loc, tree list) if (args->length () == 0) { - c_parser_error (parser, "expected integer expression before ')'"); + c_parser_error (parser, + "expected integer expression list before %<)%>"); release_tree_vector (args); return list; } diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index c6ebc494e59..e80c1fba670 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -32094,7 +32094,8 @@ cp_parser_oacc_wait_list (cp_parser *parser, location_t clause_loc, tree list) if (args == NULL || args->length () == 0) { - cp_parser_error (parser, "expected integer expression before ')'"); + cp_parser_error (parser, + "expected integer expression list before %<)%>"); if (args != NULL) release_tree_vector (args); return list; diff --git a/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c b/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c index e1840af5d70..2fc89486ee5 100644 --- a/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c +++ b/gcc/testsuite/c-c++-common/goacc/asyncwait-1.c @@ -116,7 +116,7 @@ f (int N, float *a, float *b) } #pragma acc parallel copyin (a[0:N]) copy (b[0:N]) wait (1 /* { dg-error "expected '\\\)' before end of line" } */ -/* { dg-error "expected integer expression before '\\\)'" "" { target c++ } .-1 } */ +/* { dg-error "expected integer expression list before" "" { target c++ } .-1 } */ { for (ii = 0; ii < N; ii++) b[ii] = a[ii]; @@ -171,7 +171,7 @@ f (int N, float *a, float *b) #pragma acc wait (1,2,,) /* { dg-error "expected (primary-|)expression before" } */ #pragma acc wait (1 /* { dg-error "expected '\\\)' before end of line" } */ -/* { dg-error "expected integer expression before '\\\)'" "" { target c++ } .-1 } */ +/* { dg-error "expected integer expression list before" "" { target c++ } .-1 } */ #pragma acc wait (1,*) /* { dg-error "expected (primary-|)expression before" } */ -- 2.17.1
Re: [PATCH][OpenACC] Update deviceptr handling during gimplification
On 09/25/2018 05:55 PM, Julian Brown wrote: > On Tue, 7 Aug 2018 15:09:38 -0700 > Cesar Philippidis wrote: > >> I had previously posted this patch as part of a monster deviceptr >> patch here >> <https://gcc.gnu.org/ml/gcc-patches/2018-06/msg01911.html>. This >> patch breaks out the generic gimplifier changes. Essentially, with >> this patch, the gimplifier will now transfer deviceptr data clauses >> using GOMP_MAP_FORCE_DEVICEPTR. >> >> Is this patch OK for trunk? It bootstrapped / regression tested >> cleanly for x86_64 with nvptx offloading. > > This patch also appears to fix the attached test case, which had been > associated with a different deviceptr-related patch on the og8 branch > (the other parts of which are upstream already). Perhaps you'd like to > incorporate this test into your patch? It was by James Norris > originally, IIUC. Ok, I'll do that. Thanks for updating those tests. Cesar
[patch,openacc] update fortran nested parallelism error messages
Bernhard noticed a typo in one of the OpenACC parallelism error messages. The error should have reported that gang loops cannot be nested inside vector loops, not worker loops. I'll commit the attached patch to trunk as obvious. I bootstrapped and regtested it against x86_64 Linux with nvptx offloading. Cesar [OpenACC] update fortran nested parallelism error messages 2018-09-24 Bernhard Reuther-Fischer Cesar Philippidis gcc/fortran/ * openmp.c (resolve_oacc_loop_blocks): gcc/testsuite/ * gfortran.dg/goacc/nested-parallelism.f90: New test. --- gcc/fortran/openmp.c | 2 +- .../gfortran.dg/goacc/nested-parallelism.f90 | 51 +++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90 diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index ac1923ea06b..a046863445d 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -5913,7 +5913,7 @@ resolve_oacc_loop_blocks (gfc_code *code) >loc); if (c->code->ext.omp_clauses->vector) gfc_error ("Loop parallelized across gangs is not allowed " - "inside loop parallelized across workers at %L", + "inside loop parallelized across vectors at %L", >loc); } if (code->ext.omp_clauses->worker) diff --git a/gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90 b/gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90 new file mode 100644 index 000..6ebef6a4547 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/nested-parallelism.f90 @@ -0,0 +1,51 @@ +! Verify the invalid gang, worker, vector parallelism error messages. + +program np + integer, parameter :: n = 100 + integer :: i, j, k + + !$acc parallel loop gang + do i = 1, n + !$acc loop gang ! { dg-error "gangs is not allowed inside another loop parallelized across gangs" } + do j = 1, n + end do + + !$acc loop worker + do j = 1, n + end do + + !$acc loop vector + do j = 1, n + end do + end do + + !$acc parallel loop worker + do i = 1, n + !$acc loop gang ! { dg-error "gangs is not allowed inside loop parallelized across workers" } + do j = 1, n + end do + + !$acc loop worker ! { dg-error "workers is not allowed inside another loop parallelized across workers" } + do j = 1, n + end do + + !$acc loop vector + do j = 1, n + end do + end do + + !$acc parallel loop vector + do i = 1, n + !$acc loop gang ! { dg-error "gangs is not allowed inside loop parallelized across vectors" } + do j = 1, n + end do + + !$acc loop worker ! { dg-error "workers is not allowed inside another loop parallelized across vectors" } + do j = 1, n + end do + + !$acc loop vector ! { dg-error "vectors is not allowed inside another loop parallelized across vectors" } + do j = 1, n + end do + end do +end program np -- 2.17.1
Re: [openacc] Teach gfortran to lower OpenACC routine dims
On 09/20/2018 09:10 AM, Bernhard Reutner-Fischer wrote: > On Thu, 20 Sep 2018 07:41:08 -0700 > Cesar Philippidis wrote: > >> On 09/19/2018 03:27 PM, Bernhard Reutner-Fischer wrote: >>> On Wed, 5 Sep 2018 12:52:03 -0700 >>> Cesar Philippidis wrote: > >>>> diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c >>>> index eea6b81ebfa..eed868f475b 100644 >>>> --- a/gcc/fortran/trans-decl.c >>>> +++ b/gcc/fortran/trans-decl.c >>>> @@ -46,6 +46,7 @@ along with GCC; see the file COPYING3. If not >>>> see #include "trans-stmt.h" >>>> #include "gomp-constants.h" >>>> #include "gimplify.h" >>>> +#include "omp-general.h" >>> >>> hmz. so the gomp-constants.h include would be redundant, but do we >>> really need omp-general.h? >> >> Good point. omp-general.h is required for oacc_build_routine_dims. >> >>> Doesn't this suggest to move this oacc dims lowering to >>> trans-openmp.c instead, please? >> >> So something like adding a new gfc_add_omp_offload_attributes to >> trans-openmp.c and call it from add_attributes_to_decl? > > yes. > >> On a related note, I noticed that I forgot to incorporate this change >> in gfortran.h: >> >> @@ -902,7 +912,7 @@ typedef struct >>unsigned oacc_declare_link:1; >> >>/* This is an OpenACC acclerator function at level N - 1 */ >> - unsigned oacc_function:3; >> + ENUM_BITFIELD (oacc_function) oacc_function:3; >> >> It's probably not huge, but I noticed that some other enum bitfields >> are declared that way. > > yea, some compilers had trouble with enum bitfields (where plain int > bitfields like here worked fine, IIRC) but i'm not sure if it's > considered legacy these days. Fine with me to be safe. I updated the patch by incorporating all of those changes. Is it OK for trunk? Thanks, Cesar [openacc] Make GFC default to -1 for OpenACC routine dims 2018-09-24 Cesar Philippidis * gfortran.h (enum oacc_function): New enum. (gfc_oacc_routine_name): Add locus loc field. (symbol_attribute): Update type of oacc_function field. * openmp.c (gfc_oacc_routine_dims): Return oacc_function. (gfc_match_oacc_routine): Update routine clause syntax checking. Populate oacc_function attribute with dims. * trans-decl.c (add_attributes_to_decl): Use oacc_build_routine_dims to construct routine dims. * trans.h (gfc_add_omp_offload_attributes): Declare. * trans-decl.c (add_attributes_to_decl): Use it to set OMP and ACC offload function attributes. * trans-openmp.c (gfc_add_omp_offload_attributes): New function. gcc/testsuite/ * gfortran.dg/goacc/classify-routine.f95: Adjust test. * gfortran.dg/goacc/pr71704.f90: Likewise. * gfortran.dg/goacc/routine-6.f90: Likewise. * gfortran.dg/goacc/routine-8.f90: Likewise. * gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise. libgomp/ * testsuite/libgomp.oacc-fortran/routine-1.f90: Adjust test. * testsuite/libgomp.oacc-fortran/routine-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-5.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-9.f90: Likewise. * libgomp.oacc-fortran/host_data-2.f90: Likewise. * libgomp.oacc-fortran/host_data-3.f: Likewise. * libgomp.oacc-fortran/host_data-4.f90: Likewise. diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h index 04b0024a992..3efd59c95f7 100644 --- a/gcc/fortran/gfortran.h +++ b/gcc/fortran/gfortran.h @@ -316,6 +316,16 @@ enum save_state { SAVE_NONE = 0, SAVE_EXPLICIT, SAVE_IMPLICIT }; +/* Flags to keep track of ACC routine states. */ +enum oacc_function +{ OACC_FUNCTION_NONE = 0, + OACC_FUNCTION_GANG, + OACC_FUNCTION_WORKER, + OACC_FUNCTION_VECTOR, + OACC_FUNCTION_SEQ, + OACC_FUNCTION_AUTO +}; + /* Strings for all symbol attributes. We use these for dumping the parse tree, in error messages, and also when reading and writing modules. In symbol.c. */ @@ -902,7 +912,7 @@ typedef struct unsigned oacc_declare_link:1; /* This is an OpenACC acclerator function at level N - 1 */ - unsigned oacc_function:3; + ENUM_BITFIELD (oacc_function) oacc_function:3; /* Attributes set by compiler extensions (!GCC$ ATTRIBUTES). */ unsigned ext_attr:EXT_ATTR_NUM; @@ -1726,6 +1736,7 @@ typedef struct gfc_oacc_routine_name struct gfc_symbol *sym; struct gfc_omp_clauses *clauses; struct gfc_oacc_routine_name *next; + locus loc; } gfc_oacc_routine_name; diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index 94a7f7eaa50..ac1923ea06b 100644 --- a/gcc/for
Re: [patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels
On 09/20/2018 10:14 AM, Cesar Philippidis wrote: > As Chung-Lin noted here > <https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01079.html>: > > This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a > "sequential" loop form (without the OMP runtime calls), used for loop > directives inside OpenACC kernels constructs. Tom mentions that this > allows the kernels parallelization to work when '#pragma acc loop' > makes the front-ends create OMP_FOR, which the loop analysis phases > don't understand. > > I bootstrapped and regtested it on x86_64 Linux with nvptx offloading. > Is this patch OK for trunk? I forgot to mention how that patch depends on the omp_target_base_pointers_restrict_p functionality from omp lowering that I removed back in June when I added support for the OpenACC 2.5 data clause semantics. It turned out that I was too aggressive when I was removing unused code. That's because, at least initially, there was no test cases that exercised that functionality in trunk until Chung-Lin's kernels patch goes in. Anyway, this patch is specifically required to get kernels-acc-loop-reduction.c working. Is this OK for trunk? I bootstrapped and regression tested it on x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] Reintroduce omp_target_base_pointers_restrict_p It turns out that existing acc kernels instructure based on parloops will benefit if the variables used in OpenACC data clauses maintained the restrict pointer qualifier. This code is present in GCC 8, but I removed it back in June when I committed a patch to update the behavior of the data clauses match the semantics in OpenACC 2.5. Is this patch OK for trunk? A forthcoming acc kernels patch depends on it. 2018-XX-YY Cesar Philippidis * omp-low.c (install_var_field): New base_pointer_restrict argument. (scan_sharing_clauses): Update call to install_var_field. (omp_target_base_pointers_restrict_p): New function. (scan_omp_target): Update call to install_var_field. --- gcc/omp-low.c | 89 +++ 1 file changed, 83 insertions(+), 6 deletions(-) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 24685fd012c..a59c15ae5fd 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -642,7 +642,8 @@ build_sender_ref (tree var, omp_context *ctx) BASE_POINTERS_RESTRICT, declare the field with restrict. */ static void -install_var_field (tree var, bool by_ref, int mask, omp_context *ctx) +install_var_field (tree var, bool by_ref, int mask, omp_context *ctx, + bool base_pointers_restrict = false) { tree field, type, sfield = NULL_TREE; splay_tree_key key = (splay_tree_key) var; @@ -673,7 +674,11 @@ install_var_field (tree var, bool by_ref, int mask, omp_context *ctx) type = build_pointer_type (build_pointer_type (type)); } else if (by_ref) -type = build_pointer_type (type); +{ + type = build_pointer_type (type); + if (base_pointers_restrict) + type = build_qualified_type (type, TYPE_QUAL_RESTRICT); +} else if ((mask & 3) == 1 && omp_is_reference (var)) type = TREE_TYPE (type); @@ -987,10 +992,12 @@ fixup_child_record_type (omp_context *ctx) } /* Instantiate decls as necessary in CTX to satisfy the data sharing - specified by CLAUSES. */ + specified by CLAUSES. If BASE_POINTERS_RESTRICT, install var field with + restrict. */ static void -scan_sharing_clauses (tree clauses, omp_context *ctx) +scan_sharing_clauses (tree clauses, omp_context *ctx, + bool base_pointers_restrict = false) { tree c, decl; bool scan_array_reductions = false; @@ -1252,7 +1259,8 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) && TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE) install_var_field (decl, true, 7, ctx); else - install_var_field (decl, true, 3, ctx); + install_var_field (decl, true, 3, ctx, + base_pointers_restrict); if (is_gimple_omp_offloaded (ctx->stmt) && !OMP_CLAUSE_MAP_IN_REDUCTION (c)) install_var_local (decl, ctx); @@ -2265,6 +2273,68 @@ scan_omp_single (gomp_single *stmt, omp_context *outer_ctx) layout_type (ctx->record_type); } +/* Return true if the CLAUSES of an omp target guarantee that the base pointers + used in the corresponding offloaded function are restrict. */ + +static bool +omp_target_base_pointers_restrict_p (tree clauses) +{ + /* The analysis relies on the GOMP_MAP_FORCE_* mapping kinds, which are only + used by OpenACC. */ + if (flag_openacc == 0) +return false; + + /* I. Basic example: + + void foo (void) + { + unsigned int a[2], b[2]; + + #pragma acc kernels \ + copyout (a) \ + copyout (b) + { + a[0] = 0; + b[0] = 1; + } + } + + After gimplification, we have: + + #pragma omp target oacc_kernels \ + map(force_from:a [len: 8]) \ + map(force_from:b
Re: [patch,openacc] handle missing OMP_LIST_ clauses in fortran's parse tree debugger
On 09/20/2018 11:22 AM, Paul Richard Thomas wrote: > Hi Cesar, > > It looks OK to me. > > Thanks for the patch. > > Paul Thanks! Committed in r264446. Cesar > On 20 September 2018 at 18:21, Cesar Philippidis > wrote: >> This patch updates Fortran's parse tree printer to print the names of >> new OpenACC data clauses. I'm not if this functionality is widely used >> or not, but from a standpoint of correctness, this patch would probably >> be nice to have. >> >> It this patch OK for trunk? I bootstrapped and regtested it for x86_64 >> Linux with nvptx offloading. >> >> Thanks, >> Cesar > > >
[patch,openacc] Update _OPENACC value and documentation for OpenACC 2.5
This patch formally introduces OpenACC 2.5 functionality in various GCC documentation sources along with with updated the _OPENACC value in the various offloading header files. As of right now, GCC trunk already supports the updated OpenACC 2.5 data clause semantics. Julian, Chung-Lin and I have been working on pushing our remaining og8 patches to trunk (which we're down to under 30 now from 170+). But a number of those changes involve performance tuning, rather than new OpenACC functionality. Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] Update _OPENACC value and documentation for OpenACC 2.5 2018-XX-YY Thomas Schwinge Cesar Philippidis gcc/c-family/ * c-cppbuiltin.c (c_cpp_builtins): Update "_OPENACC" to "201510". gcc/fortran/ * cpp.c (cpp_define_builtins): Update "_OPENACC" to "201510". * gfortran.texi: Update for OpenACC 2.5. * Intrinsic.texi: Likewise. * invoke.texi: Likewise. gcc/testsuite/ * c-c++-common/cpp/openacc-define-3.c: Update. * gfortran.dg/openacc-define-3.f90: Likewise. gcc/ * doc/invoke.texi: Update for OpenACC 2.5. libgomp/ * libgomp.texi: Update for OpenACC 2.5. * openacc.f90 (openacc_version): Update to "201510". * openacc_lib.h (openacc_version): Likewise. * testsuite/libgomp.oacc-fortran/openacc_version-1.f: Update. * testsuite/libgomp.oacc-fortran/openacc_version-2.f90: Update. (cherry picked from gomp-4_0-branch r248057, ccbbcb70569) --- gcc/c-family/c-cppbuiltin.c | 2 +- gcc/doc/invoke.texi | 4 +++- gcc/fortran/cpp.c | 2 +- gcc/fortran/gfortran.texi | 16 +- gcc/fortran/intrinsic.texi| 6 +++--- gcc/fortran/invoke.texi | 4 +--- .../c-c++-common/cpp/openacc-define-3.c | 2 +- .../gfortran.dg/openacc-define-3.f90 | 2 +- libgomp/libgomp.texi | 21 ++- libgomp/openacc.f90 | 2 +- libgomp/openacc_lib.h | 2 +- .../libgomp.oacc-fortran/openacc_version-1.f | 2 +- .../openacc_version-2.f90 | 2 +- 13 files changed, 31 insertions(+), 36 deletions(-) diff --git a/gcc/c-family/c-cppbuiltin.c b/gcc/c-family/c-cppbuiltin.c index 96a6b4dfd2b..f2a273b6ac7 100644 --- a/gcc/c-family/c-cppbuiltin.c +++ b/gcc/c-family/c-cppbuiltin.c @@ -1391,7 +1391,7 @@ c_cpp_builtins (cpp_reader *pfile) cpp_define (pfile, "__SSP__=1"); if (flag_openacc) -cpp_define (pfile, "_OPENACC=201306"); +cpp_define (pfile, "_OPENACC=201510"); if (flag_openmp) cpp_define (pfile, "_OPENMP=201511"); diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 94304c314cf..34d7ff71512 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -2161,10 +2161,12 @@ freestanding and hosted environments. Enable handling of OpenACC directives @code{#pragma acc} in C/C++ and @code{!$acc} in Fortran. When @option{-fopenacc} is specified, the compiler generates accelerated code according to the OpenACC Application -Programming Interface v2.0 @w{@uref{https://www.openacc.org}}. This option +Programming Interface v2.5 @w{@uref{https://www.openacc.org}}. This option implies @option{-pthread}, and thus is only supported on targets that have support for @option{-pthread}. +See @uref{https://gcc.gnu.org/wiki/OpenACC} for more information. + @item -fopenacc-dim=@var{geom} @opindex fopenacc-dim @cindex OpenACC accelerator programming diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c index 0b3de42e832..14871129ff6 100644 --- a/gcc/fortran/cpp.c +++ b/gcc/fortran/cpp.c @@ -165,7 +165,7 @@ cpp_define_builtins (cpp_reader *pfile) cpp_define (pfile, "_LANGUAGE_FORTRAN=1"); if (flag_openacc) -cpp_define (pfile, "_OPENACC=201306"); +cpp_define (pfile, "_OPENACC=201510"); if (flag_openmp) cpp_define (pfile, "_OPENMP=201511"); diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi index 30934046a49..59a69457fe0 100644 --- a/gcc/fortran/gfortran.texi +++ b/gcc/fortran/gfortran.texi @@ -476,9 +476,7 @@ used on real-world programs. In particular, the supported extensions include OpenMP, Cray-style pointers, some old vendor extensions, and several Fortran 2003 and Fortran 2008 features, including TR 15581. However, it is still under development and has a few remaining rough edges. -There also is initial support for OpenACC. -Note that this is an experimental feature, incomplete, and subject to -change in future versions of GCC. See +There also is support for OpenACC. See @uref{https://gcc.gnu.org/wiki/OpenACC} for more information. At present, the GNU Fortran compiler passes the @@ -538,10 +536,8 @@ status} and @ref{Fortran 2018 status
[patch,openacc] Set safelen to INT_MAX for oacc independent pragma
This is another old gomp4 OpenACC patch which impacts targets that use simd vectorization, such as the host and AMD GCN, rather than nvptx. Basically, as the subject states, it sets safelen to INT_MAX for independent acc loops, which I believe is already being done for OpenMP in certain situations. The original discussion for this patch can be found here <https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01872.html>. Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] Set safelen to INT_MAX for oacc independent pragma 2018-XX-YY Tom de Vries Cesar Philippidis gcc/ * omp-expand.c (expand_omp_for): Set loop->safelen to INT_MAX if marked_independent. (cherry picked from gomp-4_0-branch r226079) --- gcc/omp-expand.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index 427f329d35f..ee147f10826 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -5718,6 +5718,7 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt) { struct loop *loop = region->cont->loop_father; loop->marked_independent = true; + loop->safelen = INT_MAX; } } else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD) -- 2.17.1
[patch,openacc] Propagate independent clause for OpenACC kernels pass
This is another old patch teaches the omp expansion pass how to propagate the acc loop independent clause to the later stages throughout compilation. Unfortunately, it didn't include any test cases. I'm not sure how effective this will be with the existing kernel parloops pass. But as I noted in my Cauldron talk, we would like to convert acc kernels regions to acc parallel regions, and this patch could help in that regard. Chung-Lin, do you have anymore state on this patch? Anyway, I bootstrapped and regtested it for x86_64 Linux with nvptx offloading and it didn't introduce any regressions. We do have a couple of other standalone kernels patches in og8, but those depend on other patches. Thanks, Cesar [OpenACC] Propagate independent clause for OpenACC kernels pass 2018-XX-YY Chung-Lin Tang Cesar Philippidis gcc/ * cfgloop.h (struct loop): Add 'bool marked_independent' field. * omp-expand.c (struct omp_region): Add 'int kind' and 'bool independent' fields. (expand_omp_for): Set 'marked_independent' field for loop corresponding to region. (find_omp_for_region_data): New function. (build_omp_regions_1): Set kind field. Call find_omp_for_region_data for GIMPLE_OMP_FOR statements. (cherry picked from gomp-4_0-branch r225759) --- gcc/cfgloop.h| 4 gcc/omp-expand.c | 46 -- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 80a31c416ca..7928681b514 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -221,6 +221,10 @@ struct GTY ((chain_next ("%h.next"))) loop { /* True if the loop is part of an oacc kernels region. */ unsigned in_oacc_kernels_region : 1; + /* True if loop is tagged as having independent iterations by user, + e.g. the OpenACC independent clause. */ + bool marked_independent; + /* The number of times to unroll the loop. 0 means no information given, just do what we always do. A value of 1 means do not unroll the loop. A value of USHRT_MAX means unroll with no specific unrolling factor. diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index 9b03f62e065..427f329d35f 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -107,6 +107,12 @@ struct omp_region /* True if this is nested inside an OpenACC kernels construct. */ bool inside_kernels_p; + + /* Records a generic kind field. */ + int kind; + + /* For an OpenACC loop directive, true if has the 'independent' clause. */ + bool independent; }; static struct omp_region *root_omp_region; @@ -5705,8 +5711,15 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt) loops_state_set (LOOPS_NEED_FIXUP); if (region->inside_kernels_p) -expand_omp_for_generic (region, , BUILT_IN_NONE, BUILT_IN_NONE, - inner_stmt); +{ + expand_omp_for_generic (region, , BUILT_IN_NONE, BUILT_IN_NONE, + inner_stmt); + if (region->independent && region->cont->loop_father) + { + struct loop *loop = region->cont->loop_father; + loop->marked_independent = true; + } +} else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD) expand_omp_simd (region, ); else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP) @@ -7887,6 +7900,31 @@ expand_omp (struct omp_region *region) } } +/* Fill in additional data for a region REGION associated with an + OMP_FOR STMT. */ + +static void +find_omp_for_region_data (struct omp_region *region, gomp_for *stmt) +{ + region->kind = gimple_omp_for_kind (stmt); + + if (region->kind == GF_OMP_FOR_KIND_OACC_LOOP) +{ + struct omp_region *target_region = region->outer; + while (target_region + && target_region->type != GIMPLE_OMP_TARGET) + target_region = target_region->outer; + if (!target_region) + return; + + tree clauses = gimple_omp_for_clauses (stmt); + + if (target_region->kind == GF_OMP_TARGET_KIND_OACC_KERNELS + && omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT)) + region->independent = true; +} +} + /* Helper for build_omp_regions. Scan the dominator tree starting at block BB. PARENT is the region that contains BB. If SINGLE_TREE is true, the function ends once a single tree is built (otherwise, whole @@ -7953,6 +7991,8 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent, case GF_OMP_TARGET_KIND_OACC_KERNELS: case GF_OMP_TARGET_KIND_OACC_DATA: case GF_OMP_TARGET_KIND_OACC_HOST_DATA: + if (is_gimple_omp_oacc (stmt)) + region->kind = gimple_omp_target_kind (stmt); break; case GF_OMP_TARGET_KIND_UPDATE: case GF_OMP_TARGET_KIND_ENTER_DATA: @@ -7974,6 +8014,8 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent, /* #pragma omp ordered depend is also just a stand-alone directive. */ region = NULL; + else if (code == GIMPLE_OMP_FOR) + find_omp_for_region_data (re
[patch,openacc] Fix PR71959: lto dump of callee counts
This is another old gomp4 patch that demotes an ICE in PR71959 to a linker warning. One problem here is that it is not clear if OpenACC allows individual member functions in C++ classes to be marked as acc routines. There's another issue accessing member data inside offloaded regions. We'll add some support for member data OpenACC 2.6, but some of the OpenACC C++ semantics are still unclear. Is this OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [PR71959] lto dump of callee counts 2018-XX-YY Nathan Sidwell Cesar Philippidis gcc/ * ipa-inline-analysis.c (inline_write_summary): Only dump callee counts when dumping the function's body. libgomp/ * testsuite/libgomp.oacc-c++/pr71959.C: New. * testsuite/libgomp.oacc-c++/pr71959-a.C: New. (cherry picked from gomp-4_0-branch r239788) --- gcc/ipa-fnsummary.c | 18 --- .../testsuite/libgomp.oacc-c++/pr71959-a.C| 31 +++ libgomp/testsuite/libgomp.oacc-c++/pr71959.C | 31 +++ 3 files changed, 75 insertions(+), 5 deletions(-) create mode 100644 libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C create mode 100644 libgomp/testsuite/libgomp.oacc-c++/pr71959.C diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c index 62095c6cf6f..e796b085e14 100644 --- a/gcc/ipa-fnsummary.c +++ b/gcc/ipa-fnsummary.c @@ -3409,8 +3409,10 @@ ipa_fn_summary_write (void) int i; size_time_entry *e; struct condition *c; + int index = lto_symtab_encoder_encode (encoder, cnode); + bool body = encoder->nodes[index].body; - streamer_write_uhwi (ob, lto_symtab_encoder_encode (encoder, cnode)); + streamer_write_uhwi (ob, index); streamer_write_hwi (ob, info->estimated_self_stack_size); streamer_write_hwi (ob, info->self_size); info->time.stream_out (ob); @@ -3453,10 +3455,16 @@ ipa_fn_summary_write (void) info->array_index->stream_out (ob); else streamer_write_uhwi (ob, 0); - for (edge = cnode->callees; edge; edge = edge->next_callee) - write_ipa_call_summary (ob, edge); - for (edge = cnode->indirect_calls; edge; edge = edge->next_callee) - write_ipa_call_summary (ob, edge); + if (body) + { + /* Only write callee counts when we're emitting the + body, as the reader only knows about the callees when + the body's emitted. */ + for (edge = cnode->callees; edge; edge = edge->next_callee) + write_ipa_call_summary (ob, edge); + for (edge = cnode->indirect_calls; edge; edge = edge->next_callee) + write_ipa_call_summary (ob, edge); + } } } streamer_write_char_stream (ob->main_stream, 0); diff --git a/libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C b/libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C new file mode 100644 index 000..9486512d0e7 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c++/pr71959-a.C @@ -0,0 +1,31 @@ +// { dg-do compile } + +struct Iter +{ + int *cursor; + + void ctor (int *cursor_) asm("_ZN4IterC1EPi"); + int *point () const asm("_ZNK4Iter5pointEv"); +}; + +#pragma acc routine +void Iter::ctor (int *cursor_) +{ + cursor = cursor_; +} + +#pragma acc routine +int *Iter::point () const +{ + return cursor; +} + +void apply (int (*fn)(), Iter out) asm ("_ZN5Apply5applyEPFivE4Iter"); + +#pragma acc routine +void apply (int (*fn)(), struct Iter out) +{ *out.point() = fn (); } + +extern "C" void __gxx_personality_v0 () +{ +} diff --git a/libgomp/testsuite/libgomp.oacc-c++/pr71959.C b/libgomp/testsuite/libgomp.oacc-c++/pr71959.C new file mode 100644 index 000..169bf4aad17 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c++/pr71959.C @@ -0,0 +1,31 @@ +// { dg-additional-sources "pr71959-a.C" } + +// pr lto/71959 ICEd LTO due to mismatch between writing & reading behaviour + +struct Iter +{ + int *cursor; + + Iter(int *cursor_) : cursor(cursor_) {} + + int *point() const { return cursor; } +}; + +#pragma acc routine seq +int one () { return 1; } + +struct Apply +{ + static void apply (int (*fn)(), Iter out) + { *out.point() = fn (); } +}; + +int main () +{ + int x; + +#pragma acc parallel copyout(x) + Apply::apply (one, Iter ()); + + return x != 1; +} -- 2.17.1
[patch,openacc] Fix hang when running oacc exec with CUDA 9.0 nvprof
While tuning the performance of nvptx OpenACC offloading earlier this year, Tom fixed a bug in og7 that prevented Nvidia's nvprof profiling tool from working with CUDA 9. Tom posted more details on the patch here <https://gcc.gnu.org/ml/gcc-patches/2018-02/msg01269.html>, which is still relevant here. Note that this issue was triggered by the new OpenACC profiling API in og7, which has not landed in trunk yet. However, it's probably a good idea to get this patch committed independently from that huge profiling patch series. Is this OK for trunk? I bootstrapped and regtested this for x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] Fix hang when running oacc exec with CUDA 9.0 nvprof 2018-XX-YY Tom de Vries Cesar Philippidis libgomp/ * oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread): New variable. (acc_init_1): Set acc_init_thread to pthread_self (). Set acc_init_state to initializing at the start, and to initialized at the end. (self_initializing_p): New function. (acc_get_device_type): Return acc_device_none if called by thread that is currently executing acc_init_1. (cherry picked from openacc-gcc-7-branch commit 81904b675f6298a9c26c71391909ce362990a11f, bfc999c) --- libgomp/oacc-init.c | 34 ++ 1 file changed, 34 insertions(+) diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index 8db24b17d29..8842e7218cb 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -40,6 +40,11 @@ static gomp_mutex_t acc_device_lock; +static gomp_mutex_t acc_init_state_lock; +static enum { uninitialized, initializing, initialized } acc_init_state + = uninitialized; +static pthread_t acc_init_thread; + /* A cached version of the dispatcher for the global "current" accelerator type, e.g. used as the default when creating new host threads. This is the device-type equivalent of goacc_device_num (which specifies which device to @@ -215,6 +220,11 @@ acc_init_1 (acc_device_t d) struct gomp_device_descr *base_dev, *acc_dev; int ndevs; + gomp_mutex_lock (_init_state_lock); + acc_init_state = initializing; + acc_init_thread = pthread_self (); + gomp_mutex_unlock (_init_state_lock); + base_dev = resolve_device (d, true); ndevs = base_dev->get_num_devices_func (); @@ -234,6 +244,10 @@ acc_init_1 (acc_device_t d) gomp_init_device (acc_dev); gomp_mutex_unlock (_dev->lock); + gomp_mutex_lock (_init_state_lock); + acc_init_state = initialized; + gomp_mutex_unlock (_init_state_lock); + return base_dev; } @@ -528,6 +542,17 @@ acc_set_device_type (acc_device_t d) ialias (acc_set_device_type) +static bool +self_initializing_p (void) +{ + bool res; + gomp_mutex_lock (_init_state_lock); + res = (acc_init_state == initializing + && pthread_equal (acc_init_thread, pthread_self ())); + gomp_mutex_unlock (_init_state_lock); + return res; +} + acc_device_t acc_get_device_type (void) { @@ -537,6 +562,15 @@ acc_get_device_type (void) if (thr && thr->base_dev) res = acc_device_type (thr->base_dev->type); + else if (self_initializing_p ()) +/* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the + acc_ev_device_init_start event callback, which is dispatched during + acc_init_1. Trying to lock acc_device_lock during such a call (as we do + in the else clause below), will result in deadlock, since the lock has + already been taken by the acc_init_1 caller. We work around this problem + by using the acc_get_device_type property "If the device type has not yet + been selected, the value acc_device_none may be returned". */ +; else { gomp_init_targets_once (); -- 2.17.1
[patch,openacc] handle missing OMP_LIST_ clauses in fortran's parse tree debugger
This patch updates Fortran's parse tree printer to print the names of new OpenACC data clauses. I'm not if this functionality is widely used or not, but from a standpoint of correctness, this patch would probably be nice to have. It this patch OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] handle missing OMP_LIST_ clauses in fortran's parse tree debugger 2018-XX-YY Cesar Philippidis gcc/fortran/ * dump-parse-tree.c (show_omp_clauses): Add missing omp list_types and reorder the switch cases to match the enum in gfortran.h. (cherry picked from gomp-4_0-branch r228355, 159518d) --- gcc/fortran/dump-parse-tree.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gcc/fortran/dump-parse-tree.c b/gcc/fortran/dump-parse-tree.c index 2a28fa30986..f1be5a67a26 100644 --- a/gcc/fortran/dump-parse-tree.c +++ b/gcc/fortran/dump-parse-tree.c @@ -1384,21 +1384,26 @@ show_omp_clauses (gfc_omp_clauses *omp_clauses) const char *type = NULL; switch (list_type) { - case OMP_LIST_USE_DEVICE: type = "USE_DEVICE"; break; - case OMP_LIST_DEVICE_RESIDENT: type = "USE_DEVICE"; break; - case OMP_LIST_CACHE: type = ""; break; case OMP_LIST_PRIVATE: type = "PRIVATE"; break; case OMP_LIST_FIRSTPRIVATE: type = "FIRSTPRIVATE"; break; case OMP_LIST_LASTPRIVATE: type = "LASTPRIVATE"; break; + case OMP_LIST_COPYPRIVATE: type = "COPYPRIVATE"; break; case OMP_LIST_SHARED: type = "SHARED"; break; case OMP_LIST_COPYIN: type = "COPYIN"; break; case OMP_LIST_UNIFORM: type = "UNIFORM"; break; case OMP_LIST_ALIGNED: type = "ALIGNED"; break; case OMP_LIST_LINEAR: type = "LINEAR"; break; + case OMP_LIST_DEPEND: type = "DEPEND"; break; + case OMP_LIST_MAP: type = "MAP"; break; + case OMP_LIST_TO: type = "TO"; break; + case OMP_LIST_FROM: type = "FROM"; break; case OMP_LIST_REDUCTION: type = "REDUCTION"; break; + case OMP_LIST_DEVICE_RESIDENT: type = "DEVICE_RESIDENT"; break; + case OMP_LIST_LINK: type = "LINK"; break; + case OMP_LIST_USE_DEVICE: type = "USE_DEVICE"; break; + case OMP_LIST_CACHE: type = "CACHE"; break; case OMP_LIST_IS_DEVICE_PTR: type = "IS_DEVICE_PTR"; break; case OMP_LIST_USE_DEVICE_PTR: type = "USE_DEVICE_PTR"; break; - case OMP_LIST_DEPEND: type = "DEPEND"; break; default: gcc_unreachable (); } -- 2.17.1
[patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels
As Chung-Lin noted here <https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01079.html>: This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a "sequential" loop form (without the OMP runtime calls), used for loop directives inside OpenACC kernels constructs. Tom mentions that this allows the kernels parallelization to work when '#pragma acc loop' makes the front-ends create OMP_FOR, which the loop analysis phases don't understand. I bootstrapped and regtested it on x86_64 Linux with nvptx offloading. Is this patch OK for trunk? Thanks, Cesar [OpenACC] Generate sequential loop for OpenACC loop directive inside kernels 2018-XX-YY Chung-Lin Tang Cesar Philippidis gcc/ * omp-expand.c (struct omp_region): Add inside_kernels_p field. (expand_omp_for_generic): Adjust to generate a 'sequential' loop when GOMP builtin arguments are BUILT_IN_NONE. (expand_omp_for): Use expand_omp_for_generic to generate a non-parallelized loop for OMP_FORs inside OpenACC kernels regions. (expand_omp): Mark inside_kernels_p field true for regions nested inside OpenACC kernels constructs. gcc/testsuite/ * c-c++-common/goacc/kernels-loop-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test. * c-c++-common/goacc/kernels-acc-loop-reduction.c: New test. * c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test. (cherry picked from gomp-4_0-branch r224505, r224837, r228232, r228233, r231461, and r247958) --- gcc/omp-expand.c | 136 -- .../goacc/kernels-acc-loop-reduction.c| 23 +++ .../goacc/kernels-acc-loop-smaller-equal.c| 23 +++ .../goacc/kernels-loop-2-acc-loop.c | 18 +++ .../goacc/kernels-loop-3-acc-loop.c | 15 ++ .../goacc/kernels-loop-acc-loop.c | 15 ++ .../goacc/kernels-loop-n-acc-loop.c | 15 ++ 7 files changed, 204 insertions(+), 41 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index d2a77c067c6..9b03f62e065 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -104,6 +104,9 @@ struct omp_region /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has a depend clause. */ gomp_ordered *ord_stmt; + + /* True if this is nested inside an OpenACC kernels construct. */ + bool inside_kernels_p; }; static struct omp_region *root_omp_region; @@ -2509,6 +2512,7 @@ expand_omp_for_generic (struct omp_region *region, gassign *assign_stmt; bool in_combined_parallel = is_combined_parallel (region); bool broken_loop = region->cont == NULL; + bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE); edge e, ne; tree *counts = NULL; int i; @@ -2606,8 +2610,12 @@ expand_omp_for_generic (struct omp_region *region, type = TREE_TYPE (fd->loop.v); istart0 = create_tmp_var (fd->iter_type, ".istart0"); iend0 = create_tmp_var (fd->iter_type, ".iend0"); - TREE_ADDRESSABLE (istart0) = 1; - TREE_ADDRESSABLE (iend0) = 1; + + if (!seq_loop) +{ + TREE_ADDRESSABLE (istart0) = 1; + TREE_ADDRESSABLE (iend0) = 1; +} /* See if we need to bias by LLONG_MIN. */ if (fd->iter_type == long_long_unsigned_type_node @@ -2637,7 +2645,25 @@ expand_omp_for_generic (struct omp_region *region, gsi_prev (); tree arr = NULL_TREE; - if (in_combined_parallel) + if (seq_loop) +{ + tree n1 = fold_convert (fd->iter_type, fd->loop.n1); + tree n2 = fold_convert (fd->iter_type, fd->loop.n2); + + n1 = force_gimple_operand_gsi_1 (, n1, is_gimple_reg, NULL_TREE, true, + GSI_SAME_STMT); + n2 = force_gimple_operand_gsi_1 (, n2, is_gimple_reg, NULL_TREE, true, + GSI_SAME_STMT); + + assign_stmt = gimple_build_assign (istart0, n1); + gsi_insert_before (, assign_stmt, GSI_SAME_STMT); + + assign_stmt = gimple_build_assign (iend0, n2); + gsi_insert_before (, assign_stmt, GSI_SAME_STMT); + + t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0); +} + else if (in_combined_parallel) { gcc_assert (fd->ordered == 0); /* In a combined parallel loop, emit a call to @@ -3059,39 +3085,45 @@ expand_omp_for_generic (struct omp_region *region, collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb); /* Emit code to get the next
[patch,openacc] Fix infinite recursion in OMP clause pretty-printing, default label
Apparently, Tom ran into an ICE when we were adding support for new clauses back in the gomp-4_0-branch days. This patch shouldn't be necessary because all of the clauses are fully implemented now, but it may prevent similar bugs from occurring in the future at least during development. Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading. Thanks, Cesar Fix infinite recursion in OMP clause pretty-printing, default label Apparently, Tom ran into an ICE when we were adding support for new clauses back in the gomp-4_0-branch days. This patch shouldn't be necessary because all of the clauses are fully implemented now, but it may prevent similar bugs from occuring in the future at least during development. 2018-XX-YY Tom de Vries Cesar Philippidis gcc/ * tree-pretty-print.c (dump_omp_clause): Fix infinite recursion in default label. (cherry picked from gomp-4_0-branch r228915, 2e4d930) --- gcc/tree-pretty-print.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c index 2c089b11751..031afbb49e4 100644 --- a/gcc/tree-pretty-print.c +++ b/gcc/tree-pretty-print.c @@ -1063,8 +1063,7 @@ dump_omp_clause (pretty_printer *pp, tree clause, int spc, dump_flags_t flags) break; default: - /* Should never happen. */ - dump_generic_node (pp, clause, spc, flags, false); + pp_string (pp, "unknown"); break; } } -- 2.17.1
[patch,openacc] Fix acc_shutdown issue
Attached is an old gomp4 patch that allegedly fixes an shutdown runtime issue involving OpenACC accelerators. Unfortunately, the original patch didn't include a test case, nor did it generate any regressions in the libgomp testsuite when I reverted it in og8. With that said, I like how this patch eliminates the redundant use of gomp_mutex_lock to unmap variables (because gomp_unmap_vars already acquires a lock). However, the trade-off is that it does increase tgt->list_count to num_funcs + num_vars. Does anyone have any strong opinion on this patch and is it OK for trunk? I bootstrapped and regtested it for x86_64 Linux with nvptx offloading and I didn't encounter any regressions. Thanks, Cesar [OpenACC] Fix acc_shutdown issue 2018-XX-YY James Norris Cesar Philippidis libgomp/ * oacc-init.c (acc_shutdown_1): Replace use of gomp_free_memmap with gomp_unmap_vars. * target.c (gomp_load_image_to_device): Fix initialization. (gomp_free_memmap): Remove. (cherry picked from gomp-4_0-branch r226045) --- libgomp/libgomp.h | 1 - libgomp/oacc-init.c | 9 ++--- libgomp/target.c| 27 +-- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 3a8cc2bd7d6..5c11e97616d 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1003,7 +1003,6 @@ extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, enum gomp_map_vars_kind); extern void gomp_unmap_vars (struct target_mem_desc *, bool); extern void gomp_init_device (struct gomp_device_descr *); -extern void gomp_free_memmap (struct splay_tree_s *); extern void gomp_unload_device (struct gomp_device_descr *); extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key); diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index 8842e7218cb..957bb9f31f9 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -303,9 +303,12 @@ acc_shutdown_1 (acc_device_t d) if (walk->dev) { - gomp_mutex_lock (>dev->lock); - gomp_free_memmap (>dev->mem_map); - gomp_mutex_unlock (>dev->lock); + while (walk->dev->mem_map.root) + { + struct target_mem_desc *tgt = walk->dev->mem_map.root->key.tgt; + + gomp_unmap_vars (tgt, false); + } walk->dev = NULL; walk->base_dev = NULL; diff --git a/libgomp/target.c b/libgomp/target.c index dda041cdbef..9ddc8d6c038 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -1184,14 +1184,17 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, } /* Insert host-target address mapping into splay tree. */ - struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt)); + struct target_mem_desc *tgt = + gomp_malloc (sizeof (*tgt) + + sizeof (tgt->list[0]) + * (num_funcs + num_vars) * sizeof (*tgt->array)); tgt->array = gomp_malloc ((num_funcs + num_vars) * sizeof (*tgt->array)); tgt->refcount = REFCOUNT_INFINITY; tgt->tgt_start = 0; tgt->tgt_end = 0; tgt->to_free = NULL; tgt->prev = NULL; - tgt->list_count = 0; + tgt->list_count = num_funcs + num_vars; tgt->device_descr = devicep; splay_tree_node array = tgt->array; @@ -1204,6 +1207,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, k->tgt_offset = target_table[i].start; k->refcount = REFCOUNT_INFINITY; k->link_key = NULL; + tgt->list[i].key = k; + tgt->refcount++; array->left = NULL; array->right = NULL; splay_tree_insert (>mem_map, array); @@ -1236,6 +1241,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, k->tgt_offset = target_var->start; k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY; k->link_key = NULL; + tgt->list[i].key = k; + tgt->refcount++; array->left = NULL; array->right = NULL; splay_tree_insert (>mem_map, array); @@ -1454,22 +1461,6 @@ gomp_unload_device (struct gomp_device_descr *devicep) } } -/* Free address mapping tables. MM must be locked on entry, and remains locked - on return. */ - -attribute_hidden void -gomp_free_memmap (struct splay_tree_s *mem_map) -{ - while (mem_map->root) -{ - struct target_mem_desc *tgt = mem_map->root->key.tgt; - - splay_tree_remove (mem_map, _map->root->key); - free (tgt->array); - free (tgt); -} -} - /* Host fallback for GOMP_target{,_ext} routines. */ static void -- 2.17.1
[patch,opencc] Don't mark OpenACC auto loops as independent inside acc parallel regions
OpenACC as a concept of loop independence, in which independent loops may be executed in parallel across gangs, workers and vectors. Inside acc parallel regions, if a loop isn't explicitly marked seq or auto, it is predetermined to be independent. This patch corrects a bug where acc loops marked as auto were being mistakenly promoted to independent. That's bad because it can generate bogus results if a dependency exist. Note that this patch depends on the following patches for -fnote-info-omp-optimized which is used in a test case. * Add user-friendly OpenACC diagnostics regarding detected parallelism. https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01652.html * Correct the reported line number in fortran combined OpenACC directives https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01554.html * Correct the reported line number in c++ combined OpenACC directives https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01552.html Is this OK for trunk? I bootstrapped and regtested on x86_64 Linux with nvptx offloading. Thanks, Cesar [OpenACC] Don't mark OpenACC auto loops as independent inside acc parallel regions 2018-XX-YY Cesar Philippidis gcc/ * omp-low.c (lower_oacc_head_mark): Don't mark OpenACC auto loops as independent inside acc parallel regions. gcc/testsuite/ * c-c++-common/goacc/loop-auto-1.c: Adjust test case to conform to the new behavior of the auto clause in OpenACC 2.5. * c-c++-common/goacc/loop-auto-2.c: Likewise. * gcc.dg/goacc/loop-processing-1.c: Likewise. * c-c++-common/goacc/loop-auto-3.c: New test. * gfortran.dg/goacc/loop-auto-1.f90: New test. libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust test case to conform to the new behavior of the auto clause in OpenACC 2.5. (cherry picked from gomp-4_0-branch r247569, 6d30b542f29) --- gcc/omp-low.c | 5 +- .../c-c++-common/goacc/loop-auto-1.c | 50 +-- .../c-c++-common/goacc/loop-auto-2.c | 4 +- .../c-c++-common/goacc/loop-auto-3.c | 78 .../gcc.dg/goacc/loop-processing-1.c | 2 +- .../gfortran.dg/goacc/loop-auto-1.f90 | 88 +++ .../libgomp.oacc-c-c++-common/loop-auto-1.c | 20 ++--- 7 files changed, 207 insertions(+), 40 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/loop-auto-3.c create mode 100644 gcc/testsuite/gfortran.dg/goacc/loop-auto-1.f90 diff --git a/gcc/omp-low.c b/gcc/omp-low.c index fdabf67249b..24685fd012c 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -5647,9 +5647,10 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses, tag |= OLF_GANG_STATIC; } - /* In a parallel region, loops are implicitly INDEPENDENT. */ + /* In a parallel region, loops without auto and seq clauses are + implicitly INDEPENDENT. */ omp_context *tgt = enclosing_target_ctx (ctx); - if (!tgt || is_oacc_parallel (tgt)) + if ((!tgt || is_oacc_parallel (tgt)) && !(tag & (OLF_SEQ | OLF_AUTO))) tag |= OLF_INDEPENDENT; if (tag & OLF_TILE) diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c index 124befc4002..dcad07f11c8 100644 --- a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c +++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c @@ -10,7 +10,7 @@ void Foo () #pragma acc loop seq for (int jx = 0; jx < 10; jx++) {} -#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ +#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */ for (int jx = 0; jx < 10; jx++) {} } @@ -20,7 +20,7 @@ void Foo () #pragma acc loop auto for (int jx = 0; jx < 10; jx++) {} -#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ +#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */ for (int jx = 0; jx < 10; jx++) { #pragma acc loop vector @@ -51,7 +51,7 @@ void Foo () #pragma acc loop vector for (int jx = 0; jx < 10; jx++) { -#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ +#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */ for (int kx = 0; kx < 10; kx++) {} } @@ -64,27 +64,27 @@ void Foo () } -#pragma acc loop auto +#pragma acc loop auto independent for (int ix = 0; ix < 10; ix++) { -#pragma acc loop auto +#pragma acc loop auto independent for (int jx = 0; jx < 10; jx++) { -#pragma acc loop auto +#pragma acc loop auto independent for (int kx = 0; kx < 10; kx++) {} } } -#pragma acc loop auto +#pragma acc loop auto independent for (int ix = 0; ix < 10; ix++) { -#pragma acc loop auto +#pragma acc loop auto independent for (int jx = 0; jx < 10; jx++) { -#pragma acc loop auto /* { dg-warning "insufficient partitioning&q
[patch,openacc] Better distinguish OpenACC and OpenMP sections in libgomp.texi
This patch updates the libgomp documentation to more clearly identify OpenMP-specific sections. Specifically, the sections "Runtime Library Routine" and "Environment Variables" are now prefixed by OpenMP, because those sections are applicable to OpenACC. Is this OK for trunk? I verified that libgomp.pdf looks ok. Thanks, Cesar [OpenACC] Update _OPENACC value and documentation for OpenACC 2.5 2018-XX-YY Thomas Schwinge Cesar Philippidis gcc/c-family/ * c-cppbuiltin.c (c_cpp_builtins): Update "_OPENACC" to "201510". gcc/fortran/ * cpp.c (cpp_define_builtins): Update "_OPENACC" to "201510". * gfortran.texi: Update for OpenACC 2.5. * Intrinsic.texi: Likewise. * invoke.texi: Likewise. gcc/testsuite/ * c-c++-common/cpp/openacc-define-3.c: Update. * gfortran.dg/openacc-define-3.f90: Likewise. gcc/ * doc/invoke.texi: Update for OpenACC 2.5. libgomp/ * libgomp.texi: Update for OpenACC 2.5. * openacc.f90 (openacc_version): Update to "201510". * openacc_lib.h (openacc_version): Likewise. * testsuite/libgomp.oacc-fortran/openacc_version-1.f: Update. * testsuite/libgomp.oacc-fortran/openacc_version-2.f90: Update. (cherry picked from gomp-4_0-branch r248057, ccbbcb70569) --- gcc/c-family/c-cppbuiltin.c | 2 +- gcc/doc/invoke.texi | 4 +++- gcc/fortran/cpp.c | 2 +- gcc/fortran/gfortran.texi | 16 +- gcc/fortran/intrinsic.texi| 6 +++--- gcc/fortran/invoke.texi | 4 +--- .../c-c++-common/cpp/openacc-define-3.c | 2 +- .../gfortran.dg/openacc-define-3.f90 | 2 +- libgomp/libgomp.texi | 21 ++- libgomp/openacc.f90 | 2 +- libgomp/openacc_lib.h | 2 +- .../libgomp.oacc-fortran/openacc_version-1.f | 2 +- .../openacc_version-2.f90 | 2 +- 13 files changed, 31 insertions(+), 36 deletions(-) diff --git a/gcc/c-family/c-cppbuiltin.c b/gcc/c-family/c-cppbuiltin.c index 96a6b4dfd2b..f2a273b6ac7 100644 --- a/gcc/c-family/c-cppbuiltin.c +++ b/gcc/c-family/c-cppbuiltin.c @@ -1391,7 +1391,7 @@ c_cpp_builtins (cpp_reader *pfile) cpp_define (pfile, "__SSP__=1"); if (flag_openacc) -cpp_define (pfile, "_OPENACC=201306"); +cpp_define (pfile, "_OPENACC=201510"); if (flag_openmp) cpp_define (pfile, "_OPENMP=201511"); diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 94304c314cf..34d7ff71512 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -2161,10 +2161,12 @@ freestanding and hosted environments. Enable handling of OpenACC directives @code{#pragma acc} in C/C++ and @code{!$acc} in Fortran. When @option{-fopenacc} is specified, the compiler generates accelerated code according to the OpenACC Application -Programming Interface v2.0 @w{@uref{https://www.openacc.org}}. This option +Programming Interface v2.5 @w{@uref{https://www.openacc.org}}. This option implies @option{-pthread}, and thus is only supported on targets that have support for @option{-pthread}. +See @uref{https://gcc.gnu.org/wiki/OpenACC} for more information. + @item -fopenacc-dim=@var{geom} @opindex fopenacc-dim @cindex OpenACC accelerator programming diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c index 0b3de42e832..14871129ff6 100644 --- a/gcc/fortran/cpp.c +++ b/gcc/fortran/cpp.c @@ -165,7 +165,7 @@ cpp_define_builtins (cpp_reader *pfile) cpp_define (pfile, "_LANGUAGE_FORTRAN=1"); if (flag_openacc) -cpp_define (pfile, "_OPENACC=201306"); +cpp_define (pfile, "_OPENACC=201510"); if (flag_openmp) cpp_define (pfile, "_OPENMP=201511"); diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi index 30934046a49..59a69457fe0 100644 --- a/gcc/fortran/gfortran.texi +++ b/gcc/fortran/gfortran.texi @@ -476,9 +476,7 @@ used on real-world programs. In particular, the supported extensions include OpenMP, Cray-style pointers, some old vendor extensions, and several Fortran 2003 and Fortran 2008 features, including TR 15581. However, it is still under development and has a few remaining rough edges. -There also is initial support for OpenACC. -Note that this is an experimental feature, incomplete, and subject to -change in future versions of GCC. See +There also is support for OpenACC. See @uref{https://gcc.gnu.org/wiki/OpenACC} for more information. At present, the GNU Fortran compiler passes the @@ -538,10 +536,8 @@ status} and @ref{Fortran 2018 status} sections of the documentation. Additionally, the GNU Fortran compilers supports the OpenMP specification (version 4.0 and most of the features of the 4.5 version, @url{http://openmp.org/@/wp/@/openmp-specifications/}). -There also is initial support for t
Re: [openacc] Teach gfortran to lower OpenACC routine dims
On 09/19/2018 03:27 PM, Bernhard Reutner-Fischer wrote: > On Wed, 5 Sep 2018 12:52:03 -0700 > Cesar Philippidis wrote: > >> At present, gfortran does not encode the gang, worker or vector >> parallelism clauses when it creates acc routines dim attribute for >> subroutines and functions. While support for acc routine is lacking in >> other areas in gfortran (including modules), this patch is important >> because it encodes the parallelism attributes using the same function >> as the C and C++ FEs. This will become important with the forthcoming >> nvptx vector length extensions, because large vectors are not >> supported in acc routines yet. >> >> Is this OK for trunk? I regtested and bootstrapped for x86_64 with >> nvptx offloading. > >> diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c >> index 94a7f7eaa50..d48c9351e25 100644 >> --- a/gcc/fortran/openmp.c >> +++ b/gcc/fortran/openmp.c >> @@ -2234,34 +2234,45 @@ gfc_match_oacc_cache (void) >>return MATCH_YES; >> } >> >> -/* Determine the loop level for a routine. */ >> +/* Determine the loop level for a routine. Returns >> OACC_FUNCTION_NONE >> + if any error is detected. */ >> >> -static int >> +static oacc_function >> gfc_oacc_routine_dims (gfc_omp_clauses *clauses) >> { >>int level = -1; >> + oacc_function ret = OACC_FUNCTION_AUTO; >> >>if (clauses) >> { >>unsigned mask = 0; >> >>if (clauses->gang) >> -level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level); >> +{ >> + level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level); >> + ret = OACC_FUNCTION_GANG; >> +} >>if (clauses->worker) >> -level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level); >> +{ >> + level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level); >> + ret = OACC_FUNCTION_WORKER; >> +} >>if (clauses->vector) >> -level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level); >> +{ >> + level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level); >> + ret = OACC_FUNCTION_VECTOR; >> +} >>if (clauses->seq) >> -level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level); >> +{ >> + level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level); >> + ret = OACC_FUNCTION_SEQ; >> +} >> >>if (mask != (mask & -mask)) >> -gfc_error ("Multiple loop axes specified for routine"); >> +ret = OACC_FUNCTION_NONE; >> } >> >> - if (level < 0) >> -level = GOMP_DIM_MAX; >> - >> - return level; >> + return ret; >> } >> >> match >> @@ -2272,6 +2283,8 @@ gfc_match_oacc_routine (void) >>match m; >>gfc_omp_clauses *c = NULL; >>gfc_oacc_routine_name *n = NULL; >> + oacc_function dims = OACC_FUNCTION_NONE; > > Unneeded initialisation of dims. ACK. >> + bool seen_error = false; >> >>old_loc = gfc_current_locus; >> >> @@ -2318,17 +2331,15 @@ gfc_match_oacc_routine (void) >> } >>else >> { >> - gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C"); >> - gfc_current_locus = old_loc; >> - return MATCH_ERROR; >> + gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L", >> _loc); >> + goto cleanup; >> } >> >>if (gfc_match_char (')') != MATCH_YES) >> { >> - gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C, >> expecting" >> - " ')' after NAME"); >> - gfc_current_locus = old_loc; >> - return MATCH_ERROR; >> + gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L, >> expecting" >> + " ')' after NAME", _loc); >> + goto cleanup; >> } >> } >> >> @@ -2337,26 +2348,83 @@ gfc_match_oacc_routine (void) >>!= MATCH_YES)) >> return MATCH_ERROR; >> >> + /* Scan for invalid routine geometry. */ >> + dims = gfc_oacc_routine_dims (c); >> + if (dims == OACC_FUNCTION_NONE) >> +{ >> + gfc_error ("Multiple loop axes specified in !$ACC ROUTINE at >> %L", >> + _loc); >> + >> + /* Don't abort early, because it's important to let the user >> + know of any potential duplicate routine directives. */ >> + seen_error =
[nvptx] vector length patch series
Hi Tom, Here is a link to our nvptx vector length patches on github: https://github.com/cesarjp/gcc/tree/trunk-og8-vl-private Specifically, the code lives in the trunk-og8-vl-private branch. There are a couple of outstanding dependency patches: * Teach gfortran to lower OpenACC routine dims https://gcc.gnu.org/ml/gcc-patches/2018-09/msg00368.html b186c651f37 [openacc] Make GFC default to -1 for OpenACC routine dims * Add target hook TARGET_GOACC_ADJUST_PARALLELISM https://gcc.gnu.org/ml/gcc-patches/2018-09/msg00369.html 49b2039013e [openacc] Add target hook TARGET_GOACC_ADJUST_PARALLELISM * Enable firstprivate OpenACC reductions https://gcc.gnu.org/ml/gcc-patches/2018-09/msg00370.html 1f70cdb7cf0 (HEAD -> trunk-og8-vl-private, github/trunk-og8-vl-private) [OpenACC] Enable firstprivate OpenACC reductions * Adjust offsets for present data clauses https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01213.html 8bcda2f1a2b [libgomp, OpenACC] Adjust offsets for present data clauses Of the patches in trunk-og8-vl-private, the following are just general refactors and cleanups which do not change any functionality: 7eb378e9b0c [nvptx] Generalize state propagation and synchronization 10aa1f74d5a [nvptx] Use MAX, MIN, ROUND_UP macros 9dfe611f3d8 [nvptx] Use TARGET_SET_CURRENT_FUNCTION 4fbe0e812bd [nvptx] Add axis_dim fbe43dac79f [nvptx] Add thread count parm to bar.sync 57d3f8c88ff [nvptx] only use one bar.sync barriers in OpenACC offloaded code f14d0e882eb [nvptx] Fix whitespace in nvptx_single and nvptx_neuter_pars 82d81fffb0f [nvptx] make nvptx state propagation function names more generic 95703737e09 [nvptx] consolidate offloaded function attributes into struct offload_attrs 8c9e897c36d [nvptx] Rename worker_bcast variables oacc_bcast. 45147e7e3f3 [nvptx] update openacc dim macros caa641ecfb4 [nvptx] Update insufficient launch message to accommodate large vectors The following patches actually implement the new vector length functionality. Note that trunk doesn't support missing arguments between colons in -fopenacc-dim like -fopenacc-dim=::64, so I had to remove a couple or adjust a couple of your test cases from og8. 591973d3c3a [nvptx] use user-defined vectors when possible fb9cefa5b17 [nvptx] Handle large vector reductions 5154d363d07 [nvptx] Force vl32 if calling vector-partitionable routines f62e3afcf6a [nvptx, openacc] Don't emit barriers for empty loops 4cc408658fb [PR85246] [nvptx] Fix propagation of branch cond in vw-neutered code d97ed5fc580 [nvptx] Simplifly logic in nvptx_single 62f0c5df3dd [nvptx] Enable worker partitioning with warp-sized vector_length f2cf96b0df3 [nvptx] Handle large vectors in libgomp eba014c260c [nvptx] Enable large vectors f31d8b98ca1 [nvptx] Add vector_length 128 testcases Let me know if you encounter any problems with that github branch. This branch has recently been recently rebased against trunk. Further, I bootstrapped and regtested it on x86_64 Linux target with nvptx offloading. Thanks, Cesar
Re: [PATCH,nvptx] Remove use of CUDA unified memory in libgomp
On 08/01/2018 04:12 AM, Tom de Vries wrote: > On 07/31/2018 05:27 PM, Cesar Philippidis wrote: >>/* Copy the (device) pointers to arguments to the device (dp and hp might >> in >> fact have the same value on a unified-memory system). */ > > This comment needs to be updated, right? > >> - CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, >> + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, >> mapnum * sizeof (void *)); >>GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" >> " gangs=%u, workers=%u, vectors=%u\n", >> -- 2.7.4 >> > > Otherwise OK. Thanks. I've committed the attach patch to trunk. Cesar [nvptx] Remove use of CUDA unified memory in libgomp 2018-09-18 Cesar Philippidis libgomp/ * plugin/plugin-nvptx.c (struct cuda_map): New. (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev, h_tail with (cuda_map *) map. (cuda_map_create): New function. (cuda_map_destroy): New function. (map_init): Update to use a linked list of cuda_map objects. (map_fini): Likewise. (map_pop): Likewise. (map_push): Likewise. Return CUdeviceptr instead of void. (init_streams_for_device): Remove stales references to ptx_stream members. (select_stream_for_async): Likewise. (nvptx_exec): Update call to map_init. (cherry picked from gomp-4_0-branch r242614) --- libgomp/plugin/plugin-nvptx.c | 170 ++ 1 file changed, 91 insertions(+), 79 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index bae1b05..6492e5f 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -192,20 +192,20 @@ cuda_error (CUresult r) static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; +struct cuda_map +{ + CUdeviceptr d; + size_t size; + bool active; + struct cuda_map *next; +}; + struct ptx_stream { CUstream stream; pthread_t host_thread; bool multithreaded; - - CUdeviceptr d; - void *h; - void *h_begin; - void *h_end; - void *h_next; - void *h_prev; - void *h_tail; - + struct cuda_map *map; struct ptx_stream *next; }; @@ -217,101 +217,114 @@ struct nvptx_thread struct ptx_device *ptx_dev; }; +static struct cuda_map * +cuda_map_create (size_t size) +{ + struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); + + assert (map); + + map->next = NULL; + map->size = size; + map->active = false; + + CUDA_CALL_ERET (NULL, cuMemAlloc, >d, size); + assert (map->d); + + return map; +} + +static void +cuda_map_destroy (struct cuda_map *map) +{ + CUDA_CALL_ASSERT (cuMemFree, map->d); + free (map); +} + +/* The following map_* routines manage the CUDA device memory that + contains the data mapping arguments for cuLaunchKernel. Each + asynchronous PTX stream may have multiple pending kernel + invocations, which are launched in a FIFO order. As such, the map + routines maintains a queue of cuLaunchKernel arguments. + + Calls to map_push and map_pop must be guarded by ptx_event_lock. + Likewise, calls to map_init and map_fini are guarded by + ptx_dev_lock inside GOMP_OFFLOAD_init_device and + GOMP_OFFLOAD_fini_device, respectively. */ + static bool map_init (struct ptx_stream *s) { int size = getpagesize (); assert (s); - assert (!s->d); - assert (!s->h); - - CUDA_CALL (cuMemAllocHost, >h, size); - CUDA_CALL (cuMemHostGetDevicePointer, >d, s->h, 0); - assert (s->h); + s->map = cuda_map_create (size); - s->h_begin = s->h; - s->h_end = s->h_begin + size; - s->h_next = s->h_prev = s->h_tail = s->h_begin; - - assert (s->h_next); - assert (s->h_end); return true; } static bool map_fini (struct ptx_stream *s) { - CUDA_CALL (cuMemFreeHost, s->h); + assert (s->map->next == NULL); + assert (!s->map->active); + + cuda_map_destroy (s->map); + return true; } static void map_pop (struct ptx_stream *s) { - assert (s != NULL); - assert (s->h_next); - assert (s->h_prev); - assert (s->h_tail); - - s->h_tail = s->h_next; - - if (s->h_tail >= s->h_end) -s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); + struct cuda_map *next; - if (s->h_next == s->h_tail) -s->h_prev = s->h_next; + assert (s != NULL); - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); + if (s->map->next == NULL) +{ + s->map->active = false; + return; +} - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + next = s->map->next; + cuda_map_destroy (s->map); + s->map = nex
[patch,nvptx] Add atomic_fetch* support for SImode arguments.
I've committed this patch extends the nvptx atomic_fetch_ pattern to accept SImode arguments regardless of the -misa argument supplied. Tom had pre-approved this patch awhile ago. As the test case demonstrates, it only works 32-bit pointers. While adding the new test case, I noticed that I named atomic-fetch-2.c incorrectly; there should be an underscore between atomic and fetch. This patch also fixes that. I tested this patch using both a standalone nvptx compiler and x86_64 Linux with nvptx offloading. Cesar [nvptx] Add atomic_fetch* support for SImode arguments. 2018-09-17 Cesar Philippidis Bernd Schmidt gcc/ * config/nvptx/nvptx.md (atomic_fetch_): Enable with SImode args. gcc/testsuite/ * gcc.target/nvptx/atomic-fetch-2.c: Rename to ... * gcc.target/nvptx/atomic_fetch-2.c: ... this. * gcc.target/nvptx/atomic_fetch-3.c: New test. --- gcc/config/nvptx/nvptx.md | 2 +- .../{atomic-fetch-2.c => atomic_fetch-2.c}| 0 .../gcc.target/nvptx/atomic_fetch-3.c | 24 +++ 3 files changed, 25 insertions(+), 1 deletion(-) rename gcc/testsuite/gcc.target/nvptx/{atomic-fetch-2.c => atomic_fetch-2.c} (100%) create mode 100644 gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index dd6032d021b..ca00b1d8073 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1449,7 +1449,7 @@ UNSPECV_LOCK)) (set (match_operand:SDIM 0 "nvptx_register_operand" "=R") (match_dup 1))] - "TARGET_SM35" + "mode == SImode || TARGET_SM35" "%.\\tatom%A1.b%T0.\\t%0, %1, %2;" [(set_attr "atomic" "true")]) diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-2.c similarity index 100% rename from gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c rename to gcc/testsuite/gcc.target/nvptx/atomic_fetch-2.c diff --git a/gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c new file mode 100644 index 000..36a83ebba9b --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-3.c @@ -0,0 +1,24 @@ +/* Test the nvptx atomic instructions for __atomic_fetch_OP for + SImode arguments. */ + +/* { dg-do compile } */ +/* { dg-options "-O2 -m32" } */ + +int +main() +{ + unsigned long a = ~0; + unsigned b = 0xa; + + __atomic_fetch_add (, b, 0); + __atomic_fetch_and (, b, 0); + __atomic_fetch_or (, b, 0); + __atomic_fetch_xor (, b, 0); + + return a; +} + +/* { dg-final { scan-assembler "atom.add.u32" } } */ +/* { dg-final { scan-assembler "atom.b32.and" } } */ +/* { dg-final { scan-assembler "atom.b32.or" } } */ +/* { dg-final { scan-assembler "atom.b32.xor" } } */ -- 2.17.1
Re: [PATCH, OpenACC] C++ reference mapping (PR middle-end/86336)
On 09/10/2018 10:37 AM, Jason Merrill wrote: > On Mon, Sep 10, 2018 at 4:05 AM, Julian Brown wrote: >> This patch (by Cesar) changes the way C++ references are mapped in >> OpenACC regions, fixing an ICE in the non-scalar-data.C testcase. >> >> Post-patch, references are mapped like this (from the omplower dump): >> >> map(force_present:*x [len: 4]) map(firstprivate ref:x [pointer assign, bias: >> 0]) >> >> Tested with offloading to NVPTX and bootstrapped. OK for trunk? >> >> Thanks, >> >> Julian >> >> ChangeLog >> >> 2018-09-09 Cesar Philippidis >> Julian Brown >> >> PR middle-end/86336 >> >> (gimplify_adjust_omp_clauses_1): Update handling of mapping of C++ >> references. > > How is reference handling specified differently between OpenMP and > OpenACC? It seems strange for them to differ. Both OpenACC and OpenMP privatize mapped array pointers on the accelerator for subarrays in the same way. However, for pointers without subarrays, OpenMP treats them as zero-length arrays, whereas OpenACC treats them as ordinary scalars so that the pointer target will not get remapped on the accelerator (which is odd because there's a deviceptr clause for that). Scalars in C++ are special, because references must treated like an array of length one, for lack of a better terminology. > In any case, you shouldn't need to check lang_GNU_CXX since we're > already calling the langhook. Julian, can you look into this? I'm traveling tomorrow. Cesar
Re: [PATCH, OpenACC 2.5, libgomp] Add *_async versions of runtime library API functions
On 09/10/2018 08:04 AM, Chung-Lin Tang wrote: > GOACC_2.0 { > Index: libgomp/oacc-mem.c > === > --- libgomp/oacc-mem.c(revision 264192) > +++ libgomp/oacc-mem.c(working copy) > @@ -153,8 +153,9 @@ acc_free (void *d) > gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); > } > > -void > -acc_memcpy_to_device (void *d, void *h, size_t s) > +static void > +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, > + const char *libfnname) This showed up oddly in the diff, but memcpy_tofrom_device is a new internal function that's not part of the public API. It's nice that you were able to merge the to/from functions together. I think this is safe in terms of backwards compatibility. > { >/* No need to call lazy open here, as the device pointer must have > been obtained from a routine that did that. */ > @@ -164,31 +165,49 @@ acc_free (void *d) > >if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) > { > - memmove (d, h, s); > + if (from) > + memmove (h, d, s); > + else > + memmove (d, h, s); >return; > } > > - if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) > -gomp_fatal ("error in %s", __FUNCTION__); > + if (async > acc_async_sync) > +thr->dev->openacc.async_set_async_func (async); > + > + bool ret = (from > + ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) > + : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); > + > + if (async > acc_async_sync) > +thr->dev->openacc.async_set_async_func (acc_async_sync); > + > + if (!ret) > +gomp_fatal ("error in %s", libfnname); > } > > void > -acc_memcpy_from_device (void *h, void *d, size_t s) > +acc_memcpy_to_device (void *d, void *h, size_t s) > { > - /* No need to call lazy open here, as the device pointer must have > - been obtained from a routine that did that. */ > - struct goacc_thread *thr = goacc_thread (); > + memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); > +} > > - assert (thr && thr->dev); > +void > +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) > +{ > + memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); > +} > > - if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) > -{ > - memmove (h, d, s); > - return; > -} > +void > +acc_memcpy_from_device (void *h, void *d, size_t s) > +{ > + memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); > +} > > - if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) > -gomp_fatal ("error in %s", __FUNCTION__); > +void > +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) > +{ > + memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); > } > > /* Return the device pointer that corresponds to host data H. Or NULL > @@ -428,7 +447,7 @@ acc_unmap_data (void *h) > #define FLAG_COPY (1 << 2) > > static void * > -present_create_copy (unsigned f, void *h, size_t s) > +present_create_copy (unsigned f, void *h, size_t s, int async) Likewise, this is another internal function, so it shouldn't break anything. > { >void *d; >splay_tree_key n; > @@ -490,11 +509,17 @@ static void * > >gomp_mutex_unlock (_dev->lock); > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (async); > + >tgt = gomp_map_vars (acc_dev, mapnum, , NULL, , , > true, > GOMP_MAP_VARS_OPENACC); >/* Initialize dynamic refcount. */ >tgt->list[0].key->dynamic_refcount = 1; > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (acc_async_sync); > + >gomp_mutex_lock (_dev->lock); > >d = tgt->to_free; > @@ -510,19 +535,32 @@ static void * > void * > acc_create (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, > acc_async_sync); > } > > +void > +acc_create_async (void *h, size_t s, int async) > +{ > + present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async); > +} > + > void * > acc_copyin (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, > + acc_async_sync); > } > > +void > +acc_copyin_async (void *h, size_t s, int async) > +{ > + present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async); > +} > + > void * > acc_present_or_create (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, > acc_async_sync); > } > > /* acc_pcreate is acc_present_or_create by a different name. */ > @@ -539,7
Re: [patch,nvptx] Basic -misa support for nvptx
On 09/05/2018 07:30 AM, Tom de Vries wrote: > On 09/05/2018 12:19 AM, Cesar Philippidis wrote: >> On 09/02/2018 07:57 AM, Cesar Philippidis wrote: >>> On 09/01/2018 12:04 PM, Tom de Vries wrote: >>>> On 08/31/2018 04:14 PM, Cesar Philippidis wrote: >>> >>>>> Is this patch OK for trunk? >>>>> >>>> >>>> Well, how did you test this ( >>>> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and >>>> testing. State the host and target combinations you used to do proper >>>> testing as described above, and the results of your testing.") ? >>> >>> I tested the standalone nvptx compiler. I'll retest with libgomp with >>> -misa=sm_35. Bootstrapping won't help much here, unfortunately. >>>>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c >>>>> @@ -0,0 +1,24 @@ >>>>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35 >>>>> + targets. */ >>>>> + >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -misa=sm_35" } */ >>>>> + >>>>> +int >>>>> +main() >>>>> +{ >>>>> + unsigned long long a = ~0; >>>>> + unsigned b = 0xa; >>>>> + >>>>> + __atomic_fetch_add (, b, 0); >>>>> + __atomic_fetch_and (, b, 0); >>>>> + __atomic_fetch_or (, b, 0); >>>>> + __atomic_fetch_xor (, b, 0); >>>>> + >>>>> + return a; >>>>> +} >>>>> + >>>>> +/* { dg-final { scan-assembler "atom.add.u64" } } */ >>>>> +/* { dg-final { scan-assembler "atom.b64.and" } } */ >>>>> +/* { dg-final { scan-assembler "atom.b64.or" } } */ >>>>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */ >>>>> -- 2.17.1 >>>>> >>>> >>>> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not >>>> testcase does not use this difference, so that needs to be fixed, or for >>>> bonus points, changed into a scan-assembler testcase). >>>> >>>> The documentation uses "op.type", we should fix the compiler to emit >>>> that consistently. Separate patch that fixes that pre-approved. >>> >>> ACK. I think there are a lot of other cases like that in the BE. >>> >>>> This is ok (with, as I mentioned above, the SI part split off into a >>>> separate patch), on the condition that you test libgomp with >>>> -foffload=-misa=sm_35. >> >> Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't >> support the -misa flag. > > That doesn't make sense to me. For me this works without any problems. > Have you tried a clean build? I was incorrectly setting ALWAYS_CFLAGS to use -foffload=-misa=sm_35. That didn't work on the host. But lappend'ing tagopt did work. >> When I forced the nvptx BE to set TARGET_SM35 to >> always be true, I ran into problems with SM_30 code linking against >> SM_35 code. > > I also cannot reproduce this, works for me. I found the problem. I wasn't using a clean build. Besides, with the tagopt change in libgomp, I didn't need to force the -misa=sm_35 flag everywhere. >> Therefore, I don't think this patch is ready for trunk yet. >>> By the way, is -misa really necessary for atomic_fetch_? >> Looking at the PTX documentation I see >> <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-3-1>: >> >> PTX ISA version 3.1 introduces the following new features: >> >> * Support for sm_35 target architecture. >> * Extends atomic and reduction instructions to perform 64-bit {and, or, >> xor} operations, and 64-bit integer {min, max} operations. >> >> Is there a table for which list which GPUs are compatible with which >> instructions? > > Yes, every instruction has a table in the ptx manual, and there's a "PTX > ISA Notes" entry. > > For the atom instruction in ptx isa 3.1 manual, we have "PTX ISA Notes": > ... > atom.global requires sm_11 or higher. > atom.shared requires sm_12 or higher. > 64-bit atom.global.{add,cas,exch} require sm_12 or higher. > 64-bit atom.shared.{add,cas,exch} require sm_20 or higher. > 64-bit atom.{and,or.xor,min,max} require sm_35 or higher. > atom.add.f32 requires sm_20 or higher. > Use of generic addressing requires sm_20 or higher. > ... Thanks! I'll com
[OpenACC] Enable firstprivate OpenACC reductions
This patch teaches the gimplifier how to pass certain OpenACC reduction variables as firstprivate, and not with an implicit copy directive. This is matches the default behavior for the implicit data mappings of scalar variables inside OpenACC parallel regions. It should be noted that the gimplifier will still implicitly map reduction variables on loops immediately enclosed inside a parallel regions, like so #pragma acc parallel #pragma acc loop reduction(+:sum) as copy. This change only impacts reductions variables inside nested acc loops like #pragma acc parallel #pragma acc loop for (...) { #pragma acc loop reduction(+:s2) Here s2 will be transferred into the accelerator as firstprivate instead of copy. Is this OK for trunk? I regtested and bootstrapped for x86_64 with nvptx offloading. Cesar [OpenACC] Enable firstprivate OpenACC reductions 2018-XX-YY Cesar Philippidis Chung-Lin Tang gcc/ * gimplify.c (omp_add_variable): Enable firstprivate reduction variables. gcc/testsuite/ * c-c++-common/goacc/reduction-8.c: New test. libgomp/ * testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c: New test. * testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c: New test. diff --git a/gcc/gimplify.c b/gcc/gimplify.c index dbd0f0ebd0c..4d954e20788 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -6823,20 +6823,27 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) else splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags); - /* For reductions clauses in OpenACC loop directives, by default create a - copy clause on the enclosing parallel construct for carrying back the - results. */ + /* For OpenACC loop directives, when a reduction clause is placed on + the outermost acc loop within an acc parallel or kernels + construct, it must have an implied copy data mapping. E.g. + + #pragma acc parallel + { + #pragma acc loop reduction (+:sum) + + a copy clause for sum should be added on the enclosing parallel + construct for carrying back the results. */ if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION)) { struct gimplify_omp_ctx *outer_ctx = ctx->outer_context; - while (outer_ctx) + if (outer_ctx) { n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl); if (n != NULL) { /* Ignore local variables and explicitly declared clauses. */ if (n->value & (GOVD_LOCAL | GOVD_EXPLICIT)) - break; + ; else if (outer_ctx->region_type == ORT_ACC_KERNELS) { /* According to the OpenACC spec, such a reduction variable @@ -6856,9 +6863,7 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) { splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl, GOVD_MAP | GOVD_SEEN); - break; } - outer_ctx = outer_ctx->outer_context; } } } diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-8.c b/gcc/testsuite/c-c++-common/goacc/reduction-8.c new file mode 100644 index 000..8a0283f4ac3 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/reduction-8.c @@ -0,0 +1,94 @@ +/* { dg-additional-options "-fdump-tree-gimple" } */ + +#define n 1000 + +int +main(void) +{ + int i, j; + int result, array[n]; + +#pragma acc parallel loop reduction (+:result) + for (i = 0; i < n; i++) +result ++; + +#pragma acc parallel +#pragma acc loop reduction (+:result) + for (i = 0; i < n; i++) +result ++; + +#pragma acc parallel +#pragma acc loop + for (i = 0; i < n; i++) +{ + result = i; + +#pragma acc loop reduction(+:result) + for (j = 0; j < n; j++) + result ++; + + array[i] = result; +} + +#pragma acc parallel +#pragma acc loop + for (i = 0; i < n; i++) +{ + result = i; + +#pragma acc loop worker vector reduction(+:result) + for (j = 0; j < n; j++) + result ++; + + array[i] = result; +} + +#pragma acc parallel +#pragma acc loop // { dg-warning "insufficient partitioning" } + for (i = 0; i < n; i++) +{ + result = i; + +#pragma acc loop gang reduction(+:result) + for (j = 0; j < n; j++) + result ++; + + array[i] = result; +} + +#pragma acc parallel copy(result) +#pragma acc loop // { dg-warning "insufficient partitioning" } + for (i = 0; i < n; i++) +{ + result = i; + +#pragma acc loop gang reduction(+:result) + for (j = 0; j < n; j++) + result ++; + + array[i] = result; +} + +#pragma acc kernels +#pragma acc loop + for (i = 0; i < n; i++) +{ + result = i; + +#pragma acc loop reduction(+:result) + for (j = 0; j < n; j++) + result ++; + + array[i] = result; +} + + return 0; +} + +/* Check that default copy maps are generated for loop reductions. */ +/* { dg-final { scan-tree-dump-times "reductio
[patch][OpenACC] Add target hook TARGET_GOACC_ADJUST_PARALLELISM
At present, GCC fixes the vector length on all targets. However, that is an artificial restriction. This patch introduces a new TARGET_GOACC_ADJUST_PARALLELISM hook that enables the runtime to correct the default number of acc workers and vectors. Extra care need to be done to ensure that large vectors fit inside workers. The target hook itself doesn't do anything for the host, but the nvptx BE will make use of it. Is this patch OK for trunk? I regtested and bootstrapped for x86_64 with nvptx offloading. Thanks, Cesar [openacc] Add target hook TARGET_GOACC_ADJUST_PARALLELISM gcc/ * doc/tm.texi.in: Add placeholder for TARGET_GOACC_ADJUST_PARALLELISM. * doc/tm.texi: Regenerate. * omp-offload.c (oacc_loop_fixed_partitions): Use the adjust_parallelism hook to modify this_mask. (oacc_loop_auto_partitions): Use the adjust_parallelism hook to modify this_mask and loop->mask. (default_goacc_adjust_parallelism): New function. * target.def (adjust_parallelism): New hook. * targhooks.h (default_goacc_adjust_parallelism): Declare. diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index a40f45ade07..365a7bbec90 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6029,6 +6029,12 @@ This hook should return the maximum size of a particular dimension, or zero if unbounded. @end deftypefn +@deftypefn {Target Hook} unsigned TARGET_GOACC_ADJUST_PARALLELISM (unsigned @var{this_mask}, unsigned @var{outer_mask}) +This hook allows the accelerator compiler to remove any unused +parallelism exposed in the current loop @var{THIS_MASK}, and the +enclosing loop @var{OUTER_MASK}. It returns an adjusted mask. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_GOACC_FORK_JOIN (gcall *@var{call}, const int *@var{dims}, bool @var{is_fork}) This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN function calls to target-specific gimple, or indicate whether they diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 39a214e9b2c..9edd2e7ecaf 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4145,6 +4145,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_GOACC_DIM_LIMIT +@hook TARGET_GOACC_ADJUST_PARALLELISM + @hook TARGET_GOACC_FORK_JOIN @hook TARGET_GOACC_REDUCTION diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index 0abf0283c9e..1659febd2b1 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -1218,6 +1218,13 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) } } + /* Ideally, we should be coalescing parallelism here if the + hardware supports it. E.g. Instead of partitioning a loop + across worker and vector axes, sometimes the hardware can + execute those loops together without resorting to placing + extra thread barriers. */ + this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask); + mask_all |= this_mask; if (loop->flags & OLF_TILE) @@ -1302,6 +1309,7 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask, this_mask ^= loop->e_mask; } + this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask); loop->mask |= this_mask; } @@ -1350,6 +1358,8 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask, } loop->mask |= this_mask; + loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask); + if (!loop->mask && noisy) warning_at (loop->loc, 0, tiling @@ -1684,6 +1694,15 @@ default_goacc_dim_limit (int ARG_UNUSED (axis)) #endif } +/* Default adjustment of loop parallelism is not required. */ + +unsigned +default_goacc_adjust_parallelism (unsigned this_mask, + unsigned ARG_UNUSED (outer_mask)) +{ + return this_mask; +} + namespace { const pass_data pass_data_oacc_device_lower = diff --git a/gcc/target.def b/gcc/target.def index c570f3825a5..401d681fc42 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1678,6 +1678,14 @@ or zero if unbounded.", int, (int axis), default_goacc_dim_limit) +DEFHOOK +(adjust_parallelism, +"This hook allows the accelerator compiler to remove any unused\n\ +parallelism exposed in the current loop @var{THIS_MASK}, and the\n\ +enclosing loop @var{OUTER_MASK}. It returns an adjusted mask.", +unsigned, (unsigned this_mask, unsigned outer_mask), +default_goacc_adjust_parallelism) + DEFHOOK (fork_join, "This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN\n\ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index f92ca5ca997..38e024b13de 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -125,6 +125,7 @@ extern bool default_goacc_validate_dims (tree, int [], int); extern int default_goacc_dim_limit (int); extern bool default_goacc_fork_join (gcall *, const int [], bool); extern void default_goacc_reduction (gcall *); +extern unsigned default_goacc_adjust_parallelism (unsigned, unsigned); /* These are here, and not in hooks.[ch], because not all users of hooks.h include tm.h, and thus we don't
[openacc] Teach gfortran to lower OpenACC routine dims
At present, gfortran does not encode the gang, worker or vector parallelism clauses when it creates acc routines dim attribute for subroutines and functions. While support for acc routine is lacking in other areas in gfortran (including modules), this patch is important because it encodes the parallelism attributes using the same function as the C and C++ FEs. This will become important with the forthcoming nvptx vector length extensions, because large vectors are not supported in acc routines yet. Is this OK for trunk? I regtested and bootstrapped for x86_64 with nvptx offloading. Thanks, Cesar [openacc] Teach gfortran to lower OpenACC routine dims gcc/fortran/ * gfortran.h (oacc_function): New enum. (gfc_oacc_routine_name): Add locus loc field. * openmp.c (gfc_oacc_routine_dims): Return oacc_function. (gfc_match_oacc_routine): Update routine clause syntax checking. Populate oacc_function attribute with dims. * trans-decl.c (add_attributes_to_decl): Use oacc_build_routine_dims to construct routine dims. gcc/testsuite/ * gfortran.dg/goacc/classify-routine.f95: Adjust test. * gfortran.dg/goacc/pr71704.f90: Likewise. * gfortran.dg/goacc/routine-6.f90: Likewise. * gfortran.dg/goacc/routine-8.f90: Likewise. * gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise. libgomp/ * testsuite/libgomp.oacc-fortran/routine-1.f90: Adjust test. * testsuite/libgomp.oacc-fortran/routine-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-5.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-9.f90: Likewise. * libgomp.oacc-fortran/host_data-2.f90: Likewise. * libgomp.oacc-fortran/host_data-3.f: Likewise. * libgomp.oacc-fortran/host_data-4.f90: Likewise. diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h index 04b0024a992..3675f2e8d52 100644 --- a/gcc/fortran/gfortran.h +++ b/gcc/fortran/gfortran.h @@ -316,6 +316,16 @@ enum save_state { SAVE_NONE = 0, SAVE_EXPLICIT, SAVE_IMPLICIT }; +/* Flags to keep track of ACC routine states. */ +enum oacc_function +{ OACC_FUNCTION_NONE = 0, + OACC_FUNCTION_GANG, + OACC_FUNCTION_WORKER, + OACC_FUNCTION_VECTOR, + OACC_FUNCTION_SEQ, + OACC_FUNCTION_AUTO +}; + /* Strings for all symbol attributes. We use these for dumping the parse tree, in error messages, and also when reading and writing modules. In symbol.c. */ @@ -1726,6 +1736,7 @@ typedef struct gfc_oacc_routine_name struct gfc_symbol *sym; struct gfc_omp_clauses *clauses; struct gfc_oacc_routine_name *next; + locus loc; } gfc_oacc_routine_name; diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index 94a7f7eaa50..d48c9351e25 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -2234,34 +2234,45 @@ gfc_match_oacc_cache (void) return MATCH_YES; } -/* Determine the loop level for a routine. */ +/* Determine the loop level for a routine. Returns OACC_FUNCTION_NONE + if any error is detected. */ -static int +static oacc_function gfc_oacc_routine_dims (gfc_omp_clauses *clauses) { int level = -1; + oacc_function ret = OACC_FUNCTION_AUTO; if (clauses) { unsigned mask = 0; if (clauses->gang) - level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level); + { + level = GOMP_DIM_GANG, mask |= GOMP_DIM_MASK (level); + ret = OACC_FUNCTION_GANG; + } if (clauses->worker) - level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level); + { + level = GOMP_DIM_WORKER, mask |= GOMP_DIM_MASK (level); + ret = OACC_FUNCTION_WORKER; + } if (clauses->vector) - level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level); + { + level = GOMP_DIM_VECTOR, mask |= GOMP_DIM_MASK (level); + ret = OACC_FUNCTION_VECTOR; + } if (clauses->seq) - level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level); + { + level = GOMP_DIM_MAX, mask |= GOMP_DIM_MASK (level); + ret = OACC_FUNCTION_SEQ; + } if (mask != (mask & -mask)) - gfc_error ("Multiple loop axes specified for routine"); + ret = OACC_FUNCTION_NONE; } - if (level < 0) -level = GOMP_DIM_MAX; - - return level; + return ret; } match @@ -2272,6 +2283,8 @@ gfc_match_oacc_routine (void) match m; gfc_omp_clauses *c = NULL; gfc_oacc_routine_name *n = NULL; + oacc_function dims = OACC_FUNCTION_NONE; + bool seen_error = false; old_loc = gfc_current_locus; @@ -2318,17 +2331,15 @@ gfc_match_oacc_routine (void) } else { - gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C"); - gfc_current_locus = old_loc; - return MATCH_ERROR; + gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %L", _loc); + goto cleanup; } if (gfc_match_char (')') != MATCH_YES) { - gfc_error ("Syntax error in !$ACC ROUTINE ( NAME ) at %C, expecting" - " ')' after NAME"); - gfc_current_locus = old_loc; - return
Re: [patch,nvptx] Basic -misa support for nvptx
On 09/02/2018 07:57 AM, Cesar Philippidis wrote: > On 09/01/2018 12:04 PM, Tom de Vries wrote: >> On 08/31/2018 04:14 PM, Cesar Philippidis wrote: > >>> Is this patch OK for trunk? >>> >> >> Well, how did you test this ( >> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and >> testing. State the host and target combinations you used to do proper >> testing as described above, and the results of your testing.") ? > > I tested the standalone nvptx compiler. I'll retest with libgomp with > -misa=sm_35. Bootstrapping won't help much here, unfortunately. >>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c >>> @@ -0,0 +1,24 @@ >>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35 >>> + targets. */ >>> + >>> +/* { dg-do compile } */ >>> +/* { dg-options "-O2 -misa=sm_35" } */ >>> + >>> +int >>> +main() >>> +{ >>> + unsigned long long a = ~0; >>> + unsigned b = 0xa; >>> + >>> + __atomic_fetch_add (, b, 0); >>> + __atomic_fetch_and (, b, 0); >>> + __atomic_fetch_or (, b, 0); >>> + __atomic_fetch_xor (, b, 0); >>> + >>> + return a; >>> +} >>> + >>> +/* { dg-final { scan-assembler "atom.add.u64" } } */ >>> +/* { dg-final { scan-assembler "atom.b64.and" } } */ >>> +/* { dg-final { scan-assembler "atom.b64.or" } } */ >>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */ >>> -- 2.17.1 >>> >> >> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not >> testcase does not use this difference, so that needs to be fixed, or for >> bonus points, changed into a scan-assembler testcase). >> >> The documentation uses "op.type", we should fix the compiler to emit >> that consistently. Separate patch that fixes that pre-approved. > > ACK. I think there are a lot of other cases like that in the BE. > >> This is ok (with, as I mentioned above, the SI part split off into a >> separate patch), on the condition that you test libgomp with >> -foffload=-misa=sm_35. Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't support the -misa flag. When I forced the nvptx BE to set TARGET_SM35 to always be true, I ran into problems with SM_30 code linking against SM_35 code. Therefore, I don't think this patch is ready for trunk yet. By the way, is -misa really necessary for atomic_fetch_? Looking at the PTX documentation I see <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-3-1>: PTX ISA version 3.1 introduces the following new features: * Support for sm_35 target architecture. * Extends atomic and reduction instructions to perform 64-bit {and, or, xor} operations, and 64-bit integer {min, max} operations. Is there a table for which list which GPUs are compatible with which instructions? Thanks, Cesar
Re: [patch,nvptx] Basic -misa support for nvptx
On 09/01/2018 12:04 PM, Tom de Vries wrote: > On 08/31/2018 04:14 PM, Cesar Philippidis wrote: >> Is this patch OK for trunk? >> > > Well, how did you test this ( > https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and > testing. State the host and target combinations you used to do proper > testing as described above, and the results of your testing.") ? I tested the standalone nvptx compiler. I'll retest with libgomp with -misa=sm_35. Bootstrapping won't help much here, unfortunately. >> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c >> @@ -0,0 +1,24 @@ >> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35 >> + targets. */ >> + >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -misa=sm_35" } */ >> + >> +int >> +main() >> +{ >> + unsigned long long a = ~0; >> + unsigned b = 0xa; >> + >> + __atomic_fetch_add (, b, 0); >> + __atomic_fetch_and (, b, 0); >> + __atomic_fetch_or (, b, 0); >> + __atomic_fetch_xor (, b, 0); >> + >> + return a; >> +} >> + >> +/* { dg-final { scan-assembler "atom.add.u64" } } */ >> +/* { dg-final { scan-assembler "atom.b64.and" } } */ >> +/* { dg-final { scan-assembler "atom.b64.or" } } */ >> +/* { dg-final { scan-assembler "atom.b64.xor" } } */ >> -- 2.17.1 >> > > Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not > testcase does not use this difference, so that needs to be fixed, or for > bonus points, changed into a scan-assembler testcase). > > The documentation uses "op.type", we should fix the compiler to emit > that consistently. Separate patch that fixes that pre-approved. ACK. I think there are a lot of other cases like that in the BE. > This is ok (with, as I mentioned above, the SI part split off into a > separate patch), on the condition that you test libgomp with > -foffload=-misa=sm_35. Thanks, Cesar
[patch,nvptx] Basic -misa support for nvptx
Attached is an nvptx patch that adds support for a new, albeit rarely used, compiler option -misa. At present, there are only two valid ISA arguments, SM_30 and SM_35. Without that flag, GCC will default to SM_30. The major advantage of using the SM_35 ISA is to enable the use PTX atom instructions for __atomic_fetch_{add,and,or,xor} for DI integers. Without -misa, GCC would use an atomic CAS loop for them. As an aside, this patch also enables PTX atom instructions for those aforementioned functions for SI integers. Is this patch OK for trunk? Thanks, Cesar Basic -misa support for nvptx 2018-XX-YY Cesar Philippidis Bernd Schmidt gcc/ * config/nvptx/nvptx-opts.h: New file. * config/nvptx/nvptx.c (nvptx_file_start): Print the correct .target. * config/nvptx/nvptx.h: Include "nvptx-opts.h". (ASM_SPEC): Define. (TARGET_SM35): New macro. * config/nvptx/nvptx.md (atomic_fetch_): Enable with the correct predicate. * config/nvptx/nvptx.opt (ptx_isa, sm_30, sm_35): New enum and its values. (misa=): New option. * doc/invoke.texi (Nvidia PTX Options): Document -misa. gcc/testsuite/ * gcc.target/nvptx/atomic_fetch-1.c: New test. * gcc.target/nvptx/atomic_fetch-1.c: New test. diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h new file mode 100644 index 000..55d9599917e --- /dev/null +++ b/gcc/config/nvptx/nvptx-opts.h @@ -0,0 +1,30 @@ +/* Definitions for the NVPTX port needed for option handling. + Copyright (C) 2015-2018 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef NVPTX_OPTS_H +#define NVPTX_OPTS_H + +enum ptx_isa +{ + PTX_ISA_SM30, + PTX_ISA_SM35 +}; + +#endif + diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index c0b0a2ec3ab..9903a273863 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -4931,7 +4931,10 @@ nvptx_file_start (void) { fputs ("// BEGIN PREAMBLE\n", asm_out_file); fputs ("\t.version\t3.1\n", asm_out_file); - fputs ("\t.target\tsm_30\n", asm_out_file); + if (TARGET_SM35) +fputs ("\t.target\tsm_35\n", asm_out_file); + else +fputs ("\t.target\tsm_30\n", asm_out_file); fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); fputs ("// END PREAMBLE\n", asm_out_file); } diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index dfa1e9aa859..a2fe8b68b22 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -21,10 +21,16 @@ #ifndef GCC_NVPTX_H #define GCC_NVPTX_H +#ifndef NVPTX_OPTS_H +#include "config/nvptx/nvptx-opts.h" +#endif + /* Run-time Target. */ #define STARTFILE_SPEC "%{mmainkernel:crt0.o}" +#define ASM_SPEC "%{misa=*:-m %*}" + #define TARGET_CPU_CPP_BUILTINS() \ do \ { \ @@ -87,6 +93,8 @@ #define Pmode (TARGET_ABI64 ? DImode : SImode) #define STACK_SIZE_MODE Pmode +#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35) + /* Registers. Since ptx is a virtual target, we just define a few hard registers for special purposes and leave pseudos unallocated. We have to have some available hard registers, to keep gcc setup diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 2988f5dfa91..ca00b1d8073 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -1440,7 +1440,6 @@ (define_code_iterator any_logic [and ior xor]) (define_code_attr logic [(and "and") (ior "or") (xor "xor")]) -;; Currently disabled until we add better subtarget support - requires sm_32. (define_insn "atomic_fetch_" [(set (match_operand:SDIM 1 "memory_operand" "+m") (unspec_volatile:SDIM @@ -1450,7 +1449,7 @@ UNSPECV_LOCK)) (set (match_operand:SDIM 0 "nvptx_register_operand" "=R") (match_dup 1))] - "0" + "mode == SImode || TARGET_SM35" "%.\\tatom%A1.b%T0.\\t%0, %1, %2;" [(set_attr "atomic" "true")]) diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 04277d1d98e..8194c0324d6 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -48,3 +48,17 @@ Generate code that can keep local state uniform across all l
Re: [PATCH, OpenACC] (2/2) Fix implicit mapping for array slices on lexically-enclosing data constructs (PR70828)
On 08/28/2018 02:32 PM, Julian Brown wrote: > On Tue, 28 Aug 2018 12:23:22 -0700 > Cesar Philippidis wrote: >> This is specific to OpenACC, and needs to be guarded as such. > > Are you sure that condition can be true for OpenMP? I'd assumed not... My bad, you're correct. OMP doesn't use those GOMP_MAP_FORCE map types anymore. Cesar
Re: [PATCH, OpenACC] (2/2) Fix implicit mapping for array slices on lexically-enclosing data constructs (PR70828)
On 08/28/2018 12:19 PM, Julian Brown wrote: > diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c > index f038f4c..86be407 100644 > --- a/gcc/fortran/trans-openmp.c > +++ b/gcc/fortran/trans-openmp.c > @@ -1045,9 +1045,13 @@ gfc_omp_finish_clause (tree c, gimple_seq *pre_p) > >tree decl = OMP_CLAUSE_DECL (c); > > - /* Assumed-size arrays can't be mapped implicitly, they have to be > - mapped explicitly using array sections. */ > - if (TREE_CODE (decl) == PARM_DECL > + /* Assumed-size arrays can't be mapped implicitly, they have to be mapped > + explicitly using array sections. An exception is if the array is > + mapped explicitly in an enclosing data construct for OpenACC, in which > + case we see GOMP_MAP_FORCE_PRESENT here and do not need to raise an > + error. */ > + if (OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_FORCE_PRESENT > + && TREE_CODE (decl) == PARM_DECL >&& GFC_ARRAY_TYPE_P (TREE_TYPE (decl)) >&& GFC_TYPE_ARRAY_AKIND (TREE_TYPE (decl)) == GFC_ARRAY_UNKNOWN >&& GFC_TYPE_ARRAY_UBOUND (TREE_TYPE (decl), This is specific to OpenACC, and needs to be guarded as such. Cesar
Re: [PATCH, OpenACC] Add support for gang local storage allocation in shared memory
On 08/13/2018 11:42 AM, Cesar Philippidis wrote: > On 08/13/2018 09:21 AM, Julian Brown wrote: > >> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c >> b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c >> new file mode 100644 >> index 000..2fa708a >> --- /dev/null >> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c >> @@ -0,0 +1,106 @@ >> +/* { dg-xfail-run-if "gangprivate failure" { openacc_nvidia_accel_selected >> } { "-O0" } { "" } } */ > > As a quick comment, I like the approach that you've taken with this > patch, but the og8 patch only applies the gangprivate attribute in the > c/c++ FE. I'd have to review the notes, but I seem to recall that > excluding that clause in fortran was deliberate. Chung-Lin, do you > recall the rationale behind that? I found this in an old email: The older version of fortran that OpenACC supports doesn't have a concept of lexically scoped blocks like c/c++, so this isn't relevant except for explicit gang private variables. So in other words, this is safe for fortran. It probably could use a fortran test, because that functionality wasn't explicitly exercised in og7/og8. Cesar
Re: [PATCH, OpenACC] Add support for gang local storage allocation in shared memory
On 08/13/2018 09:21 AM, Julian Brown wrote: > diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c > b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c > new file mode 100644 > index 000..2fa708a > --- /dev/null > +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c > @@ -0,0 +1,106 @@ > +/* { dg-xfail-run-if "gangprivate failure" { openacc_nvidia_accel_selected } > { "-O0" } { "" } } */ As a quick comment, I like the approach that you've taken with this patch, but the og8 patch only applies the gangprivate attribute in the c/c++ FE. I'd have to review the notes, but I seem to recall that excluding that clause in fortran was deliberate. Chung-Lin, do you recall the rationale behind that? With that aside, is the above xfail still necessary? It seems to xpass for me on nvptx. However, I see this regression on the host: FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/loop-gwv-2.c -DACC_DEVICE_TYPE_host=1 -DACC_MEM_SHARED=1 -O2 execution test There could be other regressions, but I only tested the new tests introduced by the patch so far. Cesar
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/13/2018 08:08 AM, Tom de Vries wrote: > On 08/13/2018 04:54 PM, Cesar Philippidis wrote: >> Going >> forward, how would you like to proceed with the nvptx BE vector length >> changes. > > Do you have a branch available on github containing the patch series > you've submitted? Yes, https://github.com/cesarjp/gcc/tree/trunk-og8-vl-private Beware that I'm constantly rebasing that branch to keep my patches up to date. All of the commit subject lines prefixed with [nvptx] touch the nvptx BE. The [OpenACC] patches are either involve platform-independent code or libgomp. Cesar
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/13/2018 05:04 AM, Tom de Vries wrote: > On 08/10/2018 08:39 PM, Cesar Philippidis wrote: >> is that I modified the default value for vectors as follows >> >> +int vectors = default_dim_p[GOMP_DIM_VECTOR] >> + ? 0 : dims[GOMP_DIM_VECTOR]; >> >> Technically, trunk only supports warp-sized vectors, but the fallback >> code is already checking for the presence of vectors as so >> >> +if (default_dim_p[GOMP_DIM_VECTOR]) >> + dims[GOMP_DIM_VECTOR] >> += MIN (dims[GOMP_DIM_VECTOR], >> + (targ_fn->max_threads_per_block / warp_size >> +* warp_size)); >> > > That code handles the case that the default vector size is bigger than > the function being launched allows, independent from whether that > default is calculated by the runtime, or set by GOMP_OPENACC_DIM. > > The GOMP_OPENACC_DIM part is forward compatible, given that currently > the compiler doesn't allow the runtime to choose the vector length, and > AFAICT that will remain the same after application of the submitted set > of vector_length patches. > >> therefore, I had the cuOccupancyMaxPotentialBlockSize code path behave >> the same. > > They don't behave the same. What you add here is ignoring > GOMP_OPENACC_DIM[GOMP_DIM_VECTOR], not handling it. That requires a comment. I meant, same in the sense that it inspects for a pre-defined value of vector length; not the application of vector length. I should have been more clear. > Furthermore, by assigning dims[GOMP_DIM_VECTOR] at the start you break > the pattern of the code, which: > - first applies GOMP_OPENACC_DIM > - then further fills in defaults as required > - then applies defaults > I've rewritten this bit to fit the pattern. This result is not pretty, > but it'll do for now. Changing the pattern may make things better > structured, but this is something we can do in a follow up patch, and > want to do for all dimensions at once, not just for vector, otherwise > the code will become too convoluted. > > Btw, I've also noticed that we don't handle a too high > GOMP_OPENACC_DIM[GOMP_DIM_WORKER], I've added a TODO comment for this. That's why I set vectors to dims[GOMP_DIM_VECTOR] when set. However, I do agree that this is a task for a follow up patch. > Committed as attached. Thank you Tom! Looking at my patch queue, there's only one more non-vector length related patch in there - Remove use of CUDA unified memory in libgomp <https://gcc.gnu.org/ml/gcc-patches/2018-07/msg01970.html>. Going forward, how would you like to proceed with the nvptx BE vector length changes. Cesar
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/08/2018 08:19 AM, Tom de Vries wrote: > On Wed, Aug 08, 2018 at 07:09:16AM -0700, Cesar Philippidis wrote: >> On 08/07/2018 06:52 AM, Cesar Philippidis wrote: Thanks for review. This version should address all of the following remarks. However, one thing to note ... >> [nvptx] Use CUDA driver API to select default runtime launch geometry >> >> 2018-08-YY Cesar Philippidis >> >> libgomp/ >> plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. >> (cuDriverGetVersion): Declare. >> (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare. >> plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for >> cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize. >> (ptx_device): Add driver_version member. >> (nvptx_open_device): Initialize it. >> (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the >> default num_gangs and num_workers when the driver supports it. >> --- >> libgomp/plugin/cuda-lib.def | 2 ++ >> libgomp/plugin/cuda/cuda.h| 4 >> libgomp/plugin/plugin-nvptx.c | 40 +++- >> 3 files changed, 45 insertions(+), 1 deletion(-) >> >> diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def >> index be8e3b3..f2433e1 100644 >> --- a/libgomp/plugin/cuda-lib.def >> +++ b/libgomp/plugin/cuda-lib.def >> @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate) >> CUDA_ONE_CALL (cuCtxDestroy) >> CUDA_ONE_CALL (cuCtxGetCurrent) >> CUDA_ONE_CALL (cuCtxGetDevice) >> +CUDA_ONE_CALL (cuDriverGetVersion) > > Don't use cuDriverGetVersion. > >> CUDA_ONE_CALL (cuCtxPopCurrent) >> CUDA_ONE_CALL (cuCtxPushCurrent) >> CUDA_ONE_CALL (cuCtxSynchronize) >> @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) >> CUDA_ONE_CALL (cuModuleLoad) >> CUDA_ONE_CALL (cuModuleLoadData) >> CUDA_ONE_CALL (cuModuleUnload) >> +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) > > Use CUDA_ONE_CALL_MAYBE_NULL. > >> CUDA_ONE_CALL (cuStreamCreate) >> CUDA_ONE_CALL (cuStreamDestroy) >> CUDA_ONE_CALL (cuStreamQuery) >> diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h >> index 4799825..3a790e6 100644 >> --- a/libgomp/plugin/cuda/cuda.h >> +++ b/libgomp/plugin/cuda/cuda.h >> @@ -44,6 +44,7 @@ typedef void *CUevent; >> typedef void *CUfunction; >> typedef void *CUlinkState; >> typedef void *CUmodule; >> +typedef size_t (*CUoccupancyB2DSize)(int); >> typedef void *CUstream; >> >> typedef enum { >> @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void); >> CUresult cuDeviceGet (CUdevice *, int); >> CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice); >> CUresult cuDeviceGetCount (int *); >> +CUresult cuDriverGetVersion(int *); >> CUresult cuEventCreate (CUevent *, unsigned); >> #define cuEventDestroy cuEventDestroy_v2 >> CUresult cuEventDestroy (CUevent); >> @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, >> CUmodule, const char *); >> CUresult cuModuleLoad (CUmodule *, const char *); >> CUresult cuModuleLoadData (CUmodule *, const void *); >> CUresult cuModuleUnload (CUmodule); >> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, >> + CUoccupancyB2DSize, size_t, int); >> CUresult cuStreamCreate (CUstream *, unsigned); >> #define cuStreamDestroy cuStreamDestroy_v2 >> CUresult cuStreamDestroy (CUstream); >> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c >> index 825470a..b0ccf0b 100644 >> --- a/libgomp/plugin/plugin-nvptx.c >> +++ b/libgomp/plugin/plugin-nvptx.c >> @@ -376,6 +376,7 @@ struct ptx_device >>int max_threads_per_block; >>int max_threads_per_multiprocessor; >>int default_dims[GOMP_DIM_MAX]; >> + int driver_version; >> >>struct ptx_image_data *images; /* Images loaded on device. */ >>pthread_mutex_t image_lock; /* Lock for above list. */ >> @@ -687,6 +688,7 @@ nvptx_open_device (int n) >>ptx_dev->ord = n; >>ptx_dev->dev = dev; >>ptx_dev->ctx_shared = false; >> + ptx_dev->driver_version = 0; >> >>r = CUDA_CALL_NOCHECK (cuCtxGetDevice, _dev); >>if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) >> @@ -780,6 +782,9 @@ nvptx_open_device (int n) >>for (int i = 0; i != GOMP_DIM_MAX; i++) >> ptx_dev->default_dims[i] = 0; >> >> + CUDA_CALL_ERET (NULL, cuDriverGetVersion,
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/07/2018 06:52 AM, Cesar Philippidis wrote: > I attached an updated version of the CUDA driver patch, although I > haven't rebased it against your changes yet. It still needs to be tested > against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give > you an update. > > Does this patch look OK, at least after testing competes? I removed the > tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't > supported in the older drivers. I've finally finished testing this patch. Besides for a couple of regressions with CUDA 5.5 in libgomp.oacc-c-c++-common/lib-75.c, lib-76.c and lib-79.c, the results came back clean. This patch has been tested the following ways using a K40 GPU: * Using GCC's cuda.h with CUDA 9.2 drivers. * Using cuda.h from CUDA 5.5 and Nvidia drivers 331.133 (supports CUDA 6.0) and the driver from CUDA 8.0. * Using cuda.h from CUDA 8.0. As mentioned before, because GCC's cuda.h defines CUDA_VERSION as 8000, there was a conflict with using it against CUDA 5.5, because of the missing cuLinkAddData_v2 symbol. Note how the usage of cuOccupancyMaxPotentialBlockSize is guarded by checking for the version of CUDA_VERSION. I don't really like this, but it's a necessary evil of maintaining backwards compatibility. Is this patch OK for trunk? Thanks, Cesar [nvptx] Use CUDA driver API to select default runtime launch geometry 2018-08-YY Cesar Philippidis libgomp/ plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. (cuDriverGetVersion): Declare. (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare. plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize. (ptx_device): Add driver_version member. (nvptx_open_device): Initialize it. (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the default num_gangs and num_workers when the driver supports it. --- libgomp/plugin/cuda-lib.def | 2 ++ libgomp/plugin/cuda/cuda.h| 4 libgomp/plugin/plugin-nvptx.c | 40 +++- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index be8e3b3..f2433e1 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate) CUDA_ONE_CALL (cuCtxDestroy) CUDA_ONE_CALL (cuCtxGetCurrent) CUDA_ONE_CALL (cuCtxGetDevice) +CUDA_ONE_CALL (cuDriverGetVersion) CUDA_ONE_CALL (cuCtxPopCurrent) CUDA_ONE_CALL (cuCtxPushCurrent) CUDA_ONE_CALL (cuCtxSynchronize) @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamQuery) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 4799825..3a790e6 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -44,6 +44,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; typedef enum { @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void); CUresult cuDeviceGet (CUdevice *, int); CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice); CUresult cuDeviceGetCount (int *); +CUresult cuDriverGetVersion(int *); CUresult cuEventCreate (CUevent *, unsigned); #define cuEventDestroy cuEventDestroy_v2 CUresult cuEventDestroy (CUevent); @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleLoad (CUmodule *, const char *); CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); CUresult cuStreamCreate (CUstream *, unsigned); #define cuStreamDestroy cuStreamDestroy_v2 CUresult cuStreamDestroy (CUstream); diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 825470a..b0ccf0b 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -376,6 +376,7 @@ struct ptx_device int max_threads_per_block; int max_threads_per_multiprocessor; int default_dims[GOMP_DIM_MAX]; + int driver_version; struct ptx_image_data *images; /* Images loaded on device. */ pthread_mutex_t image_lock; /* Lock for above list. */ @@ -687,6 +688,7 @@ nvptx_open_device (int n) ptx_dev->ord = n; ptx_dev->dev = dev; ptx_dev->ctx_shared = false; + ptx_dev->driver_version = 0; r = CUDA_CALL_NOCHECK (cuCtxGetDevice, _dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) @@ -780,6 +782,9 @@ nvptx_open_device (int n) for (int i = 0; i != GOMP_DIM_MAX; i++) ptx_dev->default_dims[i] = 0
[PATCH][OpenACC] Update deviceptr handling during gimplification
I had previously posted this patch as part of a monster deviceptr patch here <https://gcc.gnu.org/ml/gcc-patches/2018-06/msg01911.html>. This patch breaks out the generic gimplifier changes. Essentially, with this patch, the gimplifier will now transfer deviceptr data clauses using GOMP_MAP_FORCE_DEVICEPTR. Is this patch OK for trunk? It bootstrapped / regression tested cleanly for x86_64 with nvptx offloading. Thanks, Cesar >From b5cf37b795ce78c78f3f434ac6999f7094bd86aa Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Mon, 7 May 2018 08:23:48 -0700 Subject: [PATCH] [OpenACC] Update deviceptr handling 2018-XX-YY Cesar Philippidis gcc/fortran/ * trans-openmp.c (gfc_omp_finish_clause): Don't create pointer data mappings for deviceptr clauses. (gfc_trans_omp_clauses): Likewise. gcc/ * gimplify.c (enum gimplify_omp_var_data): Add GOVD_DEVICETPR. (omp_notice_variable): Add GOVD_DEVICEPTR attribute when appropriate. (gimplify_scan_omp_clauses): Likewise. (gimplify_adjust_omp_clauses_1): Set GOMP_MAP_FORCE_DEVICEPTR for implicit deviceptr mappings. gcc/testsuite/ * c-c++-common/goacc/deviceptr-4.c: Update expected data mapping. (cherry picked from openacc-gcc-7-branch commit d3de16b461545aac1925f0d7c2851c8c49a07d06 and commit f0514fe1899666bb5b8ee52601f5d4263d4c4646) --- gcc/fortran/trans-openmp.c | 9 + gcc/gimplify.c | 12 +++- gcc/testsuite/c-c++-common/goacc/deviceptr-4.c | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c index f038f4c..ca31c88 100644 --- a/gcc/fortran/trans-openmp.c +++ b/gcc/fortran/trans-openmp.c @@ -1060,6 +1060,8 @@ gfc_omp_finish_clause (tree c, gimple_seq *pre_p) } tree c2 = NULL_TREE, c3 = NULL_TREE, c4 = NULL_TREE; + if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FORCE_DEVICEPTR) +return; if (POINTER_TYPE_P (TREE_TYPE (decl))) { if (!gfc_omp_privatize_by_reference (decl) @@ -2111,6 +2113,12 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses, if (n->expr == NULL || n->expr->ref->u.ar.type == AR_FULL) { if (POINTER_TYPE_P (TREE_TYPE (decl)) + && n->u.map_op == OMP_MAP_FORCE_DEVICEPTR) + { + OMP_CLAUSE_DECL (node) = decl; + goto finalize_map_clause; + } + else if (POINTER_TYPE_P (TREE_TYPE (decl)) && (gfc_omp_privatize_by_reference (decl) || GFC_DECL_GET_SCALAR_POINTER (decl) || GFC_DECL_GET_SCALAR_ALLOCATABLE (decl) @@ -2282,6 +2290,7 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses, ptr2 = fold_convert (sizetype, ptr2); OMP_CLAUSE_SIZE (node3) = fold_build2 (MINUS_EXPR, sizetype, ptr, ptr2); + finalize_map_clause:; } switch (n->u.map_op) { diff --git a/gcc/gimplify.c b/gcc/gimplify.c index 4a109ae..bcf862f 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -105,6 +105,9 @@ enum gimplify_omp_var_data /* Flag for GOVD_MAP: must be present already. */ GOVD_MAP_FORCE_PRESENT = 524288, + /* Flag for OpenACC deviceptrs. */ + GOVD_DEVICEPTR = (1<<21), + GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR | GOVD_LOCAL) @@ -7232,6 +7235,7 @@ omp_notice_variable (struct gimplify_omp_ctx *ctx, tree decl, bool in_code) error ("variable %qE declared in enclosing " "% region", DECL_NAME (decl)); nflags |= GOVD_MAP; + nflags |= (n2->value & GOVD_DEVICEPTR); if (octx->region_type == ORT_ACC_DATA && (n2->value & GOVD_MAP_0LEN_ARRAY)) nflags |= GOVD_MAP_0LEN_ARRAY; @@ -8213,6 +8217,8 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_TO || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_TOFROM) flags |= GOVD_MAP_ALWAYS_TO; + else if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FORCE_DEVICEPTR) + flags |= GOVD_DEVICEPTR; goto do_add; case OMP_CLAUSE_DEPEND: @@ -8828,7 +8834,8 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data) /* Not all combinations of these GOVD_MAP flags are actually valid. */ switch (flags & (GOVD_MAP_TO_ONLY | GOVD_MAP_FORCE - | GOVD_MAP_FORCE_PRESENT)) + | GOVD_MAP_FORCE_PRESENT + | GOVD_DEVICEPTR)) { case 0: kind = GOMP_MAP_TOFROM; @@ -8845,6 +8852,9 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data) case GOVD_MAP_FORCE_PRESENT: kind = GOMP_MAP_FORCE_PRESENT; break; + case GOVD_DEVICEPTR: + kind = GOMP_MAP_FORCE_DEVICEPTR; + break; default: gcc_unreachable (); } diff --git a/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c b/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c index db1b916..79a5162 100644 --- a/g
[PATCH][OpenACC] Don't error on implicitly private induction variables in gfortran
At present, the fortran FE reports an error if the user adds an explicit private clause to an induction variable used by an acc loop. This patch teaches the fortran acc block resolver how to cope with "duplicate" private clauses, so that it doesn't error anymore. Is this patch OK for trunk? I bootstrapped and regression tested it for x86_64 with nvptx offloading. Thanks, Cesar >From 576b2a7d5574400f067ec309929b38b324d8c6f6 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Fri, 27 Jan 2017 14:58:16 + Subject: [PATCH] [OpenACC] Don't error on implicitly private induction variables in gfortran 2018-XX-YY Cesar Philippidis gcc/fortran/ * openmp.c (gfc_resolve_oacc_blocks): Populate list of private variables. gcc/testsuite/ * gfortran.dg/goacc/implicitly-private.f90: New test. --- gcc/fortran/openmp.c | 5 + gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90 | 12 2 files changed, 17 insertions(+) create mode 100644 gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90 diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index b346b51..798c5fa 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -5951,6 +5951,7 @@ void gfc_resolve_oacc_blocks (gfc_code *code, gfc_namespace *ns) { fortran_omp_context ctx; + gfc_omp_namelist *n; resolve_oacc_loop_blocks (code); @@ -5961,6 +5962,10 @@ gfc_resolve_oacc_blocks (gfc_code *code, gfc_namespace *ns) ctx.is_openmp = false; omp_current_ctx = + if (code->ext.omp_clauses) +for (n = code->ext.omp_clauses->lists[OMP_LIST_PRIVATE]; n; n = n->next) + ctx.private_iterators->add (n->sym); + gfc_resolve_blocks (code->block, ns); omp_current_ctx = ctx.previous; diff --git a/gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90 b/gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90 new file mode 100644 index 000..a687d8a --- /dev/null +++ b/gcc/testsuite/gfortran.dg/goacc/implicitly-private.f90 @@ -0,0 +1,12 @@ +! Ensure that implicitly private variables do not clash with those +! that are explicitly private. + +program main + implicit none + + integer i + + !$acc parallel loop private(i) + do i = 1, 100 + end do +end program main -- 2.7.4
[PATCH][OpenACC] Add support for firstprivate Fortran allocatable scalars
This patch updates the way that lower_omp_target uses firstprivate pointers in OpenACC offloaded regions. On host side, when preparing firstprivate data mapping for pointer type objects, not to be confused with GOMP_MAP_FIRSTPRIVATE_POINTER, the compiler passes passes the address of the value being pointed to and not the address of the pointer itself to the runtime. Correspondingly, on the device side, the compiler generates to code to dereference the remapped pointer once to copy the data to a local buffer. While this behavior looks like it would break things, it will not affect C or C++ data mappings, because those languages transfer pointers via GOMP_MAP_FIRSTPRIVATE_POINTER. In addition, this will not cause problems with array types, because the default remapping rules for OpenACC is to transfer them in via copy. Besides it really doesn't make sense to allow arrays to be transferred in via firstprivate because that would use up a lot of memory on the accelerator. Is this OK for trunk? I bootstrapped and regtested it for x86_64 with nvptx offloading. Thanks, Cesar >From b8fb83b36d0f96b12af9a1f5596f31b3c6b72ef0 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Mon, 6 Aug 2018 09:19:28 -0700 Subject: [PATCH] [OpenACC] Add support for firstprivate Fortran allocatable scalars This patch updates the way that lower_omp_target uses firstprivate pointers in OpenACC offloaded regions. On host side, when preparing pointer type firstprivate data mapping, not to be confused with GOMP_MAP_FIRSTPRIVATE_POINTER, the compiler passes passes the address of the value being pointed to, not the address of the pointer itself. Correspondingly, on the device side, the compiler generates to deference the remapped pointer once and copy the data to a local buffer. While this behavior like it would break things, it will not affect C or C++ data mappings, because those languages transfer pointers via GOMP_MAP_FIRSTPRIVATE_POINTER. In addition, this will not cause problems with array types, because the default remapping rules for OpenACC is to transfer them in via copy. Besides it really doesn't make sense to allow arrays to be transferred in via firstprivate because that would use up a lot of memory on the accelerator. 2018-XX-YY Cesar Philippidis gcc/ omp-low.c (lower_omp_target): Update OpenACC handling of pointer variables with GOMP_MAP_FIRSTPRIVATE mappings. libgomp/ testsuite/libgomp.oacc-fortran/allocatable-scalar.f90: New test. --- gcc/omp-low.c | 18 .../libgomp.oacc-fortran/allocatable-scalar.f90| 33 ++ 2 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90 diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 843c66f..47603c4 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -7643,15 +7643,21 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE) { gcc_assert (is_gimple_omp_oacc (ctx->stmt)); - if (omp_is_reference (new_var) - && TREE_CODE (TREE_TYPE (new_var)) != POINTER_TYPE) + if (omp_is_reference (new_var)) { /* Create a local object to hold the instance value. */ - tree type = TREE_TYPE (TREE_TYPE (new_var)); + tree type = TREE_TYPE (new_var); + /* Pointer types are mapped onto the device via a + single level of indirection. */ + if (TREE_CODE (type) != POINTER_TYPE) + type = TREE_TYPE (type); const char *id = IDENTIFIER_POINTER (DECL_NAME (new_var)); tree inst = create_tmp_var (type, id); - gimplify_assign (inst, fold_indirect_ref (x), ); + if (TREE_CODE (TREE_TYPE (new_var)) == POINTER_TYPE) + gimplify_assign (inst, fold_indirect_ref (x), ); + else + gimplify_assign (inst, fold_indirect_ref (x), ); x = build_fold_addr_expr (inst); } gimplify_assign (new_var, x, ); @@ -7879,7 +7885,9 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE) { gcc_assert (is_gimple_omp_oacc (ctx->stmt)); - if (!omp_is_reference (var)) + /* Handle Fortran allocatable scalars. */ + if (!omp_is_reference (var) + && TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE) { if (is_gimple_reg (var) && OMP_CLAUSE_FIRSTPRIVATE_IMPLICIT (c)) diff --git a/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90 b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90 new file mode 100644 index 000..be86d14 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90 @@ -0,0 +1,33 @@ +! Test non-declared allocatable scalars in OpenACC data clauses. + +! { dg-do run } + +program main + implicit none + integer, parameter :: n = 100 + integer, allocatable :: a, c + integer :: i, b(n) + + al
[PATCH][OpenACC] update gfortran's tile clause error handling
This patch updates how the OpenACC tile clause is handled in the Fortran FE to match it's behavior in C/C++. Specifically, the tile clause now errors on negative integer arguments, instead of emitting a warning. Is this OK for trunk? Thanks, Cesar >From af39a6d65cfb46397fa62c88521189002fb3d705 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Mon, 3 Oct 2016 13:58:59 + Subject: [PATCH] [OpenACC] update gfortran's tile clause error handling 2018-XX-YY Cesar Philippidis gcc/fortran/ * openmp.c (resolve_positive_int_expr): Promote the warning to an error. gcc/testsuite/ * gfortran.dg/goacc/loop-2.f95: Change expected tile clause warnings to errors. * gfortran.dg/goacc/loop-5.f95: Likewise. * gfortran.dg/goacc/sie.f95: Likewise. * gfortran.dg/goacc/tile-1.f90: New test. * gfortran.dg/goacc/tile-2.f90: New test. --- gcc/fortran/openmp.c | 4 ++-- gcc/testsuite/gfortran.dg/goacc/loop-2.f95 | 8 +++ gcc/testsuite/gfortran.dg/goacc/loop-5.f95 | 12 -- gcc/testsuite/gfortran.dg/goacc/sie.f95| 36 +++--- gcc/testsuite/gfortran.dg/goacc/tile-1.f90 | 16 ++--- gcc/testsuite/gfortran.dg/gomp/pr77516.f90 | 2 +- 6 files changed, 33 insertions(+), 45 deletions(-) diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index 5c0ae45..b346b51 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -3719,8 +3719,8 @@ resolve_positive_int_expr (gfc_expr *expr, const char *clause) if (expr->expr_type == EXPR_CONSTANT && expr->ts.type == BT_INTEGER && mpz_sgn (expr->value.integer) <= 0) -gfc_warning (0, "INTEGER expression of %s clause at %L must be positive", - clause, >where); +gfc_error ("INTEGER expression of %s clause at %L must be positive", + clause, >where); } static void diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-2.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-2.f95 index 0c902b2..d4c6273 100644 --- a/gcc/testsuite/gfortran.dg/goacc/loop-2.f95 +++ b/gcc/testsuite/gfortran.dg/goacc/loop-2.f95 @@ -143,7 +143,7 @@ program test DO j = 1,10 ENDDO ENDDO -!$acc loop tile(-1) ! { dg-warning "must be positive" } +!$acc loop tile(-1) ! { dg-error "must be positive" } do i = 1,10 enddo !$acc loop tile(i) ! { dg-error "constant expression" } @@ -307,7 +307,7 @@ program test DO j = 1,10 ENDDO ENDDO -!$acc loop tile(-1) ! { dg-warning "must be positive" } +!$acc loop tile(-1) ! { dg-error "must be positive" } do i = 1,10 enddo !$acc loop tile(i) ! { dg-error "constant expression" } @@ -460,7 +460,7 @@ program test DO j = 1,10 ENDDO ENDDO - !$acc kernels loop tile(-1) ! { dg-warning "must be positive" } + !$acc kernels loop tile(-1) ! { dg-error "must be positive" } do i = 1,10 enddo !$acc kernels loop tile(i) ! { dg-error "constant expression" } @@ -612,7 +612,7 @@ program test DO j = 1,10 ENDDO ENDDO - !$acc parallel loop tile(-1) ! { dg-warning "must be positive" } + !$acc parallel loop tile(-1) ! { dg-error "must be positive" } do i = 1,10 enddo !$acc parallel loop tile(i) ! { dg-error "constant expression" } diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-5.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-5.f95 index d059cf7..fe137d5 100644 --- a/gcc/testsuite/gfortran.dg/goacc/loop-5.f95 +++ b/gcc/testsuite/gfortran.dg/goacc/loop-5.f95 @@ -93,9 +93,6 @@ program test DO j = 1,10 ENDDO ENDDO -!$acc loop tile(-1) ! { dg-warning "must be positive" } -do i = 1,10 -enddo !$acc loop vector tile(*) DO i = 1,10 ENDDO @@ -129,9 +126,6 @@ program test DO j = 1,10 ENDDO ENDDO -!$acc loop tile(-1) ! { dg-warning "must be positive" } -do i = 1,10 -enddo !$acc loop vector tile(*) DO i = 1,10 ENDDO @@ -242,9 +236,6 @@ program test DO j = 1,10 ENDDO ENDDO - !$acc kernels loop tile(-1) ! { dg-warning "must be positive" } - do i = 1,10 - enddo !$acc kernels loop vector tile(*) DO i = 1,10 ENDDO @@ -333,9 +324,6 @@ program test DO j = 1,10 ENDDO ENDDO - !$acc parallel loop tile(-1) ! { dg-warning "must be positive" } - do i = 1,10 - enddo !$acc parallel loop vector tile(*) DO i = 1,10 ENDDO diff --git a/gcc/testsuite/gfortran.dg/goacc/sie.f95 b/gcc/testsuite/gfortran.dg/goacc/sie.f95 index abfe28b..3abf2c8 100644 --- a/gcc/testsuite/gfortran.dg/goacc/sie.f95 +++ b/gcc/testsuite/gfortran.dg/goacc/sie.f95 @@ -78,10 +78,10 @@ program test !$acc parallel num_gangs(i+1) !$acc end parallel - !$acc parallel num_gangs(-1) ! { dg-warning "must be positive" } + !$acc parallel num_gangs(-1) ! { dg-error &
[PATCH][OpenACC] cleanup trans-stmt.h
This patch removes a stale reference to trans-openacc.c in gcc/fortran/trans-statement.h. I'll apply it to trunk as obvious shortly. Cesar >From a08fe168c3f3ca4d446915ad26027786cda58394 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Tue, 14 Mar 2017 22:33:00 + Subject: [PATCH] [OpenACC] cleanup trans-stmt.h 2018-08-07 Cesar Philippidis gcc/fortran/ * trans-stmt.h: Remove stale reference to trans-openacc.c. --- gcc/fortran/trans-stmt.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/gcc/fortran/trans-stmt.h b/gcc/fortran/trans-stmt.h index c798c80..848c7d9 100644 --- a/gcc/fortran/trans-stmt.h +++ b/gcc/fortran/trans-stmt.h @@ -70,8 +70,6 @@ tree gfc_trans_deallocate_array (tree); /* trans-openmp.c */ tree gfc_trans_omp_directive (gfc_code *); void gfc_trans_omp_declare_simd (gfc_namespace *); - -/* trans-openacc.c */ tree gfc_trans_oacc_directive (gfc_code *); tree gfc_trans_oacc_declare (gfc_namespace *); -- 2.7.4
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/06/2018 11:08 PM, Tom de Vries wrote: > On 08/01/2018 12:18 PM, Tom de Vries wrote: > >> I think we need to add and handle: >> ... >> CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) >> ... >> > > I realized that the patch I posted introducing CUDA_ONE_CALL_MAYBE_NULL > was incomplete, and needed to use the weak attribute in case of linking > against a concrete libcuda.so. > > So, I've now committed a patch implementing just CUDA_ONE_CALL_MAYBE_NULL: > "[libgomp, nvptx] Handle CUDA_ONE_CALL_MAYBE_NULL" @ > https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00447.html . You can use > "CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)" to test for > existence of the function in the cuda driver API. Sorry for taking so long getting this patch updated. It's a slow build and test cycle getting older versions of cuda to play nicely. So far, I've managed to get CUDA 5.5 partially working with Nvidia driver 331.113 (which supports CUDA 6.0) in the sense that I spotted an error with the patch; I realized that the cuda.h that ships with libgomp emulates version CUDA 8.0. That lead to problems using cuLinkAddData, because that function gets remapped to cuLinkAddData_v2 in CUDA 6.5 and newer. That leads me to a question, do we really want to support older versions of CUDA without using the system's CUDA header files? >> The patch doesn't build in a setup with >> --enable-offload-targets=nvptx-none and without cuda, that enables usage >> of plugin/cuda/cuda.h: >> ... >> /data/offload-nvptx/src/libgomp/plugin/plugin-nvptx.c:98:16: error: >> ‘cuOccupancyMaxPotentialBlockSize’ undeclared here (not in a function); >> did you mean ‘cuOccupancyMaxPotentialBlockSizeWithFlags’? >> CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \ >> ... >> > > I've committed a patch "[libgomp, nvptx, --without-cuda-driver] Don't > use system cuda driver" @ > https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00348.html . > > Using --without-cuda-driver should make it easy to build using the > dlopen interface without having to de-install the system libcuda.so. I attached an updated version of the CUDA driver patch, although I haven't rebased it against your changes yet. It still needs to be tested against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give you an update. Does this patch look OK, at least after testing competes? I removed the tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't supported in the older drivers. Cesar >From 7fc093da173543b43e1d83dd5fb9e00e2b92eb09 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Thu, 26 Jul 2018 11:47:35 -0700 Subject: [PATCH] [nvptx] Use CUDA driver API to select default runtime launch geometry libgomp/ plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. (cuDriverGetVersion): Declare. (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare. plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize. (ptx_device): Add driver_version member. (nvptx_open_device): Initialize it. (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the default num_gangs and num_workers when the driver supports it. --- libgomp/plugin/cuda-lib.def | 2 ++ libgomp/plugin/cuda/cuda.h| 4 libgomp/plugin/plugin-nvptx.c | 41 +-- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index be8e3b3ec4d..f2433e1f0a9 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate) CUDA_ONE_CALL (cuCtxDestroy) CUDA_ONE_CALL (cuCtxGetCurrent) CUDA_ONE_CALL (cuCtxGetDevice) +CUDA_ONE_CALL (cuDriverGetVersion) CUDA_ONE_CALL (cuCtxPopCurrent) CUDA_ONE_CALL (cuCtxPushCurrent) CUDA_ONE_CALL (cuCtxSynchronize) @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamQuery) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 4799825bda2..3a790e688e0 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -44,6 +44,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; typedef enum { @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void); CUresult cuDeviceGet (CUdevice *, int); CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice); CUresult cuDeviceGetCount (int *); +CUresult cuDriverGetVersion(int *); CUresult cuEventCreate (CUevent *, unsigned); #def
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/03/2018 08:22 AM, Tom de Vries wrote: > On 08/01/2018 09:11 PM, Cesar Philippidis wrote: >> On 08/01/2018 07:12 AM, Tom de Vries wrote: >> >>>>>> + gangs = grids * (blocks / warp_size); >>>>> >>>>> So, we launch with gangs == grids * workers ? Is that intentional? >>>> >>>> Yes. At least that's what I've been using in og8. Setting num_gangs = >>>> grids alone caused significant slow downs. >>>> >>> >>> Well, what you're saying here is: increasing num_gangs increases >>> performance. >>> >>> You don't explain why you multiply with workers specifically. >> >> I set it that way because I think the occupancy calculator is >> determining the occupancy of a single multiprocessor unit, rather than >> the entire GPU. Looking at the og8 code again, I had >> >>num_gangs = 2 * threads_per_sm / warp_size * dev_size >> >> which corresponds to >> >>2 * grids * blocks / warp_size >> > > I've done an experiment using the sample simpleOccupancy. The kernel is > small, so the blocks returned is the maximum: max_threads_per_block (1024). > > The grids returned is 10, which I tentatively interpret as num_dev * > (max_threads_per_multi_processor / blocks). [ Where num_dev == 5, and > max_threads_per_multi_processor == 2048. ] > > Substituting that into the og8 code, and equating > max_threads_per_multi_processor with threads_per_sm, I indeed get > > num_gangs = 2 * grids * blocks / warp_size. > > So with this extra information I see how you got there. > > But I still see no rationale why blocks is used here, and I wonder > whether something like num_gangs = grids * 64 would give similar results. My original intent was to keep the load proportional to the block size. So, in the case were a block size is limited by shared-memory or the register file capacity, the runtime wouldn't excessively over assign gangs to the multiprocessor units if their state is going to be swapped out even more than necessary. With that said, I could be wrong here. It would be nice if Nvidia provided us with more insights into their hardware. > Anyway, given that this is what is used on og8, I'm ok with using that, > so let's go with: > ... > gangs = 2 * grids * (blocks / warp_size); > ... > [ so, including the factor two you explicitly left out from the original > patch. Unless you see a pressing reason not to include it. ] > > Can you repost after retesting? [ note: the updated patch I posted > earlier doesn't apply on trunk anymore due to the cuda-lib.def change. ] Thanks for looking into this. I got bogged down tracking a problem with allocatable scalars in fortran. I'll repost post this patch after I tested it with an older version of CUDA (probably CUDA 5.5 using the Nvidia driver 331.113 on a K40). Cesar
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/01/2018 07:12 AM, Tom de Vries wrote: +gangs = grids * (blocks / warp_size); >>> >>> So, we launch with gangs == grids * workers ? Is that intentional? >> >> Yes. At least that's what I've been using in og8. Setting num_gangs = >> grids alone caused significant slow downs. >> > > Well, what you're saying here is: increasing num_gangs increases > performance. > > You don't explain why you multiply with workers specifically. I set it that way because I think the occupancy calculator is determining the occupancy of a single multiprocessor unit, rather than the entire GPU. Looking at the og8 code again, I had num_gangs = 2 * threads_per_sm / warp_size * dev_size which corresponds to 2 * grids * blocks / warp_size Because blocks is generally smaller than threads_per_block, the driver occupancy calculator ends up launching fewer gangs. I don't have a firm position with this default behavior. Perhaps we should just set gang = grids That's probably an improvement over what's there now. Cesar
Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
On 08/01/2018 03:18 AM, Tom de Vries wrote: > On 07/31/2018 04:58 PM, Cesar Philippidis wrote: >> The attached patch teaches libgomp how to use the CUDA thread occupancy >> calculator built into the CUDA driver. Despite both being based off the >> CUDA thread occupancy spreadsheet distributed with CUDA, the built in >> occupancy calculator differs from the occupancy calculator in og8 in two >> key ways. First, og8 launches twice the number of gangs as the driver >> thread occupancy calculator. This was my attempt at preventing threads >> from idling, and it operating on a similar principle of running 'make >> -jN', where N is twice the number of CPU threads. > > You're saying the two methods are different, and that the difference > between the two methods is a factor two, which is a heuristic you added > yourself on top of one of the methods, which implies that in fact the > two methods are identical. Is my understanding correct here? With the exception being that og8 multiples num_gangs by a factor of two, those two algorithms are identical, at least with respect to gangs. >> Second, whereas og8 >> always attempts to maximize the CUDA block size, the driver may select a >> smaller block, which effectively decreases num_workers. >> > > So, do I understand it correctly that using the function > cuOccupancyMaxPotentialBlockSize gives us "minimum block size that can > achieve the maximum occupancy" or some such and og8 gives us "maximum > block size"? Correct. >> In terms of performance, there really isn't that much of a difference >> between the CUDA driver's occupancy calculator and og8's. However, on >> the tests that are impacted, they are generally within a factor of two >> from one another, with some tests running faster with the driver >> occupancy calculator and others with og8's. >> > > Ack. Well, until we understand that in more detail, going with the > driver's occupancy calculator seems the right thing to do. > >> Unfortunately, support for the CUDA driver API isn't universal; it's >> only available in CUDA version 6.5 (or 6050) and newer. In this patch, >> I'm exploiting the fact that init_cuda_lib only checks for errors on the >> last library function initialized. > > That sounds incorrect to me. In init_cuda_lib I see: > ... > # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call) > # define CUDA_ONE_CALL_1(call) \ > cuda_lib.call = dlsym (h, #call); \ > if (cuda_lib.call == NULL)\ > return false; > CUDA_CALLS > ... > so in fact every library function is checked. Have you tested this with > pre 6-5 cuda? I misread that. You're correct. So far, I've only tested this out with CUDA 9. > I think we need to add and handle: > ... > CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) > ... > >> Therefore it guards the usage of >> >> cuOccupancyMaxPotentialBlockSizeWithFlags >> >> by checking driver_version. > > If we allow the cuOccupancyMaxPotentialBlockSize field to be NULL, we > can test for NULL, which seems a simpler solution than testing the version. > >> If the driver occupancy calculator isn't >> available, it falls back to the existing defaults. Maybe the og8 thread >> occupancy would make a better default for older versions of CUDA, but >> that's a patch for another day. >> > > Agreed. > >> Is this patch OK for trunk? > > The patch doesn't build in a setup with > --enable-offload-targets=nvptx-none and without cuda, that enables usage > of plugin/cuda/cuda.h: > ... > /data/offload-nvptx/src/libgomp/plugin/plugin-nvptx.c:98:16: error: > ‘cuOccupancyMaxPotentialBlockSize’ undeclared here (not in a function); > did you mean ‘cuOccupancyMaxPotentialBlockSizeWithFlags’? > CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \ > ... > >> @@ -1220,11 +1227,39 @@ nvptx_exec (void (*fn), size_t mapnum, void >> **hostaddrs, void **devaddrs, >> >>{ >> bool default_dim_p[GOMP_DIM_MAX]; >> +int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR]; >> +int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER]; >> +int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG]; >> + >> +/* The CUDA driver occupancy calculator is only available on >> + CUDA version 6.5 (6050) and newer. */ >> +if (nvthd->ptx_dev->driver_version > 6050) >> + { >> +int grids, blocks; >> +CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, , >> + , function, NULL, 0, >> +
Re: [PATCH,nvptx] Remove use of 'struct map' from plugin (nvptx)
On 08/01/2018 04:01 AM, Tom de Vries wrote: > On 07/31/2018 05:12 PM, Cesar Philippidis wrote: >> This is an old patch which removes the struct map from the nvptx plugin. >> I believe at one point this was supposed to be used to manage async data >> mappings, but in practice that never worked out. > > I don't quite understand what rationale you're trying to present here. > > Is this dead code? It's dead code. Cesar
[og8] More goacc_parlevel enhancements
I've committed this patch which contains all of the remaining goacc_parlevel bug fixes present in trunk to og8. The goal of the goacc parlevel changes is replace the use of inline ptx code with builtin functions so that the certain OpenACC execution tests that exercise the execution model can be target independent. For the most part, these patches applied cleanly to og8, however, as I noted in PR86757, there were a couple of og8-specific regressions involving tests that started to fail when built -O0. I believe that problem is caused by the ganglocal memory changes. Chung-Lin, we'll need to fix PR86757 before we push the gangprivate changes upstream. Julian, I'm not sure if the GCN port supports gangprivate memory. If it does, you might be hit by this failure at -O0. But those tests have already been xfailed, so you should be OK. Cesar [og8] More goacc_parlevel enhancements 2018-07-31 Cesar Philippidis libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust test. * testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise. Backport from mainline: 2018-05-02 Tom de Vries PR libgomp/85411 libgomp/ * plugin/plugin-nvptx.c (nvptx_exec): Move parsing of GOMP_OPENACC_DIM ... * env.c (parse_gomp_openacc_dim): ... here. New function. (initialize_env): Call parse_gomp_openacc_dim. (goacc_default_dims): Define. * libgomp.h (goacc_default_dims): Declare. * oacc-plugin.c (GOMP_PLUGIN_acc_default_dim): New function. * oacc-plugin.h (GOMP_PLUGIN_acc_default_dim): Declare. * libgomp.map: New version "GOMP_PLUGIN_1.2". Add GOMP_PLUGIN_acc_default_dim. * testsuite/libgomp.oacc-c-c++-common/loop-default-runtime.c: New test. * testsuite/libgomp.oacc-c-c++-common/loop-default.h: New test. 2018-05-04 Tom de Vries PR libgomp/85639 gcc/ * builtins.c (expand_builtin_goacc_parlevel_id_size): Handle null target if ignore == 0. 2018-05-07 Tom de Vries PR testsuite/85677 libgomp/ * testsuite/lib/libgomp.exp (libgomp_init): Move inclusion of top-level include directory in ALWAYS_CFLAGS out of $blddir != "" condition. [openacc] Move GOMP_OPENACC_DIM parsing out of nvptx plugin git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259852 138bc75d-0d04-0410-961f-82ee72b054a4 [expand] Handle null target in expand_builtin_goacc_parlevel_id_size git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259927 138bc75d-0d04-0410-961f-82ee72b054a4 [openacc, testsuite] Allow installed testing of libgomp to find gomp-constants.h git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259992 138bc75d-0d04-0410-961f-82ee72b054a4 diff --git a/gcc/builtins.c b/gcc/builtins.c index 300e13c..0097d5b 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -6682,6 +6682,9 @@ expand_builtin_goacc_parlevel_id_size (tree exp, rtx target, int ignore) if (ignore) return target; + if (target == NULL_RTX) +target = gen_reg_rtx (TYPE_MODE (TREE_TYPE (exp))); + if (!targetm.have_oacc_dim_size ()) { emit_move_insn (target, fallback_retval); diff --git a/libgomp/env.c b/libgomp/env.c index c99ba85..fab35b7 100644 --- a/libgomp/env.c +++ b/libgomp/env.c @@ -90,6 +90,7 @@ int gomp_debug_var; unsigned int gomp_num_teams_var; char *goacc_device_type; int goacc_device_num; +int goacc_default_dims[GOMP_DIM_MAX]; #ifndef LIBGOMP_OFFLOADED_ONLY @@ -1066,6 +1067,36 @@ parse_acc_device_type (void) } static void +parse_gomp_openacc_dim (void) +{ + /* The syntax is the same as for the -fopenacc-dim compilation option. */ + const char *var_name = "GOMP_OPENACC_DIM"; + const char *env_var = getenv (var_name); + if (!env_var) +return; + + const char *pos = env_var; + int i; + for (i = 0; *pos && i != GOMP_DIM_MAX; i++) +{ + if (i && *pos++ != ':') + break; + + if (*pos == ':') + continue; + + const char *eptr; + errno = 0; + long val = strtol (pos, (char **), 10); + if (errno || val < 0 || (unsigned)val != val) + break; + + goacc_default_dims[i] = (int)val; + pos = eptr; +} +} + +static void handle_omp_display_env (unsigned long stacksize, int wait_policy) { const char *env; @@ -1336,6 +1367,7 @@ initialize_env (void) goacc_device_num = 0; parse_acc_device_type (); + parse_gomp_openacc_dim (); goacc_runtime_initialize (); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index a9aca74..607f4c2 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -44,6 +44,7 @@ #include "config.h" #include "gst
[og8] Add __builtin_goacc_parlevel_{id,size}
I've committed this patch to og8 which backports the first of Tom's goacc_parlevel patches from mainline. I'll post of a followup patch which contains various bug fixes. I believe that this patch was originally introduced in PR82428, or at least it resolves that PR. Cesar [og8] Add __builtin_goacc_parlevel_{id,size} 2018-07-31 Cesar Philippidis Backport from mainline: 2018-05-02 Tom de Vries PR libgomp/82428 gcc/ * builtins.def (DEF_GOACC_BUILTIN_ONLY): Define. * omp-builtins.def (BUILT_IN_GOACC_PARLEVEL_ID) (BUILT_IN_GOACC_PARLEVEL_SIZE): New builtin. * builtins.c (expand_builtin_goacc_parlevel_id_size): New function. (expand_builtin): Call expand_builtin_goacc_parlevel_id_size. * doc/extend.texi (Other Builtins): Add __builtin_goacc_parlevel_id and __builtin_goacc_parlevel_size. gcc/fortran/ * f95-lang.c (DEF_GOACC_BUILTIN_ONLY): Define. gcc/testsuite/ * c-c++-common/goacc/builtin-goacc-parlevel-id-size-2.c: New test. * c-c++-common/goacc/builtin-goacc-parlevel-id-size.c: New test. libgomp/ * testsuite/libgomp.oacc-c-c++-common/gang-static-2.c: Use __builtin_goacc_parlevel_{id,size}. * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Same. * testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Same. * testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Same. * testsuite/libgomp.oacc-c-c++-common/tile-1.c: Same. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@259850 138bc75d-0d04-0410-961f-82ee72b054a4 diff --git a/gcc/builtins.c b/gcc/builtins.c index a71555e..300e13c 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -71,6 +71,8 @@ along with GCC; see the file COPYING3. If not see #include "gimple-fold.h" #include "intl.h" #include "file-prefix-map.h" /* remap_macro_filename() */ +#include "gomp-constants.h" +#include "omp-general.h" struct target_builtins default_target_builtins; #if SWITCHABLE_TARGET @@ -6628,6 +6630,71 @@ expand_stack_save (void) return ret; } +/* Emit code to get the openacc gang, worker or vector id or size. */ + +static rtx +expand_builtin_goacc_parlevel_id_size (tree exp, rtx target, int ignore) +{ + const char *name; + rtx fallback_retval; + rtx_insn *(*gen_fn) (rtx, rtx); + switch (DECL_FUNCTION_CODE (get_callee_fndecl (exp))) +{ +case BUILT_IN_GOACC_PARLEVEL_ID: + name = "__builtin_goacc_parlevel_id"; + fallback_retval = const0_rtx; + gen_fn = targetm.gen_oacc_dim_pos; + break; +case BUILT_IN_GOACC_PARLEVEL_SIZE: + name = "__builtin_goacc_parlevel_size"; + fallback_retval = const1_rtx; + gen_fn = targetm.gen_oacc_dim_size; + break; +default: + gcc_unreachable (); +} + + if (oacc_get_fn_attrib (current_function_decl) == NULL_TREE) +{ + error ("%qs only supported in OpenACC code", name); + return const0_rtx; +} + + tree arg = CALL_EXPR_ARG (exp, 0); + if (TREE_CODE (arg) != INTEGER_CST) +{ + error ("non-constant argument 0 to %qs", name); + return const0_rtx; +} + + int dim = TREE_INT_CST_LOW (arg); + switch (dim) +{ +case GOMP_DIM_GANG: +case GOMP_DIM_WORKER: +case GOMP_DIM_VECTOR: + break; +default: + error ("illegal argument 0 to %qs", name); + return const0_rtx; +} + + if (ignore) +return target; + + if (!targetm.have_oacc_dim_size ()) +{ + emit_move_insn (target, fallback_retval); + return target; +} + + rtx reg = MEM_P (target) ? gen_reg_rtx (GET_MODE (target)) : target; + emit_insn (gen_fn (reg, GEN_INT (dim))); + if (reg != target) +emit_move_insn (target, reg); + + return target; +} /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient @@ -7
[PATCH,nvptx] Truncate config/nvptx/oacc-parallel.c
Way back in the GCC 5 days when support for OpenACC was in its infancy, we used to rely on having various GOACC_ thread functions in the runtime to implement the execution model, or there lack of (that version of GCC only supported vector level parallelism). However, beginning with GCC 6, those external functions were replaced with internal functions that get expanded by the nvptx BE directly. This patch removes those stale libgomp functions from the nvptx libgomp target. Is this OK for trunk, or does libgomp still need to maintain backwards compatibility with GCC 5? This patch has been bootstrapped and regtested for x86_64 with nvptx offloading. Thanks, Cesar [PATCH] [libgomp] Truncate config/nvptx/oacc-parallel.c 2018-XX-YY Cesar Philippidis Thomas Schwinge libgomp/ * config/nvptx/oacc-parallel.c: Truncate. (cherry picked from gomp-4_0-branch r228836) --- libgomp/config/nvptx/oacc-parallel.c | 358 --- 1 file changed, 358 deletions(-) diff --git a/libgomp/config/nvptx/oacc-parallel.c b/libgomp/config/nvptx/oacc-parallel.c index 5dc53da..e69de29 100644 --- a/libgomp/config/nvptx/oacc-parallel.c +++ b/libgomp/config/nvptx/oacc-parallel.c @@ -1,358 +0,0 @@ -/* OpenACC constructs - - Copyright (C) 2014-2018 Free Software Foundation, Inc. - - Contributed by Mentor Embedded. - - This file is part of the GNU Offloading and Multi Processing Library - (libgomp). - - Libgomp is free software; you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3, or (at your option) - any later version. - - Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - FOR A PARTICULAR PURPOSE. See the GNU General Public License for - more details. - - Under Section 7 of GPL version 3, you are granted additional - permissions described in the GCC Runtime Library Exception, version - 3.1, as published by the Free Software Foundation. - - You should have received a copy of the GNU General Public License and - a copy of the GCC Runtime Library Exception along with this program; - see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - <http://www.gnu.org/licenses/>. */ - -#include "libgomp_g.h" - -__asm__ (".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n" - ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n" - ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n" - ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n" - "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_num_threads\n" - ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" - "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_thread_num\n" - ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" - "// BEGIN GLOBAL FUNCTION DECL: abort\n" - ".extern .func abort;\n" - ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n" - "{\n" - ".reg .u32 %ar1;\n" - ".reg .u32 %retval;\n" - ".reg .u64 %hr10;\n" - ".reg .u32 %r22;\n" - ".reg .u32 %r23;\n" - ".reg .u32 %r24;\n" - ".reg .u32 %r25;\n" - ".reg .u32 %r26;\n" - ".reg .u32 %r27;\n" - ".reg .u32 %r28;\n" - ".reg .u32 %r29;\n" - ".reg .pred %r30;\n" - ".reg .u32 %r31;\n" - ".reg .pred %r32;\n" - ".reg .u32 %r33;\n" - ".reg .pred %r34;\n" - ".local .align 8 .b8 %frame[4];\n" - "ld.param.u32 %ar1,[%in_ar1];\n" - "mov.u32 %r27,%ar1;\n" - "st.local.u32 [%frame],%r27;\n" - "ld.local.u32 %r28,[%frame];\n" - "mov.u32 %r29,1;\n" - "setp.eq.u32 %r30,%r28,%r29;\n" - "@%r30 bra $L4;\n" - "mov.u32 %r31,2;\n" - "setp.eq.u32 %r32,%r28,%r31;\n" - "@%r32 bra $L5;\n" - "mov.u32 %r33,0;\n" - "setp.eq.u32 %r34,%r28,%r33;\n" - "@!%r34 bra $L8;\n" - "mov.u32 %r23,%tid.x;\n" - "mov.u32 %r22,%r23;\n" - "bra $L7;\n" - "$L4:\n" - "mov.u32 %r24,%tid.y;\n" - "mov.u32 %r22,%r24;\n" - "bra $L7;\n" - "$L5:\n" - "mov.u32 %r25,%tid.z;\n" - "mov.u32 %r22,%r25;\n" - "bra $L7;\n" - "$L8:\n" - "{\n" - "{\n" - "call abort;\n" - "}\n" - "}\n" - "$L7:\n" - "mov.u32 %r26,%r22;\n" - "mov.u32 %retval,%r26;\n" - "st.param.u32 [%ou
[PATCH,nvptx] Remove use of CUDA unified memory in libgomp
At present, libgomp is using CUDA unified memory only as a buffer pass to the struct containing the pointers to the data mappings to the offloaded functions. I'm not sure why unified memory is needed here if it is still being managed explicitly by the driver. This patch removes the use of CUDA unified memory from the driver. I don't recall observing any reduction in performance. Besides, eventually, we'd like to eliminate the struct containing all pointers to the offloaded data mappings and pass those pointers as individual function arguments to cuLaunchKernel directly. Is this patch OK for trunk? I bootstrapped and regression tested it for x86_64 with nvptx offloading. Thanks, Cesar [PATCH] [nvptx] Remove use of CUDA unified memory in libgomp 2018-XX-YY Cesar Philippidis libgomp/ * plugin/plugin-nvptx.c (struct cuda_map): New. (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev, h_tail with (cuda_map *) map. (cuda_map_create): New function. (cuda_map_destroy): New function. (map_init): Update to use a linked list of cuda_map objects. (map_fini): Likewise. (map_pop): Likewise. (map_push): Likewise. Return CUdeviceptr instead of void. (init_streams_for_device): Remove stales references to ptx_stream members. (select_stream_for_async): Likewise. (nvptx_exec): Update call to map_init. (cherry picked from gomp-4_0-branch r242614) --- libgomp/plugin/plugin-nvptx.c | 167 +++--- 1 file changed, 90 insertions(+), 77 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 1237ea10..d79ddf1 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -200,20 +200,20 @@ cuda_error (CUresult r) static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; +struct cuda_map +{ + CUdeviceptr d; + size_t size; + bool active; + struct cuda_map *next; +}; + struct ptx_stream { CUstream stream; pthread_t host_thread; bool multithreaded; - - CUdeviceptr d; - void *h; - void *h_begin; - void *h_end; - void *h_next; - void *h_prev; - void *h_tail; - + struct cuda_map *map; struct ptx_stream *next; }; @@ -225,101 +225,114 @@ struct nvptx_thread struct ptx_device *ptx_dev; }; +static struct cuda_map * +cuda_map_create (size_t size) +{ + struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); + + assert (map); + + map->next = NULL; + map->size = size; + map->active = false; + + CUDA_CALL_ERET (NULL, cuMemAlloc, >d, size); + assert (map->d); + + return map; +} + +static void +cuda_map_destroy (struct cuda_map *map) +{ + CUDA_CALL_ASSERT (cuMemFree, map->d); + free (map); +} + +/* The following map_* routines manage the CUDA device memory that + contains the data mapping arguments for cuLaunchKernel. Each + asynchronous PTX stream may have multiple pending kernel + invocations, which are launched in a FIFO order. As such, the map + routines maintains a queue of cuLaunchKernel arguments. + + Calls to map_push and map_pop must be guarded by ptx_event_lock. + Likewise, calls to map_init and map_fini are guarded by + ptx_dev_lock inside GOMP_OFFLOAD_init_device and + GOMP_OFFLOAD_fini_device, respectively. */ + static bool map_init (struct ptx_stream *s) { int size = getpagesize (); assert (s); - assert (!s->d); - assert (!s->h); - - CUDA_CALL (cuMemAllocHost, >h, size); - CUDA_CALL (cuMemHostGetDevicePointer, >d, s->h, 0); - assert (s->h); + s->map = cuda_map_create (size); - s->h_begin = s->h; - s->h_end = s->h_begin + size; - s->h_next = s->h_prev = s->h_tail = s->h_begin; - - assert (s->h_next); - assert (s->h_end); return true; } static bool map_fini (struct ptx_stream *s) { - CUDA_CALL (cuMemFreeHost, s->h); + assert (s->map->next == NULL); + assert (!s->map->active); + + cuda_map_destroy (s->map); + return true; } static void map_pop (struct ptx_stream *s) { - assert (s != NULL); - assert (s->h_next); - assert (s->h_prev); - assert (s->h_tail); - - s->h_tail = s->h_next; - - if (s->h_tail >= s->h_end) -s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); + struct cuda_map *next; - if (s->h_next == s->h_tail) -s->h_prev = s->h_next; + assert (s != NULL); - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); + if (s->map->next == NULL) +{ + s->map->active = false; + return; +} - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + next = s->map->next; + cuda_map_destroy (s->map); + s->map = next; } -static void -map_push (struct ptx_stream *s, size_t size,
[PATCH,nvptx] Remove use of 'struct map' from plugin (nvptx)
This is an old patch which removes the struct map from the nvptx plugin. I believe at one point this was supposed to be used to manage async data mappings, but in practice that never worked out. Is this OK for trunk? I bootstrapped and regtested on x86_64 with nvptx offloading. Thanks, Cesar [PATCH] Remove use of 'struct map' from plugin (nvptx) 2018-XX-YY Cesar Philippidis James Norris libgomp/ * plugin/plugin-nvptx.c (struct map): Removed. (map_init, map_pop): Remove use of struct map. (map_push): Likewise and change argument list. * testsuite/libgomp.oacc-c-c++-common/mapping-1.c: New (cherry picked from gomp-4_0-branch r231616) --- libgomp/plugin/plugin-nvptx.c | 33 +++- .../libgomp.oacc-c-c++-common/mapping-1.c | 63 ++ 2 files changed, 69 insertions(+), 27 deletions(-) create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a92f054..1237ea10 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -225,13 +225,6 @@ struct nvptx_thread struct ptx_device *ptx_dev; }; -struct map -{ - int async; - size_t size; - charmappings[0]; -}; - static bool map_init (struct ptx_stream *s) { @@ -265,16 +258,12 @@ map_fini (struct ptx_stream *s) static void map_pop (struct ptx_stream *s) { - struct map *m; - assert (s != NULL); assert (s->h_next); assert (s->h_prev); assert (s->h_tail); - m = s->h_tail; - - s->h_tail += m->size; + s->h_tail = s->h_next; if (s->h_tail >= s->h_end) s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); @@ -292,37 +281,27 @@ map_pop (struct ptx_stream *s) } static void -map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d) +map_push (struct ptx_stream *s, size_t size, void **h, void **d) { int left; int offset; - struct map *m; assert (s != NULL); left = s->h_end - s->h_next; - size += sizeof (struct map); assert (s->h_prev); assert (s->h_next); if (size >= left) { - m = s->h_prev; - m->size += left; - s->h_next = s->h_begin; - - if (s->h_next + size > s->h_end) - GOMP_PLUGIN_fatal ("unable to push map"); + assert (s->h_next == s->h_prev); + s->h_next = s->h_prev = s->h_tail = s->h_begin; } assert (s->h_next); - m = s->h_next; - m->async = async; - m->size = size; - - offset = (void *)>mappings[0] - s->h; + offset = s->h_next - s->h; *d = (void *)(s->d + offset); *h = (void *)(s->h + offset); @@ -1291,7 +1270,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ - map_push (dev_str, async, mapnum * sizeof (void *), , ); + map_push (dev_str, mapnum * sizeof (void *), , ); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c new file mode 100644 index 000..593e7d4 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/mapping-1.c @@ -0,0 +1,63 @@ +/* { dg-do run } */ + +#include +#include +#include + +/* Exercise the kernel launch argument mapping. */ + +int +main (int argc, char **argv) +{ + int a[256], b[256], c[256], d[256], e[256], f[256]; + int i; + int n; + + /* 48 is the size of the mappings for the first parallel construct. */ + n = sysconf (_SC_PAGESIZE) / 48 - 1; + + i = 0; + + for (i = 0; i < n; i++) +{ + #pragma acc parallel copy (a, b, c, d) + { + int j; + + for (j = 0; j < 256; j++) + { + a[j] = j; + b[j] = j; + c[j] = j; + d[j] = j; + } + } +} + +#pragma acc parallel copy (a, b, c, d, e, f) + { +int j; + +for (j = 0; j < 256; j++) + { + a[j] = j; + b[j] = j; + c[j] = j; + d[j] = j; + e[j] = j; + f[j] = j; + } + } + + for (i = 0; i < 256; i++) + { + if (a[i] != i) abort(); + if (b[i] != i) abort(); + if (c[i] != i) abort(); + if (d[i] != i) abort(); + if (e[i] != i) abort(); + if (f[i] != i) abort(); + } + + exit (0); +} -- 2.7.4
[PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry
The attached patch teaches libgomp how to use the CUDA thread occupancy calculator built into the CUDA driver. Despite both being based off the CUDA thread occupancy spreadsheet distributed with CUDA, the built in occupancy calculator differs from the occupancy calculator in og8 in two key ways. First, og8 launches twice the number of gangs as the driver thread occupancy calculator. This was my attempt at preventing threads from idling, and it operating on a similar principle of running 'make -jN', where N is twice the number of CPU threads. Second, whereas og8 always attempts to maximize the CUDA block size, the driver may select a smaller block, which effectively decreases num_workers. In terms of performance, there really isn't that much of a difference between the CUDA driver's occupancy calculator and og8's. However, on the tests that are impacted, they are generally within a factor of two from one another, with some tests running faster with the driver occupancy calculator and others with og8's. Unfortunately, support for the CUDA driver API isn't universal; it's only available in CUDA version 6.5 (or 6050) and newer. In this patch, I'm exploiting the fact that init_cuda_lib only checks for errors on the last library function initialized. Therefore it guards the usage of cuOccupancyMaxPotentialBlockSizeWithFlags by checking driver_version. If the driver occupancy calculator isn't available, it falls back to the existing defaults. Maybe the og8 thread occupancy would make a better default for older versions of CUDA, but that's a patch for another day. Is this patch OK for trunk? I bootstrapped and regression tested it using x86_64 with nvptx offloading. Thanks, Cesar [nvptx] Use CUDA driver API to select default runtime launch geometry 2018-XX-YY Cesar Philippidis libgomp/ plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. (cuDriverGetVersion): Declare. (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare. plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize. (ptx_device): Add driver_version member. (nvptx_open_device): Initialize it. (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the default num_gangs and num_workers when the driver supports it. --- libgomp/plugin/cuda/cuda.h| 5 + libgomp/plugin/plugin-nvptx.c | 37 - 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 4799825..1fc694d 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -44,6 +44,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; typedef enum { @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void); CUresult cuDeviceGet (CUdevice *, int); CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice); CUresult cuDeviceGetCount (int *); +CUresult cuDriverGetVersion (int *); CUresult cuEventCreate (CUevent *, unsigned); #define cuEventDestroy cuEventDestroy_v2 CUresult cuEventDestroy (CUevent); @@ -170,6 +172,9 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleLoad (CUmodule *, const char *); CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); +CUresult cuOccupancyMaxPotentialBlockSizeWithFlags (int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, + int, unsigned int); CUresult cuStreamCreate (CUstream *, unsigned); #define cuStreamDestroy cuStreamDestroy_v2 CUresult cuStreamDestroy (CUstream); diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index b6ec5f8..2647af6 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -63,6 +63,7 @@ CUDA_ONE_CALL (cuCtxSynchronize) \ CUDA_ONE_CALL (cuDeviceGet) \ CUDA_ONE_CALL (cuDeviceGetAttribute) \ CUDA_ONE_CALL (cuDeviceGetCount) \ +CUDA_ONE_CALL (cuDriverGetVersion) \ CUDA_ONE_CALL (cuEventCreate) \ CUDA_ONE_CALL (cuEventDestroy) \ CUDA_ONE_CALL (cuEventElapsedTime) \ @@ -94,6 +95,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) \ CUDA_ONE_CALL (cuModuleLoad) \ CUDA_ONE_CALL (cuModuleLoadData) \ CUDA_ONE_CALL (cuModuleUnload) \ +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \ CUDA_ONE_CALL (cuStreamCreate) \ CUDA_ONE_CALL (cuStreamDestroy) \ CUDA_ONE_CALL (cuStreamQuery) \ @@ -423,6 +425,7 @@ struct ptx_device int max_threads_per_block; int max_threads_per_multiprocessor; int default_dims[GOMP_DIM_MAX]; + int driver_version; struct ptx_image_data *images; /* Images loaded on device. */ pthread_mutex_t image_lock; /* Lock for above list. */ @@ -734,6 +737,7 @@ nvptx_open_device (int n) ptx_dev->ord = n; ptx_dev->dev = dev; ptx_dev->ctx_shared = false; + ptx_dev->driver_version = 0; r = CUDA_
Re: [libgomp, nvptx, committed] Calculate default dims per device
On 07/30/2018 03:19 AM, Tom de Vries wrote: > > [libgomp, nvptx] Calculate default dims per device > > The default dimensions are calculated using per-device properties, but > initialized once and used on all devices. > > This patch fixes this problem by introducing per-device default dimensions. Neat, thanks! I wonder if it's worthwhile to optimize the case where a system has more than one identical GPU. Cesar
Re: [PATCH 0/8] Reduce/remove dependencies on _GLIBCXX_USE_C99_STDINT_TR1
On 07/26/2018 07:01 AM, jwak...@redhat.com wrote: > From: Jonathan Wakely It looks like you're using git send-email for this patch series. And it seems like you made the same mistake that I did when you configured git sendmail.from. According to the git sent-email manpage, from should be your email address, however, it really wants it to be in of the form Full Name This is not a huge deal because the email went through, but it was something that wasn't immediately obvious to me. Cesar
Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions
Hi Tom, I see that you're reviewing the libgomp changes. Please disregard the following hunk: On 07/11/2018 12:13 PM, Cesar Philippidis wrote: > @@ -1199,12 +1202,59 @@ nvptx_exec (void (*fn), size_t mapnum, void > **hostaddrs, void **devaddrs, >default_dims[GOMP_DIM_VECTOR]); > } >pthread_mutex_unlock (_dev_lock); > + int vectors = default_dims[GOMP_DIM_VECTOR]; > + int workers = default_dims[GOMP_DIM_WORKER]; > + int gangs = default_dims[GOMP_DIM_GANG]; > + > + if (nvptx_thread()->ptx_dev->driver_version > 6050) > + { > + int grids, blocks; > + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, , > + , function, NULL, 0, > + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); > + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " > + "grid = %d, block = %d\n", grids, blocks); > + > + gangs = grids * dev_size; > + workers = blocks / vectors; > + } I revisited this change yesterday and I noticed it was setting gangs incorrectly. Basically, gangs should be set as follows gangs = grids * (blocks / warp_size); or to be more closer to og8 as gangs = 2 * grids * (blocks / warp_size); The use of that magic constant 2 is to prevent thread starvation. That's a similar concept behind make -j<2*#threads>. Anyway, I'm still experimenting with that change. There are still some discrepancies between the way that I select num_workers and how the driver does. The driver appears to be a little bit more conservative, but according to the thread occupancy calculator, that should yield greater performance on GPUs. I just wanted to give you a heads up because you seem to be working on this. Thanks for all of your reviews! By the way, are you now maintainer of the libgomp nvptx plugin? Cesar
Re: [PATCH 3/3] Add user-friendly OpenACC diagnostics regarding detected parallelism.
On 07/26/2018 01:33 AM, Richard Biener wrote: > On Wed, Jul 25, 2018 at 5:30 PM Cesar Philippidis > wrote: >> >> This patch teaches GCC to inform the user how it assigned parallelism >> to each OpenACC loop at compile time using the -fopt-info-note-omp >> flag. For instance, given the acc parallel loop nest: >> >> #pragma acc parallel loop >> for (...) >> #pragma acc loop vector >> for (...) >> >> GCC will report somthing like >> >> foo.c:4:0: note: Detected parallelism >> foo.c:6:0: note: Detected parallelism >> >> Note how only the inner loop specifies vector parallelism. In this >> example, GCC automatically assigned gang and worker parallelism to the >> outermost loop. Perhaps, going forward, it would be useful to >> distinguish which parallelism was specified by the user and which was >> assigned by the compiler. But that can be added in a follow up patch. >> >> Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 >> with nvptx offloading. > > Shouldn't this use MSG_OPTIMIZED_LOCATIONS instead? Are there > any other optinfo notes emitted? Like when despite pragmas loops > are not handled or so? Early on I was just using the diagnostics in omp-grid.c as a model, but yes, it does make sense to use MSG_OPTIMIZED_LOCATIONS instead of MSG_NOTE. And no, these are the only optinfo notes that we're emitting at the moment. All of the other diagnostics are just errors and warnings, although we probably should revisit that for some of the forthcoming acc routine diagnostics. Going forward, now that there's in interest in automatic parallelism inside acc kernels, we do plan on expanding the diagnostics. The attached revised patch now uses MSG_OPTIMIZED_LOCATIONS for the diagnostics. If this gets approved for trunk, I'll go ahead and backport it to og8 and update the OpenACC wiki to change the usage of -fopt-info-note-omp to -fopt-info-optimized-omp. Is this OK for trunk? Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/ * omp-offload.c (inform_oacc_loop): New function. (execute_oacc_device_lower): Use it to display loop parallelism. gcc/testsuite/ * c-c++-common/goacc/note-parallelism.c: New test. * gfortran.dg/goacc/note-parallelism.f90: New test. (cherry picked from gomp-4_0-branch r245683, and gcc/testsuite/ parts of r245770) use MSG_OPTIMIZED_LOCATIONS instead of MSG_NOTE --- gcc/omp-offload.c | 27 .../c-c++-common/goacc/note-parallelism.c | 61 ++ .../gfortran.dg/goacc/note-parallelism.f90| 62 +++ 3 files changed, 150 insertions(+) create mode 100644 gcc/testsuite/c-c++-common/goacc/note-parallelism.c create mode 100644 gcc/testsuite/gfortran.dg/goacc/note-parallelism.f90 diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index 0abf0283c9e..3582dda3d1a 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -866,6 +866,31 @@ debug_oacc_loop (oacc_loop *loop) dump_oacc_loop (stderr, loop, 0); } +/* Provide diagnostics on OpenACC loops LOOP, its siblings and its + children. */ + +static void +inform_oacc_loop (oacc_loop *loop) +{ + const char *seq = loop->mask == 0 ? " seq" : ""; + const char *gang = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) +? " gang" : ""; + const char *worker = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) +? " worker" : ""; + const char *vector = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) +? " vector" : ""; + dump_location_t loc = dump_location_t::from_location_t (loop->loc); + + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, + "Detected parallelism \n", seq, gang, + worker, vector); + + if (loop->child) +inform_oacc_loop (loop->child); + if (loop->sibling) +inform_oacc_loop (loop->sibling); +} + /* DFS walk of basic blocks BB onwards, creating OpenACC loop structures as we go. By construction these loops are properly nested. */ @@ -1533,6 +1558,8 @@ execute_oacc_device_lower () dump_oacc_loop (dump_file, loops, 0); fprintf (dump_file, "\n"); } + if (dump_enabled_p () && loops->child) +inform_oacc_loop (loops->child); /* Offloaded targets may introduce new basic blocks, which require dominance information to update SSA. */ diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c new file mode 100644 index 000..2e50d86cd23 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c @@ -0,0 +1,61 @@ +/* Test the output of -fopt-info-note-omp. */ + +/* { dg-additional-options "-fopt-info-note-optimized" } */ + +int +main () +{ + int x, y, z; + +#pragma acc parallel
Re: [PATCH 00/11] [nvptx] Initial vector length changes
On 07/24/2018 01:47 PM, ce...@codesourcery.com wrote: > From: Cesar Philippidis > > This patch series contains various cleanups and structural > reorganizations to the NVPTX BE in preparation for the forthcoming > variable length vector length enhancements. Tom, in order to make > these changes easier for you to review, I broke these patches into > logical components. If approved for trunk, would you like to see these > patches committed individually, or all together in a single huge > commit? > > One notable change in this patch set is the partial inclusion of the > PTX_DEFAULT_RUNTIME_DIM change that I previously placed with the > libgomp default geometry update patch that I posted a couple of weeks > ago. I don't want to block this patch series so I included the nvptx > changes in patch 01. > > It this OK for trunk? I regtested both standalone and offloading > compiliers. I'm seeing some inconsistencies in the standalone compiler > results, so I might rerun those just to be safe. But the results using > nvptx as an offloading compiler came back clean. On further inspection, the inconsistencies turned out to be isolated in the c++ tests. The c tests results are clean. Cesar
Re: [PATCH 1/3] Correct the reported line number in fortran combined OpenACC directives
On 07/25/2018 08:32 AM, Marek Polacek wrote: > On Wed, Jul 25, 2018 at 08:29:17AM -0700, Cesar Philippidis wrote: >> The fortran FE incorrectly records the line locations of combined acc >> loop directives when it lowers the construct to gimple. Usually this >> isn't a problem because the fortran FE is able to report problems with >> acc loops itself. However, there will be inaccuracies if the ME tries >> to use those locations. >> >> Note that test cases are inconspicuously absent in this patch. >> However, without this bug fix, -fopt-info-note-omp will report bogus >> line numbers. This code patch will be tested in a later patch in >> this series. >> >> Is this OK for trunk? I bootstrapped and regtested it on x86_64 with >> nvptx offloading. >> >> Thanks, >> Cesar >> >> 2018-XX-YY Cesar Philippidis >> >> gcc/fortran/ >> * trans-openmp.c (gfc_trans_oacc_combined_directive): Set the >> location of combined acc loops. >> >> (cherry picked from gomp-4_0-branch r245653) >> >> diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c >> index f038f4c..e7707d0 100644 >> --- a/gcc/fortran/trans-openmp.c >> +++ b/gcc/fortran/trans-openmp.c >> @@ -3869,6 +3869,7 @@ gfc_trans_oacc_combined_directive (gfc_code *code) >>gfc_omp_clauses construct_clauses, loop_clauses; >>tree stmt, oacc_clauses = NULL_TREE; >>enum tree_code construct_code; >> + location_t loc = input_location; >> >>switch (code->op) >> { >> @@ -3930,12 +3931,16 @@ gfc_trans_oacc_combined_directive (gfc_code *code) >>else >> pushlevel (); >>stmt = gfc_trans_omp_do (code, EXEC_OACC_LOOP, pblock, _clauses, >> NULL); >> + >> + if (CAN_HAVE_LOCATION_P (stmt)) >> +SET_EXPR_LOCATION (stmt, loc); > > This is protected_set_expr_location. Neat, thanks! This patch includes that correction. Is it ok for trunk after bootstrapping and regression testing? Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/fortran/ * trans-openmp.c (gfc_trans_oacc_combined_directive): Set the location of combined acc loops. (cherry picked from gomp-4_0-branch r245653) --- gcc/fortran/trans-openmp.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c index f038f4c5bf8..b549c682533 100644 --- a/gcc/fortran/trans-openmp.c +++ b/gcc/fortran/trans-openmp.c @@ -3869,6 +3869,7 @@ gfc_trans_oacc_combined_directive (gfc_code *code) gfc_omp_clauses construct_clauses, loop_clauses; tree stmt, oacc_clauses = NULL_TREE; enum tree_code construct_code; + location_t loc = input_location; switch (code->op) { @@ -3929,13 +3930,16 @@ gfc_trans_oacc_combined_directive (gfc_code *code) pblock = else pushlevel (); + stmt = gfc_trans_omp_do (code, EXEC_OACC_LOOP, pblock, _clauses, NULL); + protected_set_expr_location (stmt, loc); + if (TREE_CODE (stmt) != BIND_EXPR) stmt = build3_v (BIND_EXPR, NULL, stmt, poplevel (1, 0)); else poplevel (0, 0); - stmt = build2_loc (input_location, construct_code, void_type_node, stmt, - oacc_clauses); + + stmt = build2_loc (loc, construct_code, void_type_node, stmt, oacc_clauses); gfc_add_expr_to_block (, stmt); return gfc_finish_block (); } -- 2.17.1
[PATCH 3/3] Add user-friendly OpenACC diagnostics regarding detected parallelism.
This patch teaches GCC to inform the user how it assigned parallelism to each OpenACC loop at compile time using the -fopt-info-note-omp flag. For instance, given the acc parallel loop nest: #pragma acc parallel loop for (...) #pragma acc loop vector for (...) GCC will report somthing like foo.c:4:0: note: Detected parallelism foo.c:6:0: note: Detected parallelism Note how only the inner loop specifies vector parallelism. In this example, GCC automatically assigned gang and worker parallelism to the outermost loop. Perhaps, going forward, it would be useful to distinguish which parallelism was specified by the user and which was assigned by the compiler. But that can be added in a follow up patch. Is this patch OK for trunk? I bootstrapped and regtested it for x86_64 with nvptx offloading. Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/ * omp-offload.c (inform_oacc_loop): New function. (execute_oacc_device_lower): Use it to display loop parallelism. gcc/testsuite/ * c-c++-common/goacc/note-parallelism.c: New test. * gfortran.dg/goacc/note-parallelism.f90: New test. (cherry picked from gomp-4_0-branch r245683, and gcc/testsuite/ parts of r245770) diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index 0abf028..66b99bb 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -866,6 +866,31 @@ debug_oacc_loop (oacc_loop *loop) dump_oacc_loop (stderr, loop, 0); } +/* Provide diagnostics on OpenACC loops LOOP, its siblings and its + children. */ + +static void +inform_oacc_loop (oacc_loop *loop) +{ + const char *seq = loop->mask == 0 ? " seq" : ""; + const char *gang = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) +? " gang" : ""; + const char *worker = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) +? " worker" : ""; + const char *vector = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) +? " vector" : ""; + dump_location_t loc = dump_location_t::from_location_t (loop->loc); + + dump_printf_loc (MSG_NOTE, loc, + "Detected parallelism \n", seq, gang, + worker, vector); + + if (loop->child) +inform_oacc_loop (loop->child); + if (loop->sibling) +inform_oacc_loop (loop->sibling); +} + /* DFS walk of basic blocks BB onwards, creating OpenACC loop structures as we go. By construction these loops are properly nested. */ @@ -1533,6 +1558,8 @@ execute_oacc_device_lower () dump_oacc_loop (dump_file, loops, 0); fprintf (dump_file, "\n"); } + if (dump_enabled_p () && loops->child) +inform_oacc_loop (loops->child); /* Offloaded targets may introduce new basic blocks, which require dominance information to update SSA. */ diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c new file mode 100644 index 000..3ec794c --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c @@ -0,0 +1,61 @@ +/* Test the output of -fopt-info-note-omp. */ + +/* { dg-additional-options "-fopt-info-note-omp" } */ + +int +main () +{ + int x, y, z; + +#pragma acc parallel loop seq /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop gang /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop worker /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop vector /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop gang vector /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop gang worker /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop worker vector /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop gang worker vector /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +; + +#pragma acc parallel loop /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +#pragma acc loop /* { dg-message "note: Detected parallelism " } */ +for (y = 0; y < 10; y++) + ; + +#pragma acc parallel loop gang /* { dg-message "note: Detected parallelism " } */ + for (x = 0; x < 10; x++) +#pragma acc loop worker /* { dg-message "note: Detected parallelism " } */ +
[PATCH 2/3] Correct the reported line number in c++ combined OpenACC directives
Like the fortran FE, the C++ FE doesn't set the expr_location of the split acc loop in combined acc parallel/kernels loop directives. This only happens for with combined directives, otherwise cp_parser_omp_construct would be responsible for setting the location. After fixing this bug, I was able to resolve a couple of long standing diagnostics discrepancies between the c/c++ FEs in the test suite. Is this patch OK for trunk? I bootstrapped and regtested using x86_64 with nvptx offloading. Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/cp/ * parser.c (cp_parser_oacc_kernels_parallel): Adjust EXPR_LOCATION on the combined acc loop. gcc/testsuite/ * c-c++-common/goacc/combined-directives-3.c: New test. * c-c++-common/goacc/loop-2-kernels.c (void K): Adjust test. * c-c++-common/goacc/loop-2-parallel.c (void P): Adjust test. * c-c++-common/goacc/loop-3.c (void p2): Adjust test. (cherry picked from gomp-4_0-branch r245673) diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 90d5d00..52e61fc 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -37183,8 +37183,9 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok, cp_lexer_consume_token (parser->lexer); tree block = begin_omp_parallel (); tree clauses; - cp_parser_oacc_loop (parser, pragma_tok, p_name, mask, , - if_p); + tree stmt = cp_parser_oacc_loop (parser, pragma_tok, p_name, mask, + , if_p); + protected_set_expr_location (stmt, pragma_tok->location); return finish_omp_construct (code, block, clauses); } } diff --git a/gcc/testsuite/c-c++-common/goacc/combined-directives-3.c b/gcc/testsuite/c-c++-common/goacc/combined-directives-3.c new file mode 100644 index 000..77d4182 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/combined-directives-3.c @@ -0,0 +1,24 @@ +/* Verify the accuracy of the line number associated with combined + constructs. */ + +int +main () +{ + int x, y, z; + +#pragma acc parallel loop seq auto /* { dg-error "'seq' overrides other OpenACC loop specifiers" } */ + for (x = 0; x < 10; x++) +#pragma acc loop +for (y = 0; y < 10; y++) + ; + +#pragma acc parallel loop gang auto /* { dg-error "'auto' conflicts with other OpenACC loop specifiers" } */ + for (x = 0; x < 10; x++) +#pragma acc loop worker auto /* { dg-error "'auto' conflicts with other OpenACC loop specifiers" } */ +for (y = 0; y < 10; y++) +#pragma acc loop vector + for (z = 0; z < 10; z++) + ; + + return 0; +} diff --git a/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c b/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c index 01ad32d..3a11ef5f 100644 --- a/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c +++ b/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c @@ -145,8 +145,8 @@ void K(void) #pragma acc kernels loop worker(num:5) for (i = 0; i < 10; i++) { } -#pragma acc kernels loop seq worker // { dg-error "'seq' overrides" "" { target c } } - for (i = 0; i < 10; i++) // { dg-error "'seq' overrides" "" { target c++ } } +#pragma acc kernels loop seq worker // { dg-error "'seq' overrides" } + for (i = 0; i < 10; i++) { } #pragma acc kernels loop gang worker for (i = 0; i < 10; i++) @@ -161,8 +161,8 @@ void K(void) #pragma acc kernels loop vector(length:5) for (i = 0; i < 10; i++) { } -#pragma acc kernels loop seq vector // { dg-error "'seq' overrides" "" { target c } } - for (i = 0; i < 10; i++) // { dg-error "'seq' overrides" "" { target c++ } } +#pragma acc kernels loop seq vector // { dg-error "'seq' overrides" } + for (i = 0; i < 10; i++) { } #pragma acc kernels loop gang vector for (i = 0; i < 10; i++) @@ -174,16 +174,16 @@ void K(void) #pragma acc kernels loop auto for (i = 0; i < 10; i++) { } -#pragma acc kernels loop seq auto // { dg-error "'seq' overrides" "" { target c } } - for (i = 0; i < 10; i++) // { dg-error "'seq' overrides" "" { target c++ } } +#pragma acc kernels loop seq auto // { dg-error "'seq' overrides" } + for (i = 0; i < 10; i++) { } -#pragma acc kernels loop gang auto // { dg-error "'auto' conflicts" "" { target c } } - for (i = 0; i < 10; i++) // { dg-error "'auto' conflicts" "" { target c++ } } +#pragma acc kernels loop gang auto // { dg-error "'auto' conflicts" } + for (i = 0; i < 10; i++) { } -#pragma acc kernels loop worker auto // { dg-error "'auto' conflicts" "" { target c } } - for (i = 0; i < 10; i++) // { dg-error "'auto' conflicts" "" { target c++ } } +#prag
[PATCH 1/3] Correct the reported line number in fortran combined OpenACC directives
The fortran FE incorrectly records the line locations of combined acc loop directives when it lowers the construct to gimple. Usually this isn't a problem because the fortran FE is able to report problems with acc loops itself. However, there will be inaccuracies if the ME tries to use those locations. Note that test cases are inconspicuously absent in this patch. However, without this bug fix, -fopt-info-note-omp will report bogus line numbers. This code patch will be tested in a later patch in this series. Is this OK for trunk? I bootstrapped and regtested it on x86_64 with nvptx offloading. Thanks, Cesar 2018-XX-YY Cesar Philippidis gcc/fortran/ * trans-openmp.c (gfc_trans_oacc_combined_directive): Set the location of combined acc loops. (cherry picked from gomp-4_0-branch r245653) diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c index f038f4c..e7707d0 100644 --- a/gcc/fortran/trans-openmp.c +++ b/gcc/fortran/trans-openmp.c @@ -3869,6 +3869,7 @@ gfc_trans_oacc_combined_directive (gfc_code *code) gfc_omp_clauses construct_clauses, loop_clauses; tree stmt, oacc_clauses = NULL_TREE; enum tree_code construct_code; + location_t loc = input_location; switch (code->op) { @@ -3930,12 +3931,16 @@ gfc_trans_oacc_combined_directive (gfc_code *code) else pushlevel (); stmt = gfc_trans_omp_do (code, EXEC_OACC_LOOP, pblock, _clauses, NULL); + + if (CAN_HAVE_LOCATION_P (stmt)) +SET_EXPR_LOCATION (stmt, loc); + if (TREE_CODE (stmt) != BIND_EXPR) stmt = build3_v (BIND_EXPR, NULL, stmt, poplevel (1, 0)); else poplevel (0, 0); - stmt = build2_loc (input_location, construct_code, void_type_node, stmt, -oacc_clauses); + + stmt = build2_loc (loc, construct_code, void_type_node, stmt, oacc_clauses); gfc_add_expr_to_block (, stmt); return gfc_finish_block (); } -- 2.7.4
[PATCH 0/3] Add OpenACC diagnostics to -fopt-info-note-omp
This patch series extends -fopt-info-note-omp to include OpenACC loop diagnostics when it is used in conjunction with -fopenacc. At present, the diagnostics are limited to reporting how OpenACC loops are partitioned, e.g., seq, gang, worker or vector. The major advantage of this diagnostics is that it informs the user how GCC automatically partitions independent loops, i.e., acc loops without any parallelism clauses inside acc parallel regions. This information provides the user with insights on how to select num_gangs, num_workers and vector_length for their application. All three patches in this series are independent from one another. Patches 1 and 2 fix diagnostics bugs involving incorrect line numbers. Patch 3 is responsible for generating the actual diagnostics. Cesar
[PATCH] Adjust offsets for present data clauses
This is another old gomp4 patch that corrects a bug where the runtime was passing the wrong offset for subarray data to the accelerator. The original description of this patch can be found here <https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01676.html> I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk? Thanks, Cesar >From fb743d8a45193c177cb0082400d140949e8c1e6d Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Wed, 24 Aug 2016 00:02:50 + Subject: [PATCH 5/5] [libgomp, OpenACC] Adjust offsets for present data clauses 2018-XX-YY Cesar Philippidis libgomp/ * oacc-parallel.c (GOACC_parallel_keyed): Add offset to devaddrs. * testsuite/libgomp.oacc-c-c++-common/data_offset.c: New test. * testsuite/libgomp.oacc-fortran/data_offset.f90: New test. (cherry picked from gomp-4_0-branch r239723, 00c2585) --- libgomp/oacc-parallel.c | 10 - .../libgomp.oacc-c-c++-common/data_offset.c | 41 ++ .../libgomp.oacc-fortran/data_offset.f90 | 43 +++ 3 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90 diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index b80ace58590..20e9ab2e251 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -231,8 +231,14 @@ GOACC_parallel_keyed (int device, void (*fn) (void *), devaddrs = gomp_alloca (sizeof (void *) * mapnum); for (i = 0; i < mapnum; i++) -devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start - + tgt->list[i].key->tgt_offset); +{ + if (tgt->list[i].key != NULL) + devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start ++ tgt->list[i].key->tgt_offset ++ tgt->list[i].offset); + else + devaddrs[i] = NULL; +} acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, async, dims, tgt); diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c new file mode 100644 index 000..ccbbfcab87b --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data_offset.c @@ -0,0 +1,41 @@ +/* Test present data clauses in acc offloaded regions when the + subarray inside the present clause does not have the same base + offset value as the subarray in the enclosing acc data or acc enter + data variable. */ + +#include + +void +offset (int *data, int n) +{ + int i; + +#pragma acc parallel loop present (data[0:n]) + for (i = 0; i < n; i++) +data[i] = n; +} + +int +main () +{ + const int n = 30; + int data[n], i; + + for (i = 0; i < n; i++) +data[i] = -1; + +#pragma acc data copy(data[0:n]) + { +offset (data+10, 10); + } + + for (i = 0; i < n; i++) +{ + if (i < 10 || i >= 20) + assert (data[i] == -1); + else + assert (data[i] == 10); +} + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90 b/libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90 new file mode 100644 index 000..ff8ee39f964 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/data_offset.f90 @@ -0,0 +1,43 @@ +! Test present data clauses in acc offloaded regions when the subarray +! inside the present clause does not have the same base offset value +! as the subarray in the enclosing acc data or acc enter data variable. + +program test + implicit none + + integer, parameter :: n = 30, m = 10 + integer :: i + integer, allocatable :: data(:) + logical bounded + + allocate (data(n)) + + data(:) = -1 + + !$acc data copy (data(5:20)) + call test_data (data, n, m) + !$acc end data + + do i = 1, n + bounded = i < m .or. i >= m+m + if (bounded .and. (data(i) /= -1)) then +call abort + else if (.not. bounded .and. data(i) /= 10) then +call abort + end if + end do + + deallocate (data) +end program test + +subroutine test_data (data, n, m) + implicit none + + integer :: n, m, data(n), i + + !$acc parallel loop present (data(m:m)) + do i = m, m+m-1 + data(i) = m + end do + !$acc end parallel loop +end subroutine test_data -- 2.17.1
[PATCH] Enable firstprivate OpenACC reductions
At present, all reduction variables are transferred via an implicit 'copy' clause. As shown the the recent patches I've been posting, that causes a lot of problems when the reduction variables are used by multiple workers or vectors. This patch teaches the gimplifier to transfer reduction variable as firstprivate in OpenACC parallel regions, if the are in an inner loop. This matches the behavior of reductions in OpenACC 2.6. Is this patch OK for trunk? I bootstrapped and regtested on x86_64/nvptx. Thanks, Cesar >From 035be51a795ad8bed5342ba181220bf3102bcd6d Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Wed, 31 Jan 2018 07:21:53 -0800 Subject: [PATCH 4/5] Enable firstprivate OpenACC reductions 2018-XX-YY Cesar Philippidis gcc/ * gimplify.c (omp_add_variable): Allow certain OpenACC reduction variables to remain firstprivate. gcc/testsuite/ * c-c++-common/goacc/reduction-8.c: New test. (cherry picked from openacc-gcc-7-branch commit 441621739e2a067c97409f8b0e3e30362a7905be, cec00212ad8) --- gcc/gimplify.c| 30 -- .../c-c++-common/goacc/reduction-8.c | 94 +++ 2 files changed, 117 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/reduction-8.c diff --git a/gcc/gimplify.c b/gcc/gimplify.c index 737a280cfe9..bcfb029275c 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -6858,9 +6858,16 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) else splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags); - /* For reductions clauses in OpenACC loop directives, by default create a - copy clause on the enclosing parallel construct for carrying back the - results. */ + /* For OpenACC loop directives, when a reduction is immediately + enclosed within an acc parallel or kernels construct, it must + have an implied copy data mapping. E.g. + + #pragma acc parallel + { + #pragma acc loop reduction (+:sum) + + a copy clause for sum should be added on the enclosing parallel + construct for carrying back the results. */ if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION)) { struct gimplify_omp_ctx *outer_ctx = ctx->outer_context; @@ -6876,8 +6883,11 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) vector = true; } - /* Set new copy map as 'private' if sure we're not gang-partitioning. */ - bool map_private; + /* Reduction data maps need to be marked as private for worker + and vector loops, in order to ensure that value of the + reduction carried back to the host. Set new copy map as + 'private' if sure we're not gang-partitioning. */ + bool map_private, update_data_map = false; if (gang) map_private = false; @@ -6886,6 +6896,10 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) else map_private = oacc_privatize_reduction (ctx->outer_context); + if (ctx->outer_context + && ctx->outer_context->region_type == ORT_ACC_PARALLEL) + update_data_map = true; + while (outer_ctx) { n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl); @@ -6902,7 +6916,8 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) gcc_assert (!(n->value & GOVD_FIRSTPRIVATE) && (n->value & GOVD_MAP)); } - else if (outer_ctx->region_type == ORT_ACC_PARALLEL) + else if (update_data_map + && outer_ctx->region_type == ORT_ACC_PARALLEL) { /* Remove firstprivate and make it a copy map. */ n->value &= ~GOVD_FIRSTPRIVATE; @@ -6914,7 +6929,8 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) n->value |= GOVD_MAP_PRIVATE; } } - else if (outer_ctx->region_type == ORT_ACC_PARALLEL) + else if (update_data_map + && outer_ctx->region_type == ORT_ACC_PARALLEL) { unsigned f = GOVD_MAP | GOVD_SEEN; diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-8.c b/gcc/testsuite/c-c++-common/goacc/reduction-8.c new file mode 100644 index 000..8a0283f4ac3 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/reduction-8.c @@ -0,0 +1,94 @@ +/* { dg-additional-options "-fdump-tree-gimple" } */ + +#define n 1000 + +int +main(void) +{ + int i, j; + int result, array[n]; + +#pragma acc parallel loop reduction (+:result) + for (i = 0; i < n; i++) +result ++; + +#pragma acc parallel +#pragma acc loop reduction (+:result) + for (i = 0; i < n; i++) +result ++; + +#pragma acc parallel +#pragma acc loop + for (i = 0; i < n; i++) +{ + result = i; + +#pragma acc loop reduction(+:result) + for (j = 0; j < n; j++) + result ++; + + array[i] = result; +} + +#pragma acc parallel +#pragma acc loop + fo
[PATCH] Privatize independent OpenACC reductions
This is another OpenACC reduction patch to privatize reduction variables used inside inner acc loops. For some reason, I can't find the original email announcement on the gcc-patches mailing list. But according to the ChangeLog, I committed that change to og7 back on Jan 26, 2018. I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk? Thanks, Cesar >From a4753e2b40cf3d707aabd7c9d5bad7d8f9be8b6f Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Fri, 26 Jan 2018 08:30:13 -0800 Subject: [PATCH 3/5] Privatize independent OpenACC reductions 2018-XX-YY Cesar Philippidis gcc/ * gimplify.c (oacc_privatize_reduction): New function. (omp_add_variable): Use it to determine if a reduction variable needs to be privatized. libgomp/ * testsuite/libgomp.oacc-c-c++-common/inner-reduction.c: New test. (cherry picked from openacc-gcc-7-branch commit 330ba2316fabd0e5525c99fdacedb0bfae270244, 133f3a8fb5c) --- gcc/gimplify.c| 35 ++- .../inner-reduction.c | 23 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c diff --git a/gcc/gimplify.c b/gcc/gimplify.c index 7dadf69b758..737a280cfe9 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -6722,6 +6722,32 @@ omp_firstprivatize_type_sizes (struct gimplify_omp_ctx *ctx, tree type) lang_hooks.types.omp_firstprivatize_type_sizes (ctx, type); } +/* Determine if CTX might contain any gang partitioned loops. During + oacc_dev_low, independent loops are assign gangs at the outermost + level, and vectors in the innermost. */ + +static bool +oacc_privatize_reduction (struct gimplify_omp_ctx *ctx) +{ + if (ctx == NULL) +return false; + + if (ctx->region_type != ORT_ACC) +return false; + + for (tree c = ctx->clauses; c; c = OMP_CLAUSE_CHAIN (c)) +switch (OMP_CLAUSE_CODE (c)) + { + case OMP_CLAUSE_SEQ: + return oacc_privatize_reduction (ctx->outer_context); + case OMP_CLAUSE_GANG: + return true; + default:; + } + + return true; +} + /* Add an entry for DECL in the OMP context CTX with FLAGS. */ static void @@ -6851,7 +6877,14 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) } /* Set new copy map as 'private' if sure we're not gang-partitioning. */ - bool map_private = !gang && (worker || vector); + bool map_private; + + if (gang) + map_private = false; + else if (worker || vector) + map_private = true; + else + map_private = oacc_privatize_reduction (ctx->outer_context); while (outer_ctx) { diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c new file mode 100644 index 000..0c317dcf8a6 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/inner-reduction.c @@ -0,0 +1,23 @@ +#include + +int +main () +{ + const int n = 1000; + int i, j, temp, a[n]; + +#pragma acc parallel loop + for (i = 0; i < n; i++) +{ + temp = i; +#pragma acc loop reduction (+:temp) + for (j = 0; j < n; j++) + temp ++; + a[i] = temp; +} + + for (i = 0; i < n; i++) +assert (a[i] == i+n); + + return 0; +} -- 2.17.1
[PATCH] Add support for making maps 'private' inside OpenACC offloaded regions
Due to the different levels of parallelism available in OpenACC, it is useful to mark certain variables as GOMP_MAP_PRIVATE so that they can be used in reductions. This patch was introduced in openacc-gcc-7-branch here <https://gcc.gnu.org/ml/gcc-patches/2017-09/msg00274.html>. I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk? Thanks, Cesar >From b0e7fb09bf3a3f853e77c2712b6f85ad21472e72 Mon Sep 17 00:00:00 2001 From: Chung-Lin Tang Date: Tue, 5 Sep 2017 22:09:34 +0800 Subject: [PATCH 2/5] [OpenACC] Add support for making maps 'private' inside offloaded regions 2018-XX-YY Chung-Lin Tang Cesar Philippidis gcc/ * tree.h (OMP_CLAUSE_MAP_PRIVATE): Define macro. * gimplify.c (enum gimplify_omp_var_data): Add GOVD_MAP_PRIVATE enum value. (omp_add_variable): Add GOVD_MAP_PRIVATE to reduction clause flags if not a gang-partitioned loop directive. (gimplify_adjust_omp_clauses_1): Set OMP_CLAUSE_MAP_PRIVATE of new map clause to 1 if GOVD_MAP_PRIVATE flag is present. * omp-low.c (lower_oacc_reductions): Handle map clauses with OMP_CLAUSE_MAP_PRIVATE set in same matter as firstprivate/private. (lower_omp_target): Likewise. Add copy back code for map clauses with OMP_CLAUSE_MAP_PRIVATE set. libgomp/ * testsuite/libgomp.oacc-c-c++-common/reduction-9.c: New test. (cherry picked from openacc-gcc-7-branch commit 2dc21f336368889c1ebf031801a7613f65899ef1, e17bb2068f9) --- gcc/gimplify.c| 34 ++- gcc/omp-low.c | 28 +++-- gcc/tree.h| 3 ++ .../libgomp.oacc-c-c++-common/reduction-9.c | 41 +++ 4 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-9.c diff --git a/gcc/gimplify.c b/gcc/gimplify.c index cf8977c8508..7dadf69b758 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -105,6 +105,9 @@ enum gimplify_omp_var_data /* Flag for GOVD_MAP: must be present already. */ GOVD_MAP_FORCE_PRESENT = 524288, + /* Flag for GOVD_MAP, copy to/from private storage inside offloaded region. */ + GOVD_MAP_PRIVATE = 1048576, + GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR | GOVD_LOCAL) @@ -6835,6 +6838,21 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION)) { struct gimplify_omp_ctx *outer_ctx = ctx->outer_context; + + bool gang = false, worker = false, vector = false; + for (tree c = ctx->clauses; c; c = OMP_CLAUSE_CHAIN (c)) + { + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_GANG) + gang = true; + else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WORKER) + worker = true; + else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR) + vector = true; + } + + /* Set new copy map as 'private' if sure we're not gang-partitioning. */ + bool map_private = !gang && (worker || vector); + while (outer_ctx) { n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl); @@ -6856,12 +6874,21 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags) /* Remove firstprivate and make it a copy map. */ n->value &= ~GOVD_FIRSTPRIVATE; n->value |= GOVD_MAP; + + /* If not gang-partitioned, add MAP_PRIVATE on the map + clause. */ + if (map_private) + n->value |= GOVD_MAP_PRIVATE; } } else if (outer_ctx->region_type == ORT_ACC_PARALLEL) { - splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl, - GOVD_MAP | GOVD_SEEN); + unsigned f = GOVD_MAP | GOVD_SEEN; + + /* If not gang-partitioned, add MAP_PRIVATE on the map clause. */ + if (map_private) + f |= GOVD_MAP_PRIVATE; + splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl, f); break; } outer_ctx = outer_ctx->outer_context; @@ -8904,6 +8931,9 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data) gcc_unreachable (); } OMP_CLAUSE_SET_MAP_KIND (clause, kind); + if ((flags & GOVD_MAP_PRIVATE) + && TREE_CODE (OMP_CLAUSE_DECL (clause)) == VAR_DECL) + OMP_CLAUSE_MAP_PRIVATE (clause) = 1; tree c2 = gomp_needs_data_present (decl); /* Handle OpenACC pointers that were declared inside acc data regions. */ diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 714490d6921..ef3c7651c74 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -4907,7 +4907,9 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner, goto has_outer_reduction; } else if ((OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_FIRSTPRIVATE - || OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_PRIVATE) + || OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_PRIVATE + |
[PATCH] Fix PR70828 - broken array-type subarrays inside acc data, in OpenACC
Attached is an old gomp-4_0-branch that fixes PR70828. Besides for fixing the PR, it also introduces some changes which will enable the forthcoming nvptx vector length enhancements. More details on the patch can be found here <https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01293.html> I bootstrapped and regtested on x86_64/nvptx. Is it OK for trunk? Thanks, Cesar >From 3a58144cfaca8f6e3a889346e736e68a9ed17e6a Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Thu, 18 Aug 2016 01:12:15 + Subject: [PATCH 1/5] Fix PR70828s "broken array-type subarrays inside acc data in openacc" 2018-XX-YY Cesar Philippidis gcc/ * gimplify.c (struct gimplify_omp_ctx): Add tree clauses member. (new_omp_context): Initialize clauses to NULL_TREE. (gimplify_scan_omp_clauses): Set clauses in the gimplify_omp_ctx. (omp_clause_matching_array_ref): New function. (gomp_needs_data_present): New function. (gimplify_adjust_omp_clauses_1): Use preset or pointer omp clause map kinds when creating implicit data clauses for OpenACC offloaded variables defined used an acc data region as necessary. Link ACC new clauses with the old ones. gcc/testsuite/ * c-c++-common/goacc/acc-data-chain.c: New test. libgomp/ * testsuite/libgomp.oacc-c-c++-common/pr70828.c: New test. * testsuite/libgomp.oacc-fortran/pr70828.f90: New test. * testsuite/libgomp.oacc-fortran/lib-13.f90: Remove XFAIL. --- gcc/gimplify.c| 101 +- .../c-c++-common/goacc/acc-data-chain.c | 24 + .../libgomp.oacc-c-c++-common/pr70828.c | 25 + .../testsuite/libgomp.oacc-fortran/lib-13.f90 | 1 - .../libgomp.oacc-fortran/pr70828.f90 | 24 + 5 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/acc-data-chain.c create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90 diff --git a/gcc/gimplify.c b/gcc/gimplify.c index 4a109aee27a..cf8977c8508 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -191,6 +191,7 @@ struct gimplify_omp_ctx bool target_map_scalars_firstprivate; bool target_map_pointers_as_0len_arrays; bool target_firstprivatize_array_bases; + tree clauses; }; static struct gimplify_ctx *gimplify_ctxp; @@ -409,6 +410,7 @@ new_omp_context (enum omp_region_type region_type) c->privatized_types = new hash_set; c->location = input_location; c->region_type = region_type; + c->clauses = NULL_TREE; if ((region_type & ORT_TASK) == 0) c->default_kind = OMP_CLAUSE_DEFAULT_SHARED; else @@ -7501,6 +7503,7 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, tree *prev_list_p = NULL; ctx = new_omp_context (region_type); + ctx->clauses = *list_p; outer_ctx = ctx->outer_context; if (code == OMP_TARGET) { @@ -8696,6 +8699,58 @@ struct gimplify_adjust_omp_clauses_data gimple_seq *pre_p; }; +/* Return true if clause contains an array_ref of DECL. */ + +static bool +omp_clause_matching_array_ref (tree clause, tree decl) +{ + tree cdecl = OMP_CLAUSE_DECL (clause); + + if (TREE_CODE (cdecl) != ARRAY_REF) +return false; + + return TREE_OPERAND (cdecl, 0) == decl; +} + +/* Inside OpenACC parallel and kernels regions, the implicit data + clauses for arrays must respect the explicit data clauses set by a + containing acc data region. Specifically, care must be taken + pointers or if an subarray of a local array is specified in an acc + data region, so that the referenced array inside the offloaded + region has a present data clasue for that array with an + approporiate subarray argument. This function returns the tree + node of the acc data clause that utilizes DECL as an argument. */ + +static tree +gomp_needs_data_present (tree decl) +{ + gimplify_omp_ctx *ctx = NULL; + bool found_match = false; + tree c = NULL_TREE; + + if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE) +return NULL_TREE; + + if (gimplify_omp_ctxp->region_type != ORT_ACC_PARALLEL + && gimplify_omp_ctxp->region_type != ORT_ACC_KERNELS) +return NULL_TREE; + + for (ctx = gimplify_omp_ctxp->outer_context; !found_match && ctx; + ctx = ctx->outer_context) +{ + if (ctx->region_type != ORT_ACC_DATA) + break; + + for (c = ctx->clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP + && (omp_clause_matching_array_ref (c, decl) + || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER)) + return c; +} + + return NULL_TREE; +} + /* For all variables that were not actually used within the context, remove PRIVATE, SHARED, and FIRSTPRIVATE clauses. */ @@ -8849,7 +8904,51 @@ gimplify_adjust_omp_clauses_1 (splay_tree_node n, void *data) gcc_unreachable (); } OMP_CLAUSE_SET_MAP_KIND (clause, kind); - if (DECL_SIZE (de
Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions
On 07/02/2018 07:14 AM, Tom de Vries wrote: > On 06/21/2018 03:58 PM, Cesar Philippidis wrote: >> On 06/20/2018 03:15 PM, Tom de Vries wrote: >>> On 06/20/2018 11:59 PM, Cesar Philippidis wrote: >>>> Now it follows the formula contained in >>>> the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA. >>> >>> Any reason we're not using the cuda runtime functions to get the >>> occupancy (see PR85590 - [nvptx, libgomp, openacc] Use cuda runtime fns >>> to determine launch configuration in nvptx ) ? >> >> There are two reasons: >> >> 1) cuda_occupancy.h depends on the CUDA runtime to extract the device >> properties instead of the CUDA driver API. However, we can always >> teach libgomp how to populate the cudaDeviceProp struct using the >> driver API. >> >> 2) CUDA is not always present on the build host, and that's why >> libgomp maintains its own cuda.h. So at the very least, this >> functionality would be good to have in libgomp as a fallback >> implementation; > > Libgomp maintains its own cuda.h to "allow building GCC with PTX > offloading even without CUDA being installed" ( > https://gcc.gnu.org/ml/gcc-patches/2017-01/msg00980.html ). > > The libgomp nvptx plugin however uses the cuda driver API to launch > kernels etc, so we can assume that's always available at launch time. > And according to the "CUDA Pro Tip: Occupancy API Simplifies Launch > Configuration", the occupancy API is also available in the driver API. > > What we cannot assume to be available is the occupancy API pre cuda-6.5. > So it's fine to have a fallback for that (properly isolated in utility > functions), but for cuda 6.5 and up we want to use the occupancy API. Here's revision 2 to the patch. I replaced all of my thread occupancy heuristics with calls to the CUDA driver as you suggested. The performance is worse than my heuristics, but that's to be expected because the CUDA driver only guarantees the minimal launch geometry to to fully utilize the hardware, and not the optimal value. I'll reintroduce my heuristics later as a follow up patch. The major advantage of the CUDA thread occupancy calculator is that it allows the runtime to select sensible default num_workers to avoid those annoying runtime failures due to insufficient GPU hardware resources. One thing that may stick out in this patch is how it probes for the driver version instead of the API version. It turns out that the API version corresponds to the SM version declared in the PTX sources, whereas the driver version corresponds to the latest version of CUDA supported by the driver. At least that's the case with driver version 396.24. >> its not good to have program fail due to >> insufficient hardware resources errors when it is avoidable. >> > > Right, in fact there are two separate things you're trying to address > here: launch failure and occupancy heuristic, so split the patch. That hunk was small, so I included it with this patch. Although if you insist, I can remove it. Is this patch OK for trunk? I tested it x86_64 with nvptx offloading. Cesar 2018-07-XX Cesar Philippidis Tom de Vries gcc/ * config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Rename to ... (PTX_DEFAULT_RUNTIME_DIM): ... this. (nvptx_goacc_validate_dims): Set default worker and gang dims to PTX_DEFAULT_RUNTIME_DIM. (nvptx_dim_limit): Ignore GOMP_DIM_WORKER; libgomp/ * plugin/cuda/cuda.h (CUoccupancyB2DSize): Declare. (cuOccupancyMaxPotentialBlockSizeWithFlags): Likewise. * plugin/plugin-nvptx.c (struct ptx_device): Add driver_version member. (nvptx_open_device): Set it. (nvptx_exec): Use the CUDA driver to both determine default num_gangs and num_workers, and error if the hardware doesn't have sufficient resources to launch a kernel. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 5608bee8a8d..c1946e75f42 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -5165,7 +5165,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), /* Define dimension sizes for known hardware. */ #define PTX_VECTOR_LENGTH 32 #define PTX_WORKER_LENGTH 32 -#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */ +#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ @@ -5214,9 +5214,9 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level) { dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; if (dims[GOMP_DIM_WORKER] < 0) - dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; + dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; if (dims[GOMP_DIM_GANG] < 0) - dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT; + dims[GOMP_DIM_GANG] = P
Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions
On 07/02/2018 07:14 AM, Tom de Vries wrote: > On 06/21/2018 03:58 PM, Cesar Philippidis wrote: >> On 06/20/2018 03:15 PM, Tom de Vries wrote: >>> On 06/20/2018 11:59 PM, Cesar Philippidis wrote: >>>> Now it follows the formula contained in >>>> the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA. >>> >>> Any reason we're not using the cuda runtime functions to get the >>> occupancy (see PR85590 - [nvptx, libgomp, openacc] Use cuda runtime fns >>> to determine launch configuration in nvptx ) ? >> >> There are two reasons: >> >> 1) cuda_occupancy.h depends on the CUDA runtime to extract the device >> properties instead of the CUDA driver API. However, we can always >> teach libgomp how to populate the cudaDeviceProp struct using the >> driver API. >> >> 2) CUDA is not always present on the build host, and that's why >> libgomp maintains its own cuda.h. So at the very least, this >> functionality would be good to have in libgomp as a fallback >> implementation; > > Libgomp maintains its own cuda.h to "allow building GCC with PTX > offloading even without CUDA being installed" ( > https://gcc.gnu.org/ml/gcc-patches/2017-01/msg00980.html ). > > The libgomp nvptx plugin however uses the cuda driver API to launch > kernels etc, so we can assume that's always available at launch time. > And according to the "CUDA Pro Tip: Occupancy API Simplifies Launch > Configuration", the occupancy API is also available in the driver API. Thanks for the info. I was not aware that the CUDA driver API had a thread occupancy calculator (it' described in section 4.18). > What we cannot assume to be available is the occupancy API pre cuda-6.5. > So it's fine to have a fallback for that (properly isolated in utility > functions), but for cuda 6.5 and up we want to use the occupancy API. That seems reasonable. I'll run some experiments with that. In the meantime, would it be OK to make this fallback the default, then add support for the driver occupancy calculator as a follow up? >> its not good to have program fail due to >> insufficient hardware resources errors when it is avoidable. >> > > Right, in fact there are two separate things you're trying to address > here: launch failure and occupancy heuristic, so split the patch. ACK. I'll split those changes into separate patches. By the way, do you have any preferences on how to break up the nvptx vector length changes for trunk submission? I was planning on breaking it down into four components - generic ME changes, tests, nvptx reductions and the rest. Those two nvptx compoinents are large, so I'll probably break them down to smaller patches, but I'm not sure if it's worthwhile to make them independent from one another with the use of a lot of stub functions. Cesar
Re: [patch] adjust default nvptx launch geometry for OpenACC offloaded regions
On 06/29/2018 10:12 AM, Cesar Philippidis wrote: > Ping. While porting the vector length patches to trunk, I realized that I mistakenly removed support for the environment variable GOMP_OPENACC_DIM in this patch (thanks for adding those test case Tom!). I'll post an updated version of this patch once I got the vector length patches working with it. Cesar > On 06/20/2018 02:59 PM, Cesar Philippidis wrote: >> At present, the nvptx libgomp plugin does not take into account the >> amount of shared resources on GPUs (mostly shared-memory are register >> usage) when selecting the default num_gangs and num_workers. In certain >> situations, an OpenACC offloaded function can fail to launch if the GPU >> does not have sufficient shared resources to accommodate all of the >> threads in a CUDA block. This typically manifests when a PTX function >> uses a lot of registers and num_workers is set too large, although it >> can also happen if the shared-memory has been exhausted by the threads >> in a vector. >> >> This patch resolves that issue by adjusting num_workers based the amount >> of shared resources used by each threads. If worker parallelism has been >> requested, libgomp will spawn as many workers as possible up to 32. >> Without this patch, libgomp would always default to launching 32 workers >> when worker parallelism is used. >> >> Besides for the worker parallelism, this patch also includes some >> heuristics on selecting num_gangs. Before, the plugin would launch two >> gangs per GPU multiprocessor. Now it follows the formula contained in >> the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA. >> >> Is this patch OK for trunk? >> >> Thanks, >> Cesar >> >
[patch] Add OpenACC Fortran support for deviceptr and variable in common blocks
The attached patch adds support Fortran support for OpenACC deviceptr and the use of common block variables in data clauses (both implicit and explicit). This patch also relaxes the Fortran parser to not error certain types of integral expressions and assumed-sized arrays. With respect to those errors, I removed them because a lot of working applications do not explicitly use type attributes (like contiguous). Perhaps it would be better to reduce them to a warning. Any thoughts on that? My argument for their removal is that, while the standard states that, say, arrays must be contiguous or bad things will happen, it does not necessary mandate that the compiler enforces it. I.e., the intent is to set the user's expectation that things will go bad if garbage input is fed to the accelerator. If necessary, I can push back on the OpenACC standards committee on these issue, but don't expect a quick resolution. In hindsight, I probably should have kept the error relaxation patches separate. This patch includes the following patches from og8: * (dd8b75a) [OpenACC] Update deviceptr handling * (634727d) [OpenACC] Handle Fortran deviceptr clause * (d50862a) [Fortran] Remove pointer check in check_array_not_assumed * (0793cef) [OpenACC] add support for fortran common blocks * (bdc1acc) [Fortran] update gfortran's tile clause error handling * (5dc4968) Fix PR72715 "ICE in gfc_trans_omp_do, at fortran/trans-openmp.c:3164" Is this patch OK for trunk? It bootstrapped / regression tested cleanly for x86_64 with nvptx offloading. Thanks, Cesar 2018-06-29 Cesar Philippidis James Norris gcc/fortran/ * openmp.c (gfc_match_omp_map_clause): Re-write handling of the deviceptr clause. Add new common_blocks argument. Propagate it to gfc_match_omp_variable_list. (gfc_match_omp_clauses): Update calls to gfc_match_omp_map_clauses. (resolve_positive_int_expr): Promote the warning to an error. (check_array_not_assumed): Remove pointer check. (resolve_oacc_nested_loops): Error on do concurrent loops. * trans-openmp.c (gfc_omp_finish_clause): Don't create pointer data mappings for deviceptr clauses. (gfc_trans_omp_clauses): Likewise. gcc/ * gimplify.c (enum gimplify_omp_var_data): Add GOVD_DEVICETPR. (oacc_default_clause): Privatize fortran common blocks. (omp_notice_variable): Add GOVD_DEVICEPTR attribute when appropriate. Defer the expansion of DECL_VALUE_EXPR for common block decls. (gimplify_scan_omp_clauses): Add GOVD_DEVICEPTR attribute when appropriate. (gimplify_adjust_omp_clauses_1): Set GOMP_MAP_FORCE_DEVICEPTR for implicit deviceptr mappings. gcc/testsuite/ * c-c++-common/goacc/deviceptr-4.c: Update. * gfortran.dg/goacc/common-block-1.f90: New test. * gfortran.dg/goacc/common-block-2.f90: New test. * gfortran.dg/goacc/loop-2.f95: Update. * gfortran.dg/goacc/loop-3-2.f95: Update. * gfortran.dg/goacc/loop-3.f95: Update. * gfortran.dg/goacc/loop-5.f95: Update. * gfortran.dg/goacc/pr72715.f90: New test. * gfortran.dg/goacc/sie.f95: Update. * gfortran.dg/goacc/tile-1.f90: Update. * gfortran.dg/gomp/pr77516.f90: Update. libgomp/ * oacc-parallel.c (GOACC_parallel_keyed): Handle Fortran deviceptr clause. (GOACC_data_start): Likewise. * testsuite/libgomp.oacc-fortran/common-block-1.f90: New test. * testsuite/libgomp.oacc-fortran/common-block-2.f90: New test. * testsuite/libgomp.oacc-fortran/common-block-3.f90: New test. * testsuite/libgomp.oacc-fortran/deviceptr-1.f90: New test. >From 09c1aa87d9a7db2e08384bb47c80b4a61d218a99 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Mon, 25 Jun 2018 13:10:13 -0700 Subject: [PATCH] fortran deviceptr dd8b75 [OpenACC] Update deviceptr handling 634727 [OpenACC] Handle Fortran deviceptr clause 0793ce [OpenACC] add support for fortran common blocks bdc1ac [Fortran] update gfortran's tile clause error handling d50862 [Fortran] Remove pointer check in check_array_not_assumed 5dc496 Fix PR72715 "ICE in gfc_trans_omp_do, at fortran/trans-openmp.c:3164" --- gcc/fortran/openmp.c | 57 ++--- gcc/fortran/trans-openmp.c| 9 + gcc/gimplify.c| 35 +++- .../c-c++-common/goacc/deviceptr-4.c | 2 +- .../gfortran.dg/goacc/common-block-1.f90 | 69 ++ .../gfortran.dg/goacc/common-block-2.f90 | 49 + gcc/testsuite/gfortran.dg/goacc/loop-2.f95| 8 +- gcc/testsuite/gfortran.dg/goacc/loop-3-2.f95 | 4 +- gcc/testsuite/gfortran.dg/goacc/loop-3.f95| 4 +- gcc/testsuite/gfortran.dg/goacc/loop-5.f95| 12 -- gcc/testsuite/gfortran.dg/goacc/pr72715.f90 | 6 + gcc/testsuite/gfortran.dg/goacc/sie.f95 | 36 ++-- gcc/testsuite/gfortran.dg/goacc/tile-1.f90| 16 +- gcc/testsuite/gfortran.dg/gomp/pr77516.f90| 2 +- libgomp/oacc-parallel.c | 11 +- .../libgomp.oacc-fortran/common-block-1.f90 | 105 ++ .../libgomp.oacc-fortran/com
Re: [patch] various OpenACC reduction enhancements - test cases
Attached are the updated reductions tests cases. Again, these have been bootstrapped and regression tested cleanly for x86_64 with nvptx offloading. Is it OK for trunk? Thanks, Cesar 2018-06-29 Cesar Philippidis Nathan Sidwell gcc/testsuite/ * c-c++-common/goacc/orphan-reductions-1.c: New test. * c-c++-common/goacc/reduction-7.c: New test. * c-c++-common/goacc/routine-4.c: Update. * g++.dg/goacc/reductions-1.C: New test. * gcc.dg/goacc/loop-processing-1.c: Update. * gfortran.dg/goacc/orphan-reductions-1.f90: New test. libgomp/ * libgomp.oacc-c-c++-common/par-reduction-3.c: New test. * libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c: New test. * libgomp.oacc-fortran/reduction-9.f90: New test. From b128e80be7cd2c81171fbd9c8b23e786bb832633 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Thu, 21 Jun 2018 11:37:56 -0700 Subject: [PATCH] Trunk reductions patches OG8 Reduction patches 4469fc4 [Fortran] Permit reductions in gfc_omp_clause_copy_ctor 704f1a2 [nxptx, OpenACC] vector reductions 8a35c89 [OpenACC] Fix a reduction bug involving GOMP_MAP_FIRSTPRIVATE_POINTER variables 16ead33 [OpenACC] Update error messages for c and c++ reductions 65dd9cf Make OpenACC orphan gang reductions errors 5d60102 [PR80547] Handle parallel reductions explicitly initialized by the user --- gcc/c/c-parser.c | 46 +- gcc/c/c-typeck.c | 8 + gcc/config/nvptx/nvptx.c | 233 +++- gcc/config/nvptx/nvptx.md | 7 + gcc/cp/parser.c | 27 +- gcc/cp/semantics.c| 8 + gcc/fortran/openmp.c | 12 + gcc/fortran/trans-openmp.c| 3 +- gcc/omp-general.h | 5 +- gcc/omp-low.c | 33 +- gcc/omp-offload.c | 18 + .../c-c++-common/goacc/orphan-reductions-1.c | 56 ++ .../c-c++-common/goacc/reduction-7.c | 111 gcc/testsuite/c-c++-common/goacc/routine-4.c | 8 +- gcc/testsuite/g++.dg/goacc/reductions-1.C | 548 ++ .../gcc.dg/goacc/loop-processing-1.c | 3 +- .../gfortran.dg/goacc/orphan-reductions-1.f90 | 204 +++ .../par-reduction-3.c | 29 + .../reduction-cplx-flt-2.c| 32 + .../libgomp.oacc-fortran/reduction-9.f90 | 54 ++ 20 files changed, 1396 insertions(+), 49 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c create mode 100644 gcc/testsuite/c-c++-common/goacc/reduction-7.c create mode 100644 gcc/testsuite/g++.dg/goacc/reductions-1.C create mode 100644 gcc/testsuite/gfortran.dg/goacc/orphan-reductions-1.f90 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90 diff --git a/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c b/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c new file mode 100644 index 000..b0bd4a7de05 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c @@ -0,0 +1,56 @@ +/* Test orphan reductions. */ + +#include + +#pragma acc routine seq +int +seq_reduction (int n) +{ + int i, sum = 0; +#pragma acc loop seq reduction(+:sum) + for (i = 0; i < n; i++) +sum = sum + 1; + + return sum; +} + +#pragma acc routine gang +int +gang_reduction (int n) +{ + int i, s1 = 0, s2 = 0; +#pragma acc loop gang reduction(+:s1) /* { dg-error "gang reduction on an orphan loop" } */ + for (i = 0; i < n; i++) +s1 = s1 + 2; + +#pragma acc loop gang reduction(+:s2) /* { dg-error "gang reduction on an orphan loop" } */ + for (i = 0; i < n; i++) +s2 = s2 + 2; + + + return s1 + s2; +} + +#pragma acc routine worker +int +worker_reduction (int n) +{ + int i, sum = 0; +#pragma acc loop worker reduction(+:sum) + for (i = 0; i < n; i++) +sum = sum + 3; + + return sum; +} + +#pragma acc routine vector +int +vector_reduction (int n) +{ + int i, sum = 0; +#pragma acc loop vector reduction(+:sum) + for (i = 0; i < n; i++) +sum = sum + 4; + + return sum; +} diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-7.c b/gcc/testsuite/c-c++-common/goacc/reduction-7.c new file mode 100644 index 000..245c848d509 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/reduction-7.c @@ -0,0 +1,111 @@ +/* Exercise invalid reductions on array and struct members. */ + +void +test_parallel () +{ + struct { +int a; +float b[5]; + } s1, s2[10]; + + int i; + double z[100]; + +#pragma acc parallel reduction(+:s1.a) /* { dg-error "invalid reduction variable" } */ + for (i = 0; i < 10; i++) +s1.a += 1; + +#pragma acc parallel reduction(+:s1.b[3]) /* { dg-error "inva
Re: [patch] various OpenACC reduction enhancements - FE changes
Attaches are the FE changes for the OpenACC reduction enhancements. It depends on the ME patch. Is this patch OK for trunk? It bootstrapped / regression tested cleanly for x86_64 with nvptx offloading. Thanks, Cesar 2018-06-29 Cesar Philippidis Nathan Sidwell gcc/c/ * c-parser.c (c_parser_omp_variable_list): New c_omp_region_type argument. Use it to specialize handling of OMP_CLAUSE_REDUCTION for OpenACC. (c_parser_omp_clause_reduction): Update call to c_parser_omp_variable_list. Propage OpenACC errors as necessary. (c_parser_oacc_all_clauses): Update call to p_parser_omp_clause_reduction. (c_parser_omp_all_clauses): Likewise. * c-typeck.c (c_finish_omp_clauses): Emit an error on orphan OpenACC gang reductions. gcc/cp/ * parser.c (cp_parser_omp_var_list_no_open): New c_omp_region_type argument. Use it to specialize handling of OMP_CLAUSE_REDUCTION for OpenACC. (cp_parser_omp_clause_reduction): Update call to cp_parser_omp_variable_list. Propage OpenACC errors as necessary. (cp_parser_oacc_all_clauses): Update call to cp_parser_omp_clause_reduction. (cp_parser_omp_all_clauses): Likewise. * semantics.c (finish_omp_clauses): Emit an error on orphan OpenACC gang reductions. gcc/fortran/ * openmp.c (resolve_oacc_loop_blocks): Emit an error on orphan OpenACC gang reductions. * trans-openmp.c (gfc_omp_clause_copy_ctor): Permit reductions. --- diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 7a926285f3a..a6f453dae54 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -965,12 +965,13 @@ class token_pair /* Like token_pair::require_close, except that tokens will be skipped until the desired token is found. An error message is still produced - if the next token is not as expected. */ + if the next token is not as expected, unless QUIET is set. */ - void skip_until_found_close (c_parser *parser) const + void skip_until_found_close (c_parser *parser, bool quiet = false) const { c_parser_skip_until_found (parser, traits_t::close_token_type, - traits_t::close_gmsgid, m_open_loc); + quiet ? NULL : traits_t::close_gmsgid, + m_open_loc); } private: @@ -11498,7 +11499,8 @@ c_parser_oacc_wait_list (c_parser *parser, location_t clause_loc, tree list) static tree c_parser_omp_variable_list (c_parser *parser, location_t clause_loc, - enum omp_clause_code kind, tree list) + enum omp_clause_code kind, tree list, + enum c_omp_region_type ort = C_ORT_OMP) { if (c_parser_next_token_is_not (parser, CPP_NAME) || c_parser_peek_token (parser)->id_kind != C_ID_ID) @@ -11557,6 +11559,22 @@ c_parser_omp_variable_list (c_parser *parser, /* FALLTHROUGH */ case OMP_CLAUSE_DEPEND: case OMP_CLAUSE_REDUCTION: + if (kind == OMP_CLAUSE_REDUCTION && ort == C_ORT_ACC) + { + switch (c_parser_peek_token (parser)->type) + { + case CPP_OPEN_PAREN: + case CPP_OPEN_SQUARE: + case CPP_DOT: + case CPP_DEREF: + error ("invalid reduction variable"); + t = error_mark_node; + default:; + break; + } + if (t == error_mark_node) + break; + } while (c_parser_next_token_is (parser, CPP_OPEN_SQUARE)) { tree low_bound = NULL_TREE, length = NULL_TREE; @@ -12789,9 +12807,12 @@ c_parser_omp_clause_private (c_parser *parser, tree list) identifier */ static tree -c_parser_omp_clause_reduction (c_parser *parser, tree list) +c_parser_omp_clause_reduction (c_parser *parser, tree list, + enum c_omp_region_type ort) { location_t clause_loc = c_parser_peek_token (parser)->location; + bool seen_error = false; + matching_parens parens; if (parens.require_open (parser)) { @@ -12855,7 +12876,13 @@ c_parser_omp_clause_reduction (c_parser *parser, tree list) tree nl, c; nl = c_parser_omp_variable_list (parser, clause_loc, - OMP_CLAUSE_REDUCTION, list); + OMP_CLAUSE_REDUCTION, list, ort); + if (c_parser_peek_token (parser)->type != CPP_CLOSE_PAREN) + { + seen_error = true; + goto cleanup; + } + for (c = nl; c != list; c = OMP_CLAUSE_CHAIN (c)) { tree d = OMP_CLAUSE_DECL (c), type; @@ -12891,7 +12918,8 @@ c_parser_omp_clause_reduction (c_parser *parser, tree list) list = nl; } - parens.skip_until_found_close (parser); +cleanup: + parens.skip_until_found_close (parser, seen_error); } return list; } @@ -13998,7 +14026,7 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, c_name = "private"; break; case PRAGMA_OACC_CLAUSE_REDUCTION: - clauses = c_parser_omp_clause_reduction (parser, clauses); + clauses = c_parser_omp_clause_reduction (parser, clauses, C_ORT_ACC); c_name = "reduction"; break; case PRAGMA_OACC_CLAUSE_SEQ: @@ -14157,7 +14185,7 @@ c_parser_omp_all_clauses (c_parser *parser, omp_clause_ma
Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes
The attached patch includes the nvptx and GCC ME reductions enhancements. Is this patch OK for trunk? It bootstrapped / regression tested cleanly for x86_64 with nvptx offloading. Thanks, Cesar 2018-06-29 Cesar Philippidis Nathan Sidwell gcc/ * config/nvptx/nvptx.c (nvptx_propagate_unified): New. (nvptx_split_blocks): Call it for cond_uni insn. (nvptx_expand_cond_uni): New. (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI. (nvptx_init_builtins): Initialize it. (nvptx_expand_builtin): (nvptx_generate_vector_shuffle): Change integral SHIFT operand to tree BITS operand. (nvptx_vector_reduction): New. (nvptx_adjust_reduction_type): New. (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res. (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist. (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector. Use it to adjust the type of ref_to_res. (nvptx_goacc_reduction_teardown): * config/nvptx/nvptx.md (cond_uni): New pattern. * omp-general.h (enum oacc_loop_flags): Add OLF_REDUCTION enum. * omp-low.c (lower_oacc_reductions): Handle reduction decls mapped with GOMP_MAP_FIRSTPRIVATE_POINTER. (lower_oacc_head_mark): Use OLF_REDUCTION to mark OpenACC reductions. * omp-offload.c (oacc_loop_auto_partitions): Don't assign gang level parallelism to orphan reductions. (default_goacc_reduction): Retype ref_to_res as necessary. --- diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 5608bee8a8d..33ec3db1153 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt () } } +/* UNIFIED is a cond_uni insn. Find the branch insn it affects, and + mark that as unified. We expect to be in a single block. */ + +static void +nvptx_propagate_unified (rtx_insn *unified) +{ + rtx_insn *probe = unified; + rtx cond_reg = SET_DEST (PATTERN (unified)); + rtx pat = NULL_RTX; + + /* Find the comparison. (We could skip this and simply scan to he + blocks' terminating branch, if we didn't care for self + checking.) */ + for (;;) +{ + probe = next_real_insn (probe); + if (!probe) + break; + pat = PATTERN (probe); + + if (GET_CODE (pat) == SET + && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE + && XEXP (SET_SRC (pat), 0) == cond_reg) + break; + gcc_assert (NONJUMP_INSN_P (probe)); +} + gcc_assert (pat); + rtx pred_reg = SET_DEST (pat); + + /* Find the branch. */ + do +probe = NEXT_INSN (probe); + while (!JUMP_P (probe)); + + pat = PATTERN (probe); + rtx itec = XEXP (SET_SRC (pat), 0); + gcc_assert (XEXP (itec, 0) == pred_reg); + + /* Mark the branch's condition as unified. */ + rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg), + UNSPEC_BR_UNIFIED); + bool ok = validate_change (probe, (itec, 0), unspec, false); + + gcc_assert (ok); +} + /* Loop structure of the function. The entire function is described as a NULL loop. */ @@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map) continue; switch (recog_memoized (insn)) { + case CODE_FOR_cond_uni: + nvptx_propagate_unified (insn); + /* FALLTHROUGH */ default: seen_insn = true; continue; @@ -5080,6 +5129,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target, return target; } +/* Expander for the compare unified builtin. */ + +static rtx +nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore) +{ + if (ignore) +return target; + + rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), + NULL_RTX, mode, EXPAND_NORMAL); + + emit_insn (gen_cond_uni (target, src)); + + return target; +} /* Codes for all the NVPTX builtins. */ enum nvptx_builtins @@ -5089,6 +5153,7 @@ enum nvptx_builtins NVPTX_BUILTIN_WORKER_ADDR, NVPTX_BUILTIN_CMP_SWAP, NVPTX_BUILTIN_CMP_SWAPLL, + NVPTX_BUILTIN_COND_UNI, NVPTX_BUILTIN_MAX }; @@ -5126,6 +5191,7 @@ nvptx_init_builtins (void) (PTRVOID, ST, UINT, UINT, NULL_TREE)); DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); + DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE)); #undef DEF #undef ST @@ -5158,6 +5224,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), case NVPTX_BUILTIN_CMP_SWAPLL: return nvptx_expand_cmp_swap (exp, target, mode, ignore); +case NVPTX_BUILTIN_COND_UNI: + return nvptx_expand_cond_uni (exp, target, mode, ignore); + default: gcc_unreachable (); } } @@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset) static void nvptx_generate_vector_shuffle (location_t loc, - tree dest_var, tree var, unsigned shift, + tree dest_var, tree var, tree bits, gimple_seq *seq) { unsigned