On Mon, Dec 07, 2015 at 12:22:43PM +0100, Martin Jambor wrote:
> it creates a copy of the entire target body and expands it slightly
> differently for concurrent execution on a GPU. Note that both teams
> and distribute constructs are mandatory. Moreover, currently the
> distribute has to be in a combined statement with the inner for
> construct. And there are quite a few other restrictions which I hope
The standard calls those composite constructs, and I bet for gridification
you want that restriction always, without composite distribute parallel for
there are two different unrelated loops.
> * builtin-types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New.
> (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed.
> (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New.
> * fortran/types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New.
> (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed.
> (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New.
Fortran has its own ChangeLog file.
> @@ -556,9 +558,9 @@ DEF_FUNCTION_TYPE_9
> (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT,
> BT_PTR_FN_VOID_PTR_PTR, BT_LONG, BT_LONG,
> BT_BOOL, BT_UINT, BT_PTR, BT_INT)
>
> -DEF_FUNCTION_TYPE_10 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT,
> - BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
> - BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_INT, BT_INT)
> +DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
> + BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
> + BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR)
There shouldn't be an empty line in between this DEF_FUNCTION_TYPE_9 and the
previous one.
> @@ -221,9 +223,9 @@ DEF_FUNCTION_TYPE_9
> (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT,
> BT_PTR_FN_VOID_PTR_PTR, BT_LONG, BT_LONG,
> BT_BOOL, BT_UINT, BT_PTR, BT_INT)
>
> -DEF_FUNCTION_TYPE_10 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT,
> +DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
> BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
> - BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_INT, BT_INT)
> + BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR)
>
> DEF_FUNCTION_TYPE_11
> (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_UINT_LONG_INT_LONG_LONG_LONG,
> BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR,
Ditto.
> --- a/gcc/gimple.def
> +++ b/gcc/gimple.def
> @@ -369,13 +369,17 @@ DEFGSCODE(GIMPLE_OMP_TARGET, "gimple_omp_target",
> GSS_OMP_PARALLEL_LAYOUT)
> /* GIMPLE_OMP_TEAMS <BODY, CLAUSES> represents #pragma omp teams
> BODY is the sequence of statements inside the single section.
> CLAUSES is an OMP_CLAUSE chain holding the associated clauses. */
> -DEFGSCODE(GIMPLE_OMP_TEAMS, "gimple_omp_teams", GSS_OMP_SINGLE_LAYOUT)
> +DEFGSCODE(GIMPLE_OMP_TEAMS, "gimple_omp_teams", GSS_OMP_TEAMS_LAYOUT)
Why?
> +/* GIMPLE_OMP_GPUKERNEL <BODY> represents a parallel loop lowered for
> execution
> + on a GPU. It is an artificial statement created by omp lowering. */
> +DEFGSCODE(GIMPLE_OMP_GPUKERNEL, "gimple_omp_gpukernel", GSS_OMP)
Why do you call it GPUKERNEL or KERNEL_BODY when you really mean gridified
body and gridified loop? I mean, what is GPU specific about it? PTX is
unlikely going to use that. And kernel is a wide term.
> @@ -622,8 +623,14 @@ struct GTY((tag("GSS_OMP_FOR")))
> /* [ WORD 11 ]
> Pre-body evaluated before the loop body begins. */
> gimple_seq pre_body;
> +
> + /* [ WORD 12 ]
> + If set, this statement is part of a gridified kernel, its clauses need
> to
> + be scanned and lowered but the statement should be discarded after
> + lowering. */
> + bool kernel_phony;
Ugh no, flags should go into GF_OMP_*.
> @@ -643,6 +660,12 @@ struct GTY((tag("GSS_OMP_PARALLEL_LAYOUT")))
> /* [ WORD 10 ]
> Shared data argument. */
> tree data_arg;
> +
> + /* [ WORD 11 ] */
> + /* If set, this statement is part of a gridified kernel, its clauses need
> to
> + be scanned and lowered but the statement should be discarded after
> + lowering. */
> + bool kernel_phony;
> };
Likewise.
As for omp-low.c changes, the file is already large enough that it would be
nice if it is easy to find out what routines are for gridification purposes
only, use some special prefix (grid_*, ompgrid_*, ...) for all such
functions?
> @@ -1761,6 +1786,8 @@ fixup_child_record_type (omp_context *ctx)
> {
> tree f, type = ctx->record_type;
>
> + if (!ctx->receiver_decl)
> + return;
So when is receiver_decl NULL?
> @@ -2113,6 +2140,14 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
> }
> break;
>
> + case OMP_CLAUSE__GRIDDIM_:
> + if (ctx->outer)
> + {
> + scan_omp_op (&OMP_CLAUSE_GRIDDIM_SIZE (c), ctx->outer);
> + scan_omp_op (&OMP_CLAUSE_GRIDDIM_GROUP (c), ctx->outer);
These should be OMP_CLAUSE__GRIDDIM__{SIZE,GROUP}. See
OMP_CLAUSE__SIMDUID__DECL for another similar macro.
> @@ -6252,6 +6302,37 @@ gimple_build_cond_empty (tree cond)
> return gimple_build_cond (pred_code, lhs, rhs, NULL_TREE, NULL_TREE);
> }
>
> +/* Return true if a parallel REGION is within a declare target function or
> + within a target region and is not a part of a gridified kernel. */
> +
> +static bool
> +region_needs_kernel_p (struct omp_region *region)
> +{
> + bool indirect = false;
> + for (region = region->outer; region; region = region->outer)
> + {
> + if (region->type == GIMPLE_OMP_PARALLEL)
> + indirect = true;
> + else if (region->type == GIMPLE_OMP_TARGET)
> + {
> + gomp_target *tgt_stmt;
> + tgt_stmt = as_a <gomp_target *> (last_stmt (region->entry));
gomp_target *tgt_stmt
= as_a <gomp_target *> (last_stmt (region->entry));
?
> +static GTY(()) tree kernel_dim_array_type;
> +static GTY(()) tree kernel_lattrs_dimnum_decl;
> +static GTY(()) tree kernel_lattrs_grid_decl;
> +static GTY(()) tree kernel_lattrs_group_decl;
> +static GTY(()) tree kernel_launch_attributes_type;
Turn this at least into either a struct or array of trees, so that it is not
5 separate GC roots?
> + tree dim_arr_index_type;
> + dim_arr_index_type = build_index_type (build_int_cst (integer_type_node,
> 2));
See above for formatting; even if you don't have the declaration
one line above it, putting = in 5th column of next line will be often
beneficial for the formatting:
> + kernel_dim_array_type = build_array_type (uint32_type_node,
> + dim_arr_index_type);
> +
> + kernel_launch_attributes_type = make_node (RECORD_TYPE);
> + kernel_lattrs_dimnum_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
> + get_identifier ("ndim"),
> + uint32_type_node);
> + DECL_CHAIN (kernel_lattrs_dimnum_decl) = NULL_TREE;
> +
> + kernel_lattrs_grid_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
> + get_identifier ("grid_size"),
> + kernel_dim_array_type);
> + DECL_CHAIN (kernel_lattrs_grid_decl) = kernel_lattrs_dimnum_decl;
> + kernel_lattrs_group_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
> + get_identifier ("group_size"),
> + kernel_dim_array_type);
> + DECL_CHAIN (kernel_lattrs_group_decl) = kernel_lattrs_grid_decl;
> + finish_builtin_struct (kernel_launch_attributes_type,
> + "__gomp_kernel_launch_attributes",
> + kernel_lattrs_group_decl, NULL_TREE);
> +static tree
> +get_target_arguments (gimple_stmt_iterator *gsi, gomp_target *tgt_stmt)
> +{
> + auto_vec <tree, 4> args;
> + tree clauses = gimple_omp_target_clauses (tgt_stmt);
> + tree t, c = find_omp_clause (clauses, OMP_CLAUSE_NUM_TEAMS);
> + if (c)
> + t = OMP_CLAUSE_NUM_TEAMS_EXPR (c);
> + else
> + t = integer_minus_one_node;
> + t = get_target_argument_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL,
> + GOMP_TARGET_ARG_NUM_TEAMS, t);
> + args.quick_push (t);
This is what I've talked about in review of another patch. num_teams
is int, for 32-bit targets trying to encode it into 16 bits is not going to
work.
> +
> + c = find_omp_clause (clauses, OMP_CLAUSE_THREAD_LIMIT);
> + if (c)
> + t = OMP_CLAUSE_THREAD_LIMIT_EXPR (c);
> + else
> + t = integer_minus_one_node;
> + t = get_target_argument_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL,
> + GOMP_TARGET_ARG_THREAD_LIMIT, t);
Ditto.
> @@ -14872,6 +15392,14 @@ lower_omp_taskreg (gimple_stmt_iterator *gsi_p,
> omp_context *ctx)
> par_olist = NULL;
> par_ilist = NULL;
> par_rlist = NULL;
> + bool phony_construct = is_a <gomp_parallel *> (stmt)
> + && gimple_omp_parallel_kernel_phony (as_a <gomp_parallel *> (stmt));
I'm not a big fan of the is_a mess. gimple_code (stmt) == GIMPLE_OMP_PARALLEL
is what is used elsewhere.
> + if (phony_construct && ctx->record_type)
> + {
> + gcc_checking_assert (!ctx->receiver_decl);
> + ctx->receiver_decl = create_tmp_var
> + (build_reference_type (ctx->record_type), ".omp_rec");
Formatting.
> @@ -400,7 +401,8 @@ const char * const omp_clause_code_name[] =
> "num_gangs",
> "num_workers",
> "vector_length",
> - "tile"
> + "tile",
> + "griddim"
The clause is "_griddim_".
Jakub