expansion changes (gridification)

Jakub Jelinek Wed, 09 Dec 2015 05:20:12 -0800

On Mon, Dec 07, 2015 at 12:22:43PM +0100, Martin Jambor wrote:
> it creates a copy of the entire target body and expands it slightly
> differently for concurrent execution on a GPU.  Note that both teams
> and distribute constructs are mandatory.  Moreover, currently the
> distribute has to be in a combined statement with the inner for
> construct.  And there are quite a few other restrictions which I hope


The standard calls those composite constructs, and I bet for gridification
you want that restriction always, without composite distribute parallel for
there are two different unrelated loops.

>       * builtin-types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New.
>       (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed.
>       (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New.
>       * fortran/types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New.
>       (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed.
>       (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New.

Fortran has its own ChangeLog file.

> @@ -556,9 +558,9 @@ DEF_FUNCTION_TYPE_9 
> (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT,
>                    BT_PTR_FN_VOID_PTR_PTR, BT_LONG, BT_LONG,
>                    BT_BOOL, BT_UINT, BT_PTR, BT_INT)
>  
> -DEF_FUNCTION_TYPE_10 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT,
> -                   BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
> -                   BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_INT, BT_INT)
> +DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
> +                  BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
> +                  BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR)

There shouldn't be an empty line in between this DEF_FUNCTION_TYPE_9 and the
previous one.

> @@ -221,9 +223,9 @@ DEF_FUNCTION_TYPE_9 
> (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT,
>                    BT_PTR_FN_VOID_PTR_PTR, BT_LONG, BT_LONG,
>                    BT_BOOL, BT_UINT, BT_PTR, BT_INT)
>  
> -DEF_FUNCTION_TYPE_10 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT,
> +DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
>                     BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
> -                   BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_INT, BT_INT)
> +                   BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR)
>  
>  DEF_FUNCTION_TYPE_11 
> (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_UINT_LONG_INT_LONG_LONG_LONG,
>                     BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR,

Ditto.

> --- a/gcc/gimple.def
> +++ b/gcc/gimple.def
> @@ -369,13 +369,17 @@ DEFGSCODE(GIMPLE_OMP_TARGET, "gimple_omp_target", 
> GSS_OMP_PARALLEL_LAYOUT)
>  /* GIMPLE_OMP_TEAMS <BODY, CLAUSES> represents #pragma omp teams
>     BODY is the sequence of statements inside the single section.
>     CLAUSES is an OMP_CLAUSE chain holding the associated clauses.  */
> -DEFGSCODE(GIMPLE_OMP_TEAMS, "gimple_omp_teams", GSS_OMP_SINGLE_LAYOUT)
> +DEFGSCODE(GIMPLE_OMP_TEAMS, "gimple_omp_teams", GSS_OMP_TEAMS_LAYOUT)

Why?

> +/* GIMPLE_OMP_GPUKERNEL <BODY> represents a parallel loop lowered for 
> execution
> +   on a GPU.  It is an artificial statement created by omp lowering.  */
> +DEFGSCODE(GIMPLE_OMP_GPUKERNEL, "gimple_omp_gpukernel", GSS_OMP)

Why do you call it GPUKERNEL or KERNEL_BODY when you really mean gridified
body and gridified loop?  I mean, what is GPU specific about it?  PTX is
unlikely going to use that.  And kernel is a wide term.

> @@ -622,8 +623,14 @@ struct GTY((tag("GSS_OMP_FOR")))
>    /* [ WORD 11 ]
>       Pre-body evaluated before the loop body begins.  */
>    gimple_seq pre_body;
> +
> +  /* [ WORD 12 ]
> +     If set, this statement is part of a gridified kernel, its clauses need 
> to
> +     be scanned and lowered but the statement should be discarded after
> +     lowering.  */
> +  bool kernel_phony;

Ugh no, flags should go into GF_OMP_*.

> @@ -643,6 +660,12 @@ struct GTY((tag("GSS_OMP_PARALLEL_LAYOUT")))
>    /* [ WORD 10 ]
>       Shared data argument.  */
>    tree data_arg;
> +
> +  /* [ WORD 11 ] */
> +  /* If set, this statement is part of a gridified kernel, its clauses need 
> to
> +     be scanned and lowered but the statement should be discarded after
> +     lowering.  */
> +  bool kernel_phony;
>  };

Likewise.

As for omp-low.c changes, the file is already large enough that it would be
nice if it is easy to find out what routines are for gridification purposes
only, use some special prefix (grid_*, ompgrid_*, ...) for all such
functions?

> @@ -1761,6 +1786,8 @@ fixup_child_record_type (omp_context *ctx)
>  {
>    tree f, type = ctx->record_type;
>  
> +  if (!ctx->receiver_decl)
> +    return;

So when is receiver_decl NULL?

> @@ -2113,6 +2140,14 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
>           }
>         break;
>  
> +     case OMP_CLAUSE__GRIDDIM_:
> +       if (ctx->outer)
> +         {
> +           scan_omp_op (&OMP_CLAUSE_GRIDDIM_SIZE (c), ctx->outer);
> +           scan_omp_op (&OMP_CLAUSE_GRIDDIM_GROUP (c), ctx->outer);

These should be OMP_CLAUSE__GRIDDIM__{SIZE,GROUP}.  See
OMP_CLAUSE__SIMDUID__DECL for another similar macro.

> @@ -6252,6 +6302,37 @@ gimple_build_cond_empty (tree cond)
>    return gimple_build_cond (pred_code, lhs, rhs, NULL_TREE, NULL_TREE);
>  }
>  
> +/* Return true if a parallel REGION is within a declare target function or
> +   within a target region and is not a part of a gridified kernel.  */
> +
> +static bool
> +region_needs_kernel_p (struct omp_region *region)
> +{
> +  bool indirect = false;
> +  for (region = region->outer; region; region = region->outer)
> +    {
> +      if (region->type == GIMPLE_OMP_PARALLEL)
> +     indirect = true;
> +      else if (region->type == GIMPLE_OMP_TARGET)
> +     {
> +       gomp_target *tgt_stmt;
> +       tgt_stmt = as_a <gomp_target *> (last_stmt (region->entry));

          gomp_target *tgt_stmt
            = as_a <gomp_target *> (last_stmt (region->entry));
?

> +static GTY(()) tree kernel_dim_array_type;
> +static GTY(()) tree kernel_lattrs_dimnum_decl;
> +static GTY(()) tree kernel_lattrs_grid_decl;
> +static GTY(()) tree kernel_lattrs_group_decl;
> +static GTY(()) tree kernel_launch_attributes_type;

Turn this at least into either a struct or array of trees, so that it is not
5 separate GC roots?

> +  tree dim_arr_index_type;
> +  dim_arr_index_type = build_index_type (build_int_cst (integer_type_node, 
> 2));

See above for formatting; even if you don't have the declaration
one line above it, putting = in 5th column of next line will be often
beneficial for the formatting:

> +  kernel_dim_array_type = build_array_type (uint32_type_node,
> +                                         dim_arr_index_type);
> +
> +  kernel_launch_attributes_type = make_node (RECORD_TYPE);
> +  kernel_lattrs_dimnum_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
> +                                    get_identifier ("ndim"),
> +                                    uint32_type_node);
> +  DECL_CHAIN (kernel_lattrs_dimnum_decl) = NULL_TREE;
> +
> +  kernel_lattrs_grid_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
> +                                  get_identifier ("grid_size"),
> +                                  kernel_dim_array_type);
> +  DECL_CHAIN (kernel_lattrs_grid_decl) = kernel_lattrs_dimnum_decl;
> +  kernel_lattrs_group_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
> +                                  get_identifier ("group_size"),
> +                                  kernel_dim_array_type);
> +  DECL_CHAIN (kernel_lattrs_group_decl) = kernel_lattrs_grid_decl;
> +  finish_builtin_struct (kernel_launch_attributes_type,
> +                      "__gomp_kernel_launch_attributes",
> +                      kernel_lattrs_group_decl, NULL_TREE);

> +static tree
> +get_target_arguments (gimple_stmt_iterator *gsi, gomp_target *tgt_stmt)
> +{
> +  auto_vec <tree, 4> args;
> +  tree clauses = gimple_omp_target_clauses (tgt_stmt);
> +  tree t, c = find_omp_clause (clauses, OMP_CLAUSE_NUM_TEAMS);
> +  if (c)
> +    t = OMP_CLAUSE_NUM_TEAMS_EXPR (c);
> +  else
> +    t = integer_minus_one_node;
> +  t = get_target_argument_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL,
> +                              GOMP_TARGET_ARG_NUM_TEAMS, t);
> +  args.quick_push (t);

This is what I've talked about in review of another patch.  num_teams
is int, for 32-bit targets trying to encode it into 16 bits is not going to
work.
> +
> +  c = find_omp_clause (clauses, OMP_CLAUSE_THREAD_LIMIT);
> +  if (c)
> +    t = OMP_CLAUSE_THREAD_LIMIT_EXPR (c);
> +  else
> +    t = integer_minus_one_node;
> +  t = get_target_argument_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL,
> +                              GOMP_TARGET_ARG_THREAD_LIMIT, t);

Ditto.

> @@ -14872,6 +15392,14 @@ lower_omp_taskreg (gimple_stmt_iterator *gsi_p, 
> omp_context *ctx)
>    par_olist = NULL;
>    par_ilist = NULL;
>    par_rlist = NULL;
> +  bool phony_construct = is_a <gomp_parallel *> (stmt)
> +    && gimple_omp_parallel_kernel_phony (as_a <gomp_parallel *> (stmt));

I'm not a big fan of the is_a mess.  gimple_code (stmt) == GIMPLE_OMP_PARALLEL
is what is used elsewhere.

> +  if (phony_construct && ctx->record_type)
> +    {
> +      gcc_checking_assert (!ctx->receiver_decl);
> +      ctx->receiver_decl = create_tmp_var
> +     (build_reference_type (ctx->record_type), ".omp_rec");

Formatting.
> @@ -400,7 +401,8 @@ const char * const omp_clause_code_name[] =
>    "num_gangs",
>    "num_workers",
>    "vector_length",
> -  "tile"
> +  "tile",
> +  "griddim"

The clause is "_griddim_".

        Jakub

Re: [hsa 5/10] OpenMP lowering/expansion changes (gridification)

Reply via email to