On 2019/9/24 6:43 PM, Chung-Lin Tang wrote:
--- gcc/config/nvptx/nvptx.c (revision 275493)
+++ gcc/config/nvptx/nvptx.c (working copy)
+static void
+nvptx_expand_to_rtl_hook (void)
+{
+ /* For utilizing CUDA .param kernel arguments, we detect and modify
+ the gimple of offloaded child functions, here before RTL expansion,
+ starting with standard OMP form:
+ foo._omp_fn.0 (const struct .omp_data_t.8 & restrict .omp_data_i) { ... }
+
+ and transform it into a style where the OMP data record fields are
+ "exploded" into individual scalar arguments:
+ foo._omp_fn.0 (int * a, int * b, int * c) { ... }
+
+ Note that there are implicit assumptions of how OMP lowering (and/or other
+ intervening passes) behaves contained in this transformation code;
+ if those passes change in their output, this code may possibly need
+ updating. */
+
+ if (lookup_attribute ("omp target entrypoint",
+ DECL_ATTRIBUTES (current_function_decl))
+ /* The rather indirect manner in which OpenMP target functions are
+ launched makes this transformation only valid for OpenACC currently.
+ TODO: e.g. write_omp_entry(), nvptx_declare_function_name(), etc.
+ needs changes for this to work with OpenMP. */
+ && lookup_attribute ("oacc function",
+ DECL_ATTRIBUTES (current_function_decl))
+ && VOID_TYPE_P (TREE_TYPE (DECL_RESULT (current_function_decl))))
Why the 'void' return conditional? (Or, should that rather be an
'gcc_checking_assert' at the top of the following block?)
That the shape of child functions omp-low generates. Maybe that should be an
assertion, though here I'm just doing sanity checking and ignoring otherwise.
Come to think of it, maybe I should try using the assertion to check if
I'm unintentionally ignoring transforming some cases...
I've updated the patch to use an assertion for those convention checks. I think
it's better leave a level of checking in place, so gcc_assert() instead of
gcc_checking_assert(). Also tested no regressions.
Thanks,
Chung-Lin
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c (revision 276406)
+++ gcc/config/nvptx/nvptx.c (working copy)
@@ -68,6 +68,10 @@
#include "attribs.h"
#include "tree-vrp.h"
#include "tree-ssa-operands.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "tree-cfg.h"
+#include "gimple-ssa.h"
#include "tree-ssanames.h"
#include "gimplify.h"
#include "tree-phinodes.h"
@@ -6437,6 +6441,226 @@ nvptx_set_current_function (tree fndecl)
oacc_bcast_partition = 0;
}
+static void
+nvptx_expand_to_rtl_hook (void)
+{
+ /* For utilizing CUDA .param kernel arguments, we detect and modify
+ the gimple of offloaded child functions, here before RTL expansion,
+ starting with standard OMP form:
+ foo._omp_fn.0 (const struct .omp_data_t.8 & restrict .omp_data_i) { ... }
+
+ and transform it into a style where the OMP data record fields are
+ "exploded" into individual scalar arguments:
+ foo._omp_fn.0 (int * a, int * b, int * c) { ... }
+
+ Note that there are implicit assumptions of how OMP lowering (and/or other
+ intervening passes) behaves contained in this transformation code;
+ if those passes change in their output, this code may possibly need
+ updating. */
+
+ if (lookup_attribute ("omp target entrypoint",
+ DECL_ATTRIBUTES (current_function_decl))
+ /* The rather indirect manner in which OpenMP target functions are
+ launched makes this transformation only valid for OpenACC currently.
+ TODO: e.g. write_omp_entry(), nvptx_declare_function_name(), etc.
+ needs changes for this to work with OpenMP. */
+ && lookup_attribute ("oacc function",
+ DECL_ATTRIBUTES (current_function_decl)))
+ {
+ tree omp_data_arg = DECL_ARGUMENTS (current_function_decl);
+ tree argtype = TREE_TYPE (omp_data_arg);
+
+ /* Ensure this function is of the form of a single reference argument
+ to the OMP data record, or a single void* argument (when no values
+ passed) */
+ gcc_assert (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (current_function_decl)))
+ && (DECL_CHAIN (omp_data_arg) == NULL_TREE
+ && ((TREE_CODE (argtype) == REFERENCE_TYPE
+ && TREE_CODE (TREE_TYPE (argtype)) == RECORD_TYPE)
+ || (TREE_CODE (argtype) == POINTER_TYPE
+ && TREE_TYPE (argtype) == void_type_node))));
+ if (dump_file)
+ {
+ fprintf (dump_file, "Detected offloaded child function %s, "
+ "starting parameter conversion\n",
+ print_generic_expr_to_str (current_function_decl));
+ fprintf (dump_file, "OMP data record argument: %s (tree type: %s)\n",
+ print_generic_expr_to_str (omp_data_arg),
+ print_generic_expr_to_str (argtype));
+ fprintf (dump_file, "Data record fields:\n");
+ }
+
+ hash_map<tree,tree> fld_to_args;
+ tree fld, rectype = TREE_TYPE (argtype);
+ tree arglist = NULL_TREE, argtypelist = NULL_TREE;
+
+ if (TREE_CODE (rectype) == RECORD_TYPE)
+ {
+ /* For each field in the OMP data record type, create a corresponding
+ PARM_DECL, and map field -> parm using the fld_to_args hash_map.
+ Also create the tree chains for creating function type and
+ DECL_ARGUMENTS below. */
+ for (fld = TYPE_FIELDS (rectype); fld; fld = DECL_CHAIN (fld))
+ {
+ tree narg = build_decl (DECL_SOURCE_LOCATION (fld), PARM_DECL,
+ DECL_NAME (fld), TREE_TYPE (fld));
+ DECL_ARTIFICIAL (narg) = 1;
+ DECL_ARG_TYPE (narg) = TREE_TYPE (fld);
+ DECL_CONTEXT (narg) = current_function_decl;
+ TREE_USED (narg) = 1;
+ TREE_READONLY (narg) = 1;
+
+ if (dump_file)
+ fprintf (dump_file, "\t%s, type: %s, offset: %s bytes + %s
bits\n",
+ print_generic_expr_to_str (fld),
+ print_generic_expr_to_str (TREE_TYPE (fld)),
+ print_generic_expr_to_str (DECL_FIELD_OFFSET (fld)),
+ print_generic_expr_to_str (DECL_FIELD_BIT_OFFSET
(fld)));
+ fld_to_args.put (fld, narg);
+
+ TREE_CHAIN (narg) = arglist;
+ arglist = narg;
+ argtypelist = tree_cons (NULL_TREE, TREE_TYPE (narg),
+ argtypelist);
+ }
+ arglist = nreverse (arglist);
+ argtypelist = nreverse (argtypelist);
+ }
+ /* This is needed to not be mistaken for a stdarg function. */
+ argtypelist = chainon (argtypelist, void_list_node);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Function before OMP data arg replaced:\n");
+ dump_function_to_file (current_function_decl, dump_file, dump_flags);
+ }
+
+ /* Actually modify the tree type and DECL_ARGUMENTS here. */
+ TREE_TYPE (current_function_decl) = build_function_type (void_type_node,
+ argtypelist);
+ DECL_ARGUMENTS (current_function_decl) = arglist;
+
+ /* Remove local decls which correspond to *.omp_data_i->FIELD entries, by
+ scanning and skipping those entries, creating a new local_decls list.
+ We assume a very specific MEM_REF tree expression shape. */
+ tree decl;
+ unsigned int i;
+ vec<tree, va_gc> *new_local_decls = NULL;
+ FOR_EACH_VEC_SAFE_ELT (cfun->local_decls, i, decl)
+ {
+ if (DECL_HAS_VALUE_EXPR_P (decl))
+ {
+ tree t = DECL_VALUE_EXPR (decl);
+ if (TREE_CODE (t) == MEM_REF
+ && TREE_CODE (TREE_OPERAND (t, 0)) == COMPONENT_REF
+ && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t, 0), 0)) ==
MEM_REF
+ && (TREE_OPERAND (TREE_OPERAND (TREE_OPERAND (t, 0), 0), 0)
+ == omp_data_arg))
+ continue;
+ }
+ vec_safe_push (new_local_decls, decl);
+ }
+ vec_free (cfun->local_decls);
+ cfun->local_decls = new_local_decls;
+
+ /* Scan function body for assignments from .omp_data_i->FIELD, and using
+ the above created fld_to_args hash map, convert them to reads of
+ function arguments. */
+ basic_block bb;
+ gimple_stmt_iterator gsi;
+ FOR_EACH_BB_FN (bb, cfun)
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ tree val, *val_ptr = NULL;
+ gimple *stmt = gsi_stmt (gsi);
+ if (is_gimple_assign (stmt)
+ && gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS)
+ val_ptr = gimple_assign_rhs1_ptr (stmt);
+ else if (is_gimple_debug (stmt) && gimple_debug_bind_p (stmt))
+ val_ptr = gimple_debug_bind_get_value_ptr (stmt);
+
+ if (val_ptr == NULL || (val = *val_ptr) == NULL_TREE)
+ continue;
+
+ tree new_val = NULL_TREE, fld = NULL_TREE;
+
+ if (TREE_CODE (val) == COMPONENT_REF
+ && TREE_CODE (TREE_OPERAND (val, 0)) == MEM_REF
+ && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (val, 0), 0))
+ == SSA_NAME)
+ && (SSA_NAME_VAR (TREE_OPERAND (TREE_OPERAND (val, 0), 0))
+ == omp_data_arg))
+ {
+ /* .omp_data->FIELD case. */
+ fld = TREE_OPERAND (val, 1);
+ new_val = *fld_to_args.get (fld);
+ }
+ else if (TREE_CODE (val) == MEM_REF
+ && TREE_CODE (TREE_OPERAND (val, 0)) == SSA_NAME
+ && SSA_NAME_VAR (TREE_OPERAND (val, 0)) == omp_data_arg)
+ {
+ /* This case may happen in the final tree level optimization
+ output, due to SLP:
+ vect.XX = MEM <vector(1) unsigned long> [(void
*).omp_data_i_5(D) + 8B]
+
+ Therefore here we need a more elaborate search of the field
+ list to reverse map to which field the offset is referring
+ to. */
+ unsigned HOST_WIDE_INT offset
+ = tree_to_uhwi (TREE_OPERAND (val, 1));
+
+ for (hash_map<tree, tree>::iterator i = fld_to_args.begin ();
+ i != fld_to_args.end (); ++i)
+ {
+ tree cur_fld = (*i).first;
+ tree cur_arg = (*i).second;
+ gcc_assert (TREE_CODE (cur_arg) == PARM_DECL);
+
+ unsigned HOST_WIDE_INT cur_offset =
+ (tree_to_uhwi (DECL_FIELD_OFFSET (cur_fld))
+ + (tree_to_uhwi (DECL_FIELD_BIT_OFFSET (cur_fld))
+ / BITS_PER_UNIT));
+
+ if (offset == cur_offset)
+ {
+ new_val = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (val),
+ cur_arg);
+ break;
+ }
+ }
+ }
+
+ /* If we found the corresponding OMP data record field, replace the
+ RHS with the new created PARM_DECL. */
+ if (new_val != NULL_TREE)
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "For gimple stmt: ");
+ print_gimple_stmt (dump_file, stmt, 0);
+ fprintf (dump_file, "\tReplacing OMP recv ref %s with %s\n",
+ print_generic_expr_to_str (val),
+ print_generic_expr_to_str (new_val));
+ }
+ /* Write in looked up ARG as new RHS value. */
+ *val_ptr = new_val;
+ }
+ }
+
+ /* Delete SSA_NAMEs of .omp_data_i by setting them to NULL_TREE. */
+ tree name;
+ FOR_EACH_SSA_NAME (i, name, cfun)
+ if (SSA_NAME_VAR (name) == omp_data_arg)
+ (*SSANAMES (cfun))[SSA_NAME_VERSION (name)] = NULL_TREE;
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Function after OMP data arg replaced: ");
+ dump_function_to_file (current_function_decl, dump_file, dump_flags);
+ }
+ }
+}
+
#undef TARGET_OPTION_OVERRIDE
#define TARGET_OPTION_OVERRIDE nvptx_option_override
@@ -6576,6 +6800,9 @@ nvptx_set_current_function (tree fndecl)
#undef TARGET_SET_CURRENT_FUNCTION
#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
+#undef TARGET_EXPAND_TO_RTL_HOOK
+#define TARGET_EXPAND_TO_RTL_HOOK nvptx_expand_to_rtl_hook
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-nvptx.h"
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c (revision 276406)
+++ libgomp/plugin/plugin-nvptx.c (working copy)
@@ -695,16 +695,24 @@ link_ptx (CUmodule *module, const struct targ_ptx_
static void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
- unsigned *dims, void *targ_mem_desc,
- CUdeviceptr dp, CUstream stream)
+ unsigned *dims, CUstream stream)
{
struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
CUfunction function;
int i;
- void *kargs[1];
struct nvptx_thread *nvthd = nvptx_thread ();
int warp_size = nvthd->ptx_dev->warp_size;
+ void **kernel_args = NULL;
+ GOMP_PLUGIN_debug (0, "prepare mappings (mapnum: %u)\n", (unsigned) mapnum);
+
+ if (mapnum > 0)
+ {
+ kernel_args = alloca (mapnum * sizeof (void *));
+ for (int i = 0; i < mapnum; i++)
+ kernel_args[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]);
+ }
+
function = targ_fn->fn;
/* Initialize the launch dimensions. Typically this is constant,
@@ -936,11 +944,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
api_info);
}
- kargs[0] = &dp;
CUDA_CALL_ASSERT (cuLaunchKernel, function,
dims[GOMP_DIM_GANG], 1, 1,
dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
- 0, stream, kargs, 0);
+ 0, stream, kernel_args, 0);
if (profiling_p)
{
@@ -1349,67 +1356,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si
void **hostaddrs, void **devaddrs,
unsigned *dims, void *targ_mem_desc)
{
- GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
+ nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, NULL);
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info data_event_info;
- acc_api_info *api_info = thr->api_info;
- bool profiling_p = __builtin_expect (prof_info != NULL, false);
-
- void **hp = NULL;
- CUdeviceptr dp = 0;
-
- if (mapnum > 0)
- {
- size_t s = mapnum * sizeof (void *);
- hp = alloca (s);
- for (int i = 0; i < mapnum; i++)
- hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
- CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
- if (profiling_p)
- goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
- }
-
- /* Copy the (device) pointers to arguments to the device (dp and hp might in
- fact have the same value on a unified-memory system). */
- if (mapnum > 0)
- {
- if (profiling_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_start;
-
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel;
- data_event_info.data_event.implicit = 1; /* Always implicit. */
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL;
- data_event_info.data_event.bytes = mapnum * sizeof (void *);
- data_event_info.data_event.host_ptr = hp;
- data_event_info.data_event.device_ptr = (const void *) dp;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
- CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
- mapnum * sizeof (void *));
- if (profiling_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_end;
- data_event_info.data_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
- }
-
- nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
- dp, NULL);
-
CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
const char *maybe_abort_msg = "(perhaps abort was called)";
if (r == CUDA_ERROR_LAUNCH_FAILED)
@@ -1417,20 +1365,8 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), si
maybe_abort_msg);
else if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
-
- CUDA_CALL_ASSERT (cuMemFree, dp);
- if (profiling_p)
- goacc_profiling_acc_ev_free (thr, (void *) dp);
}
-static void
-cuda_free_argmem (void *ptr)
-{
- void **block = (void **) ptr;
- nvptx_free (block[0], (struct ptx_device *) block[1]);
- free (block);
-}
-
void
GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
void **hostaddrs, void **devaddrs,
@@ -1437,78 +1373,7 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void
unsigned *dims, void *targ_mem_desc,
struct goacc_asyncqueue *aq)
{
- GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
-
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info data_event_info;
- acc_api_info *api_info = thr->api_info;
- bool profiling_p = __builtin_expect (prof_info != NULL, false);
-
- void **hp = NULL;
- CUdeviceptr dp = 0;
- void **block = NULL;
-
- if (mapnum > 0)
- {
- size_t s = mapnum * sizeof (void *);
- block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
- hp = block + 2;
- for (int i = 0; i < mapnum; i++)
- hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
- CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
- if (profiling_p)
- goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
- }
-
- /* Copy the (device) pointers to arguments to the device (dp and hp might in
- fact have the same value on a unified-memory system). */
- if (mapnum > 0)
- {
- if (profiling_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_start;
-
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel;
- data_event_info.data_event.implicit = 1; /* Always implicit. */
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL;
- data_event_info.data_event.bytes = mapnum * sizeof (void *);
- data_event_info.data_event.host_ptr = hp;
- data_event_info.data_event.device_ptr = (const void *) dp;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
- CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
- mapnum * sizeof (void *), aq->cuda_stream);
- block[0] = (void *) dp;
-
- struct nvptx_thread *nvthd =
- (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
- block[1] = (void *) nvthd->ptx_dev;
-
- if (profiling_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_end;
- data_event_info.data_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
- }
-
- nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
- dp, aq->cuda_stream);
-
- if (mapnum > 0)
- GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
+ nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, aq->cuda_stream);
}
void *