If requested, return the vectorization factor appropriate for the offload
device, if any.
This change gives a significant speedup in the BabelStream "dot" benchmark on
amdgcn.
The omp_adjust_chunk_size usecase is set "false", for now, but I intend to
change that in a follow-up patch.
Note that NVPTX SIMT offload does not use this code-path.
gcc/ChangeLog:
* gimple-loop-versioning.cc (loop_versioning::loop_versioning): Set
omp_max_vf to offload == false.
* omp-expand.cc (omp_adjust_chunk_size): Likewise.
* omp-general.cc (omp_max_vf): Add "offload" parameter, and detect
amdgcn offload devices.
* omp-general.h (omp_max_vf): Likewise.
* omp-low.cc (lower_rec_simd_input_clauses): Pass offload state to
omp_max_vf.
---
gcc/gimple-loop-versioning.cc | 2 +-
gcc/omp-expand.cc | 2 +-
gcc/omp-general.cc | 17 +++++++++++++++--
gcc/omp-general.h | 2 +-
gcc/omp-low.cc | 3 ++-
5 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
index 107b0020024..2968c929d04 100644
--- a/gcc/gimple-loop-versioning.cc
+++ b/gcc/gimple-loop-versioning.cc
@@ -554,7 +554,7 @@ loop_versioning::loop_versioning (function *fn)
handled efficiently by scalar code. omp_max_vf calculates the
maximum number of bytes in a vector, when such a value is relevant
to loop optimization. */
- m_maximum_scale = estimated_poly_value (omp_max_vf ());
+ m_maximum_scale = estimated_poly_value (omp_max_vf (false));
m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
}
diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc
index b0b4ddf5dbc..907fd46a5b2 100644
--- a/gcc/omp-expand.cc
+++ b/gcc/omp-expand.cc
@@ -212,7 +212,7 @@ omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
if (!simd_schedule || integer_zerop (chunk_size))
return chunk_size;
- poly_uint64 vf = omp_max_vf ();
+ poly_uint64 vf = omp_max_vf (false);
if (known_eq (vf, 1U))
return chunk_size;
diff --git a/gcc/omp-general.cc b/gcc/omp-general.cc
index f74b9bf5e96..1ae575ee181 100644
--- a/gcc/omp-general.cc
+++ b/gcc/omp-general.cc
@@ -987,10 +987,11 @@ find_combined_omp_for (tree *tp, int *walk_subtrees, void
*data)
return NULL_TREE;
}
-/* Return maximum possible vectorization factor for the target. */
+/* Return maximum possible vectorization factor for the target, or for
+ the OpenMP offload target if one exists. */
poly_uint64
-omp_max_vf (void)
+omp_max_vf (bool offload)
{
if (!optimize
|| optimize_debug
@@ -999,6 +1000,18 @@ omp_max_vf (void)
&& OPTION_SET_P (flag_tree_loop_vectorize)))
return 1;
+ if (ENABLE_OFFLOADING && offload)
+ {
+ for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;)
+ {
+ if (startswith (c, "amdgcn"))
+ return ordered_max (64, omp_max_vf (false));
+ else if ((c = strchr (c, ':')))
+ c++;
+ }
+ /* Otherwise, fall through to host VF. */
+ }
+
auto_vector_modes modes;
targetm.vectorize.autovectorize_vector_modes (&modes, true);
if (!modes.is_empty ())
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index f3778131626..70f78d2055b 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -162,7 +162,7 @@ extern void omp_extract_for_data (gomp_for *for_stmt,
struct omp_for_data *fd,
struct omp_for_data_loop *loops);
extern gimple *omp_build_barrier (tree lhs);
extern tree find_combined_omp_for (tree *, int *, void *);
-extern poly_uint64 omp_max_vf (void);
+extern poly_uint64 omp_max_vf (bool);
extern int omp_max_simt_vf (void);
extern const char *omp_context_name_list_prop (tree);
extern void omp_construct_traits_to_codes (tree, int, enum tree_code *);
diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index 44c4310075b..70a2c108fbc 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -4589,7 +4589,8 @@ lower_rec_simd_input_clauses (tree new_var, omp_context
*ctx,
{
if (known_eq (sctx->max_vf, 0U))
{
- sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf ();
+ sctx->max_vf = (sctx->is_simt ? omp_max_simt_vf ()
+ : omp_max_vf (omp_maybe_offloaded_ctx (ctx)));
if (maybe_gt (sctx->max_vf, 1U))
{
tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt),
--
2.46.0