https://gcc.gnu.org/g:36c5a7aa9a6dbaed07e3a2482c66743ddcb3e776

commit r16-2638-g36c5a7aa9a6dbaed07e3a2482c66743ddcb3e776
Author: Andrew Stubbs <a...@baylibre.com>
Date:   Mon Jul 28 13:58:03 2025 +0000

    vect: Add target hook to prefer gather/scatter instructions
    
    For AMD GCN, the instructions available for loading/storing vectors are
    always scatter/gather operations (i.e. there are separate addresses for
    each vector lane), so the current heuristic to avoid gather/scatter
    operations with too many elements in get_group_load_store_type is
    counterproductive. Avoiding such operations in that function can
    subsequently lead to a missed vectorization opportunity whereby later
    analyses in the vectorizer try to use a very wide array type which is
    not available on this target, and thus it bails out.
    
    This patch adds a target hook to override the "single_element_p"
    heuristic in the function as a target hook, and activates it for GCN. This
    allows much better code to be generated for affected loops.
    
    Co-authored-by:  Julian Brown  <jul...@codesourcery.com>
    
    gcc/
            * doc/tm.texi.in (TARGET_VECTORIZE_PREFER_GATHER_SCATTER): Add
            documentation hook.
            * doc/tm.texi: Regenerate.
            * target.def (prefer_gather_scatter): Add target hook under 
vectorizer.
            * hooks.cc (hook_bool_mode_int_unsigned_false): New function.
            * hooks.h (hook_bool_mode_int_unsigned_false): New prototype.
            * tree-vect-stmts.cc (vect_use_strided_gather_scatters_p): Add
            parameters group_size and single_element_p, and rework to use
            targetm.vectorize.prefer_gather_scatter.
            (get_group_load_store_type): Move some of the condition into
            vect_use_strided_gather_scatters_p.
            * config/gcn/gcn.cc (gcn_prefer_gather_scatter): New function.
            (TARGET_VECTORIZE_PREFER_GATHER_SCATTER): Define hook.

Diff:
---
 gcc/config/gcn/gcn.cc  | 12 ++++++++++++
 gcc/doc/tm.texi        |  9 +++++++++
 gcc/doc/tm.texi.in     |  2 ++
 gcc/hooks.cc           |  7 +++++++
 gcc/hooks.h            |  1 +
 gcc/target.def         | 14 ++++++++++++++
 gcc/tree-vect-stmts.cc | 33 +++++++++++++++++++++++----------
 7 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 803ffa791d5e..5ffeb23dbaa6 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5795,6 +5795,16 @@ gcn_libc_has_function (enum function_class fn_class,
   return bsd_libc_has_function (fn_class, type);
 }
 
+/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */
+
+static bool
+gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode),
+                          int ARG_UNUSED (scale),
+                          unsigned int ARG_UNUSED (group_size))
+{
+  return true;
+}
+
 /* }}}  */
 /* {{{ md_reorg pass.  */
 
@@ -8146,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl)
   gcn_vectorize_builtin_vectorized_function
 #undef  TARGET_VECTORIZE_GET_MASK_MODE
 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
+#undef  TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter
 #undef  TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
 #undef  TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 928578bcc032..215552ceed96 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6513,6 +6513,15 @@ The default is @code{NULL_TREE} which means to not 
vectorize scatter
 stores.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFER_GATHER_SCATTER 
(machine_mode @var{mode}, int @var{scale}, unsigned int @var{group_size})
+This hook returns TRUE if gather loads or scatter stores are cheaper on
+this target than a sequence of elementwise loads or stores.  The @var{mode}
+and @var{scale} correspond to the @code{gather_load} and
+@code{scatter_store} instruction patterns.  The @var{group_size} is the
+number of scalar elements in each scalar loop iteration that are to be
+combined into the vector.
+@end deftypefn
+
 @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN 
(struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, 
@var{int}, @var{bool})
 This hook should set @var{vecsize_mangle}, @var{vecsize_int}, 
@var{vecsize_float}
 fields in @var{simd_clone} structure pointed by @var{clone_info} argument and 
also
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index eccc4d884938..b03ad4c97c60 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4311,6 +4311,8 @@ address;  but often a machine-dependent strategy can 
generate better code.
 
 @hook TARGET_VECTORIZE_BUILTIN_SCATTER
 
+@hook TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
diff --git a/gcc/hooks.cc b/gcc/hooks.cc
index 951825d4cf66..76cb593103dd 100644
--- a/gcc/hooks.cc
+++ b/gcc/hooks.cc
@@ -117,6 +117,13 @@ hook_bool_mode_const_rtx_true (machine_mode, const_rtx)
   return true;
 }
 
+/* Generic hook that takes (machine_mode, int, unsigned) and returns false.  */
+bool
+hook_bool_mode_int_unsigned_false (machine_mode, int, unsigned)
+{
+  return false;
+}
+
 /* Generic hook that takes (machine_mode, rtx) and returns false.  */
 bool
 hook_bool_mode_rtx_false (machine_mode, rtx)
diff --git a/gcc/hooks.h b/gcc/hooks.h
index c0663bf44558..e95bd11aca86 100644
--- a/gcc/hooks.h
+++ b/gcc/hooks.h
@@ -36,6 +36,7 @@ extern bool hook_bool_mode_true (machine_mode);
 extern bool hook_bool_mode_mode_true (machine_mode, machine_mode);
 extern bool hook_bool_mode_const_rtx_false (machine_mode, const_rtx);
 extern bool hook_bool_mode_const_rtx_true (machine_mode, const_rtx);
+extern bool hook_bool_mode_int_unsigned_false (machine_mode, int, unsigned);
 extern bool hook_bool_mode_rtx_false (machine_mode, rtx);
 extern bool hook_bool_mode_rtx_true (machine_mode, rtx);
 extern bool hook_bool_const_rtx_insn_const_rtx_insn_true (const rtx_insn *,
diff --git a/gcc/target.def b/gcc/target.def
index 427dc40075a5..5dd8f253ef6e 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2060,6 +2060,20 @@ all zeros.  GCC can then try to branch around the 
instruction instead.",
  (unsigned ifn),
  default_empty_mask_is_expensive)
 
+/* Prefer gather/scatter loads/stores to e.g. elementwise accesses if\n\
+we cannot use a contiguous access.  */
+DEFHOOK
+(prefer_gather_scatter,
+ "This hook returns TRUE if gather loads or scatter stores are cheaper on\n\
+this target than a sequence of elementwise loads or stores.  The @var{mode}\n\
+and @var{scale} correspond to the @code{gather_load} and\n\
+@code{scatter_store} instruction patterns.  The @var{group_size} is the\n\
+number of scalar elements in each scalar loop iteration that are to be\n\
+combined into the vector.",
+ bool,
+ (machine_mode mode, int scale, unsigned int group_size),
+ hook_bool_mode_int_unsigned_false)
+
 /* Target builtin that implements vector gather operation.  */
 DEFHOOK
 (builtin_gather,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 9ac1f91f1674..88a12a1e3189 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1702,19 +1702,32 @@ static bool
 vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info, tree vectype,
                                    loop_vec_info loop_vinfo, bool masked_p,
                                    gather_scatter_info *gs_info,
-                                   vec<int> *elsvals)
+                                   vec<int> *elsvals,
+                                   unsigned int group_size,
+                                   bool single_element_p)
 {
   if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info, elsvals)
       || gs_info->ifn == IFN_LAST)
-    return vect_truncate_gather_scatter_offset (stmt_info, vectype, loop_vinfo,
-                                               masked_p, gs_info, elsvals);
+    {
+      if (!vect_truncate_gather_scatter_offset (stmt_info, vectype, loop_vinfo,
+                                               masked_p, gs_info, elsvals))
+       return false;
+    }
+  else
+    {
+      tree old_offset_type = TREE_TYPE (gs_info->offset);
+      tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
 
-  tree old_offset_type = TREE_TYPE (gs_info->offset);
-  tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
+      gcc_assert (TYPE_PRECISION (new_offset_type)
+                 >= TYPE_PRECISION (old_offset_type));
+      gs_info->offset = fold_convert (new_offset_type, gs_info->offset);
+    }
 
-  gcc_assert (TYPE_PRECISION (new_offset_type)
-             >= TYPE_PRECISION (old_offset_type));
-  gs_info->offset = fold_convert (new_offset_type, gs_info->offset);
+  if (!single_element_p
+      && !targetm.vectorize.prefer_gather_scatter (TYPE_MODE (vectype),
+                                                  gs_info->scale,
+                                                  group_size))
+    return false;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -2262,11 +2275,11 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
   if ((*memory_access_type == VMAT_ELEMENTWISE
        || *memory_access_type == VMAT_STRIDED_SLP)
       && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
-      && single_element_p
       && SLP_TREE_LANES (slp_node) == 1
       && loop_vinfo
       && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
-                                            masked_p, gs_info, elsvals))
+                                            masked_p, gs_info, elsvals,
+                                            group_size, single_element_p))
     *memory_access_type = VMAT_GATHER_SCATTER;
 
   if (*memory_access_type == VMAT_CONTIGUOUS_DOWN

Reply via email to