https://gcc.gnu.org/g:664e0ce580a8f20a78aa355c42222e4647841f77

commit r15-3735-g664e0ce580a8f20a78aa355c42222e4647841f77
Author: Richard Biener <rguent...@suse.de>
Date:   Fri Sep 20 12:17:22 2024 +0200

    Fall back to elementwise access for too spaced SLP single element 
interleaving
    
    gcc.dg/vect/vect-pr111779.c is a case where non-SLP manages to vectorize
    using VMAT_ELEMENTWISE but SLP currently refuses because doing a regular
    access with permutes would cause excess vector loads with at most one
    element used.  The following makes us fall back to elementwise accesses
    for that, too.
    
            * tree-vect-stmts.cc (get_group_load_store_type): Fall back
            to VMAT_ELEMENTWISE when single element interleaving of
            a too large group.
            (vectorizable_load): Do not try to verify load permutations
            when using VMAT_ELEMENTWISE for single-lane SLP and fix code
            generation for this case.
    
            * gfortran.dg/vect/vect-8.f90: Allow one more vectorized loop.

Diff:
---
 gcc/testsuite/gfortran.dg/vect/vect-8.f90 |  2 +-
 gcc/tree-vect-stmts.cc                    | 37 ++++++++++++++++++-------------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 
b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
index 2a3fa90740e3..918eddee292f 100644
--- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
@@ -708,5 +708,5 @@ END SUBROUTINE kernel
 
 ! { dg-final { scan-tree-dump-times "vectorized 2\[56\] loops" 1 "vect" { 
target aarch64_sve } } }
 ! { dg-final { scan-tree-dump-times "vectorized 2\[45\] loops" 1 "vect" { 
target { aarch64*-*-* && { ! aarch64_sve } } } } }
-! { dg-final { scan-tree-dump-times "vectorized 2\[345\] loops" 1 "vect" { 
target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
+! { dg-final { scan-tree-dump-times "vectorized 2\[3456\] loops" 1 "vect" { 
target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
 ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { 
{ ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 33cdccae7849..45003f762ddf 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2190,11 +2190,12 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
              && single_element_p
              && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
            {
+             *memory_access_type = VMAT_ELEMENTWISE;
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                 "single-element interleaving not supported "
-                                "for not adjacent vector loads\n");
-             return false;
+                                "for not adjacent vector loads, using "
+                                "elementwise access\n");
            }
        }
     }
@@ -10039,7 +10040,23 @@ vectorizable_load (vec_info *vinfo,
   else
     group_size = 1;
 
-  if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+  vect_memory_access_type memory_access_type;
+  enum dr_alignment_support alignment_support_scheme;
+  int misalignment;
+  poly_int64 poffset;
+  internal_fn lanes_ifn;
+  if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, 
VLS_LOAD,
+                           ncopies, &memory_access_type, &poffset,
+                           &alignment_support_scheme, &misalignment, &gs_info,
+                           &lanes_ifn))
+    return false;
+
+  /* ???  The following checks should really be part of
+     get_group_load_store_type.  */
+  if (slp
+      && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+      && !(memory_access_type == VMAT_ELEMENTWISE
+          && SLP_TREE_LANES (slp_node) == 1))
     {
       slp_perm = true;
 
@@ -10079,17 +10096,6 @@ vectorizable_load (vec_info *vinfo,
        }
     }
 
-  vect_memory_access_type memory_access_type;
-  enum dr_alignment_support alignment_support_scheme;
-  int misalignment;
-  poly_int64 poffset;
-  internal_fn lanes_ifn;
-  if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, 
VLS_LOAD,
-                           ncopies, &memory_access_type, &poffset,
-                           &alignment_support_scheme, &misalignment, &gs_info,
-                           &lanes_ifn))
-    return false;
-
   if (slp_node
       && slp_node->ldst_lanes
       && memory_access_type != VMAT_LOAD_STORE_LANES)
@@ -10292,7 +10298,8 @@ vectorizable_load (vec_info *vinfo,
          first_dr_info = dr_info;
        }
 
-      if (slp && grouped_load)
+      if (slp && grouped_load
+         && memory_access_type == VMAT_STRIDED_SLP)
        {
          group_size = DR_GROUP_SIZE (first_stmt_info);
          ref_type = get_group_alias_ptr_type (first_stmt_info);

Reply via email to