Hi all,

Recently I'm investigating on an issue related to use D-form/X-form vector
memory access, it's the same as what the patch
https://gcc.gnu.org/ml/gcc-patches/2019-10/msg01879.html 
was intended to deal with.  Power9 introduces DQ-form instructions for vector
memory access, we perfer to use DQ-form when unrolling loop.  As the example
in the above link, it can save number of ADDI and GPR for indexing.

Or for below example:

        extern void dummy (double, unsigned n);

        void
        func (double *x, double *y, unsigned m, unsigned n)
        {
          double sacc;
          for (unsigned j = 1; j < m; j++)
          {
            sacc = 0.0;
            for (unsigned i = 1; i < n; i++)
              sacc = sacc + x[i] * y[i];
            dummy (sacc, n);
          }
        }

Core loop with X-form (lxvx):
/*
        mtctr   r10
        lxvx    vs12,r31,r9
        lxvx    vs0,r30,r9
        addi    r10,r9,16
        addi    r9,r9,32
        xvmaddadp vs32,vs12,vs0
        lxvx    vs12,r31,r10
        lxvx    vs0,r30,r10
        xvmaddadp vs11,vs12,vs0
        lxvx    vs12,r31,r9
        lxvx    vs0,r30,r9
        addi    r9,r10,32
        xvmaddadp vs32,vs12,vs0
        lxvx    vs12,r31,r9
        lxvx    vs0,r30,r9
        addi    r9,r10,48
        xvmaddadp vs11,vs12,vs0
        lxvx    vs12,r31,r9
        lxvx    vs0,r30,r9
        addi    r9,r10,64
        xvmaddadp vs32,vs12,vs0
        lxvx    vs12,r31,r9
        lxvx    vs0,r30,r9
        addi    r9,r10,80
        xvmaddadp vs11,vs12,vs0
        lxvx    vs12,r31,r9
        lxvx    vs0,r30,r9
        addi    r9,r10,96
        xvmaddadp vs32,vs12,vs0
        lxvx    vs12,r31,r9
        lxvx    vs0,r30,r9
        addi    r9,r10,112
        xvmaddadp vs11,vs12,vs0
        bdnz    190 <func+0x190>
*/

vs.
/*
Core loop with D-form (lxv)
        mtctr   r8
        lxv     vs12,0(r9)
        lxv     vs0,0(r10)
        addi    r7,r9,16  // r7, r8 can be eliminated further with r9, r10
        addi    r8,r10,16 // 2 or 4 addi vs. 8 addi above
        addi    r9,r9,128    
        addi    r10,r10,128  
        xvmaddadp vs32,vs12,vs0
        lxv     vs12,-112(r9)
        lxv     vs0,-112(r10)
        xvmaddadp vs11,vs12,vs0
        lxv     vs12,16(r7)
        lxv     vs0,16(r8)
        xvmaddadp vs32,vs12,vs0
        lxv     vs12,32(r7)
        lxv     vs0,32(r8)
        xvmaddadp vs11,vs12,vs0
        lxv     vs12,48(r7)
        lxv     vs0,48(r8)
        xvmaddadp vs32,vs12,vs0
        lxv     vs12,64(r7)
        lxv     vs0,64(r8)
        xvmaddadp vs11,vs12,vs0
        lxv     vs12,80(r7)
        lxv     vs0,80(r8)
        xvmaddadp vs32,vs12,vs0
        lxv     vs12,96(r7)
        lxv     vs0,96(r8)
        xvmaddadp vs11,vs12,vs0
        bdnz    1b0 <func+0x1b0>
*/

We are thinking whether it can be handled in IVOPTs instead of one RTL pass.

During IVOPTs selecting IV cands, it doesn't know the loop will be unrolled so
it doesn't count the possible step cost in with X-form.  If we can teach it to
consider the case, the IV cands which plays with D-form can be preferred.
Currently unrolling (incomplete) happens in RTL, it looks we have to predict
the loop whether unroll in IVOPTs.  Since there is some parameter checks on RTL
insn counts and target hooks, it seems not easy to get that.  Besides, we need
to check the step is valid to put into D-form field (eg: DQ-form requires divide
16 exactly), to ensure no extra ADDIs needed.

I'm not sure whether it's a good idea to implement in IVOPTs, but I did some
changes in IVOPTs to prove it's doable to get expected codes, the patch is 
attached.

Any comments/suggestions are highly appreiciated!

BR,
Kewen
diff --git a/gcc/common.opt b/gcc/common.opt
index 404b6aa..0d3f8f8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1465,6 +1465,10 @@ ffinite-loops
 Common Report Var(flag_finite_loops) Optimization
 Assume that loops with an exit will terminate and not loop indefinitely.
 
+fivopts-dform
+Common Report Var(flag_ivopts_dform) Init(1) Optimization
+Assume D-form is preferred in IVOPTS like unrolling.
+
 ffixed-
 Common Joined RejectNegative Var(common_deferred_options) Defer
 -ffixed-<register>     Mark <register> as being unavailable to the compiler.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 2995348..588feac 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1654,6 +1654,9 @@ static const struct attribute_spec 
rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_D_FORM_SUITABLE_P
+#define TARGET_D_FORM_SUITABLE_P rs6000_d_form_suitable_p
+
 #undef TARGET_HAVE_COUNT_REG_DECR_P
 #define TARGET_HAVE_COUNT_REG_DECR_P true
 
@@ -26258,6 +26261,28 @@ rs6000_predict_doloop_p (struct loop *loop)
   return true;
 }
 
+static bool
+rs6000_d_form_suitable_p (machine_mode mode, signed HOST_WIDE_INT val,
+                         bool is_sig, bool is_store)
+{
+  /* Only Power9 and above supports DQ form.  */
+  if (VECTOR_MODE_P (mode))
+    {
+      if (mode_supports_dq_form (mode) && !(val & 0xF))
+       return true;
+      else
+       return false;
+    }
+  else if (mode == QImode || mode == HImode || mode == SFmode || mode == 
DFmode)
+    return true;
+  else if (mode == SImode && (!is_sig || is_store || !(val & 0x3)))
+    return true;
+  else if (mode == DImode && !(val & 0x3))
+    return true;
+  else
+    return false;
+}
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-rs6000.h"
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 2244df4..f5a86e3 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11677,6 +11677,13 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_D_FORM_SUITABLE_P (machine_mode 
@var{mode}, signed HOST_WIDE_INT @var{val}, bool @var{is_sig}, bool 
@var{is_store})
+Return true if we find the given memory access with mode @var{mode} offset
+@var{val} and flag signedness @var{is_sig} flag is_store @var{is_store} have
+D-form support.
+The default version of this hook returns false.
+@end deftypefn
+
 @deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
 Return true if the target supports hardware count register for decrement
 and branch.
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 52cd603..d79c2d7 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7955,6 +7955,8 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_D_FORM_SUITABLE_P
+
 @hook TARGET_HAVE_COUNT_REG_DECR_P
 
 @hook TARGET_DOLOOP_COST_FOR_GENERIC
diff --git a/gcc/target.def b/gcc/target.def
index e705c5d..3fc4b28 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4292,7 +4292,17 @@ DEFHOOK
  emits a @code{speculation_barrier} instruction if that is defined.",
 rtx, (machine_mode mode, rtx result, rtx val, rtx failval),
  default_speculation_safe_value)
- 
+
+DEFHOOK
+(d_form_suitable_p,
+ "Return true if we find the given memory access with mode @var{mode} offset\n\
+@var{val} and flag signedness @var{is_sig} flag is_store @var{is_store} have\n\
+D-form support.\n\
+The default version of this hook returns false.",
+ bool, (machine_mode mode, signed HOST_WIDE_INT val, bool is_sig,
+ bool is_store),
+ default_d_form_suitable_p)
+
 DEFHOOK
 (predict_doloop_p,
  "Return true if we can predict it is possible to use a low-overhead loop\n\
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index fd8e435..7872d4d 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -650,6 +650,15 @@ default_predict_doloop_p (class loop *loop 
ATTRIBUTE_UNUSED)
   return false;
 }
 
+bool
+default_d_form_suitable_p (machine_mode mode ATTRIBUTE_UNUSED,
+                          signed HOST_WIDE_INT val ATTRIBUTE_UNUSED,
+                          bool is_sig ATTRIBUTE_UNUSED,
+                          bool is_store ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
 /* NULL if INSN insn is valid within a low-overhead loop, otherwise returns
    an error message.
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index e041291..590cf8a 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -88,6 +88,8 @@ extern bool default_fixed_point_supported_p (void);
 extern bool default_has_ifunc_p (void);
 
 extern bool default_predict_doloop_p (class loop *);
+extern bool default_d_form_suitable_p (machine_mode, signed HOST_WIDE_INT, 
bool,
+                                      bool);
 extern const char * default_invalid_within_doloop (const rtx_insn *);
 
 extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index ab52cbe..526fc08 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -429,6 +429,8 @@ struct iv_group
   struct iv_cand *selected;
   /* To indicate this is a doloop use group.  */
   bool doloop_p;
+  /* To indicate this group prefer D-form.  */
+  bool d_form_p;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -650,6 +652,11 @@ struct ivopts_data
 
   /* Whether the loop has doloop comparison use.  */
   bool doloop_use_p;
+
+  /* Whether the loop is likely to unrolling and its unroll factor, consider
+     it won't unroll if it's less than two.  */
+  unsigned short unroll_factor;
+
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -2724,6 +2731,70 @@ split_address_groups (struct ivopts_data *data)
     }
 }
 
+/* Go through all address type groups, check and mark D-form preferred.  */
+static void
+mark_d_form_groups (struct ivopts_data *data)
+{
+  auto_bitmap cands;
+  bool dump_details = (dump_file && (dump_flags & TDF_DETAILS));
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (address_p (group->type))
+       {
+         bool found = true;
+         for (unsigned j = 0; j < group->vuses.length (); j++)
+           {
+             struct iv_use *use = group->vuses[j];
+             gcc_assert (use->mem_type);
+             /* Ensure the step fit into D-form field.  */
+             if (TREE_CODE (use->iv->step) != INTEGER_CST
+                 || !tree_fits_shwi_p (use->iv->step))
+               {
+                 found = false;
+                 if (dump_details)
+                   fprintf (dump_file,
+                            " Group use %u.%u doesn't"
+                            "have constant step for D-form.\n",
+                            i, j);
+                 break;
+               }
+             bool is_store
+               = TREE_CODE (gimple_assign_lhs (use->stmt)) == SSA_NAME;
+             if (!targetm.d_form_suitable_p (TYPE_MODE (use->mem_type),
+                                             tree_to_shwi (use->iv->step),
+                                             TYPE_UNSIGNED (use->mem_type),
+                                             is_store))
+               {
+                 found = false;
+                 if (dump_details)
+                   fprintf (dump_file,
+                            " Group use %u.%u isn't"
+                            "suitable for D-form.\n",
+                            i, j);
+                 break;
+               }
+           }
+         if (found)
+           bitmap_set_bit (cands, i);
+       }
+    }
+
+  /* If there are more groups than number of unroll factors, we need more base
+     update instructions.  */
+  if (bitmap_count_bits (cands) <= data->unroll_factor)
+    {
+      unsigned i;
+      bitmap_iterator bi;
+      EXECUTE_IF_SET_IN_BITMAP (cands, 0, i, bi)
+      {
+       data->vgroups[i]->d_form_p = true;
+       if (dump_details)
+         fprintf (dump_file, "Mark group %u as D-form preferred.\n", i);
+      }
+    }
+}
+
 /* Finds uses of the induction variables that are interesting.  */
 
 static void
@@ -2755,6 +2826,9 @@ find_interesting_uses (struct ivopts_data *data)
 
   split_address_groups (data);
 
+  if (data->unroll_factor >= 2)
+    mark_d_form_groups (data);
+
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
       fprintf (dump_file, "\n<IV Groups>:\n");
@@ -3788,6 +3862,39 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
   return NULL_TREE;
 }
 
+/* Predict whether the given loop will be unrolled and its unroll factor if so.
+   Return value is the unroll factor, zero or one means it's unlikely to 
unroll.
+*/
+
+static unsigned short
+predict_unroll (struct ivopts_data *data)
+{
+  class loop *loop = data->current_loop;
+
+  /* FIXME: call target hook for target dependent unroll checks.  */
+
+  /* Have explicit unroll factor.  */
+  if (loop->unroll)
+    return loop->unroll;
+  /* No unrolls specified.  */
+  else if (!flag_unroll_loops && !flag_unroll_all_loops)
+    return 1;
+
+  /* Something stops unrolling.  */
+
+  if (optimize_loop_for_size_p (loop))
+    return 1;
+  if (!can_duplicate_loop_p(loop))
+    return 1;
+  if (loop->inner != NULL)
+    return 1;
+
+  /* FIXME: check the loop instructions, average instructins.  */
+
+  /* Simple return 8 for now.  */
+  return 8;
+}
+
 /* Predict whether the given loop will be transformed in the RTL
    doloop_optimize pass.  Attempt to duplicate some doloop_optimize checks.
    This is only for target independent checks, see targetm.predict_doloop_p
@@ -4982,6 +5089,14 @@ determine_group_iv_cost_address (struct ivopts_data 
*data,
        sum_cost = infinite_cost;
     }
 
+  /* If it isn't address based iv candidate, it's impossible to play with
+     D-form, increase the cost by considering step updates in unroll body.
+     FIXME: for the same iv candidate, this cost should be computed once across
+     multiple D-form groups.
+   */
+  if (group->d_form_p && !sum_cost.infinite_cost_p () && 
!cand->iv->base_object)
+    sum_cost += (data->unroll_factor - 1) * cand->cost_step;
+
   /* Uses in a group can share setup code, so only add setup cost once.  */
   cost -= cost.scratch;
   /* Compute and add costs for rest uses of this group.  */
@@ -7984,6 +8099,9 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, 
class loop *loop,
   if (!find_induction_variables (data))
     goto finish;
 
+  if (flag_ivopts_dform)
+    data->unroll_factor = predict_unroll (data);
+
   /* Finds interesting uses (item 1).  */
   find_interesting_uses (data);
   if (data->vgroups.length () > MAX_CONSIDERED_GROUPS)

Reply via email to