On Mon, Dec 4, 2023 at 3:51 PM Uros Bizjak <ubiz...@gmail.com> wrote:
>
> On Mon, Dec 4, 2023 at 8:11 AM Hongtao Liu <crazy...@gmail.com> wrote:
> >
> > On Fri, Dec 1, 2023 at 10:26 PM Richard Biener
> > <richard.guent...@gmail.com> wrote:
> > >
> > > On Fri, Dec 1, 2023 at 3:39 AM liuhongt <hongtao....@intel.com> wrote:
> > > >
> > > > > Hmm, I would suggest you put reg_needed into the class and accumulate
> > > > > over all vec_construct, with your patch you pessimize a single v32qi
> > > > > over two separate v16qi for example.  Also currently the whole block 
> > > > > is
> > > > > gated with INTEGRAL_TYPE_P but register pressure would be also
> > > > > a concern for floating point vectors.  finish_cost would then apply an
> > > > > adjustment.
> > > >
> > > > Changed.
> > > >
> > > > > 'target_avail_regs' is for GENERAL_REGS, does that include APX regs?
> > > > > I don't see anything similar for FP regs, but I guess the target 
> > > > > should know
> > > > > or maybe there's a #regs in regclass query already.
> > > > Haven't see any, use below setting.
> > > >
> > > > unsigned target_avail_sse = TARGET_64BIT ? (TARGET_AVX512F ? 32 : 16) : 
> > > > 8;
> > > >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > No big impact on SPEC2017.
> > > > Observe 1 big improvement from other benchmark by avoiding 
> > > > vectorization with
> > > > vec_construct v32qi which caused lots of spills.
> > > >
> > > > Ok for trunk?
> > >
> > > LGTM, let's see what x86 maintainers think.
> > +Honza and Uros.
> > Any comments?
>
> I have no comment on vector stuff, I think you are the most
> experienced developer in this area.
Thanks, committed.
>
> Uros.
>
> > >
> > > Richard.
> > >
> > > > For vec_contruct, the components must be live at the same time if
> > > > they're not loaded from memory, when the number of those components
> > > > exceeds available registers, spill happens. Try to account that with a
> > > > rough estimation.
> > > > ??? Ideally, we should have an overall estimation of register pressure
> > > > if we know the live range of all variables.
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
> > > >         Count sse_reg/gpr_regs for components not loaded from memory.
> > > >         (ix86_vector_costs:ix86_vector_costs): New constructor.
> > > >         (ix86_vector_costs::m_num_gpr_needed[3]): New private memeber.
> > > >         (ix86_vector_costs::m_num_sse_needed[3]): Ditto.
> > > >         (ix86_vector_costs::finish_cost): Estimate overall register
> > > >         pressure cost.
> > > >         (ix86_vector_costs::ix86_vect_estimate_reg_pressure): New
> > > >         function.
> > > > ---
> > > >  gcc/config/i386/i386.cc | 54 ++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 50 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > > index 9390f525b99..dcaea6c2096 100644
> > > > --- a/gcc/config/i386/i386.cc
> > > > +++ b/gcc/config/i386/i386.cc
> > > > @@ -24562,15 +24562,34 @@ ix86_noce_conversion_profitable_p (rtx_insn 
> > > > *seq, struct noce_if_info *if_info)
> > > >  /* x86-specific vector costs.  */
> > > >  class ix86_vector_costs : public vector_costs
> > > >  {
> > > > -  using vector_costs::vector_costs;
> > > > +public:
> > > > +  ix86_vector_costs (vec_info *, bool);
> > > >
> > > >    unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
> > > >                               stmt_vec_info stmt_info, slp_tree node,
> > > >                               tree vectype, int misalign,
> > > >                               vect_cost_model_location where) override;
> > > >    void finish_cost (const vector_costs *) override;
> > > > +
> > > > +private:
> > > > +
> > > > +  /* Estimate register pressure of the vectorized code.  */
> > > > +  void ix86_vect_estimate_reg_pressure ();
> > > > +  /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used 
> > > > for
> > > > +     estimation of register pressure.
> > > > +     ??? Currently it's only used by vec_construct/scalar_to_vec
> > > > +     where we know it's not loaded from memory.  */
> > > > +  unsigned m_num_gpr_needed[3];
> > > > +  unsigned m_num_sse_needed[3];
> > > >  };
> > > >
> > > > +ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool 
> > > > costing_for_scalar)
> > > > +  : vector_costs (vinfo, costing_for_scalar),
> > > > +    m_num_gpr_needed (),
> > > > +    m_num_sse_needed ()
> > > > +{
> > > > +}
> > > > +
> > > >  /* Implement targetm.vectorize.create_costs.  */
> > > >
> > > >  static vector_costs *
> > > > @@ -24748,8 +24767,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > > > vect_cost_for_stmt kind,
> > > >      }
> > > >    else if ((kind == vec_construct || kind == scalar_to_vec)
> > > >            && node
> > > > -          && SLP_TREE_DEF_TYPE (node) == vect_external_def
> > > > -          && INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
> > > > +          && SLP_TREE_DEF_TYPE (node) == vect_external_def)
> > > >      {
> > > >        stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, 
> > > > misalign);
> > > >        unsigned i;
> > > > @@ -24785,7 +24803,15 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > > > vect_cost_for_stmt kind,
> > > >                   && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
> > > >                       || !VECTOR_TYPE_P (TREE_TYPE
> > > >                                 (TREE_OPERAND (gimple_assign_rhs1 
> > > > (def), 0))))))
> > > > -           stmt_cost += ix86_cost->sse_to_integer;
> > > > +           {
> > > > +             if (fp)
> > > > +               m_num_sse_needed[where]++;
> > > > +             else
> > > > +               {
> > > > +                 m_num_gpr_needed[where]++;
> > > > +                 stmt_cost += ix86_cost->sse_to_integer;
> > > > +               }
> > > > +           }
> > > >         }
> > > >        FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
> > > >         if (TREE_CODE (op) == SSA_NAME)
> > > > @@ -24821,6 +24847,24 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > > > vect_cost_for_stmt kind,
> > > >    return retval;
> > > >  }
> > > >
> > > > +void
> > > > +ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
> > > > +{
> > > > +  unsigned gpr_spill_cost = COSTS_N_INSNS (ix86_cost->int_store [2]) / 
> > > > 2;
> > > > +  unsigned sse_spill_cost = COSTS_N_INSNS (ix86_cost->sse_store[0]) / 
> > > > 2;
> > > > +
> > > > +  /* Any better way to have target available fp registers, currently 
> > > > use SSE_REGS.  */
> > > > +  unsigned target_avail_sse = TARGET_64BIT ? (TARGET_AVX512F ? 32 : 
> > > > 16) : 8;
> > > > +  for (unsigned i = 0; i != 3; i++)
> > > > +    {
> > > > +      if (m_num_gpr_needed[i] > target_avail_regs)
> > > > +       m_costs[i] += gpr_spill_cost * (m_num_gpr_needed[i] - 
> > > > target_avail_regs);
> > > > +      /* Only measure sse registers pressure.  */
> > > > +      if (TARGET_SSE && (m_num_sse_needed[i] > target_avail_sse))
> > > > +       m_costs[i] += sse_spill_cost * (m_num_sse_needed[i] - 
> > > > target_avail_sse);
> > > > +    }
> > > > +}
> > > > +
> > > >  void
> > > >  ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
> > > >  {
> > > > @@ -24843,6 +24887,8 @@ ix86_vector_costs::finish_cost (const 
> > > > vector_costs *scalar_costs)
> > > >         m_costs[vect_body] = INT_MAX;
> > > >      }
> > > >
> > > > +  ix86_vect_estimate_reg_pressure ();
> > > > +
> > > >    vector_costs::finish_cost (scalar_costs);
> > > >  }
> > > >
> > > > --
> > > > 2.31.1
> > > >
> >
> >
> >
> > --
> > BR,
> > Hongtao



-- 
BR,
Hongtao

Reply via email to