On Wed, Jul 14, 2021 at 8:38 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> On Tue, Jul 13, 2021 at 9:35 PM Hongtao Liu <crazy...@gmail.com> wrote:
> >
> > On Wed, Jul 14, 2021 at 10:34 AM liuhongt <hongtao....@intel.com> wrote:
> > >
> > > By optimizing vector movement to broadcast in ix86_expand_vector_move
> > > during pass_expand, pass_reload/LRA can automatically generate an avx512
> > > embedded broadcast, pass_cpb is not needed.
> > >
> > > Considering that in the absence of avx512f, broadcast from memory is
> > > still slightly faster than loading the entire memory, so always enable
> > > broadcast.
> > >
> > > benchmark:
> > > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast
> > >
> > > The performance diff
> > >
> > > strategy    : cycles
> > > memory      : 1046611188
> > > memory      : 1255420817
> > > memory      : 1044720793
> > > memory      : 1253414145
> > > average     : 1097868397
> > >
> > > broadcast   : 1044430688
> > > broadcast   : 1044477630
> > > broadcast   : 1253554603
> > > broadcast   : 1044561934
> > > average     : 1096756213
> > >
> > > But however broadcast has larger size.
> > >
> > > the size diff
> > >
> > > size broadcast.o
> > >    text    data     bss     dec     hex filename
> > >     137       0       0     137      89 broadcast.o
> > >
> > > size memory.o
> > >    text    data     bss     dec     hex filename
> > >     115       0       0     115      73 memory.o
> > >
> > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}
> > >
> > > gcc/ChangeLog:
> > >
> > >         * config/i386/i386-expand.c
> > >         (ix86_broadcast_from_integer_constant): Rename to ..
> > >         (ix86_broadcast_from_constant): .. this, and extend it to
> > >         handle float mode.
> > >         (ix86_expand_vector_move): Extend to float mode.
> > >         * config/i386/i386-features.c
> > >         (replace_constant_pool_with_broadcast): Remove.
> > >         (remove_partial_avx_dependency_gate): Ditto.
> > >         (constant_pool_broadcast): Ditto.
> > >         (class pass_constant_pool_broadcast): Ditto.
> > >         (make_pass_constant_pool_broadcast): Ditto.
> > >         (remove_partial_avx_dependency): Adjust gate.
> > >         * config/i386/i386-passes.def: Remove 
> > > pass_constant_pool_broadcast.
> > >         * config/i386/i386-protos.h
> > >         (make_pass_constant_pool_broadcast): Remove.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase.
> > > ---
> > >  gcc/config/i386/i386-expand.c                 |  29 +++-
> > >  gcc/config/i386/i386-features.c               | 157 +-----------------
> > >  gcc/config/i386/i386-passes.def               |   1 -
> > >  gcc/config/i386/i386-protos.h                 |   1 -
> > >  .../gcc.target/i386/fuse-caller-save-xmm.c    |   2 +-
> > >  5 files changed, 26 insertions(+), 164 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > > index 69ea79e6123..ba870145acd 100644
> > > --- a/gcc/config/i386/i386-expand.c
> > > +++ b/gcc/config/i386/i386-expand.c
> > > @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[])
> > >    emit_insn (gen_rtx_SET (op0, op1));
> > >  }
> > >
> > > +/* OP is a memref of CONST_VECTOR, return scalar constant mem
> > > +   if CONST_VECTOR is a vec_duplicate, else return NULL.  */
> > >  static rtx
> > > -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
> > > +ix86_broadcast_from_constant (machine_mode mode, rtx op)
> > >  {
> > >    int nunits = GET_MODE_NUNITS (mode);
> > >    if (nunits < 2)
> > > @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode 
> > > mode, rtx op)
> > >
> > >    /* Don't use integer vector broadcast if we can't move from GPR to SSE
> > >       register directly.  */
> > > -  if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
> > > +  if (!TARGET_INTER_UNIT_MOVES_TO_VEC
> > > +      && INTEGRAL_MODE_P (mode))
> > >      return nullptr;
> > >
> > >    /* Convert CONST_VECTOR to a non-standard SSE constant integer
> > > @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode 
> > > mode, rtx op)
> > >    if (!(TARGET_AVX2
> > >         || (TARGET_AVX
> > >             && (GET_MODE_INNER (mode) == SImode
> > > -               || GET_MODE_INNER (mode) == DImode)))
> > > +               || GET_MODE_INNER (mode) == DImode))
> > > +       || FLOAT_MODE_P (mode))
> > >        || standard_sse_constant_p (op, mode))
> > >      return nullptr;
> > >
> > > -  /* Don't broadcast from a 64-bit integer constant in 32-bit mode.  */
> > > -  if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT)
> > > +  /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
> > > +     We can still put 64-bit integer constant in memory when
> > > +     avx512 embed broadcast is available.  */
> > > +  if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
> > > +      && (!TARGET_AVX512F
> > > +         || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
> > >      return nullptr;
> > >
> > >    if (GET_MODE_INNER (mode) == TImode)
> > > @@ -561,17 +569,20 @@ ix86_expand_vector_move (machine_mode mode, rtx 
> > > operands[])
> > >
> > >    if (can_create_pseudo_p ()
> > >        && GET_MODE_SIZE (mode) >= 16
> > > -      && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > > +      && VECTOR_MODE_P (mode)
> > >        && (MEM_P (op1)
> > >           && SYMBOL_REF_P (XEXP (op1, 0))
> > >           && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
> > >      {
> > > -      rtx first = ix86_broadcast_from_integer_constant (mode, op1);
> > > +      rtx first = ix86_broadcast_from_constant (mode, op1);
> > >        if (first != nullptr)
> > >         {
> > >           /* Broadcast to XMM/YMM/ZMM register from an integer
> > > -            constant.  */
> > > -         op1 = ix86_gen_scratch_sse_rtx (mode);
> > > +            constant or scalar mem.  */
> > > +         op1 = gen_reg_rtx (mode);
> > I've changed ix86_gen_scratch_sse_rtx to gen_reg_rtx to let LRA/reload
> > make the final decision for register usage, would that make sense?
>
> Hard registers are used for 2 purposes:
>
> 1.  Prevent stack realignment when the original code doesn't use vector
> registers, which is the same for memcpy and memset.
> 2.  Prevent combine to convert constant broadcast to load from constant
> pool.
>
W/ gcc version 12.0.0 20210806 (experimental) (GCC) and replaced
ix86_gen_scratch_sse_rtx with gen_reg_rtx i didn't observe the failure
related to upper cases.
only failure like
gcc.target/i386/pr100865-10b.c scan-assembler-not vzeroupper
gcc.target/i386/pr100865-4b.c scan-assembler-not vzeroupper
gcc.target/i386/pr100865-6b.c scan-assembler-not vzeroupper
I guess it's related to xmm31 usage.

Have you check in all patches rely on this part?

> Please add an argument to ix86_gen_scratch_sse_rtx to use hard registers
> for the above 2 cases.   You can get pseudo registers for your use cases.
>
> > w/o that, hard register used here will prevent LRA/reload combine (set
> > op1 (vec_duplicate: mem)) and (add (op1, op2)) to (add (op1,
> > vec_duplicate: mem))
> >
> > > +         if (FLOAT_MODE_P (mode)
> > > +             || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
> > > +           first = force_const_mem (GET_MODE_INNER (mode), first);
> > >           bool ok = ix86_expand_vector_init_duplicate (false, mode,
> > >                                                        op1, first);
> > >           gcc_assert (ok);
> > > diff --git a/gcc/config/i386/i386-features.c 
> > > b/gcc/config/i386/i386-features.c
> > > index cbd430a2ecf..d9c6652d1c9 100644
> > > --- a/gcc/config/i386/i386-features.c
> > > +++ b/gcc/config/i386/i386-features.c
> > > @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area 
> > > (gcc::context *ctxt)
> > >    return new pass_insert_endbr_and_patchable_area (ctxt);
> > >  }
> > >
> > > -/* Replace all one-value const vector that are referenced by SYMBOL_REFs 
> > > in x
> > > -   with embedded broadcast. i.e.transform
> > > -
> > > -     vpaddq .LC0(%rip), %zmm0, %zmm0
> > > -     ret
> > > -  .LC0:
> > > -    .quad 3
> > > -    .quad 3
> > > -    .quad 3
> > > -    .quad 3
> > > -    .quad 3
> > > -    .quad 3
> > > -    .quad 3
> > > -    .quad 3
> > > -
> > > -    to
> > > -
> > > -     vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
> > > -     ret
> > > -  .LC0:
> > > -    .quad 3  */
> > > -static void
> > > -replace_constant_pool_with_broadcast (rtx_insn *insn)
> > > -{
> > > -  subrtx_ptr_iterator::array_type array;
> > > -  FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
> > > -    {
> > > -      rtx *loc = *iter;
> > > -      rtx x = *loc;
> > > -      rtx broadcast_mem, vec_dup, constant, first;
> > > -      machine_mode mode;
> > > -
> > > -      /* Constant pool.  */
> > > -      if (!MEM_P (x)
> > > -         || !SYMBOL_REF_P (XEXP (x, 0))
> > > -         || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
> > > -       continue;
> > > -
> > > -      /* Const vector.  */
> > > -      mode = GET_MODE (x);
> > > -      if (!VECTOR_MODE_P (mode))
> > > -       return;
> > > -      constant = get_pool_constant (XEXP (x, 0));
> > > -      if (GET_CODE (constant) != CONST_VECTOR)
> > > -       return;
> > > -
> > > -      /* There could be some rtx like
> > > -        (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
> > > -        but with "*.LC1" refer to V2DI constant vector.  */
> > > -      if (GET_MODE (constant) != mode)
> > > -       {
> > > -         constant = simplify_subreg (mode, constant, GET_MODE 
> > > (constant), 0);
> > > -         if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
> > > -           return;
> > > -       }
> > > -      first = XVECEXP (constant, 0, 0);
> > > -
> > > -      for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
> > > -       {
> > > -         rtx tmp = XVECEXP (constant, 0, i);
> > > -         /* Vector duplicate value.  */
> > > -         if (!rtx_equal_p (tmp, first))
> > > -           return;
> > > -       }
> > > -
> > > -      /* Replace with embedded broadcast.  */
> > > -      broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
> > > -      vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
> > > -      validate_change (insn, loc, vec_dup, 0);
> > > -
> > > -      /* At most 1 memory_operand in an insn.  */
> > > -      return;
> > > -    }
> > > -}
> > > -
> > >  /* At entry of the nearest common dominator for basic blocks with
> > >     conversions, generate a single
> > >         vxorps %xmmN, %xmmN, %xmmN
> > > @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void)
> > >           if (!NONDEBUG_INSN_P (insn))
> > >             continue;
> > >
> > > -         /* Handle AVX512 embedded broadcast here to save compile time.  
> > > */
> > > -         if (TARGET_AVX512F)
> > > -           replace_constant_pool_with_broadcast (insn);
> > > -
> > >           set = single_set (insn);
> > >           if (!set)
> > >             continue;
> > > @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void)
> > >    return 0;
> > >  }
> > >
> > > -static bool
> > > -remove_partial_avx_dependency_gate ()
> > > -{
> > > -  return (TARGET_AVX
> > > -         && TARGET_SSE_PARTIAL_REG_DEPENDENCY
> > > -         && TARGET_SSE_MATH
> > > -         && optimize
> > > -         && optimize_function_for_speed_p (cfun));
> > > -}
> > > -
> > >  namespace {
> > >
> > >  const pass_data pass_data_remove_partial_avx_dependency =
> > > @@ -2419,7 +2330,11 @@ public:
> > >    /* opt_pass methods: */
> > >    virtual bool gate (function *)
> > >      {
> > > -      return remove_partial_avx_dependency_gate ();
> > > +      return (TARGET_AVX
> > > +             && TARGET_SSE_PARTIAL_REG_DEPENDENCY
> > > +             && TARGET_SSE_MATH
> > > +             && optimize
> > > +             && optimize_function_for_speed_p (cfun));
> > >      }
> > >
> > >    virtual unsigned int execute (function *)
> > > @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency 
> > > (gcc::context *ctxt)
> > >    return new pass_remove_partial_avx_dependency (ctxt);
> > >  }
> > >
> > > -/* For const vector having one duplicated value, there's no need to put
> > > -   whole vector in the constant pool when target supports embedded 
> > > broadcast. */
> > > -static unsigned int
> > > -constant_pool_broadcast (void)
> > > -{
> > > -  timevar_push (TV_MACH_DEP);
> > > -  rtx_insn *insn;
> > > -
> > > -  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> > > -    {
> > > -      if (INSN_P (insn))
> > > -       replace_constant_pool_with_broadcast (insn);
> > > -    }
> > > -  timevar_pop (TV_MACH_DEP);
> > > -  return 0;
> > > -}
> > > -
> > > -namespace {
> > > -
> > > -const pass_data pass_data_constant_pool_broadcast =
> > > -{
> > > -  RTL_PASS, /* type */
> > > -  "cpb", /* name */
> > > -  OPTGROUP_NONE, /* optinfo_flags */
> > > -  TV_MACH_DEP, /* tv_id */
> > > -  0, /* properties_required */
> > > -  0, /* properties_provided */
> > > -  0, /* properties_destroyed */
> > > -  0, /* todo_flags_start */
> > > -  TODO_df_finish, /* todo_flags_finish */
> > > -};
> > > -
> > > -class pass_constant_pool_broadcast : public rtl_opt_pass
> > > -{
> > > -public:
> > > -  pass_constant_pool_broadcast (gcc::context *ctxt)
> > > -    : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
> > > -  {}
> > > -
> > > -  /* opt_pass methods: */
> > > -  virtual bool gate (function *)
> > > -    {
> > > -      /* Return false if rpad pass gate is true.
> > > -        replace_constant_pool_with_broadcast is called
> > > -        from both this pass and rpad pass.  */
> > > -      return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
> > > -    }
> > > -
> > > -  virtual unsigned int execute (function *)
> > > -    {
> > > -      return constant_pool_broadcast ();
> > > -    }
> > > -}; // class pass_cpb
> > > -
> > > -} // anon namespace
> > > -
> > > -rtl_opt_pass *
> > > -make_pass_constant_pool_broadcast (gcc::context *ctxt)
> > > -{
> > > -  return new pass_constant_pool_broadcast (ctxt);
> > > -}
> > > -
> > >  /* This compares the priority of target features in function DECL1
> > >     and DECL2.  It returns positive value if DECL1 is higher priority,
> > >     negative value if DECL2 is higher priority and 0 if they are the
> > > diff --git a/gcc/config/i386/i386-passes.def 
> > > b/gcc/config/i386/i386-passes.def
> > > index 44df00e94ac..29baf8acd0b 100644
> > > --- a/gcc/config/i386/i386-passes.def
> > > +++ b/gcc/config/i386/i386-passes.def
> > > @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3.  If not see
> > >    INSERT_PASS_BEFORE (pass_shorten_branches, 1, 
> > > pass_insert_endbr_and_patchable_area);
> > >
> > >    INSERT_PASS_AFTER (pass_combine, 1, 
> > > pass_remove_partial_avx_dependency);
> > > -  INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);
> > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > > index 51376fcc454..07ac02aff69 100644
> > > --- a/gcc/config/i386/i386-protos.h
> > > +++ b/gcc/config/i386/i386-protos.h
> > > @@ -395,4 +395,3 @@ extern rtl_opt_pass 
> > > *make_pass_insert_endbr_and_patchable_area
> > >    (gcc::context *);
> > >  extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
> > >    (gcc::context *);
> > > -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);
> > > diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c 
> > > b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c
> > > index 4deff93c1e8..b0d3dc38a0c 100644
> > > --- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c
> > > +++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c
> > > @@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16)));
> > >  static v2df __attribute__((noinline))
> > >  bar (v2df a)
> > >  {
> > > -  return a + (v2df){ 3.0, 3.0 };
> > > +  return a + (v2df){ 3.0, 4.0 };
> > >  }
> > >
> > >  v2df __attribute__((noinline))
> > > --
> > > 2.18.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> H.J.



-- 
BR,
Hongtao

Reply via email to